数据库代码公开:单基因泛癌表达量差异分析
#示例基因为MKI67
geneName = "MKI67"
#读取表达数据
data = read.table(paste0("Expression.",geneName,".zscore.txt"),sep="\t",header=T,check.names=F) #读取箱线图输入文件
#定义分析差异的肿瘤类型
tumors = c("ESCA","STAD","COAD","READ",
"PAAD","LIHC",
"LUAD","LUSC",
"THCA","BRCA","PRAD",
"BLCA","KIRC","KIRP","KICH",
"CESC","UCEC",
"HNSC","GBM","PCPG","SARC")
#提取已定义肿瘤类型的数据
mydata1 = data[(data[,"CancerType"] %in% tumors),] #基于data文件中Type列中为Tumor的行被提取出来
#过滤zscore小于-3或大于3的离群值样本
mydata1$expr = mydata1[,geneName]
mydata1 = mydata1[mydata1$expr > -3,]
mydata1 = mydata1[mydata1$expr < 3,]
#最终纳入分析的肿瘤应该有3个及以上的正常组
tumors2 = c()
for (variable in unique(mydata1$CancerType)) {
sub_mydata1 = mydata1[mydata1$CancerType == variable,]
sub_mydata1_normal = sub_mydata1[sub_mydata1$Type == "Normal",]
if(length(unique(sub_mydata1$Type)) == 2 & nrow(sub_mydata1_normal) >= 3){
tumors2 = c(tumors2,variable)
}
}
#提取参与差异分析的肿瘤类型的数据
mydata1 <- mydata1[(mydata1[,"CancerType"] %in% tumors2),]
colnames(mydata1)[1] <- "Sample"
# 计算p value,此处采用wilcox.test
pvalues <- sapply(mydata1$CancerType, function(x){
res <- wilcox.test(as.numeric(expr) ~ Type,
data = subset(mydata1,
CancerType == x)
)
res$p.value
})
pv <- data.frame(gene = mydata1$CancerType,
pvalue = pvalues)
#将p值转化为星号
pv$sigcode <- cut(pv$pvalue,
c(0,0.0001, 0.001, 0.01, 0.05, 1),
labels=c('****','***', '**', '*', ''))