数据库代码公开:单基因泛癌表达量差异分析

作者:半步博导 时间:2024年6月29日 19:51 阅读量:142


#示例基因为MKI67
geneName = "MKI67"
#读取表达数据
data = read.table(paste0("Expression.",geneName,".zscore.txt"),sep="\t",header=T,check.names=F)   #读取箱线图输入文件
#定义分析差异的肿瘤类型
tumors = c("ESCA","STAD","COAD","READ",
            "PAAD","LIHC",
            "LUAD","LUSC",
            "THCA","BRCA","PRAD",
            "BLCA","KIRC","KIRP","KICH",
            "CESC","UCEC",
            "HNSC","GBM","PCPG","SARC")
#提取已定义肿瘤类型的数据
mydata1 = data[(data[,"CancerType"] %in% tumors),] #基于data文件中Type列中为Tumor的行被提取出来
#过滤zscore小于-3或大于3的离群值样本
mydata1$expr = mydata1[,geneName]
mydata1 = mydata1[mydata1$expr > -3,]
mydata1 = mydata1[mydata1$expr < 3,]
#最终纳入分析的肿瘤应该有3个及以上的正常组
tumors2 = c()
for (variable in unique(mydata1$CancerType)) {
  sub_mydata1 = mydata1[mydata1$CancerType == variable,]
  sub_mydata1_normal = sub_mydata1[sub_mydata1$Type == "Normal",]
  if(length(unique(sub_mydata1$Type)) == 2 & nrow(sub_mydata1_normal) >= 3){
    tumors2 = c(tumors2,variable)
  }
}
#提取参与差异分析的肿瘤类型的数据
mydata1 <- mydata1[(mydata1[,"CancerType"] %in% tumors2),]
colnames(mydata1)[1] <- "Sample"
# 计算p value,此处采用wilcox.test
pvalues <- sapply(mydata1$CancerType, function(x){
  res <- wilcox.test(as.numeric(expr) ~ Type, 
                     data = subset(mydata1, 
                                   CancerType == x)
                     ) 
  res$p.value
})
pv <- data.frame(gene = mydata1$CancerType, 
                 pvalue = pvalues)
#将p值转化为星号
pv$sigcode <- cut(pv$pvalue, 
                  c(0,0.0001, 0.001, 0.01, 0.05, 1),
                  labels=c('****','***', '**', '*', ''))