What if starts with DESeq2 normlized matrix?

  • 因为没拿到raw counts,拿到的是DESeq2 normlized matrix,为了有谱,拿airway数据用DESeq2处理两次,看下结果,比较一下是不是可行!

  • 可行性以及解释,各位看官,往下看;

纯代码:

Step1、2数据处理和两次差异分析

rm(list = ls())  
options(stringsAsFactors = F)
###matrix和phenodata提取
library(airway)
data(airway)
exprSet <- assay(airway)
group_list <- colData(airway)$dex
exprSet <- exprSet[apply(exprSet,1, function(x) sum(x>1) > 5),]
###Round1原始数据
suppressMessages(library(DESeq2))
colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,
                              design = ~ group_list)
dds2 <- DESeq(dds)
res <- results(dds2,contrast=c("group_list","trt","untrt"))
resOrdered <- res[order(res$padj),]
DEG <- as.data.frame(resOrdered)
DESeq_DEG = na.omit(DEG)
nrDEG=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')  
save(nrDEG, file = "nrDEG.Rdata")
###Round2-Normalize后的counts
exprSet <- floor(counts(dds2,normalize=T))
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,
                              design = ~ group_list)
dds2 <- DESeq(dds)
res <- results(dds2,contrast=c("group_list","trt","untrt"))
resOrdered <- res[order(res$padj),]
DEG <- as.data.frame(resOrdered)
DESeq2_DEG = na.omit(DEG)
nrDEG2=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')  
save(nrDEG2,file='DEG_nor_deg.Rdata')

Step3:差异分析和注释有否区别:

rm(list=ls())
load('nrDEG.Rdata')
load('DEG_nor_deg.Rdata')
#####构建提取差异基因的ENSEMBL函数
ENS_extract<- function(nrDEG){
  ENS<- rownames(nrDEG)[abs(nrDEG$log2FoldChange)>1.5&nrDEG$pvalue<0.05]
  return(ENS)
}
####ENSEMBL的提取
nrDEG_ens <- ENS_extract(nrDEG)
nrDEG2_ens <- ENS_extract(nrDEG2)
############
gene_inter<- intersect(rownames(nrDEG),rownames(nrDEG2))
nrDEG_inter<- nrDEG[gene_inter,]
nrDEG2_inter<- nrDEG2[gene_inter,]
####排序会有变化,看看富集结果会不会有变化?
tmp<- nrDEG_inter-nrDEG2_inter
#####GSEA
library(org.Hs.eg.db)
library(clusterProfiler)
####构建富集分析的函数
result_extract<- function(nrDEG){
gene <- bitr(rownames(nrDEG), fromType = "ENSEMBL",
             toType =  "ENTREZID",
             OrgDb = org.Hs.eg.db)
gene$logfc <- nrDEG$log2FoldChange[match(gene$ENSEMBL,rownames(nrDEG))]
geneList=nrDEG$log2FoldChange
names(geneList)=gene$ENTREZID
geneList=sort(geneList,decreasing = T)
head(geneList)
library(clusterProfiler)
kk_gse <- gseKEGG(geneList     = geneList,
                  organism     = 'hsa',
                  nPerm        = 1000,
                  minGSSize    = 10,
                  pvalueCutoff = 0.9, 
                  verbose      = FALSE)
kk_gse@result<- kk_gse@result[kk_gse@result$pvalue<0.05,]
return(kk_gse@result)}
#####执行富集分析
deg_1<- result_extract(nrDEG)
deg_2<- result_extract(nrDEG2)
#####维恩图直观看结果
library(VennDiagram)
library(grid)
data <- list(
  deg_1=as.character(deg_1$Description),
  deg_2=as.character(deg_2$Description)
)
data1 <- list(
  deg_ens_1=as.character(nrDEG_ens),
  deg_ens_2=as.character(nrDEG2_ens)
)
ven <- venn.diagram(data,filename = NULL,fill=c('red','yellow'))
ven1 <- venn.diagram(data1,filename = NULL,fill=c('red','yellow'))
grid.newpage()
grid.draw(ven)
grid.newpage()
grid.draw(ven1)

DEG_venn
GSEA_venn
下面这句是总结:
  • 从维恩图看出,差异分析的结果其实没多大影响;但对logfc等带来的差异,对GSEA是有一定影响的;

不懂没关系的续集:
  • 从Sizefactor看看究竟normalize做了什么?

  • 看着DESeq2的normalize和raw差异还是有的;所以究竟对于其他数据是什么情形,我觉得是难说的;

> estimateSizeFactorsForMatrix(counts(dds2))
SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 
 1.0236444  0.8971063  1.1791977  0.6704799  1.1768762  1.3985387  0.9207209 
SRR1039521 
 0.9448575 
> estimateSizeFactorsForMatrix(counts(dds2))
SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520 
 1.0014312  1.0044482  1.0070364  0.9989561  1.0065358  1.0067121  1.0053661 
SRR1039521 
 1.0055917 

(0)

相关推荐