What if starts with DESeq2 normlized matrix?
因为没拿到raw counts,拿到的是DESeq2 normlized matrix,为了有谱,拿airway数据用DESeq2处理两次,看下结果,比较一下是不是可行!
可行性以及解释,各位看官,往下看;
纯代码:
Step1、2数据处理和两次差异分析
rm(list = ls())
options(stringsAsFactors = F)
###matrix和phenodata提取
library(airway)
data(airway)
exprSet <- assay(airway)
group_list <- colData(airway)$dex
exprSet <- exprSet[apply(exprSet,1, function(x) sum(x>1) > 5),]
###Round1原始数据
suppressMessages(library(DESeq2))
colData <- data.frame(row.names=colnames(exprSet),group_list=group_list)
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,
design = ~ group_list)
dds2 <- DESeq(dds)
res <- results(dds2,contrast=c("group_list","trt","untrt"))
resOrdered <- res[order(res$padj),]
DEG <- as.data.frame(resOrdered)
DESeq_DEG = na.omit(DEG)
nrDEG=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
save(nrDEG, file = "nrDEG.Rdata")
###Round2-Normalize后的counts
exprSet <- floor(counts(dds2,normalize=T))
dds <- DESeqDataSetFromMatrix(countData = exprSet,colData = colData,
design = ~ group_list)
dds2 <- DESeq(dds)
res <- results(dds2,contrast=c("group_list","trt","untrt"))
resOrdered <- res[order(res$padj),]
DEG <- as.data.frame(resOrdered)
DESeq2_DEG = na.omit(DEG)
nrDEG2=DESeq2_DEG[,c(2,6)]
colnames(nrDEG)=c('log2FoldChange','pvalue')
save(nrDEG2,file='DEG_nor_deg.Rdata')
Step3:差异分析和注释有否区别:
rm(list=ls())
load('nrDEG.Rdata')
load('DEG_nor_deg.Rdata')
#####构建提取差异基因的ENSEMBL函数
ENS_extract<- function(nrDEG){
ENS<- rownames(nrDEG)[abs(nrDEG$log2FoldChange)>1.5&nrDEG$pvalue<0.05]
return(ENS)
}
####ENSEMBL的提取
nrDEG_ens <- ENS_extract(nrDEG)
nrDEG2_ens <- ENS_extract(nrDEG2)
############
gene_inter<- intersect(rownames(nrDEG),rownames(nrDEG2))
nrDEG_inter<- nrDEG[gene_inter,]
nrDEG2_inter<- nrDEG2[gene_inter,]
####排序会有变化,看看富集结果会不会有变化?
tmp<- nrDEG_inter-nrDEG2_inter
#####GSEA
library(org.Hs.eg.db)
library(clusterProfiler)
####构建富集分析的函数
result_extract<- function(nrDEG){
gene <- bitr(rownames(nrDEG), fromType = "ENSEMBL",
toType = "ENTREZID",
OrgDb = org.Hs.eg.db)
gene$logfc <- nrDEG$log2FoldChange[match(gene$ENSEMBL,rownames(nrDEG))]
geneList=nrDEG$log2FoldChange
names(geneList)=gene$ENTREZID
geneList=sort(geneList,decreasing = T)
head(geneList)
library(clusterProfiler)
kk_gse <- gseKEGG(geneList = geneList,
organism = 'hsa',
nPerm = 1000,
minGSSize = 10,
pvalueCutoff = 0.9,
verbose = FALSE)
kk_gse@result<- kk_gse@result[kk_gse@result$pvalue<0.05,]
return(kk_gse@result)}
#####执行富集分析
deg_1<- result_extract(nrDEG)
deg_2<- result_extract(nrDEG2)
#####维恩图直观看结果
library(VennDiagram)
library(grid)
data <- list(
deg_1=as.character(deg_1$Description),
deg_2=as.character(deg_2$Description)
)
data1 <- list(
deg_ens_1=as.character(nrDEG_ens),
deg_ens_2=as.character(nrDEG2_ens)
)
ven <- venn.diagram(data,filename = NULL,fill=c('red','yellow'))
ven1 <- venn.diagram(data1,filename = NULL,fill=c('red','yellow'))
grid.newpage()
grid.draw(ven)
grid.newpage()
grid.draw(ven1)
下面这句是总结:
从维恩图看出,差异分析的结果其实没多大影响;但对logfc等带来的差异,对GSEA是有一定影响的;
不懂没关系的续集:
从Sizefactor看看究竟normalize做了什么?
看着DESeq2的normalize和raw差异还是有的;所以究竟对于其他数据是什么情形,我觉得是难说的;
> estimateSizeFactorsForMatrix(counts(dds2))
SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
1.0236444 0.8971063 1.1791977 0.6704799 1.1768762 1.3985387 0.9207209
SRR1039521
0.9448575
> estimateSizeFactorsForMatrix(counts(dds2))
SRR1039508 SRR1039509 SRR1039512 SRR1039513 SRR1039516 SRR1039517 SRR1039520
1.0014312 1.0044482 1.0070364 0.9989561 1.0065358 1.0067121 1.0053661
SRR1039521
1.0055917