GEO实操|limma分析差异基因 / 开普饭

欢迎来到医科研，这里是白介素2的读书笔记，跟我一起聊临床与科研的故事, 生物医学数据挖掘，R语言，TCGA、GEO数据挖掘。

---limma分析差异基因---

在经过了前两期中的数据下载，数据基本处理之后，解决了一个探针对应多个基因数的以及多个探针对应一个基因求平均值，在此基础上运用limma包分析差异基因；
除此以外，包括绘制火山图，热图，PCA等，都在本文中解决

数据载入

1if(T){ 2Sys.setlocale('LC_ALL','C') 3library(dplyr) 4## 5if(T){ 6load("expma.Rdata") 7load("probe.Rdata") 8} 9expma[1:5,1:5] 10boxplot(expma)##看下表达情况 11metdata[1:5,1:5] 12head(probe) 13 14## 查看Gene Symbol是否有重复 15table(duplicated(probe$`Gene Symbol`))##12549 FALSE 16 17## 整合注释信息到表达矩阵 18ID<-rownames(expma) 19expma<-apply(expma,1,function(x){log2(x+1)}) 20expma<-t(expma) 21eset<-expma[ID %in% probe$ID,] %>% cbind(probe) 22eset[1:5,1:5] 23colnames(eset) 24}

1## [1] "GSM188013" "GSM188014" "GSM188016" "GSM188018" 2## [5] "GSM188020" "GSM188022" "ID" "Gene Symbol" 3## [9] "ENTREZ_GENE_ID"

多个探针求平均值

1test1<-aggregate(x=eset[,1:6],by=list(eset$`Gene Symbol`),FUN=mean,na.rm=T) 2test1[1:5,1:5]##与去重结果相吻合 3## Group.1 GSM188013 GSM188014 GSM188016 GSM188018 4## 1 8.438846 8.368513 7.322442 7.813573 5## 2 A1CF 10.979025 10.616926 9.940773 10.413311 6## 3 A2M 6.565276 6.422112 8.142194 5.652593 7## 4 A4GALT 7.728628 7.818966 8.679885 7.048563 8## 5 A4GNT 10.243388 10.182382 9.391991 8.779887 9dim(test1)## 10## [1] 12549 7 11colnames(test1) 12## [1] "Group.1" "GSM188013" "GSM188014" "GSM188016" "GSM188018" "GSM188020" 13## [7] "GSM188022" 14rownames(test1)<-test1$Group.1 15test1<-test1[,-1] 16eset_dat<-test1

PCA plot

Principal Component Analysis (PCA)分析使用的是基于R语言的 prcomp() and princomp()函数.
完成PCA分析一般有两种方法：princomp()使用的是一种的spectral decomposition方法
prcomp() and PCA()[FactoMineR]使用的是SVD法

1data<-eset_dat 2data[1:5,1:5]##表达矩阵 3## GSM188013 GSM188014 GSM188016 GSM188018 GSM188020 4## 8.438846 8.368513 7.322442 7.813573 7.244615 5## A1CF 10.979025 10.616926 9.940773 10.413311 9.743305 6## A2M 6.565276 6.422112 8.142194 5.652593 5.550033 7## A4GALT 7.728628 7.818966 8.679885 7.048563 5.929258 8## A4GNT 10.243388 10.182382 9.391991 8.779887 9.431585 9metdata[1:5,1:5] 10## title 11## GSM188013 DMSO treated MCF7 breast cancer cells [HG-U133A] Exp 1 12## GSM188014 Dioxin treated MCF7 breast cancer cells [HG-U133A] Exp 1 13## GSM188016 DMSO treated MCF7 breast cancer cells [HG-U133A] Exp 2 14## GSM188018 Dioxin treated MCF7 breast cancer cells [HG-U133A] Exp 2 15## GSM188020 DMSO treated MCF7 breast cancer cells [HG-U133A] Exp 3 16## geo_accession status submission_date 17## GSM188013 GSM188013 Public on May 12 2007 May 08 2007 18## GSM188014 GSM188014 Public on May 12 2007 May 08 2007 19## GSM188016 GSM188016 Public on May 12 2007 May 08 2007 20## GSM188018 GSM188018 Public on May 12 2007 May 08 2007 21## GSM188020 GSM188020 Public on May 12 2007 May 08 2007 22## last_update_date 23## GSM188013 May 11 2007 24## GSM188014 May 11 2007 25## GSM188016 May 11 2007 26## GSM188018 May 11 2007 27## GSM188020 May 11 2007 28##构建group_list 29group_list<-rep(c("Treat","Control"),3) 30colnames(data)<-group_list 31library(factoextra) 32## Warning: package 'factoextra' was built under R version 3.5.3 33## Loading required package: ggplot2 34## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ 35## 计算PCA 36data<-t(data)##转换数据至行为sample,列为gene 37data<-as.data.frame(data)##注意数据要转换为数据框 38res.pca <- prcomp(data, scale = TRUE) 39##展示主成分对差异的贡献 40fviz_eig(res.pca)

1## 可视化结果 2fviz_pca_ind(res.pca, 3 col.ind = group_list, # 颜色对应group信息 4 palette = c("#00AFBB", "#FC4E07"), 5 addEllipses = TRUE, # Concentration ellipses 6 ellipse.type = "confidence", 7 legend.title = "Group",## Legend名称 8 repel = TRUE 9 )

层次聚类图-聚类结果也与PCA相似，结果并不好

聚类分析的结果也同样可以进一步美化，但这里不做
计算距离时同样需进行转置，但在前一步PCA分析中的data已经经过转置，故未重复

1dd <- dist(data, method = "euclidean")##data是经过行列转换的 2hc <- hclust(dd, method = "ward.D2") 3plot(hc)

1##对结果进行美化 2# Convert hclust into a dendrogram and plot 3hcd <- as.dendrogram(hc) 4# Define nodePar 5nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), 6 cex = 0.7, col = "blue") 7# Customized plot; remove labels 8plot(hcd, ylab = "Height", nodePar = nodePar, leaflab = "none")

limma分析差异基因

limma包的具体用法参考 limma Users Guide
构建分组信息，构建好比较矩阵是关键
注意这里的表达矩阵信息 eset_dat是经过处理后的，为转置，行为gene，列为sample

1library(limma) 2library(dplyr) 3group_list 4## [1] "Treat" "Control" "Treat" "Control" "Treat" "Control" 5design <- model.matrix(~0+factor(group_list)) 6colnames(design)=levels(factor(group_list)) 7design 8## Control Treat 9## 1 0 1 10## 2 1 0 11## 3 0 1 12## 4 1 0 13## 5 0 1 14## 6 1 0 15## attr(,"assign") 16## [1] 1 1 17## attr(,"contrasts") 18## attr(,"contrasts")$`factor(group_list)` 19## [1] "contr.treatment" 20## 比较信息 21contrast.matrix<-makeContrasts("Treat-Control", 22 levels = design) 23contrast.matrix##查看比较矩阵的信息，这里我们设置的是Treat Vs. control 24## Contrasts 25## Levels Treat-Control 26## Control -1 27## Treat 1 28## 拟合模型 29fit <- lmFit(eset_dat,design) 30fit2 <- contrasts.fit(fit, contrast.matrix) 31fit2 <- eBayes(fit2) 32DEG<-topTable(fit2, coef=1, n=Inf) %>% na.omit() ## coef比较分组 n基因数 33head(DEG) 34## logFC AveExpr t P.Value adj.P.Val B 35## ALDH3A1 -3.227263 10.302323 -10.710306 4.482850e-05 0.3134585 -4.048355 36## CYP1B1 -3.033684 13.287607 -10.505888 4.995753e-05 0.3134585 -4.049713 37## CYP1A1 -9.003353 11.481268 -8.371476 1.762905e-04 0.7374232 -4.069681 38## HHLA2 -1.550587 6.595658 -7.443431 3.337672e-04 0.9308066 -4.083411 39## SLC7A5 -2.470333 13.628775 -7.298868 3.708688e-04 0.9308066 -4.085966 40## TIPARP -1.581274 12.764218 -7.024252 4.552834e-04 0.9522252 -4.091192 41dim(DEG) 42## [1] 12549 6 43save(DEG,file = "DEG_all.Rdata")

绘制火山图

火山图其实仅仅是一种可视化的方式，能够从整体上让我们对整体的差异分析情况有个了解
筛选到差异基因后，可以直接绘制出火山图
火山图的横坐标为logFC, 纵坐标为-log10(pvalue)，因此其实理论上讲plot即可完成火山图绘制

最简单原始的画法

1colnames(DEG) 2## [1] "logFC" "AveExpr" "t" "P.Value" "adj.P.Val" "B" 3plot(DEG$logFC,-log10(DEG$P.Value))

高级的画法

借助于有人开发的更高级的包，用于完成某些特殊的功能，或者更美观

1require(EnhancedVolcano) 2EnhancedVolcano(DEG, 3 4 lab = rownames(DEG), 5 6 x = "logFC", 7 8 y = "P.Value", 9 10 selectLab = rownames(DEG)[1:5], 11 12 xlab = bquote(~Log[2]~ "fold change"), 13 14 ylab = bquote(~-Log[10]~italic(P)), 15 16 pCutoff = 0.05,## pvalue阈值 17 18 FCcutoff = 1,## FC cutoff 19 20 xlim = c(-5,5), 21 22 transcriptPointSize = 1.8, 23 24 transcriptLabSize = 5.0, 25 26 colAlpha = 1, 27 28 legend=c("NS","Log2 FC"," p-value", 29 " p-value & Log2 FC"), 30 31 legendPosition = "bottom", 32 33 legendLabSize = 10, 34 35 legendIconSize = 3.0)

绘制热图

热图的使用比较频繁，得到差异基因后可以直接绘制热图
相对简单好用的要属pheatmap包了
管道中的常规提取需要加上特殊的占位符.

1## 首先提取出想要画的数据 2head(DEG) 3## 提取FC前50 4up_50<-DEG %>% as_tibble() %>% 5 mutate(genename=rownames(DEG)) %>% 6 dplyr::arrange(desc(logFC)) %>% 7 .$genename %>% .[1:50] ## 管道符中的提取 8## FC低前50 9down_50<-DEG %>% as_tibble() %>% 10 mutate(genename=rownames(DEG)) %>% 11 dplyr::arrange(logFC) %>% 12 .$genename %>% .[1:50] ## 管道符中的提取 13index<-c(up_50,down_50) 14 15## 开始绘图-最简单的图 16library(pheatmap) 17pheatmap(eset_dat[index,],show_colnames =F,show_rownames = F)

稍微调整细节

1index_matrix<-t(scale(t(eset_dat[index,])))##归一化 2index_matrix[index_matrix>1]=1 3index_matrix[index_matrix<-1]=-1 4head(index_matrix) 5 6## 添加注释 7anno=data.frame(group=group_list) 8rownames(anno)=colnames(index_matrix) 9anno##注释信息的数据框 10pheatmap(index_matrix, 11 show_colnames =F, 12 show_rownames = F, 13 cluster_cols = T, 14 annotation_col=anno) 15

本期内容就到这里，我是白介素2，下期再见

GEO实操|limma分析差异基因