Tutorial for the WGCNA package for R / 开普饭

五谷吃不完了22019.04.06 01:00:47字数 314阅读 1,306I. Network analysis of liver expression data from female mice: finding modules related to body weight参考1：Tutorials for the WGCNA package 参考2：一文学会WGCNA分析参考3:STEP6:WGCNA相关性分析第一步：Data input and cleaning1.1 数据的导入rm(list = ls())setwd('E:/gsj/RWD/WGCNA/')library(WGCNA)options(stringsAsFactors = FALSE)# 如果需要保存变量的话这一步不能省#读入文件femData = read.csv("LiverFemale3600.csv")#查看一下文件格式，如果不符合，就需要修改name(femData)head(femData)#进行WGCNA分析的时候，要求输入的表达矩阵，行名是样本，列名是基因datExpr0 = as.data.frame(t(femData[, -c(1:8)]))names(datExpr0) = femData$substanceBXHrownames(datExpr0) = names(femData)[-c(1:8)]dim(datExpr0)

1.2 检查是否有离群值#检查是否有缺失值，没问题就会返回TRUEgsg = goodSamplesGenes(datExpr0, verbose = 3)gsg$allOK#如果不是TREU，那么需要进行下面操作if (!gsg$allOK){# Optionally, print the gene and sample names that were removed:if (sum(!gsg$goodGenes)>0) printFlush(paste("Removing genes:", paste(names(datExpr0[!gsg$goodGenes], collapse = ", ")));if (sum(!gsg$goodSamples)>0) printFlush(paste("Removing samples:",paste(rownames(datExpr0)[!gsg$goodSamples], collapse = ", ")));# Remove the offending genes and samples from the data:datExpr0 = datExpr0[gsg$goodSamples, gsg$goodGenes]}#第二步，对样本进行聚类，判断有无outlier的样本sampleTree = hclust(dist(datExpr0), method = "average");sizeGrWindow(12,9)par(cex = 0.6);par(mar = c(0,4,2,0))plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,cex.axis = 1.5, cex.main = 2)#从下图可以看出sampleF2_221是一个离群值，要么手动删掉，要么设置一个阈值，剔除掉abline(h = 15, col = "red")#按照设定的高度clust = cutreeStatic(sampleTree, cutHeight = 15, minSize = 10)table(clust)#可以看出cluster1包含了我们最后所需要的样本keepSamples = (clust==1)datExpr = datExpr0[keepSamples, ]nGenes = ncol(datExpr)nSamples = nrow(datExpr)#通过以上的处理，最后得到的datExpr对象就可以用来进行后续的分析

缺失值检查

离群样本检查

cluster选择1.3 导入临床信息#导入临床信息traitData = read.csv("ClinicalTraits.csv");dim(traitData)names(traitData)#除去不相关的信息allTraits = traitData[, -c(31, 16)];allTraits = allTraits[, c(2, 11:36) ];dim(allTraits)names(allTraits)#每一个样本都会有对应的临床信息，包括体重，长度等view(allTraits)#将临床信息和表样本名结合起来femaleSamples = rownames(datExpr);traitRows = match(femaleSamples, allTraits$Mice);datTraits = allTraits[traitRows, -1];rownames(datTraits) = allTraits[traitRows, 1]view(datTraits)#最后在进行后续的网络构建和模块选择之前，我们可以看一下样本和临床信息之间的匹配度sampleTree2 = hclust(dist(datExpr), method = "average")#将特性与颜色相关联，白色表示低；红色表示高；灰色表示缺失traitColors = numbers2colors(datTraits, signed = FALSE);#出图，红色代表高的，白色代表低的，灰色代表缺失plotDendroAndColors(sampleTree2, traitColors,groupLabels = names(datTraits),main = "Sample dendrogram and trait heatmap")#没有问题的话就保存环境变量save(datExpr, datTraits, file = "FemaleLiver-01-dataInput.RData")

第二步：Automatic network construction and module detection2.0 构建R环境#######################################如果你关掉了R，那么就需要运行下面几步#rm(list = ls())#setwd('E:/gsj/RWD/WGCNA/')#library(WGCNA)#options(stringsAsFactors = FALSE)#这一步不能省#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程，但是如果是在本机上使用的话，这一步过程可以跳过#lnames = load(file = "FemaleLiver-01-dataInput.RData")#以上，成功导入第一步生成的datExpr和datTraits2.1 确定合适的阈值#由不同的方式构建基因网络，这里选择自动一步构建基因网络的方法#确定合适的阈值范围powers = c(c(1:10), seq(from = 12, to=20, by=2))#调用pickSoftThreshold函数分析出合适的阈值sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)#画图，结果展示sizeGrWindow(9, 5)par(mfrow = c(1,2));cex1 = 0.9;plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],xlab="Soft Threshold (power)", ylab="Scale Free Topology Model Fit,signed R^2",type="n",main = paste("Scale independence"))text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],labels=powers,cex=cex1,col="red")abline(h=0.90,col="green")plot(sft$fitIndices[,1], sft$fitIndices[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",main = paste("Mean connectivity"))text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")#以上会出来两张图，由此确定选取阈值6

阈值选择2.2 一步构建网络图和模块选择#一步构建网络和模块选择net = blockwiseModules(datExpr, power = 6,TOMType = "unsigned", minModuleSize = 30,reassignThreshold = 0, mergeCutHeight = 0.25, numericLabels = TRUE, pamRespectsDendro = FALSE,saveTOMs = TRUE,saveTOMFileBase = "femaleMouseTOM",verbose = 3)#参数mergeCutHeight为合并模块的一个阈值#上述的参数设置均为下限值，不同的数据类型有不同的参数设置#如果电脑的运行跟不上，建议参考大样本的那种网络构建方法#看一下有多少个模块以及模块当中所包含的基因#如下图展示的结果，一共有18个模块，从1到18模块，按照基因数递减排列，模块0表示没有分类的基因数table(net$colors)#层次聚类图的结果包含在net$dendrograms[[1]]对象中#用以下代码可以将树图和颜色分布整合，树图是对基因进行的聚类，下面不同颜色代表这个基因处于哪个模块#设置窗口的大小sizeGrWindow(12, 9)mergedColors = labels2colors(net$colors)#出图plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]], "Module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) #此外，如果用户想要修改一些参数，可以使用recutBlockwiseTrees这个函数 #保存数据moduleLabels = net$colorsmoduleColors = labels2colors(net$colors)MEs = net$MEs;geneTree = net$dendrograms[[1]];save(MEs, moduleLabels, moduleColors, geneTree, file = "FemaleLiver-02-networkConstruction-auto.RData")

第三步：relating modules to external information and identifying important genes3.0 构建R环境#rm(list = ls())#setwd('E:/gsj/RWD/WGCNA/')#library(WGCNA)#options(stringsAsFactors = FALSE)#这一步不能省#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程，但是如果是在本机上使用的话，这一步过程可以跳过#lnames = load(file = "FemaleLiver-01-dataInput.RData")#以上，成功导入第一步生成的datExpr和datTraits#lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");#lnames#以上，成功导入第二步生成的参数3.1 计算模块和性状之间的相关性nGenes = ncol(datExpr);nSamples = nrow(datExpr);MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenesMEs = orderMEs(MEs0)##不同颜色的模块的ME值矩阵(样本vs模块)moduleTraitCor = cor(MEs, datTraits, use = "p");moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples);#这一步将会展示相关性和P值sizeGrWindow(10,6)textMatrix = paste(signif(moduleTraitCor, 2), "\n(", signif(moduleTraitPvalue, 1), ")", sep = "");dim(textMatrix) = dim(moduleTraitCor)par(mar = c(6, 8.5, 3, 3))#这一步将会在热图上展示相关系数labeledHeatmap(Matrix = moduleTraitCor, xLabels = names(datTraits), yLabels = names(MEs), ySymbols = names(MEs), colorLabels = FALSE, colors = greenWhiteRed(50), textMatrix = textMatrix, setStdMargins = FALSE, cex.text = 0.5, zlim = c(-1,1), main = paste("Module-trait relationships"))#以上可以通过这个热图发现和性状相关的基因，在这之后我们主要关注weight这一性状相关的基因

3.2 确定相关模块中的显著相关基因#性状跟模块虽然求出了相关性，可以挑选最相关的那些模块来分析，但是模块本身仍然包含非常多的基因，还需进一步的寻找最重要的基因#首先计算模块与基因的相关性矩阵weight = as.data.frame(datTraits$weight_g);names(weight) = "weight" #单独把weight这一列提出来，做一个data.framemodNames = substring(names(MEs), 3) #提出每一模块的颜色geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p")); #这一步，模块与基因的相关性矩阵## 算出每个模块跟基因的皮尔森相关系数矩阵## MEs是每个模块在每个样本里面的值## datExpr是每个基因在每个样本的表达量MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples))names(geneModuleMembership) = paste("MM", modNames, sep="");names(MMPvalue) = paste("p.MM", modNames, sep="");#再计算性状与基因的相关性矩阵# 只有连续型性状才能只有计算geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p")); #这一步，性状与基因的相关性矩阵GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples));names(geneTraitSignificance) = paste("GS.", names(weight), sep="");names(GSPvalue) = paste("p.GS.", names(weight), sep="");3.3 筛选出和性状以及模块相关性都很高的基因#这里可以从上图中看出weight这一个性状中，棕色的模块最相关module = "brown"column = match(module, modNames);moduleGenes = moduleColors==module;sizeGrWindow(7, 7);par(mfrow = c(1,1));verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]), abs(geneTraitSignificance[moduleGenes, 1]), xlab = paste("Module Membership in", module, "module"), ylab = "Gene significance for body weight", main = paste("Module membership vs. gene significance\n"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module) #这一张图显示的是在棕色模块中，基因先属性和模块的关系（类似于MEs） #可以看出，和性状高度相关的基因往往在和性状高度相关的模块中#经过上述操作，我们已经找到和感兴趣的性状最相关的模型，同时还得到了相关的基因

3.4 将网络分析结果输出#返回所有的probe IDnames(datExpr)#返回brown模块对应的基因names(datExpr)[moduleColors=="brown"]#可以提供一个转换ID的文件annot = read.csv(file = "GeneAnnotation.csv");dim(annot)names(annot)probes = names(datExpr)probes2annot = match(probes, annot$substanceBXH)#计算没有注释到的probe数量sum(is.na(probes2annot))#随后构建一个data.frame，描述probe ID、gene symbol、模块颜色、基因与weight模块的显著性、p值geneInfo0 = data.frame(substanceBXH = probes, geneSymbol = annot$gene_symbol[probes2annot], LocusLinkID = annot$LocusLinkID[probes2annot], moduleColor = moduleColors, geneTraitSignificance, GSPvalue)modOrder = order(-abs(cor(MEs, weight, use = "p")));for (mod in 1:ncol(geneModuleMembership)){ oldNames = names(geneInfo0) geneInfo0 = data.frame(geneInfo0, geneModuleMembership[, modOrder[mod]], MMPvalue[, modOrder[mod]]); names(geneInfo0) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""), paste("p.MM.", modNames[modOrder[mod]], sep=""))}geneOrder = order(geneInfo0$moduleColor, -abs(geneInfo0$GS.weight));geneInfo = geneInfo0[geneOrder, ]write.csv(geneInfo, file = "geneInfo.csv")

第四步：Interfacing network analysis with other data such as functional annotation and gene ontology4.1 选取感兴趣的基因进行功能注释或者其他分析# Read in the probe annotationannot = read.csv(file = "GeneAnnotation.csv");# Match probes in the data set to the probe IDs in the annotation fileprobes = names(datExpr)probes2annot = match(probes, annot$substanceBXH)# Get the corresponding Locuis Link IDsallLLIDs = annot$LocusLinkID[probes2annot];# $ Choose interesting modulesintModules = c("brown", "red", "salmon")for (module in intModules){ # Select module probes modGenes = (moduleColors==module) # Get their entrez ID codes modLLIDs = allLLIDs[modGenes]; # Write them into a file fileName = paste("LocusLinkIDs-", module, ".txt", sep=""); write.table(as.data.frame(modLLIDs), file = fileName,row.names = FALSE, col.names = FALSE)}# As background in the enrichment analysis, we will use all probes in the analysis.fileName = paste("LocusLinkIDs-all.txt", sep="");write.table(as.data.frame(allLLIDs), file = fileName,row.names = FALSE, col.names = FALSE)#经过上述步骤，将会生成下面的文件，后续可以对感兴趣的模块进行富集分析等

第五步：Network visualization using WGCNA functions5.0 构建R环境#rm(list = ls())#setwd('E:/gsj/RWD/WGCNA/')#library(WGCNA)#options(stringsAsFactors = FALSE)#这一步不能省#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程，但是如果是在本机上使用的话，这一步过程可以跳过#lnames = load(file = "FemaleLiver-01-dataInput.RData")#以上，成功导入第一步生成的datExpr和datTraits#lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");#lnames#以上，成功导入第二步生成的参数5.1 可视化基因互作网络#网络可视化过程#对所有基因画热图#这一步速度很慢，不建议做dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 6);plotTOM = dissTOM^7;diag(plotTOM) = NA;sizeGrWindow(9,9)TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")#随机选取400个基因nSelect = 400# For reproducibility, we set the random seedset.seed(10);select = sample(nGenes, size = nSelect);selectTOM = dissTOM[select, select];# There’s no simple way of restricting a clustering tree to a subset of genes, so we must re-cluster.selectTree = hclust(as.dist(selectTOM), method = "average")selectColors = moduleColors[select];# Open a graphical windowsizeGrWindow(9,9)# Taking the dissimilarity to a power, say 10, makes the plot more informative by effectively changing# the color palette; setting the diagonal to NA also improves the clarity of the plotplotDiss = selectTOM^7;diag(plotDiss) = NA;TOMplot(plotDiss, selectTree, selectColors, main = "Network heatmap plot, selected genes")

5.2 可视化网络图中的eigenegenes#根据模块里面的eigenegenes（类似于主要的基因），对模块进行聚类以及热图，标注出想要看的模块# Recalculate module eigengenesMEs = moduleEigengenes(datExpr, moduleColors)$eigengenes## 只有连续型性状才能只有计算MEs = moduleEigengenes(datExpr, moduleColors)$eigengenesweight = as.data.frame(datTraits$weight_g);names(weight) = "weight"MET = orderMEs(cbind(MEs, weight))# 出图，模块与性状之间的关系#下面是将两张图分别画出的代码sizeGrWindow(5,7.5);par(cex = 0.9)plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)# Plot the relationships among the eigengenes and the traitsizeGrWindow(5,7.5);par(cex = 0.9)plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle= 90)# Plot the dendrogramsizeGrWindow(6,6);par(cex = 1.0)## 模块的聚类图plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0),plotHeatmaps = FALSE)# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)par(cex = 1.0)## 性状与模块热图plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),plotDendrograms = FALSE, xLabelsAngle = 90)

第六步：Exporting a gene network to external visualization software6.0 构建R环境#rm(list = ls())#setwd('E:/gsj/RWD/WGCNA/')#library(WGCNA)#options(stringsAsFactors = FALSE)#这一步不能省#enableWGCNAThreads()#这一条语句指的是允许WGCNA使用多线程，但是如果是在本机上使用的话，这一步过程可以跳过#lnames = load(file = "FemaleLiver-01-dataInput.RData")#以上，成功导入第一步生成的datExpr和datTraits#lnames = load(file = "FemaleLiver-02-networkConstruction-auto.RData");#lnames#以上，成功导入第二步生成的参数6.1 将数据导出VisANTTOM = TOMsimilarityFromExpr(datExpr, power = 6);annot = read.csv(file = "GeneAnnotation.csv");#选择模块module = "brown"#选择模块的IDprobes = names(datExpr)inModule = (moduleColors==module);modProbes = probes[inModule];modTOM = TOM[inModule, inModule];dimnames(modTOM) = list(modProbes, modProbes)vis = exportNetworkToVisANT(modTOM, file = paste("VisANTInput-", module, ".txt", sep=""), weighted = TRUE, threshold = 0, probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )#因为brown模块很大，筛选出top30的基因nTop = 30;IMConn = softConnectivity(datExpr[, modProbes]);top = (rank(-IMConn) <= nTop)vis = exportNetworkToVisANT(modTOM[top, top], file = paste("VisANTInput-", module, "-top30.txt", sep=""), weighted = TRUE, threshold = 0, probeToGene = data.frame(annot$substanceBXH, annot$gene_symbol) )6.2 导出Cytoscape# Recalculate topological overlap if neededTOM = TOMsimilarityFromExpr(datExpr, power = 6);# Read in the annotation fileannot = read.csv(file = "GeneAnnotation.csv");# Select modulesmodules = c("brown", "red");# Select module probesprobes = names(datExpr)inModule = is.finite(match(moduleColors, modules));modProbes = probes[inModule];modGenes = annot$gene_symbol[match(modProbes, annot$substanceBXH)];# Select the corresponding Topological OverlapmodTOM = TOM[inModule, inModule];dimnames(modTOM) = list(modProbes, modProbes)# Export the network into edge and node list files Cytoscape can readcyt = exportNetworkToCytoscape(modTOM, edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""), nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""), weighted = TRUE, threshold = 0.02, nodeNames = modProbes, altNodeNames = modGenes, nodeAttr = moduleColors[inModule]);20人点赞转录组数据更多精彩内容下载简书APP"小礼物走一走，来简书关注我"还没有人赞赏，支持一下

五谷吃不完了总资产8 (约0.59元)共写了3163字获得108个赞共111个粉丝

Tutorial for the WGCNA package for R

相关推荐