七步走纯R代码通过数据挖掘复现一篇实验文章（第七步WGCNA） / 开普饭

下面看学徒表演

本期我们的任务是全代码复现一篇paper，标题为：Co-expression networks revealed potential core lncRNAs in the triple-negative breast cancer. PMID：27380926 （大家需要自行阅读文献才明白jimmy交给学徒的任务！）文章里面是自己测了8个TNBC病人的转录组然后分析，但是我们有TCGA数据库，所以可以复现，这就是为什么标题是七步走纯R代码通过数据挖掘复现一篇实验文章！

因为排版限制，前面的1-6步内容在本期推文的头条！

step7.WGCNA

WGCNA分析完全是照搬jimmy老师三年前的教程,要提醒大家的是要用别人的教程一定要看清楚别人数据格式，行名，列名，然后做成绝对一致的格式，这样会很大程度的减少报错！

rm(list = ls())

load(file = "../02_data/TCGA_TNBC_pair_annotation_result.Rdata")

## step1_构建WGCNA所需表达矩阵

# --------------------------------------------------------------------------------

exprSet_lncRNA[1:3,1:3]

##            TCGA-BH-A0B3-01A TCGA-BH-A0B3-11B TCGA-BH-A18V-01A

## IGF2-AS            4.201151         4.945464         3.167927

## AL136982.1         2.628749         5.484643         4.256068

## SHANK2-AS3         2.628749         2.628749         3.552230

exprSet_mRNA[1:3,1:3]

##        TCGA-BH-A0B3-01A TCGA-BH-A0B3-11B TCGA-BH-A18V-01A

## TNMD           4.443206         8.357049         5.006992

## DBNDD1        11.148636         8.672442        12.093123

## TFPI           8.780552        12.019202        10.205594

exprSet_lncRNA = as.data.frame(t(exprSet_lncRNA))

exprSet_mRNA = as.data.frame(t(exprSet_mRNA))

exprSet_lncRNA[1:3,1:3]

##                   IGF2-AS AL136982.1 SHANK2-AS3

## TCGA-BH-A0B3-01A 4.201151   2.628749   2.628749

## TCGA-BH-A0B3-11B 4.945464   5.484643   2.628749

## TCGA-BH-A18V-01A 3.167927   4.256068   3.552230

exprSet_mRNA[1:3,1:3]

##                      TNMD    DBNDD1      TFPI

## TCGA-BH-A0B3-01A 4.443206 11.148636  8.780552

## TCGA-BH-A0B3-11B 8.357049  8.672442 12.019202

## TCGA-BH-A18V-01A 5.006992 12.093123 10.205594

exprSet_mRNA$sample = rownames(exprSet_mRNA)

exprSet_lncRNA$sample = rownames(exprSet_lncRNA)

tmp = merge(exprSet_mRNA,exprSet_lncRNA, by = "sample")

tmp[1:3,1:3]

##             sample     TNMD    DBNDD1

## 1 TCGA-BH-A0B3-01A 4.443206 11.148636

## 2 TCGA-BH-A0B3-11B 8.357049  8.672442

## 3 TCGA-BH-A18V-01A 5.006992 12.093123

rownames(tmp) = tmp$sample

tmp = tmp[,-1]

tmp[1:3,1:3]

##                      TNMD    DBNDD1      TFPI

## TCGA-BH-A0B3-01A 4.443206 11.148636  8.780552

## TCGA-BH-A0B3-11B 8.357049  8.672442 12.019202

## TCGA-BH-A18V-01A 5.006992 12.093123 10.205594

# -----------------------------------------------------------------------------------

group_list=factor(ifelse(as.numeric(substr(rownames(exprSet_mRNA),14,15)) < 10,'tumor','normal'))

table(group_list)

## group_list

## normal  tumor 

##      8      8

datTraits <- data.frame(row.names=rownames(exprSet_mRNA),

                       subtype=group_list)

library(WGCNA)

rt = tmp

datExpr = rt

nSamples = nrow(datExpr)

# step2_找到合适的beta值 #--------------------------------------------------------------------------------------- powers1=c(seq(1,10,by=1),seq(12,30,by=2)) RpowerTable=pickSoftThreshold(datExpr, powerVector=powers1)[[2]] cex1=0.7 #pdf(file="softThresholding.pdf") par(mfrow=c(1,2)) plot(RpowerTable[,1], -sign(RpowerTable[,3])*RpowerTable[,2],xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n") text(RpowerTable[,1], -sign(RpowerTable[,3])*RpowerTable[,2], labels=powers1,cex=cex1,col="red") # this line corresponds to using an R^2 cut-off of h abline(h=0.85,col="red") plot(RpowerTable[,1], RpowerTable[,5],xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n") text(RpowerTable[,1], RpowerTable[,5], labels=powers1, cex=cex1,col="red")

img

beta1=14

# step3_对基因聚类成模块并且可视化 #--------------------------------------------------------------------------------------- net = blockwiseModules( datExpr, power = 14, maxBlockSize = 6000, TOMType = "unsigned", minModuleSize = 30, reassignThreshold = 0, mergeCutHeight = 0.25, numericLabels = TRUE, pamRespectsDendro = FALSE, saveTOMs = TRUE, saveTOMFileBase = "AS-green-FPKM-TOM", verbose = 3 ) ## Calculating module eigengenes block-wise from all genes table(net$colors) ## ## 0 1 2 3 ## 78 1433 1300 500 mergedColors = labels2colors(net$colors) table(mergedColors) ## mergedColors ## blue brown grey turquoise ## 1300 500 78 1433 # 将目标基因聚成几个模块 plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]], "Module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05)

img

# step4_将样本信息添加进去,分析样本性状与模块的关系 # -------------------------------------------------------------------------- design=model.matrix(~0+ datTraits$subtype) design = as.data.frame(design) colnames(design)=levels(datTraits$subtype) moduleColors <- labels2colors(net$colors) # Recalculate MEs with color labels MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes MEs = orderMEs(MEs0) moduleTraitCor = cor(MEs, design , use = "p") moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)


# Will display correlations and their p-values

textMatrix = paste(signif(moduleTraitCor, 2), "\n(",

                   signif(moduleTraitPvalue, 1), ")", sep = "")

dim(textMatrix) = dim(moduleTraitCor)
# Display the correlation values within a heatmap plot

labeledHeatmap(Matrix = moduleTraitCor,

               xLabels = colnames(design),

               yLabels = names(MEs),

               ySymbols = names(MEs),

               colorLabels = FALSE,

               colors = greenWhiteRed(50),

               textMatrix = textMatrix,

               setStdMargins = FALSE,

               cex.text = 0.5,

               zlim = c(-1,1),

               main = paste("Module-trait relationships"))
# step5_感兴趣性状的模块的具体基因分析

# ---------------------------------------------------------------------------------

## 首先计算模块与基因的相关性矩阵

modNames = substring(names(MEs), 3)

geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"));

MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));

names(geneModuleMembership) = paste("MM", modNames, sep="");

names(MMPvalue) = paste("p.MM", modNames, sep="")
## 再计算性状与基因的相关性矩阵

tumor = as.data.frame(design[,2])

names(tumor) = "tumor"

geneTraitSignificance = as.data.frame(cor(datExpr, tumor, use = "p"))

GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))

names(geneTraitSignificance) = paste("GS.", names(tumor), sep="")

names(GSPvalue) = paste("p.GS.", names(tumor), sep="")
## 最后把两个相关性矩阵联合起来,指定感兴趣模块进行分析

module = "blue"

module

## [1] "blue"

column = match(module, modNames);

moduleGenes = moduleColors==module;
# step6_网络的可视化

# -----------------------------------------------------------------------------------

## 首先针对所有基因画热图

nGenes = ncol(datExpr)

nSamples = nrow(datExpr)

geneTree = net$dendrograms[[1]]; 

dissTOM = 1-TOMsimilarityFromExpr(datExpr, power = 14); 

plotTOM = dissTOM^7; 

diag(plotTOM) = NA; 

# TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")
## 最后画模块和性状的关系

# Recalculate module eigengenes

MEs = moduleEigengenes(datExpr, moduleColors)$eigengenes
## 这里把是否属于 tumor 表型这个变量用0,1进行数值化。

tumor = as.data.frame(design[,2])

names(tumor) = "tumor"

# Add the weight to existing module eigengenes

MET = orderMEs(cbind(MEs, tumor))

# Plot the relationships among the eigengenes and the trait

plotEigengeneNetworks(MET, "", marDendro = c(0,4,1,2), marHeatmap = c(3,4,1,2), cex.lab = 0.8, xLabelsAngle = 90)

img

# Plot the dendrogram

## 模块与性状的聚类图 plotEigengeneNetworks(MET, "Eigengene dendrogram", marDendro = c(0,4,2,0), plotHeatmaps = FALSE)

img

# Plot the heatmap matrix (note: this plot will overwrite the dendrogram plot)

## 性状与模块热图 plotEigengeneNetworks(MET, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2), plotDendrograms = FALSE, xLabelsAngle = 90)

img

# step7_提取指定模块的基因名 # -------------------------------------------------------------------------------------- # Select module module = "blue"

# Select module probes probes = colnames(datExpr) ## 我们例子里面的probe就是基因名 inModule = (moduleColors==module); modProbes = probes[inModule]; head(modProbes) ## [1] "SLC7A2" "TAC1" "DLX6" "DBF4" "TKTL1" "E2F2" # Step9: 模块的导出 # -------------------------------------------------------------------------------------- # Recalculate topological overlap TOM = TOMsimilarityFromExpr(datExpr, power = 14); module = "brown"; # Select module probes probes = colnames(datExpr) ## 我们例子里面的probe就是基因名 inModule = (moduleColors==module); modProbes = probes[inModule]; ## 也是提取指定模块的基因名 # Select the corresponding Topological Overlap modTOM = TOM[inModule, inModule]; dimnames(modTOM) = list(modProbes, modProbes) ## 模块对应的基因关系矩阵 cyt = exportNetworkToCytoscape( modTOM, edgeFile = paste("CytoscapeInput-edges-", paste(module, collapse="-"), ".txt", sep=""), nodeFile = paste("CytoscapeInput-nodes-", paste(module, collapse="-"), ".txt", sep=""), weighted = TRUE, threshold = 0.02, nodeNames = modProbes, nodeAttr = moduleColors[inModule] )

最后分享这篇教程的心得：遇到问题勤找Google~

后记

七步走纯R代码通过数据挖掘复现一篇实验文章（第七步WGCNA）

下面看学徒表演

step7.WGCNA

相关推荐