GEO数据挖掘-第一期-胶质母细胞瘤（GBM） / 开普饭

GEO数据挖掘系列文-第一期-胶质母细胞瘤

文章标题

lncRNAs PVT1 and HAR1A are prognosis biomarkers and indicate
therapy outcome for diffuse glioma patients

关键词

胶质母细胞瘤，lncRNA

疾病：神经胶质瘤

1. 星形胶质细胞（astrocytomas）

· Grade I

· Grade II：弥漫性星形细胞瘤

· Grade III：anaplastic astrocytoma

· Grade IV：胶质母细胞瘤（glioblastoma，GBM）

2. 少突胶质细胞（oligodendroglial）

· Grade II

· Grade III

◆ ◆ ◆ ◆ ◆

GEO数据库编号：GSE4290

研究对象：lncRNA

实验设计

实验组：77个神经胶质母细胞瘤样本
对照组：23个非肿瘤样本

结论：在神经胶质母细胞瘤中PVT1和CYTOR基因表达显著上调, HAR1A和MIAT基因表达显著下调。

◆ ◆ ◆ ◆ ◆

GEO数据挖掘过程

第一步下载R包

国外镜像下载速度很慢，所以下载之前一定要设置好国内镜像

bioPackages <-c( "stringi", # 处理字符串 "GEOquery", # 下载GEO数据 "limma", # 差异分析 "ggfortify", "ggplot2", "pheatmap", "ggstatsplot", "VennDiagram", # 作图 "clusterProfiler", "org.Hs.eg.db" # 注释 )


## 设置软件包的下载镜像

local({
# CRAN的镜像设置

  r <- getOption( "repos" ); 

  r[ "CRAN" ] <- "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"; 

  options( repos = r )

  # bioconductor的镜像设置

  BioC <- getOption( "BioC_mirror" ); 

  BioC[ "BioC_mirror" ] <- "https://mirrors.ustc.edu.cn/bioc/"; 

  options( BioC_mirror = BioC )

})

## 安装未安装的软件包 source( "https://bioconductor.org/biocLite.R" ) #第一次使用bioconductor需要运行 lapply( bioPackages, function( bioPackage ){ if( !require( bioPackage, character.only = T ) ){ CRANpackages <- available.packages() if( bioPackage %in% rownames( CRANpackages) ){ install.packages( bioPackage ) }else{ BiocInstaller::biocLite( bioPackage, suppressUpdates = FALSE, ask = FALSE) } } } )

第二步得到geneID和gene类型的对应关系

这个关系可以从人类的GTF文件中提取出来，GTF可以在这里下载：https://asia.ensembl.org/info/data/ftp/index.html

下载后的文件经过下面的shell脚本处理，即可以得到基因与基因类型的对应关系

awk '{if(!NF || /^#/){next}}1' /public/reference/gtf/gencode/gencode.v25lift37.annotation.gtf | cut -f9 | sed 's/\"//g'| sed 's/;//g' | awk '{ print $4"\t"$8 }' |awk '{if(/^E/){next}}1'| awk '{ print $2"\t"$1 }' | sort -k 1 | uniq > gencode.v25lift37.annotation.gtf.gene2type

将处理好的文件导入R中进行后续处理，因为这篇文章只研究lncRNA，所以要去除编码蛋白的基因ID

{ gene2type = read.table( 'gencode.v25lift37.annotation.gtf.gene2type' ) colnames( gene2type ) = c( "gene", "type" ) }


dim( gene2type )

## [1] 58298     2

## 说明一共有五万多个基因的对应关系
head( gene2type )

##       gene           type

## 1  5S_rRNA           rRNA

## 2      7SK       misc_RNA

## 3 A1BG-AS1      antisense
sort( table( gene2type$type ) )
## lincRNA    processed_pseudogene    protein_coding

##    7601                   10197             20087
## 说明绝大多数的基因是编码蛋白的

## 剔除基因类型为“protein_coding”的对应关系 gene2type = gene2type[ gene2type[,2] != 'protein_coding', ] length( unique( gene2type$gene ) ) save( gene2type, file = 'Relationship_others_gene.Rdata' )

第三步下载GEO数据

如果getGEO下载失败，可以手动在项目主页进行下载

library( "GEOquery" ) GSE_name = 'GSE4290' options( 'download.file.method.GEOquery' = 'libcurl' ) #windows系统 gset <- getGEO( GSE_name, getGPL = T )


## getGEO函数会下载GSE项目下的所有子集，得到的gset对象是一个list，'GSE4290’只有一个项目，之后的实战会遇到多子集的情况
## 'getGPL = T’会直接下载注释平台，如果报错，本文最后会附上，其他进行平台注释的方法

save( gset, file = 'gset.Rdata' )

第四步数据集筛选

对样本进行不同分组，以及探针的选取对之后的差异分析结果都会有影响。

作者研究的是GBM样本和非肿瘤样本在lncRNA表达上的差异，所以先取出这180个样本中的77个GBM样本和23个非肿瘤样本

options( stringsAsFactors = F )


load( './gset.Rdata' )

library( "GEOquery" )
## 取表达矩阵和样本信息表

{

  gset = gset[[1]]

  exprSet = exprs( gset ) ## “GEOquery”包中的exprs函数用来取出表达矩阵

  pdata = pData( gset ) ## “GEOquery”包中的pData函数用来取出样本信息

  group_list = as.character( pdata[, 35] )

}

dim( exprSet )

## [1] 54613   180

exprSet[ 1:3, 1:5 ]

##           GSM97793 GSM97794 GSM97795 GSM97796 GSM97797

## 1007_s_at  10178.1  10122.9   7826.6  11098.4   8668.9

## 1053_at      388.2    517.5    352.4    609.9    430.1
## 现在就应该对得到的矩阵有这样一个印象

## 这个矩阵有180列，列名是样本编号，54613行，行名是探针编号

## 矩阵的内容就是各个样本对应探针检测到的表达量

## 探针本身是没有意义的，所以我们下一步就要将探针转为基因名
table( group_list )

## astrocytoma, grade 2       astrocytoma, grade 3      glioblastoma, grade 4                        

##                    7                         19                         77

## non-tumor    oligodendroglioma, grade 2     oligodendroglioma, grade 3

##        23                            38                             12
## 我们不能通过上面表达矩阵的样本名知道这个样本是肿瘤样本还是非肿瘤样本

## 而pdata记录了各个样本的临床信息，就可以通过pdata得到样本名和样本类型的对应关系
## 根据上面的group_list，取出研究样本
## 总结一下就是，这180个病人中astrocytoma分为了二三期病人

## glioblastoma是胶质母细胞瘤，属于恶性细胞瘤，只有第四期

## oligodendroglioma也分为二三期病人

## 其余样本为对照
{

  n_expr = exprSet[ , grep( "non-tumor",         group_list )]

  g_expr = exprSet[ , grep( "glioblastoma",      group_list )]

  a_expr = exprSet[ , grep( "astrocytoma",       group_list )]

  o_expr = exprSet[ , grep( "oligodendroglioma", group_list )]

  

}
## 样本分组，新的表达矩阵只有normal和gbm样本

{

 exprSet = cbind( n_expr, g_expr )

 group_list = c(rep( 'normal', ncol( n_expr ) ), 

                 rep( 'gbm',    ncol( g_expr ) ) )

}
dim( exprSet )

exprSet[ 1:5, 1:5 ]

table( group_list )

save( exprSet, group_list, file = 'exprSet_by_group.Rdata')

分组这一步就完成了，下面要去除没有注释的探针

并不是每一个探针都是对应lncRNA的，所以我们要用之前的基因和基因类型关系筛选一下，之后再去除没有注释的探针

## 筛选探针 GPL = gset@featureData@data ## 第三步getGEO函数下载数据时，直接下载了平台，GPL就是注释矩阵的平台数据 ## 也就是探针和基因的对应关系


colnames( GPL )

view( GPL )
## GPL的“ID”列是探针，'Gene Symbol”列是基因名，将这两列取出来

ids = GPL[ ,c( 1, 11 ) ]
## 'Gene Symbol”列格式有些特殊

## 一个探针对应了多个基因

a<-strsplit(as.character(ids[,2]), " /// ")

tmp <- mapply( cbind, ids[,1], a ) 

ID2gene <- as.data.frame( tmp )

colnames( ID2gene ) = c( "id", "gene" )

load( "./Relationship_others_gene.Rdata" )
## 下面一步就是根据gene2type去除平台中编码蛋白的基因

ID2gene$type = gene2type[ match( ID2gene[ , 2 ], gene2type[ , 1 ] ), 2 ]
dim(ID2gene)

## [1] 59255     3
save(ID2gene, file = 'ID2gene.Rdata')
load('./ID2gene.Rdata')

load('./exprSet_by_group.Rdata')
## 这一步根据ID2gene去除没有注释的探针

{

  exprSet = exprSet[ rownames(exprSet) %in% ID2gene[ , 1 ], ]

  ID2gene = ID2gene[ match(rownames(exprSet), ID2gene[ , 1 ] ), ]

}
## 因为有些探针对应同一个基因，要将exprSet，ID2gene的探针一致
## 即exprSet有的探针，ID2gene也一定有

## 即ID2gene有的探针，exprSet也一定有
dim( exprSet )

dim( ID2gene )

tail( sort( table( ID2gene[ , 2 ] ) ), n = 12L )
## 相同基因的表达数据取最大值，五万多个探针，这一步相对会运行较长时间

{

  MAX = by( exprSet, ID2gene[ , 2 ], 

              function(x) rownames(x)[ which.max( rowMeans(x) ) ] )

  MAX = as.character(MAX)

  exprSet = exprSet[ rownames(exprSet) %in% MAX , ]

  rownames( exprSet ) = ID2gene[ match( rownames( exprSet ), ID2gene[ , 1 ] ), 2 ]

}

exprSet = log(exprSet)

dim(exprSet)

exprSet[1:5,1:5]

save(exprSet, group_list, file = 'final_exprSet.Rdata')

进行上诉操作后，基本我们就得到了之后要进行差异分析的数据集了，我们可以先通过聚类分析看一下数据的聚类情况，是否与分组一致

library( "ggfortify" ) ## 聚类 { colnames( exprSet ) = paste( group_list, 1:ncol( exprSet ), sep = '_' ) nodePar <- list( lab.cex = 0.3, pch = c( NA, 19 ), cex = 0.3, col = "red" ) hc = hclust( dist( t( exprSet ) ) ) png('hclust.png', res = 250, height = 1800) plot( as.dendrogram( hc ), nodePar = nodePar, horiz = TRUE ) dev.off() } ## 样本过多，图就不放了

画个PCA图看一下，基本是分开的，少数样本界限不清。

如果样本量过少，对于主成分分析结果与分组不一致的样本，可以选择舍去

## PCA data = as.data.frame( t( exprSet ) ) data$group = group_list png( 'pca_plot.png', res=80 ) autoplot( prcomp( data[ , 1:( ncol( data ) - 1 ) ] ), data = data, colour = 'group', label =T, frame = T) + theme_bw() dev.off()

第五步差异分析-limma包

load( "./final_exprSet.Rdata" )

library( "limma" ) { design <- model.matrix( ~0 + factor( group_list ) ) colnames( design ) = levels( factor( group_list ) ) rownames( design ) = colnames( exprSet ) } design


contrast.matrix <- makeContrasts( "gbm-normal", levels = design )

contrast.matrix
## design和contrast.matrix都是为了下面的差异分析制作的

load( "./ID2gene.Rdata" ) { fit <- lmFit( exprSet, design ) fit2 <- contrasts.fit( fit, contrast.matrix ) fit2 <- eBayes( fit2 ) nrDEG = topTable( fit2, coef = 1, n = Inf ) write.table( nrDEG, file = "nrDEG.out") } head(nrDEG)

到这里差异分析就结束了，我们画个热图看一下效果

library( "pheatmap" ) { choose_gene = head( rownames( nrDEG ), 50 ) choose_matrix = exprSet[ choose_gene, ] choose_matrix = t( scale( t( exprSet ) ) ) annotation_col = data.frame( CellType = factor( group_list ) ) rownames( annotation_col ) = colnames( exprSet ) pheatmap( fontsize = 2, choose_matrix, annotation_col = annotation_col, show_rownames = F, annotation_legend = F, filename = "heatmap.png") }

画个火山图看一下

library( "ggplot2" ) logFC_cutoff <- with( nrDEG, mean( abs( logFC ) ) + 2 * sd( abs( logFC ) ) ) logFC_cutoff logFC_cutoff = 1 ## 文章中设置为1


{

  nrDEG$change = as.factor( ifelse( nrDEG$P.Value < 0.05 & abs(nrDEG$logFC) > logFC_cutoff,

                                  ifelse( nrDEG$logFC > logFC_cutoff , 'UP', 'DOWN' ), 'NOT' ) )
save( nrDEG, file = "nrDEG.Rdata" )
this_tile <- paste0( 'Cutoff for logFC is ', round( logFC_cutoff, 3 ),

                       '\nThe number of up gene is ', nrow(nrDEG[ nrDEG$change =='UP', ] ),

                       '\nThe number of down gene is ', nrow(nrDEG[ nrDEG$change =='DOWN', ] ) )

volcano = ggplot(data = nrDEG, aes( x = logFC, y = -log10(P.Value), color = change)) + geom_point( alpha = 0.4, size = 1.75) + theme_set( theme_set( theme_bw( base_size = 15 ) ) ) + xlab( "log2 fold change" ) + ylab( "-log10 p-value" ) + ggtitle( this_tile ) + theme( plot.title = element_text( size = 15, hjust = 0.5)) + scale_colour_manual( values = c('blue','black','red') ) print( volcano ) ggsave( volcano, filename = 'volcano.png' ) dev.off() }

第六步表达显著基因在不同肿瘤中的表达量

作者挑出PVT1 CYTOR HAR1A MIAT这四个基因，画出他们在不同的肿瘤中的表达量

library( "ggstatsplot" ) load( './final_exprSet.Rdata' ) special_gene = c( 'PVT1', 'LINC00152', 'HAR1A', 'MIAT' )


## CYTOR的别名是LINC00152

for( gene in special_gene ){ filename <- paste( gene, '.png', sep = '' ) TMP = exprSet[ rownames( exprSet ) == gene, ] data = as.data.frame(TMP) data$group = group_list p <- ggbetweenstats(data = data, x = group, y = TMP ) ggsave( p, filename = filename) }