转录组数据的基因表达变化情况探索

一般来说可以用CV或者MAD来衡量某基因在某些样本的表达变化情况。

标准差与平均数的比值称为变异系数,记为C.V(Coefficient of Variance)。 变异系数又称“标准差率”,是衡量资料中各观测值变异程度的另一个统计量。 当进行两个或多个资料变异程度的比较时,如果度量单位与平均数相同,可以直接利用标准差来比较。

平均绝对误差(Mean Absolute Deviation),又叫平均绝对离差,它是是所有单个观测值与算术平均值的偏差绝对值的平均

用下面的代码可以看看,标准差,平均数,变异系数, 平均绝对误差的关系,出图如下:

代码如下:

1library(airway)
2library(edgeR)
3library(DESeq2)
4
5data(airway)
6airway
7exprSet=assay(airway)
8geneLists=rownames(exprSet)
9keepGene=rowSums(cpm(exprSet)>0) >=2
10table(keepGene);dim(exprSet)
11dim(exprSet[keepGene,])
12exprSet=exprSet[keepGene,]
13rownames(exprSet)=geneLists[keepGene]
14
15boxplot(exprSet,las=2)
16# CPM normalized counts.
17exprSet=log2(cpm(exprSet)+1)
18boxplot(exprSet,las=2)
19
20mean_per_gene <- apply(exprSet, 1, mean, na.rm = TRUE)
21sd_per_gene <- apply(exprSet, 1, sd, na.rm = TRUE)
22mad_perl_gene <-   apply(exprSet, 1, mad, na.rm = TRUE)
23cv_per_gene <- data.frame(mean = mean_per_gene,
24                          sd = sd_per_gene,
25                          mad=mad_perl_gene,
26                          cv = sd_per_gene/mean_per_gene)
27rownames(cv_per_gene) <- rownames(exprSet)
28pairs(cv_per_gene)

很明显,这个CV可以衡量某基因的表达变化情况,但是没办法在基因与基因之间比较,因为不同基因的CV不同,大部分情况是因为它们的平均表达量不同而已。

根据表达量对CV值进行校正

1# https://jdblischak.github.io/singleCellSeq/analysis/cv-adjusted-wo-19098-r2.html
2library(zoo)
3# Compute a data-wide coefficient of variation on CPM normalized counts.
4cv <- apply(2^exprSet, 1, sd)/apply(2^exprSet, 1, mean)
5# Order of genes by mean expression levels
6order_gene <- order(apply(2^exprSet, 1, mean))
7# Rolling medians of log10 squared CV by mean expression levels
8roll_medians <- rollapply(log10(cv^2)[order_gene], width = 50, by = 25,
9                          FUN = median, fill = list("extend", "extend", "NA") )
10## then change the NA values in the roll_medians
11table(is.na(roll_medians))
12ii_na <- which( is.na(roll_medians) )
13roll_medians[ii_na] <- median( log10(cv^2)[order_gene][ii_na] )
14names(roll_medians) <- rownames(exprSet)[order_gene]
15
16
17# re-order rolling medians according to the expression matrix
18roll_medians <- roll_medians[  match(rownames(exprSet), names(roll_medians) ) ]
19stopifnot( all.equal(names(roll_medians), rownames(exprSet) ) )
20
21
22# adjusted coefficient of variation on log10 scale
23log10cv2_adj <-  log10( cv^2) - roll_medians
24plot(log10cv2_adj,mean_per_gene)
25#install.packages("basicTrendline")
26library(basicTrendline)
27trendline(log10cv2_adj,mean_per_gene,model="line2P")

出图如下:

可以看到这个校正后的cv已经是几乎不受基因表达量的影响了,所以可以比较不同基因的表达变化情况啦。

根据基因长度对CV进行校正

先去gencode数据库找到gtf文件,对每个基因计算外显子长度之和作为基因的长度,代码如下;

1## First, wecomputed gene lengths by taking the union of all exons within a gene based on the Ensembl annotation.
2
3#  cat ~/reference/gtf/gencode/gencode.v25.annotation.gtf|grep -v PAR_Y |perl -alne '{next if /^#/;if($F[2] eq "gene"){/(ENSG\d+)/;print $1} }'|sort |uniq -c |grep -w 2
4# cat ~/reference/gtf/gencode/gencode.v25.annotation.gtf |grep -v PAR_Y |perl -alne '{next if /^#/;if($F[2] eq "gene"){/(ENSG\d+)/;$gene=$1;undef %h} if($F[2] eq "exon"){$key="$F[3]\t$F[4]";$len=$F[4]-$F[3];$c{$gene}+=$len unless exists $h{$key};$h{$key}++} }END{print "$_\t$c{$_}" foreach keys %c}' >>human_ENSG_length
5# grep ENSG00000237094  ~/reference/gtf/gencode/gencode.v25.annotation.gtf|grep -w exon |cut -f 4-5|sort -u |awk '{print $2-$1}'|paste -sd+ - | bc
6# grep ENSG00000237094 human_ENSG_length
7#  cat gencode.vM12.annotation.gtf |perl -alne '{next if /^#/;if($F[2] eq "gene"){/(ENSMUSG\d+)/;$gene=$1;undef %h} if($F[2] eq "exon"){$key="$F[3]\t$F[4]";$len=$F[4]-$F[3];$c{$gene}+=$len unless exists $h{$key};$h{$key}++} }END{print "$_\t$c{$_}" foreach keys %c}' >mouse_ENSG_length

得到的长度文件如下:

1               V1    V2
21 ENSG00000252040   131
32 ENSG00000251770    82
43 ENSG00000261028   856
54 ENSG00000186844   421
65 ENSG00000234241  1682
76 ENSG00000144815 15589
1gen_l=read.table('human_ENSG_length',stringsAsFactors = F)
2head(gen_l)
3length_per_gene=gen_l[match(rownames(exprSet),gen_l[,1]),2]
4mean_per_gene <- apply(exprSet, 1, mean, na.rm = TRUE)
5sd_per_gene <- apply(exprSet, 1, sd, na.rm = TRUE)
6mad_perl_gene <-   apply(exprSet, 1, mad, na.rm = TRUE)
7cv_per_gene <- data.frame(mean = mean_per_gene,
8                          sd = sd_per_gene,
9                          mad=mad_perl_gene,
10                          len=log2(length_per_gene),
11                          cv = sd_per_gene/mean_per_gene)
12rownames(cv_per_gene) <- rownames(exprSet)
13cv_per_gene=na.omit(cv_per_gene)
14cor(cv_per_gene)
15pairs(cv_per_gene)
16
17fivenum(cv_per_gene$mean)
18## 假如去除低表达量基因
19cv_per_gene=cv_per_gene[cv_per_gene$mean < fivenum(cv_per_gene$mean)[2],]
20pairs(cv_per_gene)

出图如下:

可以看到基因长度的确是影响着CV值,而且并不独立于表达量,所以还是需要去除这个因素。

可以使用校正表达量的代码来校正长度:

1library(zoo)
2table(rownames(exprSet) %in% gen_l[,1])
3exprSet=exprSet[rownames(exprSet) %in% gen_l[,1],]
4cv <- apply(2^exprSet, 1, sd)/apply(2^exprSet, 1, mean)
5## firstly for mean values of exprSet
6order_gene <- order(apply(2^exprSet, 1, mean))
7roll_medians_mean <- rollapply(log10(cv^2)[order_gene], width = 50, by = 25,
8                          FUN = median, fill = list("extend", "extend", "NA") )
9## then change the NA values in the roll_medians_mean
10table(is.na(roll_medians_mean))
11ii_na <- which( is.na(roll_medians_mean) )
12roll_medians_mean[ii_na] <- median( log10(cv^2)[order_gene][ii_na] )
13names(roll_medians_mean) <- rownames(exprSet)[order_gene]
14roll_medians_mean <- roll_medians_mean[  match(rownames(exprSet), names(roll_medians_mean) ) ]
15stopifnot( all.equal(names(roll_medians_mean), rownames(exprSet) ) )
16log10cv2_adj <-  log10( cv^2) - roll_medians_mean
17
18mean_per_gene <- apply(exprSet, 1, mean, na.rm = TRUE)
19plot( log10( cv^2) ,mean_per_gene)
20plot(log10cv2_adj,mean_per_gene)
21length_per_gene=gen_l[match(rownames(exprSet),gen_l[,1]),2]
22
23## Then for gene length(log10)
24order_gene <- order( log10(length_per_gene) )
25cv=log10cv2_adj
26roll_medians_length <- rollapply(cv[order_gene], width = 50, by = 25,
27                               FUN = median, fill = list("extend", "extend", "NA") )
28## then change the NA values in the roll_medians_length
29table(is.na(roll_medians_length))
30ii_na <- which( is.na(roll_medians_length) )
31roll_medians_length[ii_na] <- median( cv[order_gene][ii_na] )
32names(roll_medians_length) <- rownames(exprSet)[order_gene]
33roll_medians_length <- roll_medians_length[  match(rownames(exprSet), names(roll_medians_length) ) ]
34stopifnot( all.equal(names(roll_medians_length), rownames(exprSet) ) )
35log10cv2_adj <-  cv -  roll_medians_length
36plot( log10( cv^2) ,log10(length_per_gene))
37plot(log10cv2_adj,log10(length_per_gene))
38plot(log10cv2_adj,log10(mean_per_gene))
39
40#install.packages("basicTrendline")
41library(basicTrendline)
42par(mfrow=c(1,2))
43trendline(log10(length_per_gene),log10cv2_adj,model="line2P")
44trendline(mean_per_gene,log10cv2_adj,model="line2P")

完全去除后,校正如下:

可以看到跟文章里面的非常 接近了,校正两次后的CV值,就是 DM值

这个计算公式参考: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4595712/#mmc1

(0)

相关推荐