单细胞转录组课程笔记
最近参加了群主的单细胞转录组课程,根据指示从GIthub下载单细胞转录组表达矩阵数据,https://github.com/jmzeng1314/scRNA_smart_seq2
课程说明在:https://mp.weixin.qq.com/s/AV2uTbsvJGBRq_zv7yDmNg
按流程一步一步来学习
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager") ##判断是否存在BiocManager包,不存在的话安装
library(BiocManager)
BiocManager::install("TxDb.Hsapiens.UCSC.hg38.knownGene", version = "3.8") #下载包(人类已知基因信息)
BiocManager::install(c('org.Mm.eg.db','scran'))
options()$repos ## 查看使用install.packages安装时的默认镜像
options()$BioC_mirror ##查看使用bioconductor的默认镜像
options(BioC_mirror="https://mirrors.ustc.edu.cn/bioc/") ##指定镜像,这个是中国科技大学镜像
options("repos" = c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/")) ##指定install.packages安装镜像,这个是清华镜像
options()$repos
options()$BioC_mirror
BiocManager::install(c( 'scran'))
if(F){
a=read.table('../GSE111229_Mammary_Tumor_fibroblasts_768samples_rawCounts.txt.gz',
header = T ,sep = '\t') ##把表达矩阵文件载入R,header=T :保留文件头部信息,seq='\t'以tap为分隔符
# 每次都要检测数据
a[1:4,1:4]
## 读取RNA-seq定量结果,表达矩阵需要进行简单的过滤
dat=a[apply(a,1, function(x) sum(x>1) > floor(ncol(a)/50)),] #筛选表达量合格的行,列数不变
dat[1:4,1:4]
dat=log2(edgeR::cpm(dat)+1) ##CPM(count-per-million,每百万碱基中每个转录本的count值)
###CPM只对read count相对总reads数做了数量的均一化。
dat[1:4,1:4]
dat[1:4,1:4]
exprSet=dat
mean_per_gene <- apply(exprSet, 1, mean, na.rm = TRUE) #对表达矩阵每行求均值
sd_per_gene <- apply(exprSet, 1, sd, na.rm = TRUE) #对表达矩阵每行求标准差
mad_perl_gene <- apply(exprSet, 1, mad, na.rm = TRUE) #对表达矩阵每行求绝对中位差
cv_per_gene <- data.frame(mean = mean_per_gene,
sd = sd_per_gene,
mad=mad_perl_gene,
cv = sd_per_gene/mean_per_gene)
rownames(cv_per_gene) <- rownames(exprSet)
head(cv_per_gene)
# pairs(cv_per_gene)
with(cv_per_gene,plot(log10(mean),log10(cv)))
和文章的图相比,图有点丑,而且趋势也不对
仔细看下y轴是log10(cv),而文章中的图y轴是log10(cv2),所有改成cv平方,执行下面代码看看。
#画图
with(cv_per_gene,plot(log10(mean),log10(cv^2)))
发现趋势没什么变化,还是不对,只是y轴有区别,仔细看文章发现画图使用的数据不对,文章中用的是,我们导入RPKM数据来试试。
a=read.table('../GSE111229_Mammary_Tumor_fibroblasts_768samples_rpkmNormalized.txt.gz',
header = T ,sep = '\t') ##把表达矩阵文件载入R,header=T :保留文件头部信息,seq='\t'以tap为分隔符
a[1:4,1:4]
exprSet=a
mean_per_gene <- apply(exprSet, 1, mean, na.rm = TRUE) #对表达矩阵每行求均值
sd_per_gene <- apply(exprSet, 1, sd, na.rm = TRUE) #对表达矩阵每行求标准差
mad_perl_gene <- apply(exprSet, 1, mad, na.rm = TRUE) #对表达矩阵每行求绝对中位差
cv_per_gene <- data.frame(mean = mean_per_gene,
sd = sd_per_gene,
mad=mad_perl_gene,
cv = sd_per_gene/mean_per_gene)
rownames(cv_per_gene) <- rownames(exprSet)
head(cv_per_gene)
library(ggplot2)
library(magrittr)
library(ggpubr)
x<- with(cv_per_gene,log10(cv_per_gene$mean))
y<- with(cv_per_gene,log10(cv_per_gene$cv ^2))
z <-as.data.frame(cbind(x,y))
names(z) <- c("RPKM","cv2")
#画图
with(cv_per_gene,plot(log10(mean),log10(cv^2)))
图的趋势对了,但比较丑,我们来调一下,按照群里教的使用ggpubr包来画图
x<- with(cv_per_gene,log10(cv_per_gene$mean))
y<- with(cv_per_gene,log10(cv_per_gene$cv ^2))
z <-as.data.frame(cbind(x,y))
names(z) <- c("RPKM","cv2")
ggscatter(z, x = "RPKM", y = "cv2",
color = "black", shape = 16, size = 1,
xlab = 'log10(mean)RPKM', ylab = "log10(cv^2)")
ggscatter(z, x = "RPKM", y = "cv2",
color = "black", shape = 16, size = 1, # Points color, shape and size
xlab = 'log10(mean)RPKM', ylab = "log10(cv^2)",
add = "loess", #添加拟合曲线
add.params = list(color = "red",fill = "lightgray"),
cor.coeff.args = list(method = "spearman"), label.x = 3,label.sep = "\n",
dot.size = 2,
ylim=c(-0.5, 3),
xlim=c(0,4)
)
虽然和文章的图还有点不太一样,但总算有几分神似了,起码趋势是对的。
■ ■ ■