使用作者代码重复结果 / 开普饭

课程笔记

粉丝：有单细胞线上课程吗？

小编：什么

？我们的单细胞转录组分析线上课程已经上线好久了，你们竟然都不知道吗，每篇推文后面的课程推荐没人看的吗，小编已哭晕在厕所

好了，戏演完了，下面郑重介绍下我们的单细胞线上课程：（详情戳下方链接）

全网第二个单细胞视频课程预售

这个课程笔记栏目记录了学员们学习单细胞转录组课程的学习笔记

希望大家能有所收获！

序言

第三单元第十二+十三讲：使用作者代码重复结果
课程链接在：http://jm.grazy.cn/index/mulitcourse/detail.html?cid=53

这一篇会是代码密集型，因为原文作者的代码真的写的很长！

下载作者的Github

1https://github.com/KPLab/SCS_CAF

文件布局如下：

下载好以后，需要将那两个tar.gz文件解压缩

看第二个R脚本 Processing.R

读入表达量数据

1# 首先指定操作路径 2Path_Main<-"~/scrna/SCS_CAF-master" 3# 然后读入原始的第一个细胞板数据 4plate1_raw<-read.delim(paste(Path_Main,"/SS2_15_0048/counts.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t") 5> plate1_raw[1:3,1:3] 6 gene A3 A6 71 Adora1 0 0 82 Sntg1 0 0 93 Prim2 0 0

作者这里考虑到重复基因名的问题

1# 的确存在重复基因名 2> length(as.character(plate1_raw$gene)) 3[1] 24490 4> length(unique(as.character(plate1_raw$gene))) 5> sum(duplicated(as.character(plate1_raw$gene))) 6[1] 492 7[1] 23998 8# 用这个查看 9as.character(plate1_raw$gene)[duplicated(as.character(plate1_raw$gene))] 10 11# 看一下make.unique的用法 12> make.unique(c("a", "a")) 13[1] "a" "a.1" 14# 将重复基因名变为唯一的名字 15plate1_raw$gene<-make.unique(as.character(plate1_raw$gene)) 16> sum(duplicated(as.character(plate1_raw$gene))) 17[1] 0 18# 对样本重新命名 19colnames(plate1_raw)[2:length(colnames(plate1_raw))]<-paste("SS2_15_0048_",colnames(plate1_raw)[2:length(colnames(plate1_raw))],sep="")

同样的，对于0049板，也是上述操作，最后将它们按照gene这一列进行合并，并把gene转为行名

1expr_raw<-merge(plate1_raw,plate2_raw,by="gene",all=TRUE) 2rownames(expr_raw)<-as.character(expr_raw$gene) 3expr_raw$gene<-NULL

最后计算一下dropout的比例（结果有点高）：

1# 计算dropout的比例 2sum(expr_raw==0)/(dim(expr_raw)[1]*dim(expr_raw)[2]) 3# 0.8305757

读入ERCC数据

1# 也是类似上面👆的操作 2# plate1 3plate1_raw_ercc<-read.delim(paste(Path_Main,"/SS2_15_0048/counts-ercc.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t") 4plate1_raw_ercc$gene<-make.unique(as.character(plate1_raw_ercc$gene)) 5colnames(plate1_raw_ercc)[2:length(colnames(plate1_raw_ercc))]<-paste("SS2_15_0048_",colnames(plate1_raw_ercc)[2:length(colnames(plate1_raw_ercc))],sep="") 6 7# plate2 8plate2_raw_ercc<-read.delim(paste(Path_Main,"/SS2_15_0049/counts-ercc.tab",sep=""),header=TRUE,check.names=FALSE,sep="\t") 9plate2_raw_ercc$gene<-make.unique(as.character(plate2_raw_ercc$gene)) 10colnames(plate2_raw_ercc)[2:length(colnames(plate2_raw_ercc))]<-paste("SS2_15_0049_",colnames(plate2_raw_ercc)[2:length(colnames(plate2_raw_ercc))],sep="") 11 12# 最后合并、计算ERCC dropout 13expr_raw_ercc<-merge(plate1_raw_ercc,plate2_raw_ercc,by="gene",all=TRUE) 14rownames(expr_raw_ercc)<-as.character(expr_raw_ercc$gene) 15expr_raw_ercc$gene<-NULL 16 17sum(expr_raw_ercc==0)/(dim(expr_raw_ercc)[1]*dim(expr_raw_ercc)[2]) 18# 0.6267691

看一下ERCC在各个细胞的表达量分布：

1barplot(sort(as.numeric(colSums(expr_raw_ercc)),decreasing=TRUE),ylab="SPIKE LIBRARY SIZE",xlab="CELL INDEX")

然后做一个直方图，把一定数量的样本中ERCC表达量合并作一个bin：

1hist(log2(as.numeric(colSums(expr_raw_ercc))+1),col="brown", 2 main="Distribution of Spike Library Sizes", 3 xlab="Spike Library Size",breaks=20)

将内源基因与ERCC spike-in合并

先看看分别有多少：

1> print(paste0("There are ",nrow(expr_raw)," endogenous genes")) 2[1] "There are 24490 endogenous genes" 3> print(paste0("There are ",nrow(expr_raw_ercc)," spikes")) 4[1] "There are 92 spikes"

合并起来：

1all.counts.raw<-rbind(expr_raw,expr_raw_ercc) 2> dim(all.counts.raw) 3[1] 24582 768

然后重新计算dropout的比例：

1sum(all.counts.raw==0)/(dim(all.counts.raw)[1]*dim(all.counts.raw)[2]) 2# 0.8298129

一共有7153个基因在所有细胞中表达量均为0：

1dim(all.counts.raw[rowSums(all.counts.raw)==0,]) 2# 7153 768

关于原文去掉的52个细胞

根据一些指标去掉了52个细胞

作者也把这52个细胞的质控结果读入了R：

1cell_QC<-read.delim(paste(Path_Main,"/qc/qc_2plates.filtered_cells.txt",sep=""),row.names=1,header=TRUE,sep="\t") 2> dim(cell_QC) 3[1] 52 6

在原始矩阵中也要去掉这些细胞：

1rownames(cell_QC)<-gsub("__","_",rownames(cell_QC)) 2 3all.counts.raw<-subset(all.counts.raw,select=colnames(all.counts.raw)[!colnames(all.counts.raw)%in%rownames(cell_QC)]) 4> dim(all.counts.raw) 5[1] 24582 716

过滤细胞后，重新拆分成count矩阵和ERCC矩阵：

1# 得到原始count矩阵 2expr_raw<-subset(expr_raw,select=colnames(expr_raw)[!colnames(expr_raw)%in%rownames(cell_QC)]) 3# 得到ERCC矩阵 4expr_raw_ercc<-subset(expr_raw_ercc,select=colnames(expr_raw_ercc)[!colnames(expr_raw_ercc)%in%rownames(cell_QC)])

分别对count矩阵和ERCC矩阵过滤

1all.counts.raw<-all.counts.raw[rowMeans(all.counts.raw)>0,] 2expr_raw<-expr_raw[rowMeans(expr_raw)>=1,] 3# count矩阵过滤后只剩下10835个基因 4expr_raw_ercc<-expr_raw_ercc[rowMeans(expr_raw_ercc)>0,] 5# ERCC也有原来的92个变成了89个

然后画CV vs Mean图

1library("matrixStats") 2# 首先还是计算CV值 3mean_expr_raw<-as.numeric(rowMeans(expr_raw,na.rm=TRUE)) 4sd_expr_raw<-rowSds(as.matrix(expr_raw),na.rm=TRUE) 5cv_squared_expr_raw<-(sd_expr_raw/mean_expr_raw)^2 6# plot函数中使用(纵坐标~横坐标) 7plot(log10(cv_squared_expr_raw)~log10(mean_expr_raw), 8 pch=20,cex=0.5,xlab="log10 ( mean raw count )", 9 ylab="log10 ( CV^2)",main="RAW COUNTS") 10# 接下来添加ERCC的信息(画上红点) 11mean_expr_raw_ercc<-as.numeric(rowMeans(expr_raw_ercc,na.rm=TRUE)) 12sd_expr_raw_ercc<-rowSds(as.matrix(expr_raw_ercc),na.rm=TRUE) 13cv_squared_expr_raw_ercc<-(sd_expr_raw_ercc/mean_expr_raw_ercc)^2 14points(log10(cv_squared_expr_raw_ercc)~log10(mean_expr_raw_ercc),col="red",pch=20,cex=1.5) 15# 然后对ERCC添加loess拟合曲线 16fit_expr_raw_ercc<-loess(log10(cv_squared_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))]~log10(mean_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))],span=1) 17# 从小到大排个序 18j<-order(log10(mean_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))]) 19# 20lines(fit_expr_raw_ercc$fitted[j]~log10(mean_expr_raw_ercc)[is.finite(log10(mean_expr_raw_ercc))][j],col="red",lwd=3)

又根据拟合结果，进行了预测得到期望值，然后过滤得到符合期望CV值的基因，最后只留下5316个基因：

1pred_expr_raw<-predict(fit_expr_raw_ercc,log10(mean_expr_raw)) 2filtered_expr_raw<-expr_raw[log10(cv_squared_expr_raw)>=pred_expr_raw,] 3filtered_expr_raw<-filtered_expr_raw[grepl("NA",rownames(filtered_expr_raw))==FALSE,] 4> dim(filtered_expr_raw) 5[1] 5316 716

可以看到，它的过滤从原来24490的基因，然后过滤掉没表达的基因剩10835个，然后又需要符合期望，剩5000多个。最后就是拿这5000多个基因做下游分析

看第三个R脚本 Dimensionality_reduction.R

这个脚本需要RPKM结果，因此需要先跑完上面第二个完整的脚本

降维主要使用tSNE，聚类使用dbscan（它的作用和hclust或者kmeans差不多）

上来先跑50次tSNE：

1library(Rtsne) 2N_tsne <- 50 3tsne_out <- list(length = N_tsne) 4KL <- vector(length = N_tsne) 5set.seed(1234) 6for(k in 1:N_tsne) 7{ 8 tsne_out[[k]]<-Rtsne(t(log10(RPKM+1)),initial_dims=30,verbose=FALSE,check_duplicates=FALSE, 9 perplexity=27, dims=2,max_iter=5000) 10 KL[k]<-tail(tsne_out[[k]]$itercosts,1) 11 print(paste0("FINISHED ",k," TSNE ITERATION")) 12} 13names(KL) <- c(1:N_tsne) 14# 可以看到这里选择最小的KL作为50次中效果最优的tSNE，然后主要关注tsne结果的itercosts 15opt_tsne <- tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]$Y 16opt_tsne_full<-tsne_out[[as.numeric(names(KL)[KL==min(KL)])]] 17save(tsne_out,opt_tsne,opt_tsne_full,file="step3-tsne-out.Rdata")

然后使用dbscan聚类：

1library(dbscan) 2plot(opt_tsne, col=dbscan(opt_tsne,eps=3.1)$cluster, pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")

如果使用kmeans方法：

1plot(opt_tsne, col=kmeans(opt_tsne,centers = 4)$clust, pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")

看看这两种聚类方法的相关性:

1> table(kmeans(opt_tsne,centers = 4)$clust,dbscan(opt_tsne,eps=3.1)$cluster) 2 3 0 1 2 3 4 4 1 1 226 0 0 0 5 2 0 0 144 0 0 6 3 0 27 0 0 44 7 4 0 231 0 43 0 8# 左侧是kmeans，上方是dbscan。

发现有一个点是离群值，所以把它放到细胞数量最多的那个组：

1library(dbscan) 2CAFgroups<-dbscan(opt_tsne,eps=3.1)$cluster 3CAFgroups_full<-dbscan(opt_tsne,eps=3.1) 4CAFgroups[CAFgroups==0]<-1 5CAFgroups_full$cluster[CAFgroups_full$cluster==0]<-1 6plot(opt_tsne, col=CAFgroups, pch=19, xlab="tSNE dim 1", ylab="tSNE dim 2")

(补充)了tSNE，还可以对PCA可视化：

1CAFgroups<-dbscan(opt_tsne,eps=3.1)$cluster 2CAFgroups_full<-dbscan(opt_tsne,eps=3.1) 3CAFgroups[CAFgroups==0]<-1 4CAFgroups_full$cluster[CAFgroups_full$cluster==0]<-1 5 6RPKM.PCA<-prcomp(log2(t(RPKM)+1), center=TRUE) 7plot(RPKM.PCA$x,main="first PCA", pch=19, col=CAFgroups)

其实有了上面这个tSNE聚类图，我们就能把基因的表达量映射上去，很像Seurat的FeaturePlot()做的那样。但是这里作者自己创造函数（参考第五个脚本：Plotting.R）

需要用到基因名、表达量矩阵、tsne坐标

1plot.feature2<-function(gene, tsne.output=tsne.out, DATAuse=DATA){ 2 plot.frame<-data.frame(x=tsne.output$Y[,1], y=tsne.output$Y[,2], log2expr=as.numeric(log2(DATAuse[gene,]+1))) 3 p<-ggplot(plot.frame,aes(x=x, y=y, col=log2expr))+ 4 geom_point(size=1) + 5 labs(title=paste(gene))+ 6 theme_classic()+ 7 scale_color_gradientn(colors = c("#FFFF00", "#FFD000","#FF0000","#360101"), limits=c(0,14))+ 8 theme(axis.title = element_blank())+ 9 theme(axis.text = element_blank())+ 10 theme(axis.line = element_blank())+ 11 theme(axis.ticks = element_blank())+ 12 theme(plot.title = element_text(size=20,face="italic"))+ 13 theme(legend.title = element_blank())+ 14 theme(legend.position = "none") 15 return(p) 16} 17 18library(ggplot2) 19opt_tsne <- tsne_out[[as.numeric(names(KL)[KL==min(KL)])]]$Y 20opt_tsne_full<-tsne_out[[as.numeric(names(KL)[KL==min(KL)])]] 21load(file='RPKM.full.Rdata') 22load(file='CAFgroups.Rdata') 23plot.feature2("Pdgfra", opt_tsne_full, RPKM.full)

另外小提琴图的代码更是长：它是用来绘制不同基因的表达量在不同聚类分组的差异

需要用到基因名、表达量矩阵、tsne坐标

1plot.violin2 <- function(gene, DATAuse, tsne.popus, axis=FALSE, legend_position="none", gene_name=FALSE){ 2 testframe<-data.frame(expression=as.numeric(DATAuse[paste(gene),]), Population=tsne.popus$cluster) 3 testframe$Population <- as.factor(testframe$Population) 4 colnames(testframe)<-c("expression", "Population") 5 6 col.mean<-vector() 7 for(i in levels(testframe$Population)){ 8 col.mean<-c(col.mean,mean(testframe$expression[which(testframe$Population ==i)])) 9 } 10 col.mean<-log2(col.mean+1) 11 col.means<-vector() 12 for(i in testframe$Population){ 13 col.means<-c(col.means,col.mean[as.numeric(i)]) 14 } 15 testframe$Mean<-col.means 16 testframe$expression<-log2(testframe$expression+1) 17 18 p <- ggplot(testframe, aes(x=Population, y=expression, fill= Mean, color=Mean))+ 19 geom_violin(scale="width") + 20 labs(title=paste(gene), y ="log2(expression)", x="Population")+ 21 theme_classic() + 22 23 scale_color_gradientn(colors = c("#FFFF00", "#FFD000","#FF0000","#360101"), limits=c(0,14))+ 24 scale_fill_gradientn(colors = c("#FFFF00", "#FFD000","#FF0000","#360101"), limits=c(0,14))+ 25 theme(axis.title.y = element_blank())+ 26 theme(axis.ticks.y = element_blank())+ 27 theme(axis.line.y = element_blank())+ 28 theme(axis.text.y = element_blank())+ 29 theme(axis.title.x = element_blank())+ 30 theme(legend.position=legend_position ) 31 32 if(axis == FALSE){ 33 p<-p+ 34 theme( axis.line.x=element_blank(), 35 axis.text.x = element_blank(), 36 axis.ticks.x = element_blank()) 37 } 38 if(gene_name == FALSE){ 39 p<-p+ theme(plot.title = element_blank()) 40 }else{ p<-p + theme(plot.title = element_text(size=10,face="bold"))} 41 p 42} 43 44# 例如 45plot.violin2(gene = "Pdgfra", DATAuse = RPKM.full, tsne.popus = CAFgroups_full)

看第四个R脚本 Differential_gene_expression.R

主要利用了ROTS包（Reproducibility-optimized test statistic），对每个亚群和其他几个亚群共同体进行比较

差异分析重点就在：表达矩阵和分组信息

1library(ROTS) 2library(plyr) 3# 首先针对第一亚群和其他亚群比较(把其他亚群定义为234) 4groups<-CAFgroups 5groups[groups!=1]<-234 6 7ROTS_input<-RPKM.full[rowMeans(RPKM.full)>=1,] 8ROTS_input<-as.matrix(log2(ROTS_input+1)) 9# 运行代码很简单，重点就是data和group参数 10results_pop1 = ROTS(data = ROTS_input, groups = groups , B = 1000 , K = 500 , seed = 1234) 11# 最后根据FDR值得到第一组和其他组比较的差异基因 12summary_pop1<-data.frame(summary(results_pop1, fdr=1)) 13head(summary_pop1) 14## Row ROTS.statistic pvalue FDR 15## Rgs5 8345 -19.89479 0 0 16## Higd1b 4559 -17.49991 0 0 17## Abcc9 393 -16.44638 0 0 18## Pdpn 7193 16.02262 0 0 19## Fbln2 3635 15.80534 0 0 20## Rgs4 8344 -15.62123 0 0 21 22# 同理，对第2组可以与1、3、4合并组比较；对第3组可以和第1、2、4组比较；对第4组可以和第1、2、3组比较 23# 都得到以后，共同保存 24save(summary_pop1,summary_pop2,summary_pop3,summary_pop4, 25 file = 'ROTS_summary_pop.Rdata')

每个亚群可以挑top18基因绘制热图

1population_subset<-c(rownames(summary_pop1[summary_pop1$ROTS.statistic<0,])[1:18],rownames(summary_pop2[summary_pop2$ROTS.statistic<0,])[1:18],rownames(summary_pop3[summary_pop3$ROTS.statistic<0,])[1:18],rownames(summary_pop4[summary_pop4$ROTS.statistic<0,])[1:18]) 2RPKM_heatmap<-RPKM.full[population_subset,] 3 4RPKM_heatmap<-RPKM_heatmap[,order(CAFgroups_full$cluster)] 5RPKM_heatmap<-log2(RPKM_heatmap+1) 6 7popul.col<-sort(CAFgroups_full$cluster) 8popul.col<-replace(popul.col, popul.col==1,"#1C86EE" ) 9popul.col<-replace(popul.col, popul.col==2,"#00EE00" ) 10popul.col<-replace(popul.col, popul.col==3,"#FF9912" ) 11popul.col<-replace(popul.col, popul.col==4,"#FF3E96" ) 12library(gplots) 13 14#pdf("heatmap_genes_population.pdf") 15heatmap.2(as.matrix(RPKM_heatmap),ColSideColors = as.character(popul.col), tracecol = NA, dendrogram = "none",col=bluered, labCol = FALSE, scale="none", key = TRUE, symkey = F, symm=F, key.xlab = "", key.ylab = "", density.info = "density", key.title = "log2(RPKM+1)", keysize = 1.2, denscol="black", Colv=FALSE)

当然，原文还使用了其他几种差异分析方法，放在这里，可以做日后参考

1################################ 2####### 第一种：DESeq2 ########### 3################################ 4library("scran") 5library("limSolve") 6library(scater) 7library(DESeq2) 8ann<-data.frame(Plate = factor(unlist(lapply(strsplit(colnames(RPKM.full),"_"),function(x) x[3]))), Population = factor(gsub("(3|4)","2",as.character(CAFgroups)),levels=c("1","2"))) 9ann<-data.frame(Population = factor(gsub("(3|4)","2",as.character(CAFgroups)),levels=c("1","2"))) 10rownames(ann)<-colnames(RPKM.full) 11 12ddsFullCountTable <- DESeqDataSetFromMatrix( 13 countData = all.counts.raw[rownames(RPKM.full),], 14 colData = ann, 15 design = ~ Population) 16 17ddsFullCountTable<-DESeq(ddsFullCountTable) 18DESeq_result<-results(ddsFullCountTable) 19DESeq_result<-DESeq_result[order(DESeq_result$padj, DESeq_result$pvalue),] 20head(DESeq_result,30) 21write.table(DESeq_result[grep("ERCC", rownames(DESeq_result), invert=TRUE),], "DESeq_result.txt", col.names = TRUE, row.names = TRUE, quote = FALSE, sep="\t") 22 23################################ 24####### 第二种：EdgeR ############ 25################################ 26library("edgeR") 27edgeR_Data<-DGEList(counts=all.counts.raw[rownames(RPKM.full),], group=ann$Population) 28edgeR_Data<-estimateCommonDisp(edgeR_Data) 29edgeR_Data<-estimateTagwiseDisp(edgeR_Data) 30edgeR_result<-exactTest(edgeR_Data) 31edgeR_result_table<-edgeR_result$table 32edgeR_result_table<-edgeR_result_table[order(edgeR_result_table$PValue),] 33edgeR_result_table<-edgeR_result_table[grep("ERCC", rownames(edgeR_result_table), invert=TRUE),] 34write.table(edgeR_result_table, "EdgeR_result.txt", col.names = TRUE, row.names = TRUE, quote=FALSE, sep="\t") 35 36################################ 37###### 第三种：Wilcox ########## 38################################ 39NumPerm<-1000 40POP1_expr<-subset(RPKM.full,select=rownames(ann)[ann==1]) 41POP2_expr<-subset(RPKM.full,select=rownames(ann)[ann==2]) 42p_wilcox<-vector() 43p_t<-vector() 44p_perm<-vector() 45statistics<-vector() 46median_POP1_expr<-vector() 47median_POP2_expr<-vector() 48a<-seq(from=0,to=length(rownames(RPKM.full)),by=1000) 49 50print("START DIFFERENTIAL GENE EXPRESSION BETWEEN POP1 AND POP2") 51for(i in 1:length(rownames(RPKM.full))) 52{ 53 p_wilcox<-append(p_wilcox,wilcox.test(as.numeric(POP1_expr[rownames(RPKM.full)[i],]),as.numeric(POP2_expr[rownames(RPKM.full)[i],]))$p.value) 54 statistics<-append(statistics,wilcox.test(as.numeric(POP1_expr[rownames(RPKM.full)[i],]),as.numeric(POP2_expr[rownames(RPKM.full)[i],]))$statistic) 55 p_t<-append(p_t,t.test(as.numeric(POP1_expr[rownames(RPKM.full)[i],]),as.numeric(POP2_expr[rownames(RPKM.full)[i],]))$p.value) 56 p_perm<-append(p_perm,PermTest_Median(as.numeric(POP1_expr[rownames(RPKM.full)[i],]),as.numeric(POP2_expr[rownames(RPKM.full)[i],]),NumPerm)) 57 median_POP1_expr<-append(median_POP1_expr,median(as.numeric(POP1_expr[rownames(RPKM.full)[i],]))) 58 median_POP2_expr<-append(median_POP2_expr,median(as.numeric(POP2_expr[rownames(RPKM.full)[i],]))) 59 if(i%in%a){print(paste("FINISHED ",i," GENES",sep=""))} 60} 61fold_change<-median_POP1_expr/median_POP2_expr 62log2_fold_change<-log2(fold_change) 63p_adj<-p.adjust(p_wilcox,method="fdr") 64output_wilcox<-data.frame(GENE=rownames(RPKM.full),POP1_EXPR=median_POP1_expr,POP234_EXPR=median_POP2_expr,FOLD_CHANGE=fold_change,LOG2FC=log2_fold_change,WILCOX_STAT=statistics,P_T_TEST=p_t,P_PERM=p_perm,P_WILCOX=p_wilcox,FDR=p_adj) 65output_wilcox<-output_wilcox[order(output_wilcox$P_PERM,output_wilcox$FDR,output_wilcox$P_WILCOX,output_wilcox$P_T_TEST,-abs(output_wilcox$LOG2FC)),] 66print(head(output_wilcox,20)) 67write.table(output_wilcox,file="Wilcox_Perm_de_results.txt",col.names=TRUE,row.names=FALSE,quote=FALSE,sep="\t") 68 69################################ 70####### 第四种：SCDE WORKFLOW 71################################ 72library("scde") 73# factor determining cell types 74sg<-factor(as.numeric(CAFgroups)) 75# the group factor should be named accordingly 76names(sg)<-colnames(expr_raw) 77table(sg) 78# define two groups of cells 79#groups<-sg 80groups <- factor(gsub("(3|4)","2",as.character(sg)),levels=c("1","2")) 81table(groups) 82# calculate models 83 84cd<-apply(expr_raw,2,function(x) {storage.mode(x) <- 'integer'; x}) 85colnames(cd)<-colnames(expr_raw) 86o.ifm<-scde.error.models(counts=cd,groups=groups,n.cores=4,threshold.segmentation=TRUE,save.crossfit.plots=FALSE,save.model.plots=FALSE,verbose=1) 87print(head(o.ifm)) 88# filter out cells that don't show positive correlation with 89# the expected expression magnitudes (very poor fits) 90valid.cells<-o.ifm$corr.a > 0 91table(valid.cells) 92o.ifm<-o.ifm[valid.cells, ] 93# estimate gene expression prior 94o.prior<-scde.expression.prior(models=o.ifm,counts=cd,length.out=400,show.plot=FALSE) 95# run differential expression tests on all genes. 96ediff<-scde.expression.difference(o.ifm,cd,o.prior,groups=groups,n.randomizations=100,n.cores=4,verbose=1) #batch=batch 97# top upregulated genes 98ediff_order<-ediff[order(abs(ediff$Z),decreasing=TRUE), ] 99head(ediff_order,20) 100write.table(ediff_order,file="scde_de_results_1_vs_234.txt",col.names=TRUE,row.names=TRUE,quote=FALSE,sep="\t")

“

使用作者代码重复结果

序言