NC单细胞文章复现(五)：tSNE / 开普饭

我们昨日进行clustering之后，将1107个细胞分成了9个簇，今天学习tsne方面的知识。

##将unknown and undecided cells去除掉 unkund <- which(pd_norm$cell_types_cl_all %in% c("undecided", "unknown"))


#将已经再次细分好的细胞信息添加到sceset_final中，便于后续的分析

sceset_ct <- sceset_final[,-unkund]

pd_ct <- colData(sceset_ct)

mat_ct <- assays(sceset_ct)$exprs

mats_ct <- list()

pds_ct <- list()

for (i in 1:length(patients_now)) {

  mats_ct[[i]] <- mat_ct[,pd_ct$patient == patients_now[i]]

  pds_ct[[i]] <- pd_ct[pd_ct$patient == patients_now[i],]

}

names(mats_ct) <- patients_now

names(pds_ct) <- patients_now
#画6个样本的1107个细胞的细胞类型比例分布柱状图

match_celltype_levels <- c("epithelial", "stroma", "endothelial", "Tcell", "Bcell", "macrophage")

#将pd_ct转换为 tibble 类型

tbl_pd_ct <- tbl_df(pd_ct)

tbl_pd_ct <- tbl_pd_ct %>%

  group_by(patient) %>%

  mutate(cell_types_cl_all = factor(cell_types_cl_all, levels = match_celltype_levels)) %>%

  arrange(cell_types_cl_all)

  

#画Fig1.c

ggplot() +

  geom_bar(data = tbl_pd_ct, aes(x = patient, fill = factor(cell_types_cl_all)), position = position_fill(reverse = TRUE)) +

  scale_fill_manual(values = anno_colors$tsne) +

  labs(fill = "cell type", y = "fraction of cells")

Fig1.c: 可以看到，病人样本之间的细胞类型有很明显的异质性。

#先看不同病人的细胞周期比例分布情况 tbl_pd_cycle <- tbl_pd_ct %>% group_by(patient) %>% mutate(cycling_mel = factor(cycling_mel, levels = c("cycling", "non-cycling"))) %>% arrange(cycling_mel)


#画fig1d

ggplot() +

  geom_bar(data = tbl_pd_cycle, aes(x = patient, fill = factor(cycling_mel)), position = position_fill(reverse = TRUE)) +

  scale_fill_manual(values = anno_colors$cycling) +

  labs(fill = "cycling status", y = "fraction of cells")

Fig1.d:PT081样本中，cycling细胞占比（>34% ）最多。接着将细胞周期与细胞类型联系在一起。

#epithelial细胞比例 for (i in 1:length(patients_now)) { percent_epith <- length(intersect_all(which(pd_ct$patient == patients_now[i]), #取交集 which(pd_ct$cell_types_cl_all == "epithelial"), which(pd_ct$cycling_mel == "cycling")))/length(intersect_all( which(pd_ct$patient == patients_now[i]), which(pd_ct$cell_types_cl_all == "epithelial")))*100 #细胞周期比例 percent_all <- length(intersect_all(which(pd_ct$patient == patients_now[i]), which(pd_ct$cycling_mel == "cycling")))/length(which(pd_ct$patient == patients_now[i]))*100 print(ggplot(as.data.frame(pds_ct[[i]]), aes(x = mel_scores_g1s, y = mel_scores_g2m)) + geom_rect(ggplot2::aes(xmin = median(pd_ct$mel_scores_g1s) + 2 * mad(pd_ct$mel_scores_g1s),#以G1期cycling score中位数加其2MAD数值作为鉴定cycling cell的分界线 xmax = Inf, #Inf：无穷大 ymin = -Inf, ymax = Inf), fill = "gainsboro", alpha = 0.05) +#定义颜色、透明度 geom_rect(aes(ymin = median(pd_ct$mel_scores_g2m) + 2 * mad(pd_ct$mel_scores_g2m), ymax = Inf, xmin = -Inf, xmax = Inf),

fill = "gainsboro", alpha = 0.05) + geom_point(aes(col = factor(cell_types_cl_all, levels = names(anno_colors$tsne)), shape = factor(cycling_mel)), size = 5) + xlim(-0.15, 2) + ylim(-0.15, 2.8) + labs(col = "cell type", shape = "cycling", x = "G1S score", y = "G2M score", #注释 title = paste("patient ", patients_now[i], " (", round(percent_all), "% cycling cells)", sep = "")) + #round:四舍五入 scale_color_manual(values = anno_colors$tsne)) }

Fig1.e:对于PT126来说，大部分处于cycling的细胞都被鉴定为上皮细胞。

接着开始进行tSNE

## tsne on cell types #先对1112个细胞进行聚类 to_plot_ct <- unique(pd_ct$cell_types_cl_all) #mat_ct是已经处理好的1112个细胞和13280个基因德数据框，pd_ct是对应的细胞和样本的注释信息，pd_ct$cell_types_cl_all指每个细#胞对应的细胞类型。 #which函数中cell_types_cl_all等于#to_plot_ct的位置，然后提取mat_ct的表达谱，重新生成矩阵mat_short_ct mat_short_ct <- mat_ct[, which(pd_ct$cell_types_cl_all %in% to_plot_ct)] #同样的，提取注释信息pd_short_ct pd_short_ct <- pd_ct[which(pd_ct$cell_types_cl_all %in% to_plot_ct), ] #开始tsne tsne_short_ct <- Rtsne(t(mat_short_ct), perplexity = 30) #最终得到一个list，其中tsne_short_ct$Y存储画图的信息，给tsne_short_ct$Y适当添加对应的细胞类型等属性 colnames(tsne_short_ct$Y) <- c("col1", "col2") tsne_short_ct$Y <- as.data.frame(tsne_short_ct$Y) tsne_short_ct$Y$cell_types_cl_all <- pd_short_ct$cell_types_cl_all tsne_short_ct$Y$cell_types_markers <- pd_short_ct$cell_types_markers tsne_short_ct$Y$patient <- pd_short_ct$patient head(tsne_short_ct$Y) #这样子就得到了每个细胞的坐标，细胞类型，对应的marker和病人标本信息 > head(tsne_short_ct$Y) col1 col2 cell_types_cl_all cell_types_markers patient 1 16.12422 -20.896689773 epithelial epithelial PT089 2 15.71953 -20.986193941 epithelial epithelial PT089 3 14.95464 -20.609750926 epithelial epithelial PT089 4 -33.07589 -0.069714081 macrophage macrophage PT089 5 -32.66427 -0.007138231 macrophage macrophage PT089 6 15.48987 -21.197981561 epithelial epithelial PT089 #画图fig2a ggplot(tsne_short_ct$Y, aes(x = col1, y = col2, color = factor(cell_types_cl_all, levels = names(anno_colors$tsne)), shape = patient)) + geom_point(size = 4) + scale_color_manual(values = anno_colors$tsne) + labs(col = "patient", x = "tSNE dimension 1", y = "tSNE dimension 2", shape = "patient")

Fig2a：使用tSNE聚类对所有患者的1189个细胞进行分析，发现非上皮性细胞群和上皮细胞群之间有很大的分离，非上皮性细胞群被很好地分离成不同的簇，上皮细胞群则形成多个亚群。

#对上皮细胞群进行tsne to_plot_ct <- c("epithelial") mat_short_ct <- mat_ct[, which(pd_ct$cell_types_cl_all %in% to_plot_ct)] pd_short_ct <- pd_ct[which(pd_ct$cell_types_cl_all %in% to_plot_ct), ] tsne_short_ct <- Rtsne(t(mat_short_ct), perplexity = 30) colnames(tsne_short_ct$Y) <- c("col1", "col2") tsne_short_ct$Y <- as.data.frame(tsne_short_ct$Y) tsne_short_ct$Y$cell_types_cl_all <- pd_short_ct$cell_types_cl_all tsne_short_ct$Y$cell_types_markers <- pd_short_ct$cell_types_markers tsne_short_ct$Y$patient <- pd_short_ct$patient #画图fig2c ggplot(tsne_short_ct$Y, aes(x = col1, y = col2, color = factor(patient, levels = names(anno_colors$patient)), shape = cell_types_cl_all)) + geom_point(size = 4) + scale_color_manual(values = anno_colors$patient) + labs(col = "patient", x = "tSNE dimension 1", y = "tSNE dimension 2", shape = "cell type")

Fig2c：上皮细胞群通常被分成多个特异性簇(特别在PT039和PT081中更明显)，6个样本均有上皮细胞簇。

#对非上皮细胞群进行tsne to_plot_ct <- c("Bcell", "macrophage", "Tcell", "stroma", "endothelial") mat_short_ct <- mat_ct[, which(pd_ct$cell_types_cl_all %in% to_plot_ct)] pd_short_ct <- pd_ct[which(pd_ct$cell_types_cl_all %in% to_plot_ct), ] tsne_short_ct <- Rtsne(t(mat_short_ct), perplexity = 30) colnames(tsne_short_ct$Y) <- c("col1", "col2") tsne_short_ct$Y <- as.data.frame(tsne_short_ct$Y) tsne_short_ct$Y$cell_types_cl_all <- pd_short_ct$cell_types_cl_all tsne_short_ct$Y$cell_types_markers <- pd_short_ct$cell_types_markers tsne_short_ct$Y$patient <- pd_short_ct$patient


#画图fig2c

ggplot(tsne_short_ct$Y, aes(x = col1, y = col2, color = factor(cell_types_cl_all, levels = names(anno_colors$tsne)), 

                            shape = patient)) + 

  geom_point(size = 4) + 

  labs(col = "cell type", x = "tSNE dimension 1", y = "tSNE dimension 2") + 

  scale_color_manual(values = anno_colors$tsne)

先前对黑色素瘤和胶质母细胞瘤的单细胞分选研究显示，恶性细胞主要按患者样本进行聚集，这也说明了不同病人肿瘤之的异质性很显著。但是有其他乳腺癌的单细胞分析结果却与之相反，每个患者特有的和共有的恶性细胞可以存在一个簇中，暗示了乳腺癌中既有特定的瘤内异质性亚群细胞存在，也“共享”着一部分细胞亚群，而后者与作者的结果是符合的。

接着使用Monocle包对上皮细胞群进行clustering，并且对患者效应进行regressing out 。monocle_unsup_clust_plots是已经包装好的函数，这个函数采用了 2014Science上的⼀篇《Clustering by fast search and find of density peaks》文章的算法，这篇文献提供⼀种基于密度（density-based ）的聚类方法，关于单细胞聚类法方法的选择大家可以参考2017年发表在Molecular Aspects of Medicine上的文章《Identifying cell populations with scRNASeq 》，单细胞天地已经有对这篇文献进行过解读。单细胞转录组分析综述

## clustering of epithelial cells HSMM_allepith_clustering <- monocle_unsup_clust_plots(sceset_obj = sceset_ct[,which(colData(sceset_ct)$cell_types_cl_all == "epithelial")], mat_to_cluster = mat_ct[,which(colData(sceset_ct)$cell_types_cl_all == "epithelial")], anno_colors = anno_colors, name_in_phenodata = "cluster_allepith_regr_disp", disp_extra = 1, save_plots = 0, path_plots = NULL, type_pats = "allpats", regress_pat = 1, use_known_colors = 1, use_only_known_celltypes = 1) table(HSMM_allepith_clustering$Cluster) > table(HSMM_allepith_clustering$Cluster)

1 2 3 4 69 292 169 338

结果是将868个上皮细胞分成了4个clustering，与原文不一样。

作者是这么解释的，由于Monocle包的函数reduceDimension and clusterCells有所改变，因此要想重现图片，接下来作者建议使用他们已经做好的原始数据。

# due to changes in Monocle's functions (reduceDimension and clusterCells), the resulting clustering of #epithelial cells is slightly different from the original clustering from the paper. for reproducibility, we #read in the original clustering of epithelial cellsoriginal_clustering_epithelial <- readRDS(file= "data/original_clustering_epithelial.RDS") table(original_clustering_epithelial)

HSMM_allepith_clustering$Cluster <- original_clustering_epithelial clustering_allepith <- HSMM_allepith_clustering$Cluster #画图fig3a plot_cell_clusters(HSMM_allepith_clustering, 1, 2, color = "Cluster", cell_size = 2) + scale_color_manual(values = c("1" = "#ee204d", "2" = "#17806d", "3" = "#b2ec5d", "4" = "#cda4de", "5" = "#1974d2"))

#给6个样本的868个上皮细胞标记上对应的clusters编号 clusterings_sep_allepith <- list() for (i in patients_now) { clusterings_sep_allepith[[i]] <- clustering_allepith[which(HSMM_allepith_clustering$patient == i)] names(clusterings_sep_allepith[[i]]) <- colnames(HSMM_allepith_clustering)[which(HSMM_allepith_clustering$patient == i)] }


#看看每个cluster的的cycling 和non-cycling细胞比例

tbl_pd_cluster <- tbl_df(pData(HSMM_allepith_clustering))

tbl_pd_cluster <- tbl_pd_cluster %>%

  group_by(Cluster) %>%

  mutate(cycling_mel = factor(cycling_mel, levels = c("cycling", "non-cycling"))) %>%

  arrange(cycling_mel)

#画图figS6 ggplot() + geom_bar(data = tbl_pd_cluster, aes(x = Cluster, fill = factor(cycling_mel)), position = position_fill(reverse = TRUE)) + scale_fill_manual(values = anno_colors$cycling) + labs(fill = "cycling status", y = "fraction of cells")

## 将每一个cluster与全部的cluster进行差异分析 HSMM_for_DE <- HSMM_allepith_clustering diff_test_res <- list() #先进行cluster1跟全部的cluster差异分析 HSMM_for_DE$allvs1 <- clustering_allepith HSMM_for_DE$allvs1 <- as.numeric(HSMM_for_DE$allvs1) HSMM_for_DE$allvs1[which(HSMM_for_DE$allvs1 != 1)] <- 2 #差异分析采用monocle的differentialGeneTest函数 diff_test_res$allvs1 <- differentialGeneTest(HSMM_for_DE, fullModelFormulaStr = "~allvs1", cores = 3) diff_test_res$allvs1 <- diff_test_res$allvs1[order(diff_test_res$allvs1$qval),]#qval排序 diff_test_res$allvs1 <- diff_test_res$allvs1[which(diff_test_res$allvs1$qval <= 0.1),] #挑选qval <= 0.1的基因 head(diff_test_res$allvs1[,1:5], n = 10) > head(diff_test_res$allvs1[,1:5], n = 10) status family pval qval ensembl_gene_id HP OK negbinomial.size 5.727834e-52 4.811381e-48 ENSG00000257017 PRG4 OK negbinomial.size 4.208455e-26 1.767551e-22 ENSG00000116690 PLA2G2A OK negbinomial.size 1.305859e-24 3.656405e-21 ENSG00000188257 CFH OK negbinomial.size 1.307494e-23 2.745737e-20 ENSG00000000971 EGFL6 OK negbinomial.size 1.660370e-21 2.789421e-18 ENSG00000198759 CCL2 OK negbinomial.size 1.830853e-20 2.197023e-17 ENSG00000108691 ENPP2 OK negbinomial.size 1.715218e-20 2.197023e-17 ENSG00000136960 VTN OK negbinomial.size 3.867117e-19 4.060473e-16 ENSG00000109072 CLDN1 OK negbinomial.size 9.901388e-15 9.241296e-12 ENSG00000163347 BCHE OK negbinomial.size 3.048404e-13 2.560659e-10 ENSG00000114200 #接着就是cluster2了 HSMM_for_DE$allvs2 <- clustering_allepith HSMM_for_DE$allvs2 <- as.numeric(HSMM_for_DE$allvs2) HSMM_for_DE$allvs2[which(HSMM_for_DE$allvs2 != 2)] <- 3 diff_test_res$allvs2 <- differentialGeneTest(HSMM_for_DE, fullModelFormulaStr = "~allvs2", cores = 3) diff_test_res$allvs2 <- diff_test_res$allvs2[order(diff_test_res$allvs2$qval),] diff_test_res$allvs2 <- diff_test_res$allvs2[which(diff_test_res$allvs2$qval <= 0.1),] head(diff_test_res$allvs2[,1:5], n = 10) ......

接着就是cluster3,4和5，代码我不放上来了，下一节我们继续学习。

NC单细胞文章复现(五)：tSNE

相关推荐