单细胞层面解析TNFα通过Hippo信号通路参与肝癌发生 / 开普饭

考虑到咱们生信技能树粉丝对单细胞数据挖掘的需求，我开通了一个专栏《100个单细胞转录组数据降维聚类分群图表复现》，也亲自示范了几个，不过自己带娃，读博，时间精力有限，所以把剩余的90多个任务安排了学徒，实习生，学员。真的是太棒了，群策群力！

下面七月份学徒的投稿

今天分享的文章是2021年6月发表在Cancer Research 杂志的《A TNFR2–hnRNPK Axis Promotes Primary Liver Cancer Development via Activation of YAP Signaling in Hepatic Progenitor Cells》文章链接在：https://cancerres.aacrjournals.org/content/81/11/3036.long

文章的主要内容是：研究者利用HPCs移植大鼠原发肝癌模型联合肝癌患者单细胞转录组测序揭示了HPCs恶性转化过程，并证实了TNFR2对TNFα诱导的原发性肝癌至关重要。研究者将TNFR2 确定为 TNFα 下游的特异性受体，驱动 HPCs 介导的原发性肝癌的发生发展，并证明 TNFR2 通过 hnRNPK 调节 YAP 的转录活性。

数据链接在：https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE166635

包含两个10x样本，是标准的三个文件

下面开始复现

step1：读取表达矩阵文件，然后构建Seurat对象

在GSE166635_RAW 创建两个子文件夹，GSM5076749_HCC1和GSM5076750_HCC2 ,里面存放10x的标准文件

###### step1:导入数据 ###### rm(list=ls()) options(stringsAsFactors = F) library(Seurat) library(ggplot2) library(clustree) library(cowplot) library(dplyr) samples=list.files('./GSE166635_RAW/') samples sceList = lapply(samples,function(pro){ #pro=samples[1] folder=file.path('./GSE166635_RAW',pro) print(pro) print(folder) print(list.files(folder)) sce=CreateSeuratObject(counts = Read10X(folder), project = pro ) return(sce) }) names(sceList) = samples sce <- merge(sceList[[1]], y= sceList[ -1 ] , add.cell.ids=samples) #计算线粒体基因比例 sce=PercentageFeatureSet(sce, "^MT-", col.name = "percent_mito") #计算核糖体基因比例 sce=PercentageFeatureSet(sce, "^RP[SL]", col.name = "percent_ribo") #计算红血细胞基因比例 sce=PercentageFeatureSet(sce, "^HB[^(P)]", col.name = "percent_hb") #根据上述指标，过滤低质量细胞/基因 #过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因 selected_c <- WhichCells(sce, expression = nFeature_RNA > 300) selected_f <- rownames(sce)[Matrix::rowSums(sce@assays$RNA@counts > 0 ) > 3] sce.all.filt <- subset(sce, features = selected_f, cells = selected_c) #过滤指标2:线粒体/核糖体基因比例(根据上面的violin图) selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 20) selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3) selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.1) sce.all.filt <- subset(sce.all.filt, cells = selected_mito) sce.all.filt <- subset(sce.all.filt, cells = selected_ribo) sce.all.filt <- subset(sce.all.filt, cells = selected_hb) save(sce.all.filt,file = "sce.all.filt.Rdata")

step2：降维聚类分群

sce.all=sce.all.filt sce.all = NormalizeData(sce.all) sce.all <- FindVariableFeatures(sce.all, nfeatures = 3000) sce.all # 默认后续使用 HVGs 进行特征寻找 sce.all = ScaleData(sce.all #,vars.to.regress = c("nFeature_RNA", "percent_mito") ) #加上vars.to.regress参数后运行so slow!!! # 降维处理 sce.all <- RunPCA(sce.all, npcs = 30) sce.all <- RunTSNE(sce.all, dims = 1:30) sce.all <- RunUMAP(sce.all, dims = 1:30) # 聚类分析 sce.all <- FindNeighbors(sce.all, dims = 1:30) for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) { sce.all <- FindClusters(sce.all, graph.name = "RNA_snn", resolution = res, algorithm = 1) } clustree(sce.all@meta.data, prefix = "RNA_snn_res.")

step3: 数据整合

######整合#### #拆分为个seurat子对象 sce.all.list <- SplitObject(sce.all.filt, split.by = "orig.ident") #数据整合之前要对每个样本的seurat对象进行数据标准化和选择高变基因 for (i in 1:length(sce.all.list)) { print(i) sce.all.list[[i]] <- NormalizeData(sce.all.list[[i]], verbose = FALSE) sce.all.list[[i]] <- FindVariableFeatures(sce.all.list[[i]], selection.method = "vst", nfeatures = 2000, verbose = FALSE) } ###以VariableFeatures为基础寻找锚点，运行时间较长 alldata.anchors <- FindIntegrationAnchors(object.list = sce.all.list, dims = 1:30, reduction = "cca") ##利用锚点整合数据，运行时间较长 sce.all.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:30, new.assay.name = "CCA") sce.all.int=ScaleData(sce.all.int) sce.all.int=RunPCA(sce.all.int, npcs = 30) sce.all.int=RunTSNE(sce.all.int, dims = 1:30) sce.all.int=RunUMAP(sce.all.int, dims = 1:30) #整合前后对比 p4.compare=plot_grid(ncol = 3, DimPlot(sce.all, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA raw_data"), DimPlot(sce.all, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE raw_data"), DimPlot(sce.all, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP raw_data"), DimPlot(sce.all.int, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"), DimPlot(sce.all.int, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated"), DimPlot(sce.all.int, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated") ) p4.compare

step4：再次降维聚类分群和细胞注释

# 聚类分析 sce.all.int=FindNeighbors(sce.all.int, dims = 1:30, k.param = 60, prune.SNN = 1/15) #设置不同的分辨率，观察分群效果(选择哪一个？) for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) { sce.all.int=FindClusters(sce.all.int, graph.name = "CCA_snn", resolution = res, algorithm = 1) } apply(sce.all.int@meta.data[,grep("CCA_snn_res",colnames(sce.all.int@meta.data))],2,table) p5_tree <- clustree(sce.all.int@meta.data, prefix = "CCA_snn_res.") p5_tree

#选取resolution 1 进行分析 p6<- DimPlot(sce.all.int, reduction = "tsne", group.by = "CCA_snn_res.1", label = T,label.box = T)+NoAxes() p6

#查看每个cluster高表达的10个基因 table(sce.all.int$seurat_clusters) markers_genes <- FindAllMarkers(sce.all.int, logfc.threshold = 0.1, test.use = "wilcox", min.pct = 0.1, min.diff.pct = 0.2, only.pos = TRUE, max.cells.per.ident = 50, assay = "RNA")

top10 <- markers_genes %>% group_by(cluster) %>% top_n(n=10, wt =p_val) top10 # 这个热图超级耗时，如果没有必要，可以忽略它。 DoHeatmap(sce.all.int,top10$gene,size=3 )

#手动注释 library(ggplot2) genes_to_check <- list( CD4=c("IL22","CCR6","CXCL13","PDCD1","LAG3","CD4","IL7R"), CD8=c("ICOS", "GZMB","PRF1","NKG7"), B_cell=c("CD79A", "SLAMF7", "BLNK", "FCRL5"), TAMs=c('CD14', 'CD163', 'C1QA'), Treg=c("IL10","FOXP3","IKZF2","CTLA4"), HPCs=c('EPCAM', 'KRT19', 'CD24'), DC=c('LILRA4',"CXCR3","IRF7"),#'FLT3','H2-AB1',"CD74","BST2" fibo=c('DCN','SFRP4','LUM','COL1A1', 'PCLAF','MKI67'), ISEC=c("LILRB2") ) genes_to_check library(stringr) p_markers <- DotPlot(sce.all.int,features = genes_to_check,assay='RNA') + theme(axis.text.x=element_text(angle=45,hjust=1.2,vjust = 1.1,size = 8)) p_markers

# 需要自行看图，定细胞亚群： celltype=data.frame(ClusterID=0:18, celltype='un') celltype[celltype$ClusterID %in% c( '3'),2]='CD4' celltype[celltype$ClusterID %in% c( '4','13' ),2]='CD8' celltype[celltype$ClusterID %in% c( '17' ),2]='B cells' celltype[celltype$ClusterID %in% c( '1','2','15' ),2]='TAM' celltype[celltype$ClusterID %in% c( '5' ),2]='Treg' celltype[celltype$ClusterID %in% c( '0','10','11',"12","18"),2]='HPCs' celltype[celltype$ClusterID %in% c( '14',"9" ),2]='TAFs' celltype[celltype$ClusterID %in% c( '6' ),2]='DC' celltype[celltype$ClusterID %in% c( '7','8'),2]='ISEC' head(celltype) celltype table(celltype$celltype) sce.all.int@meta.data$celltype = "NA" for(i in 1:nrow(celltype)){ sce.all.int@meta.data[which(sce.all.int@meta.data$CCA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]} table(sce.all.int@meta.data$celltype) DimPlot(sce.all.int, reduction = "tsne", group.by = "celltype", label = T, label.box = T)

虽然说上面的代码都是复制粘贴即可运行，但是如果要更好地完成上面的图表，通常是需要掌握5个R包，分别是: scater,monocle,Seurat,scran,M3Drop，需要熟练掌握它们的对象，：一些单细胞转录组R包的对象而且分析流程也大同小异：

step1: 创建对象
step2: 质量控制
step3: 表达量的标准化和归一化
step4: 去除干扰因素(多个样本整合)
step5: 判断重要的基因
step6: 多种降维算法
step7: 可视化降维结果
step8: 多种聚类算法
step9: 聚类后找每个细胞亚群的标志基因
step10: 继续分类

单细胞转录组数据分析的标准降维聚类分群，并且进行生物学注释后的结果。可以参考前面的例子：人人都能学会的单细胞聚类分群注释，我们演示了第一层次的分群。

如果你对单细胞数据分析还没有基础认知，可以看基础10讲：

写在文末

咱们现在这个专栏《100个单细胞转录组数据降维聚类分群图表复现》分享的代码是到此为止，但是一般来说单细胞文章数据分析还有很多进阶图表制作，比如inferCNV看肿瘤拷贝数变异，monocle看拟时序等等。如果你也需要，可以加入我们这个专栏《100个单细胞转录组数据降维聚类分群图表复现》创作团队，获取进阶指引哦！见：急！计划招募100个单细胞爱好者，免费学全套单细胞降维聚类分群和生物学亚群注释

单细胞层面解析TNFα通过Hippo信号通路参与肝癌发生