单细胞层面解析TNFα通过Hippo信号通路参与肝癌发生
考虑到咱们生信技能树粉丝对单细胞数据挖掘的需求,我开通了一个专栏《100个单细胞转录组数据降维聚类分群图表复现》,也亲自示范了几个,不过自己带娃,读博,时间精力有限,所以把剩余的90多个任务安排了学徒,实习生,学员。真的是太棒了,群策群力!
今天分享的文章是2021年6月发表在Cancer Research 杂志 的《A TNFR2–hnRNPK Axis Promotes Primary Liver Cancer Development via Activation of YAP Signaling in Hepatic Progenitor Cells》文章链接在:https://cancerres.aacrjournals.org/content/81/11/3036.long
文章的主要内容是:研究者利用HPCs移植大鼠原发肝癌模型联合肝癌患者单细胞转录组测序揭示了HPCs恶性转化过程,并证实了TNFR2对TNFα诱导的原发性肝癌至关重要。研究者将TNFR2 确定为 TNFα 下游的特异性受体,驱动 HPCs 介导的原发性肝癌的发生发展,并证明 TNFR2 通过 hnRNPK 调节 YAP 的转录活性。
数据链接在:https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE166635
包含两个10x样本,是标准的三个文件
下面开始复现
step1:读取表达矩阵文件,然后构建Seurat对象
在GSE166635_RAW
创建两个子文件夹,GSM5076749_HCC1
和GSM5076750_HCC2
,里面存放10x的标准文件
###### step1:导入数据 ######
rm(list=ls())
options(stringsAsFactors = F)
library(Seurat)
library(ggplot2)
library(clustree)
library(cowplot)
library(dplyr)
samples=list.files('./GSE166635_RAW/')
samples
sceList = lapply(samples,function(pro){
#pro=samples[1]
folder=file.path('./GSE166635_RAW',pro)
print(pro)
print(folder)
print(list.files(folder))
sce=CreateSeuratObject(counts = Read10X(folder),
project = pro )
return(sce)
})
names(sceList) = samples
sce <- merge(sceList[[1]], y= sceList[ -1 ] ,
add.cell.ids=samples)
#计算线粒体基因比例
sce=PercentageFeatureSet(sce, "^MT-", col.name = "percent_mito")
#计算核糖体基因比例
sce=PercentageFeatureSet(sce, "^RP[SL]", col.name = "percent_ribo")
#计算红血细胞基因比例
sce=PercentageFeatureSet(sce, "^HB[^(P)]", col.name = "percent_hb")
#根据上述指标,过滤低质量细胞/基因
#过滤指标1:最少表达基因数的细胞&最少表达细胞数的基因
selected_c <- WhichCells(sce, expression = nFeature_RNA > 300)
selected_f <- rownames(sce)[Matrix::rowSums(sce@assays$RNA@counts > 0 ) > 3]
sce.all.filt <- subset(sce, features = selected_f, cells = selected_c)
#过滤指标2:线粒体/核糖体基因比例(根据上面的violin图)
selected_mito <- WhichCells(sce.all.filt, expression = percent_mito < 20)
selected_ribo <- WhichCells(sce.all.filt, expression = percent_ribo > 3)
selected_hb <- WhichCells(sce.all.filt, expression = percent_hb < 0.1)
sce.all.filt <- subset(sce.all.filt, cells = selected_mito)
sce.all.filt <- subset(sce.all.filt, cells = selected_ribo)
sce.all.filt <- subset(sce.all.filt, cells = selected_hb)
save(sce.all.filt,file = "sce.all.filt.Rdata")
step2:降维聚类分群
sce.all=sce.all.filt
sce.all = NormalizeData(sce.all)
sce.all <- FindVariableFeatures(sce.all, nfeatures = 3000)
sce.all
# 默认后续使用 HVGs 进行 特征寻找
sce.all = ScaleData(sce.all
#,vars.to.regress = c("nFeature_RNA", "percent_mito")
) #加上vars.to.regress参数后运行so slow!!!
# 降维处理
sce.all <- RunPCA(sce.all, npcs = 30)
sce.all <- RunTSNE(sce.all, dims = 1:30)
sce.all <- RunUMAP(sce.all, dims = 1:30)
# 聚类分析
sce.all <- FindNeighbors(sce.all, dims = 1:30)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
sce.all <- FindClusters(sce.all, graph.name = "RNA_snn", resolution = res, algorithm = 1)
}
clustree(sce.all@meta.data, prefix = "RNA_snn_res.")
step3: 数据整合
######整合####
#拆分为 个seurat子对象
sce.all.list <- SplitObject(sce.all.filt, split.by = "orig.ident")
#数据整合之前要对每个样本的seurat对象进行数据标准化和选择高变基因
for (i in 1:length(sce.all.list)) {
print(i)
sce.all.list[[i]] <- NormalizeData(sce.all.list[[i]], verbose = FALSE)
sce.all.list[[i]] <- FindVariableFeatures(sce.all.list[[i]], selection.method = "vst",
nfeatures = 2000, verbose = FALSE)
}
###以VariableFeatures为基础寻找锚点,运行时间较长
alldata.anchors <- FindIntegrationAnchors(object.list = sce.all.list, dims = 1:30,
reduction = "cca")
##利用锚点整合数据,运行时间较长
sce.all.int <- IntegrateData(anchorset = alldata.anchors, dims = 1:30, new.assay.name = "CCA")
sce.all.int=ScaleData(sce.all.int)
sce.all.int=RunPCA(sce.all.int, npcs = 30)
sce.all.int=RunTSNE(sce.all.int, dims = 1:30)
sce.all.int=RunUMAP(sce.all.int, dims = 1:30)
#整合前后对比
p4.compare=plot_grid(ncol = 3,
DimPlot(sce.all, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA raw_data"),
DimPlot(sce.all, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE raw_data"),
DimPlot(sce.all, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP raw_data"),
DimPlot(sce.all.int, reduction = "pca", group.by = "orig.ident")+NoAxes()+ggtitle("PCA integrated"),
DimPlot(sce.all.int, reduction = "tsne", group.by = "orig.ident")+NoAxes()+ggtitle("tSNE integrated"),
DimPlot(sce.all.int, reduction = "umap", group.by = "orig.ident")+NoAxes()+ggtitle("UMAP integrated")
)
p4.compare
step4:再次降维聚类分群和细胞注释
# 聚类分析
sce.all.int=FindNeighbors(sce.all.int, dims = 1:30, k.param = 60, prune.SNN = 1/15)
#设置不同的分辨率,观察分群效果(选择哪一个?)
for (res in c(0.01, 0.05, 0.1, 0.2, 0.3, 0.5,0.8,1)) {
sce.all.int=FindClusters(sce.all.int, graph.name = "CCA_snn", resolution = res, algorithm = 1)
}
apply(sce.all.int@meta.data[,grep("CCA_snn_res",colnames(sce.all.int@meta.data))],2,table)
p5_tree <- clustree(sce.all.int@meta.data, prefix = "CCA_snn_res.")
p5_tree
#选取resolution 1 进行分析
p6<- DimPlot(sce.all.int, reduction = "tsne", group.by = "CCA_snn_res.1", label = T,label.box = T)+NoAxes()
p6
#查看每个cluster高表达的10个基因
table(sce.all.int$seurat_clusters)
markers_genes <- FindAllMarkers(sce.all.int, logfc.threshold = 0.1, test.use = "wilcox",
min.pct = 0.1, min.diff.pct = 0.2, only.pos = TRUE, max.cells.per.ident = 50,
assay = "RNA")
top10 <- markers_genes %>% group_by(cluster) %>% top_n(n=10, wt =p_val)
top10
# 这个热图超级耗时,如果没有必要,可以忽略它。
DoHeatmap(sce.all.int,top10$gene,size=3 )
#手动注释
library(ggplot2)
genes_to_check <- list(
CD4=c("IL22","CCR6","CXCL13","PDCD1","LAG3","CD4","IL7R"),
CD8=c("ICOS", "GZMB","PRF1","NKG7"),
B_cell=c("CD79A", "SLAMF7", "BLNK", "FCRL5"),
TAMs=c('CD14', 'CD163', 'C1QA'),
Treg=c("IL10","FOXP3","IKZF2","CTLA4"),
HPCs=c('EPCAM', 'KRT19', 'CD24'),
DC=c('LILRA4',"CXCR3","IRF7"),#'FLT3','H2-AB1',"CD74","BST2"
fibo=c('DCN','SFRP4','LUM','COL1A1',
'PCLAF','MKI67'),
ISEC=c("LILRB2")
)
genes_to_check
library(stringr)
p_markers <- DotPlot(sce.all.int,features = genes_to_check,assay='RNA') +
theme(axis.text.x=element_text(angle=45,hjust=1.2,vjust = 1.1,size = 8))
p_markers
# 需要自行看图,定细胞亚群:
celltype=data.frame(ClusterID=0:18,
celltype='un')
celltype[celltype$ClusterID %in% c( '3'),2]='CD4'
celltype[celltype$ClusterID %in% c( '4','13' ),2]='CD8'
celltype[celltype$ClusterID %in% c( '17' ),2]='B cells'
celltype[celltype$ClusterID %in% c( '1','2','15' ),2]='TAM'
celltype[celltype$ClusterID %in% c( '5' ),2]='Treg'
celltype[celltype$ClusterID %in% c( '0','10','11',"12","18"),2]='HPCs'
celltype[celltype$ClusterID %in% c( '14',"9" ),2]='TAFs'
celltype[celltype$ClusterID %in% c( '6' ),2]='DC'
celltype[celltype$ClusterID %in% c( '7','8'),2]='ISEC'
head(celltype)
celltype
table(celltype$celltype)
sce.all.int@meta.data$celltype = "NA"
for(i in 1:nrow(celltype)){
sce.all.int@meta.data[which(sce.all.int@meta.data$CCA_snn_res.1 == celltype$ClusterID[i]),'celltype'] <- celltype$celltype[i]}
table(sce.all.int@meta.data$celltype)
DimPlot(sce.all.int, reduction = "tsne", group.by = "celltype", label = T, label.box = T)
虽然说上面的代码都是复制粘贴即可运行,但是如果要更好地完成上面的图表,通常是需要掌握5个R包,分别是: scater,monocle,Seurat,scran,M3Drop,需要熟练掌握它们的对象,:一些单细胞转录组R包的对象 而且分析流程也大同小异:
step1: 创建对象 step2: 质量控制 step3: 表达量的标准化和归一化 step4: 去除干扰因素(多个样本整合) step5: 判断重要的基因 step6: 多种降维算法 step7: 可视化降维结果 step8: 多种聚类算法 step9: 聚类后找每个细胞亚群的标志基因 step10: 继续分类
单细胞转录组数据分析的标准降维聚类分群,并且进行生物学注释后的结果。可以参考前面的例子:人人都能学会的单细胞聚类分群注释 ,我们演示了第一层次的分群。
如果你对单细胞数据分析还没有基础认知,可以看基础10讲:
写在文末
咱们现在这个专栏《100个单细胞转录组数据降维聚类分群图表复现》分享的代码是到此为止,但是一般来说单细胞文章数据分析还有很多进阶图表制作,比如inferCNV看肿瘤拷贝数变异,monocle看拟时序等等。如果你也需要,可以加入我们这个专栏《100个单细胞转录组数据降维聚类分群图表复现》创作团队,获取进阶指引哦!见:急!计划招募100个单细胞爱好者,免费学全套单细胞降维聚类分群和生物学亚群注释