使用R实现批量方差分析(aov)和多重比较(LSD)
Tao Wen
2019年1月5日
新
年
快
乐
引子
由我的工作来看,coding存在的意义就是无序重复机械的分析过程,让自己的时间集中在科学问题上,然而实现这个过程却花费了更多的?时间。 我们可以轻松的使用各种软件实现方差检验和多重比较。但是在同时对几百甚至几千组数据做方差分析和多重比较的时候,coding的优势就提体现出来了。
##清空内存rm(list=ls()) #载入包
library("vegan")
library(dplyr)
library(tidyverse)
library(agricolae)
library(car)#修改工作目录
setwd("F:/")
##############全部峰值数据来做统计#########去除磷酸之后进行分析
RET_all = read.delim("./Data_processing/a0_RET_缺失值处理clean_root_exudates.txt", header=T, row.names= 1,check.names=F) head(RET_all)## CK2 CK5 CK3 CK1 CK4 RS1 RS2## RE1 26632.985 25224.034 30596.48 29812.76 25972.92 225065.33 75659.57## RE2 118030.479 60185.801 212033.07 105679.97 94223.07 0.00 0.00## RE3 92672.041 65451.785 96917.73 100051.53 77912.07 689247.38 232806.81## RE4 22620.908 4666.435 18808.13 9232.97 22456.49 11528.64 11517.85## RE5 22688.856 14983.546 17679.77 16747.14 16299.52 18650.75 19104.47## RE6 9271.438 8784.055 12884.40 10313.30 10313.30 0.00 0.00## RS3 RS5 RS4 B1 B2 B3 B4## RE1 31002.659 31484.553 26169.62 30109.59 31586.178 28028.22 30081.14## RE2 0.000 0.000 0.00 0.00 0.000 0.00 0.00## RE3 107402.115 101363.214 83245.85 151057.74 99776.930 101816.86 90471.41## RE4 4962.333 3686.371 14990.28 5080.92 8451.872 27181.51 23715.11## RE5 21119.811 14020.793 20534.05 18634.03 20593.487 23398.10 13077.19## RE6 0.000 0.000 0.00 0.00 0.000 0.00 0.00## B5 compound## RE1 28979.445 (-)-Dihydrocarveol## RE2 0.000 (2R,3S)-2-hydroxy-3-isopropylbutanedioic acid## RE3 74380.312 1,4-Cyclohexanedione 2## RE4 6579.112 11-beta-prostaglandin-F-2-alpha 1## RE5 17367.560 1-Hexadecanol## RE6 0.000 2,3-Dimethylsuccinic acid
RET = RET_all[1:dim(RET_all)[2]-1]head(RET)
#导入需要的作图文件design = read.table("./Data_processing/mapping_RET_all.txt", header=T, row.names= 1, sep="\t") head(design)
## group## CK5 CK## CK4 CK## CK3 CK## CK2 CK## CK1 CK## B5 B
design$ID = row.names(design)# 过滤数据并排序idx = rownames(design) %in% colnames(RET) sub_design = design[idx,]count = RET[ ,rownames(sub_design)]head(count)
comp_all = dim(count)head(count)
# 转换原始数据为百分比,norm = t(t(count)/colSums(count,na=T)) * 100 # normalization to total 100head(norm)
## CK5 CK4 CK3 CK2 CK1## RE1 0.005549065 0.013885418 0.011357306 0.009472478 0.011338921## RE2 0.013240346 0.050372725 0.078705924 0.041979565 0.040194100## RE3 0.014398817 0.041652675 0.035975518 0.032960401 0.038053388## RE4 0.001026575 0.012005496 0.006981510 0.008045514 0.003511648## RE5 0.003296248 0.008713910 0.006562666 0.008069681 0.006369571## RE6 0.001932415 0.005513607 0.004782645 0.003297546 0.003922539## B5 B4 B3 B2 B1## RE1 0.006645724 0.011490223 0.008876583 0.007449352 0.011995785## RE2 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000## RE3 0.017057299 0.034557751 0.032245560 0.023531604 0.060182030## RE4 0.001508758 0.009058563 0.008608428 0.001993307 0.002024260## RE5 0.003982824 0.004995150 0.007410215 0.004856812 0.007423875## RE6 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000## RS5 RS4 RS3 RS2 RS1## RE1 0.009075086 0.005696559 0.012207578 0.022859290 0.066628198## RE2 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000## RE3 0.029216864 0.018120817 0.042290558 0.070338732 0.204044354## RE4 0.001062557 0.003263060 0.001953964 0.003479927 0.003412932## RE5 0.004041344 0.004469817 0.008316117 0.005772101 0.005521356## RE6 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
norm=as.data.frame(norm)a=normhead(a)
Pvalue<-c(rep(0,nrow(a)))pB<-c(rep(0,nrow(a)))pCK<-c(rep(0,nrow(a)))pRS<-c(rep(0,nrow(a)))str(pCK)
## num [1:604] 0 0 0 0 0 0 0 0 0 0 ...
# 2~4列是处理组1,5~7列是处理组2;#将使用循环对每一行进行方差检验#两组表达量都是0的基因,不检验;#每一行计算得到p value和log2FC将被加入原文件的后两列;#计算log2FC时,每组均值加0.001,是为了防止分母为0导致bug;# i= 2# ab = as.numeric(a[i,1:15])# b = sub_design$group# aa = data.frame(ab,b)# y=aov(ab~b,data = aa)# Pvalue[i]<-summary(y)[[1]][,5][1]# out <- LSD.test(y,"b", p.adj="none")# bb = as.data.frame(out$group)# bb$ID = row.names(bb)# bb<- arrange(bb, row.names(bb))# pB = bb[1,2]# pCK = bb[2,2]# pRS = bb[3,2]for(i in 1:nrow(a)){ if(sum(a[i,1:5])==0&&sum(a[i,6:10])==0&&sum(a[i,11:15])==0){ Pvalue[i] <- "NA" }else{ ab = as.numeric(a[i,1:15]) b = sub_design$group aa = data.frame(ab,b) y=aov(ab~b,data = aa) Pvalue[i]<-summary(y)[[1]][,5][1] out <- LSD.test(y,"b", p.adj="none") bb = as.data.frame(out$group) bb$ID = row.names(bb) bb<- arrange(bb, row.names(bb)) pCK[i] = bb$groups[2] pB[i] = as.character(bb[1,2]) pCK[i] = as.character(bb[2,2]) pRS[i] = as.character(bb[3,2]) pRS[i];pCK[i];pB[i] # Dunnett<-glht(y, linfct = mcp(type = "Dunnett"),alternative = "two.side") # out <- LSD.test(y,sub_design$group, p.adj="none")#进行多重比较,不矫正P值 # tuk=TukeyHSD(y) # out <-scheffe.test (y,"group") }}# Pvalueout<-cbind(a,Pvalue, pB,pCK,pRS)head(out)
## CK5 CK4 CK3 CK2 CK1## RE1 0.005549065 0.013885418 0.011357306 0.009472478 0.011338921## RE2 0.013240346 0.050372725 0.078705924 0.041979565 0.040194100## RE3 0.014398817 0.041652675 0.035975518 0.032960401 0.038053388## RE4 0.001026575 0.012005496 0.006981510 0.008045514 0.003511648## RE5 0.003296248 0.008713910 0.006562666 0.008069681 0.006369571## RE6 0.001932415 0.005513607 0.004782645 0.003297546 0.003922539## B5 B4 B3 B2 B1## RE1 0.006645724 0.011490223 0.008876583 0.007449352 0.011995785## RE2 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000## RE3 0.017057299 0.034557751 0.032245560 0.023531604 0.060182030## RE4 0.001508758 0.009058563 0.008608428 0.001993307 0.002024260## RE5 0.003982824 0.004995150 0.007410215 0.004856812 0.007423875## RE6 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000## RS5 RS4 RS3 RS2 RS1## RE1 0.009075086 0.005696559 0.012207578 0.022859290 0.066628198## RE2 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000## RE3 0.029216864 0.018120817 0.042290558 0.070338732 0.204044354## RE4 0.001062557 0.003263060 0.001953964 0.003479927 0.003412932## RE5 0.004041344 0.004469817 0.008316117 0.005772101 0.005521356## RE6 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000## Pvalue pB pCK pRS## RE1 2.796373e-01 a a a## RE2 2.278410e-04 b a b## RE3 3.121200e-01 a a a## RE4 2.608948e-01 a a a## RE5 6.511946e-01 a a a## RE6 5.103709e-06 b a b
RET_name = RET_all[dim(RET_all)[2]]head(RET_name)
## compound## RE1 (-)-Dihydrocarveol## RE2 (2R,3S)-2-hydroxy-3-isopropylbutanedioic acid## RE3 1,4-Cyclohexanedione 2## RE4 11-beta-prostaglandin-F-2-alpha 1## RE5 1-Hexadecanol## RE6 2,3-Dimethylsuccinic acid
rusult_out = merge(out ,RET_name, by="row.names",all=F)head(rusult_out)
## Row.names CK5 CK4 CK3 CK2 CK1## 1 RE1 0.005549065 0.013885418 0.011357306 0.009472478 0.011338921## 2 RE10 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000## 3 RE100 0.001650192 0.003067643 0.002522073 0.001038059 0.002182409## 4 RE101 0.006185346 0.017028862 0.029767109 0.012691608 0.072403533## 5 RE102 0.002229133 0.008797501 0.021920508 0.004811970 0.026383348## 6 RE103 0.001405335 0.002621616 0.005901619 0.002184498 0.011798804## B5 B4 B3 B2 B1 RS5## 1 0.006645724 0.011490223 0.008876583 0.007449352 0.0119957851 0.009075086## 2 0.001245465 0.001468414 0.001217488 0.001201888 0.0004006671 0.001499867## 3 0.000000000 0.000000000 0.000000000 0.000000000 0.0000000000 0.002260963## 4 0.026783700 0.070971462 0.011183080 0.019668387 0.0648061792 0.015967222## 5 0.021370834 0.030246691 0.009708425 0.010392458 0.0244686738 0.008926288## 6 0.009331215 0.009423304 0.002036022 0.002954869 0.0090469287 0.004238941## RS4 RS3 RS2 RS1 Pvalue pB pCK pRS## 1 0.005696559 0.012207578 0.022859290 0.0666281983 2.796373e-01 a a a## 2 0.001142296 0.001528522 0.001253334 0.0006690667 6.950709e-05 a b a## 3 0.001242421 0.003830482 0.002939149 0.0046277758 5.597853e-04 b a a## 4 0.007431094 0.023015901 0.019496753 0.0210254041 3.504130e-01 a a a## 5 0.005726721 0.007780721 0.012159299 0.0048272448 1.341319e-01 a ab b## 6 0.001370010 0.003022282 0.002371406 0.0034698880 2.604930e-01 a a a## compound## 1 (-)-Dihydrocarveol## 2 2-Amino-2-norbornanecarboxylic acid 3## 3 Analyte 172## 4 Analyte 177## 5 Analyte 178## 6 Analyte 179
dim(rusult_out)
## [1] 604 21
write.table(rusult_out,"./sub_result_and_script_allgroup//a3_RET_compounds_t检验all结果.txt",quote = FALSE,row.names = T, col.names = NA,sep = "\t")