质谱数据的组间差异检验

前言

上一节,我们通过对组间整体质谱数据分析后,发现NC和PC、PT的组间差异显著,那么具体到单个蛋白质水平的结果又是如何呢。

在单个蛋白质水平上做比较分析,我们可以使用limma包的函数进行分析。除了组间比较外,它还可以对某些可能潜在影响比较结果的因素如性别、年龄等进行校正处理(可以先期对组间临床表型进行组间差异比较,查看哪些指标是组间显著差异的,这些指标可能影响蛋白质的组间差异比较。它们应该作为协变量被校正掉).

除了使用基于线性模型的limma包外,还可以使用常用的t-test等,但默认情况下,我们会认为case和control组的数据都服从同一正态分布,具有相同的方差,这个时候我们会使用student-t test。可实际上,case和control组的分布可能不是同一个方差,我们这个时候应该选择Welch’s t-test。

导入数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
library(dplyr)
library(tibble)
library(ggplot2)
library(convert)
library(limma)

# rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 1000 * 1024^2)

subgrp <- c("NC", "PC", "PT")
grp.col <- c("#568875", "#73FAFC", "#EE853D")

ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")

Differential Expression Analysis

  1. 如果数据存在配对情况,则可设置pair=T。

  2. scale参数是针对counts数据设置的。

  3. future.apply是对并行计算设置的。

  4. eBayes的结果中logFC有时候让人很困惑,需要注意该结果是否的正确性,因此设置datCoe判断富集方向,也可以通过内置的pl图发现富集方向。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
get_DiffProtein_limma <- function(dataset=ExprSet_LOG,
group_name=subgrp[1:2],
pair=FALSE,
scale=FALSE,
fc=1,
Pval=0.05){

# dataset=ExprSet_LOG
# group_name=subgrp[1:2]
# pair=FALSE
# scale=FALSE
# fc=1
# Pval=0.05

pheno <- pData(dataset) %>%
filter(SubGroup%in%group_name)
pheno$SubGroup <- factor(as.character(pheno$SubGroup), levels = group_name)
pheno$PID <- factor(as.character(pheno$PID))

if(pair){
# paired test
design <- model.matrix(~ pheno$SubGroup + pheno$PID)
rownames(design) <- rownames(pheno)
colnames(design) <- c("Intercept",
paste(group_col, collapse = "-"),
as.character(unique(pheno$PID)[-1]))
}else{
design <- model.matrix( ~ 0 + pheno$SubGroup)
rownames(design) <- rownames(pheno)
colnames(design) <- group_name
}

# show distribution
edata <- as.matrix(exprs(dataset))
exprSet <- edata[, colnames(edata)%in%rownames(pheno)]
boxplot(exprSet)
plotDensities(exprSet)

# Normalization: TMM
if(scale){
require(edgeR)
DGEList <- edgeR::DGEList(
counts = exprSet,
group = pheno$SubGroup)
exprSet_norm <- edgeR::calcNormFactors(DGEList, method = "TMM")
plotMDS(exprSet_norm, col=as.numeric(pheno$SubGroup))
}else{
exprSet_norm <- exprSet
}

# linear fitting
#limma_voom <- voom(exprSet_norm, design, plot = TRUE)
fit <- lmFit(exprSet_norm, design)

if(pair){
# eBayes
fit2 <- eBayes(fit)
qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
abline(0,1)

# differential features
diff_gene <- topTable(fit2, number = Inf, adjust.method = 'BH', coef = 2) %>%
rownames_to_column("GeneID")

# delta
require(future.apply)
plan(multiprocess, workers = 10)
delta_value <- future_apply(exprSet, 1, function(x, y){
# x = exprSet[1, ]
# y = pheno
dat <- data.frame(value=x, y) %>%
arrange(PID, SubGroup) %>%
dplyr::select(PID, SubGroup, value)
dat$SubGroup <- factor(dat$SubGroup, levels = group_name)

dat_delta <- dat %>% group_by(PID, SubGroup) %>%
summarise(mean_value=mean(value)) %>% # mean or median???
mutate(delta=dplyr::first(mean_value) - dplyr::last(mean_value)) %>%
ungroup()

delta <- mean(dat_delta$delta)
return(delta)

}, pheno) %>% data.frame() %>%
setNames("Delta") %>%
rownames_to_column("GeneID")
# stopCluster(cl)

# combine DEG and delta
diff_gene_delta <- inner_join(diff_gene, delta_value, by = "GeneID")

}else{
# contrast group for unpaired test
group <- paste(group_name, collapse = "-")
if(group%in%"NC-PC"){
contrast <- makeContrasts(contrasts = "NC-PC",
levels = design)
}else if(group%in%"NC-PT"){
contrast <- makeContrasts(contrasts = "NC-PT",
levels = design)
}else if(group%in%"PC-PT"){
contrast <- makeContrasts(contrasts = "PC-PT",
levels = design)
}
print(contrast)
# eBayes
fit2 <- contrasts.fit(fit, contrast)
fit2 <- eBayes(fit2)

qqt(fit2$t, df = fit2$df.prior+fit2$df.residual, pch = 16, cex = 0.2)
abline(0,1)

# differential features
diff_gene <- topTable(fit2, number = Inf, adjust.method = 'BH', coef = 1) %>%
rownames_to_column("GeneID")

# delta
require(future.apply)
plan(multiprocess, workers = 10)
delta_value <- future_apply(exprSet, 1, function(x, y){
# x = exprSet[1, ]
# y = pheno
dat <- data.frame(value=x, y) %>%
arrange(SubGroup) %>%
dplyr::select(SubGroup, value)
dat$Type <- factor(dat$SubGroup, levels = group_name)

dat_delta <- dat %>% group_by(SubGroup) %>%
summarise(mean_value=mean(value)) %>% # mean or median???
mutate(delta=dplyr::first(mean_value) - dplyr::last(mean_value)) %>%
ungroup()

delta <- mean(dat_delta$delta)
return(delta)

}, pheno) %>% data.frame() %>%
setNames("Delta") %>%
rownames_to_column("GeneID")

# stopCluster(cl)

# combine DEG and delta
diff_gene_delta <- inner_join(diff_gene, delta_value, by = "GeneID")
}

# validate the enriched directory
pl <- data.frame(edata)[rownames(data.frame(edata))%in%diff_gene_delta$GeneID[1], , F] %>%
t() %>% data.frame() %>%
setNames("Gene") %>%
rownames_to_column("SampleID") %>%
inner_join(pheno%>%rownames_to_column("SampleID"), by = "SampleID") %>%
ggplot(aes(x=SubGroup, y=Gene))+
geom_boxplot()+
labs(y=diff_gene$GeneID[1], x="")+
ggpubr::stat_compare_means(method = "wilcox.test",
paired = pair,
comparisons = list(group_name))+
theme_bw()
print(pl)

# enriched directory: It is sometimes useful to check things by hand to make sure you have the right interpretation.
for(i in 1:5){
datCoe <- fit$coefficients[diff_gene_delta$GeneID[i], ]
deltaMean <- as.numeric(datCoe[group_name[2]] - datCoe[group_name[1]])
logFC <- diff_gene_delta[diff_gene_delta$GeneID%in%diff_gene_delta$GeneID[i], "logFC"]
cat(paste0(diff_gene_delta$GeneID[i], ": ", paste(rev(group_name), collapse = "-"), " = ", signif(deltaMean, 3)))
cat("\n")
cat(paste0(diff_gene_delta$GeneID[i], ": ", "logFC = ", signif(logFC, 3)))
cat("\n")
}

if((deltaMean > 0 & logFC > 0) | (deltaMean < 0 & logFC < 0)){
diff_gene_delta[which(diff_gene_delta$logFC >= fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
diff_gene_delta[which(diff_gene_delta$logFC <= -fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
diff_gene_delta[which(abs(diff_gene_delta$logFC) < fc | diff_gene_delta$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"
}else if((deltaMean > 0 & logFC < 0) | (deltaMean < 0 & logFC > 0)){
diff_gene_delta[which(diff_gene_delta$logFC >= fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
diff_gene_delta[which(diff_gene_delta$logFC <= -fc & diff_gene_delta$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
diff_gene_delta[which(abs(diff_gene_delta$logFC) < fc | diff_gene_delta$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"
}

# Number & Block
dat_status <- table(pheno$SubGroup)
dat_status_number <- as.numeric(dat_status)
dat_status_name <- names(dat_status)
diff_gene_delta$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
"vs",
paste(dat_status_number[2], dat_status_name[2], sep = "_"))

res <- diff_gene_delta %>% dplyr::select(GeneID, Block, logFC, adj.P.Val, Enrichment, everything()) %>%
arrange(adj.P.Val, logFC)

print(dim(res %>% filter(Enrichment != "Nonsignif")))

return(res)
}


NC_PC_LOG2Impute <- get_DiffProtein_limma(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[1:2],
pair = FALSE,
scale = FALSE,
fc = 1,
Pval = 0.05)
write.csv(NC_PC_LOG2Impute, "NC_PC_limma_Mass_LOG2Impute.csv", row.names = F)

NC_PT_LOG2Impute <- get_DiffProtein_limma(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[c(1,3)],
pair = FALSE,
scale = FALSE,
fc = 1,
Pval = 0.05)
write.csv(NC_PT_LOG2Impute, "NC_PT_limma_Mass_LOG2Impute.csv", row.names = F)

PC_PT_LOG2Impute <- get_DiffProtein_limma(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[c(2, 3)],
pair = FALSE,
scale = FALSE,
fc = 1,
Pval = 0.05)
write.csv(PC_PT_LOG2Impute, "PC_PT_limma_Mass_LOG2Impute.csv", row.names = F)

Welch’s t-test

The independent samples t-test comes in two different forms:
the standard Student’s t-test, which assumes that the variance of the two groups are equal. the Welch’s t-test, which is less restrictive compared to the original Student’s test. This is the test where you do not assume that the variance is the same in the two groups, which results in the fractional degrees of freedom.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
Welch_ttest <- function(dataset=ExprSet_LOG2Impute,
group_name=subgrp[1:2],
Pval=0.05,
fc=1){

# dataset=ExprSet_LOG2Impute
# group_name=subgrp[1:2]
# Pval=0.05
# fc=1

pheno <- pData(dataset) %>%
filter(SubGroup%in%group_name) %>%
mutate(SubGroup=factor(SubGroup, levels = group_name))
edata <- exprs(dataset)[, rownames(pheno)]

require(rstatix)
Welch_res <- apply(edata, 1, function(x, y){
# x <- edata[1, ]
# y <- pheno$Group
dat <- data.frame(value=x, group=y)
mdn <- tapply(dat$value, dat$group, median) %>%
data.frame() %>% setNames("value") %>%
rownames_to_column("Group")
mdn1 <- with(mdn, mdn[Group%in%group_name[1], "value"])
mdn2 <- with(mdn, mdn[Group%in%group_name[2], "value"])
Log2FC <- log2(mdn2/mdn1)
rest <- t_test(data = dat, value ~ group)
return(c(Log2FC, rest$statistic, rest$p))
}, pheno$SubGroup) %>%
t() %>% data.frame() %>%
setNames(c("logFC", "rho", "P.value"))

res <- Welch_res[!is.nan(Welch_res$logFC), ] %>%
rownames_to_column("GeneID")
res$adj.P.Val <- p.adjust(as.numeric(res$P.value), method = "BH")
# Number & Block
dat_status <- table(pheno$SubGroup)
dat_status_number <- as.numeric(dat_status)
dat_status_name <- names(dat_status)
res$Block <- paste(paste(dat_status_number[1], dat_status_name[1], sep = "_"),
"vs",
paste(dat_status_number[2], dat_status_name[2], sep = "_"))
# Enrichment
res[which(res$logFC >= fc & res$adj.P.Val < Pval), "Enrichment"] <- group_name[2]
res[which(res$logFC <= -fc & res$adj.P.Val < Pval), "Enrichment"] <- group_name[1]
res[which(abs(res$logFC) < fc | res$adj.P.Val >= Pval), "Enrichment"] <- "Nonsignif"

res <- res %>% dplyr::select(GeneID, Block, logFC, adj.P.Val, Enrichment, everything()) %>%
arrange(adj.P.Val, logFC)

return(res)
}

NC_PC_LOG2Impute_WelchT <- Welch_ttest(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[1:2],
fc = 1,
Pval = 0.05)
write.csv(NC_PC_LOG2Impute_WelchT, "NC_PC_WelchT_Mass_LOG2Impute.csv", row.names = F)

NC_PT_LOG2Impute_WelchT <- Welch_ttest(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[c(1,3)],
fc = 1,
Pval = 0.05)
write.csv(NC_PT_LOG2Impute_WelchT, "NC_PT_WelchT_Mass_LOG2Impute.csv", row.names = F)

PC_PT_LOG2Impute_WelchT <- Welch_ttest(
dataset = ExprSet_LOG2Impute,
group_name = subgrp[c(2,3)],
fc = 1,
Pval = 0.05)
write.csv(PC_PT_LOG2Impute_WelchT, "PC_PT_WelchT_Mass_LOG2Impute.csv", row.names = F)

systemic information

1
sessionInfo()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)

Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so

locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] parallel stats graphics grDevices utils datasets methods base

other attached packages:
[1] rstatix_0.7.0 convert_1.64.0 marray_1.68.0 limma_3.46.0 Biobase_2.50.0 BiocGenerics_0.36.0
[7] ggplot2_3.3.3 tibble_3.1.0 dplyr_1.0.5

loaded via a namespace (and not attached):
[1] sass_0.3.1 edgeR_3.32.1 tidyr_1.1.3 jsonlite_1.7.2 carData_3.0-4 bslib_0.2.4
[7] assertthat_0.2.1 askpass_1.1 cellranger_1.1.0 yaml_2.2.1 pillar_1.6.0 backports_1.2.1
[13] lattice_0.20-41 glue_1.4.2 reticulate_1.18 digest_0.6.27 ggsignif_0.6.0 colorspace_2.0-0
[19] cowplot_1.1.0 htmltools_0.5.1.1 Matrix_1.3-2 plyr_1.8.6 pkgconfig_2.0.3 broom_0.7.6
[25] haven_2.3.1 purrr_0.3.4 scales_1.1.1 RSpectra_0.16-0 openxlsx_4.2.3 rio_0.5.16
[31] openssl_1.4.3 generics_0.1.0 farver_2.1.0 car_3.0-10 ellipsis_0.3.1 ggpubr_0.4.0
[37] withr_2.4.1 umap_0.2.7.0 magrittr_2.0.1 crayon_1.4.1 readxl_1.3.1 evaluate_0.14
[43] fansi_0.4.2 forcats_0.5.0 foreign_0.8-81 tools_4.0.2 data.table_1.14.0 hms_1.0.0
[49] lifecycle_1.0.0 stringr_1.4.0 munsell_0.5.0 locfit_1.5-9.4 zip_2.1.1 jquerylib_0.1.3
[55] compiler_4.0.2 tinytex_0.31 rlang_0.4.10 grid_4.0.2 labeling_0.4.2 rmarkdown_2.7
[61] gtable_0.3.0 abind_1.4-5 DBI_1.1.1 curl_4.3 reshape2_1.4.4 R6_2.5.0
[67] knitr_1.31 utf8_1.2.1 stringi_1.4.6 Rcpp_1.0.6 vctrs_0.3.7 tidyselect_1.1.0
[73] xfun_0.20

Reference

  1. Proteomics Data Analysis (2/3): Data Filtering and Missing Value Imputation

  2. Welch’s t-test: When to Use it + Examples

  3. T-TEST ESSENTIALS: DEFINITION, FORMULA AND CALCULATION

参考文章如引起任何侵权问题,可以与我联系,谢谢。


------------- The End Thanks for reading --------