高纬度数据降维方法的R实现

 前言

上一节,我们对质谱数据预处理后,获取得到可用于后续统计分析的数据。在常见的case-control的实验中,我们一般会研究组学数据与分组是否在整体上存在显著的差异。

一般通过降维算法得到低纬度如二维或三维的新坐标数据,再结合可视化技术去展示样本的在新坐标的空间分布,接着加上统计检验结果证实整体组学水平上组间的差异性。降维算法有基于线性模型的PCA,也有基于非线性的tSNE和UMAP等方法。

PCA是对原高纬度数据进行线性组合实现线性变换的算法,每个新的主成分均是原维度数据的线性组合。tSNE和UMAP是计算样本在空间上的概率密度分布,可有效地保留组内样本间距离也可以保留组间最大距离,是一类较好且适合非线性数据的方法。

导入数据

1
2
3
4
5
6
7
8
9
options(warn = 0)
library(dplyr)
library(tibble)
library(ggplot2)
options(warn = 0)

ExprSet_LOG2Impute <- readRDS("Mass_Proteins_filtered_Normal_LOG2Impute.RDS")
subgrp <- c("NC", "PC", "PT")
grp.col <- c("#568875", "#73FAFC", "#EE853D")

Principal Component Analysis

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
PCAFun <- function(dataset = ExprSet_LOG2Impute ){

# dataset = ExprSet_LOG2Impute

require(convert)
metadata <- pData(dataset)
profile <- exprs(dataset)

# pca
pca <- prcomp(scale(t(profile), center = T, scale = T))
require(factoextra)
eig <- get_eig(pca)
# explains variable
explains <- paste0(paste0("PC", seq(2)), "(", paste0(round(eig[1:2, 2], 2), "%"), ")")
# principal component score of each sample
score <- inner_join(pca$x %>% data.frame() %>%
dplyr::select(c(1:2)) %>%
rownames_to_column("SampleID"),
metadata %>% rownames_to_column("SampleID"),
by = "SampleID") %>%
mutate(Group=factor(Group, levels = grp))


# PERMANOVA
require(vegan)
set.seed(123)
if(any(profile < 0)){
res_adonis <- adonis(vegdist(t(profile), method = "manhattan") ~ metadata$SubGroup, permutations = 999)
}else{
res_adonis <- adonis(vegdist(t(profile), method = "bray") ~ metadata$SubGroup, permutations = 999)
}
adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
#use the bquote function to format adonis results to be annotated on the ordination plot.
signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))


pl <- ggplot(score, aes(x=PC1, y=PC2))+
geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
labs(x=explains[1], y=explains[2])+
scale_color_manual(values = grp.col)+
scale_fill_manual(name = "Condition",
values = grp.col)+
annotate("text", x = max(score$PC1) - 8,
y = min(score$PC1),
label = adn_res_format,
size = 6)+
guides(color=F)+
theme_classic()+
theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
axis.text = element_text(size = 9, color = "black"),
text = element_text(size = 8, color = "black", family = "serif"),
strip.text = element_text(size = 9, color = "black", face = "bold"),
panel.grid = element_blank(),
legend.title = element_text(size = 11, color = "black", family = "serif"),
legend.text = element_text(size = 10, color = "black", family = "serif"),
legend.position = c(0, 0),
legend.justification = c(0, 0),
legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
return(pl)
}

PCA_LOG2Impute <- PCAFun(dataset = ExprSet_LOG2Impute)
PCA_LOG2Impute
ggsave("Mass_ProteinsLOG2Impute_PCA.pdf", PCA_LOG2Impute, width = 5, height = 5, dpi = 300)

Rtsne

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
RtsneFun <- function(dataset = ExprSet_LOG2Impute,
perpl = 10){

# dataset = ExprSet_LOG2Impute
# perpl = 10

require(convert)
metadata <- pData(dataset)
profile <- t(exprs(dataset))

# Rtsne
require(Rtsne)
#set.seed(123)
Rtsne <- Rtsne(profile,
dims=2,
perplexity=perpl,
verbose=TRUE,
max_iter=500,
eta=200)
point <- Rtsne$Y %>% data.frame() %>%
dplyr::select(c(1:2)) %>%
setNames(c("tSNE1", "tSNE2"))
rownames(point) <- rownames(profile)
score <- inner_join(point %>% rownames_to_column("SampleID"),
metadata %>% rownames_to_column("SampleID"),
by = "SampleID") %>%
mutate(Group=factor(Group, levels = grp))

# PERMANOVA
require(vegan)
set.seed(123)
if(any(profile < 0)){
res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999)
}else{
res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)
}
adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
#use the bquote function to format adonis results to be annotated on the ordination plot.
signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))

pl <- ggplot(score, aes(x=tSNE1, y=tSNE2))+
geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
scale_color_manual(values = grp.col)+
scale_fill_manual(name = "Condition",
values = grp.col)+
annotate("text", x = max(score$tSNE1) - 8,
y = max(score$tSNE2)-5,
label = adn_res_format,
size = 6)+
guides(color=F)+
theme_classic()+
theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
axis.text = element_text(size = 9, color = "black"),
text = element_text(size = 8, color = "black", family = "serif"),
strip.text = element_text(size = 9, color = "black", face = "bold"),
panel.grid = element_blank(),
legend.title = element_text(size = 11, color = "black", family = "serif"),
legend.text = element_text(size = 10, color = "black", family = "serif"),
legend.position = c(0, 0),
legend.justification = c(0, 0),
legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
return(pl)
}

Rtsne_LOG2Impute <- RtsneFun(dataset = ExprSet_LOG2Impute, perpl = 10)
Rtsne_LOG2Impute
ggsave("Mass_ProteinsLOG2Impute_Rtsne.pdf", Rtsne_LOG2Impute, width = 5, height = 5, dpi = 300)

UMAP: a non-linear dimensionality reduction algorithm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
UMAPFun <- function(dataset = ExprSet_LOG2Impute){

# dataset = ExprSet_LOG2Impute

require(convert)
metadata <- pData(dataset)
profile <- t(exprs(dataset))

# umap
require(umap)
umap <- umap::umap(profile)

point <- umap$layout %>% data.frame() %>%
setNames(c("UMAP1", "UMAP2"))
rownames(point) <- rownames(profile)
score <- inner_join(point %>% rownames_to_column("SampleID"),
metadata %>% rownames_to_column("SampleID"),
by = "SampleID") %>%
mutate(Group=factor(Group, levels = grp))

# PERMANOVA
require(vegan)
set.seed(123)
if(any(profile < 0)){
res_adonis <- adonis(vegdist(profile, method = "manhattan") ~ metadata$SubGroup, permutations = 999)
}else{
res_adonis <- adonis(vegdist(profile, method = "bray") ~ metadata$SubGroup, permutations = 999)
}
adn_pvalue <- res_adonis[[1]][["Pr(>F)"]][1]
adn_rsquared <- round(res_adonis[[1]][["R2"]][1],3)
#use the bquote function to format adonis results to be annotated on the ordination plot.
signi_label <- paste(cut(adn_pvalue,breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ".")))
adn_res_format <- bquote(atop(atop("PERMANOVA",R^2==~.(adn_rsquared)),
atop("p-value="~.(adn_pvalue)~.(signi_label), phantom())))

pl <- ggplot(score, aes(x=UMAP1, y=UMAP2))+
geom_point(aes(fill=SubGroup), size=3.5, shape=21, stroke = .8, color = "black")+
stat_ellipse(aes(color=SubGroup), level = 0.95, linetype = 1, size = 1.5)+
scale_color_manual(values = grp.col)+
scale_fill_manual(name = "Condition",
values = grp.col)+
annotate("text", x = max(score$UMAP1),
y = min(score$UMAP2),
label = adn_res_format,
size = 6)+
guides(color=F)+
theme_classic()+
theme(axis.title = element_text(size = 10, color = "black", face = "bold"),
axis.text = element_text(size = 9, color = "black"),
text = element_text(size = 8, color = "black", family = "serif"),
strip.text = element_text(size = 9, color = "black", face = "bold"),
panel.grid = element_blank(),
legend.title = element_text(size = 11, color = "black", family = "serif"),
legend.text = element_text(size = 10, color = "black", family = "serif"),
legend.position = c(0, 0),
legend.justification = c(0, 0),
legend.background = element_rect(color = "black", fill = "white", linetype = 2, size = 0.5))
return(pl)
}

UMAP_LOG2Impute <- UMAPFun(dataset = ExprSet_LOG2Impute)
UMAP_LOG2Impute
ggsave("Mass_ProteinsLOG2Impute_UMAP.pdf", UMAP_LOG2Impute, width = 5, height = 5, dpi = 300)

Notes: 三组数据的整体性差异如果较为明显,一般可以在不同的降维方法上都有所体现。从分组看,NC组和PC、PT组均存在明显的差异,后续数据分析应该着重关注 NC vs PC, NC vs PT,但也可以对PC vs PT做比较分析。

Systemic information

1
sessionInfo()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
R version 4.0.2 (2020-06-22)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 8 (Core)

Matrix products: default
BLAS/LAPACK: /disk/share/anaconda3/lib/libopenblasp-r0.3.10.so

locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

attached base packages:
[1] stats4 parallel stats graphics grDevices utils datasets methods base

other attached packages:
[1] umap_0.2.7.0 Rtsne_0.15
[3] vegan_2.5-6 lattice_0.20-41
[5] permute_0.9-5 factoextra_1.0.7
[7] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0 IlluminaHumanMethylationEPICmanifest_0.3.0
[9] minfi_1.36.0 bumphunter_1.32.0
[11] locfit_1.5-9.4 iterators_1.0.13
[13] foreach_1.5.1 Biostrings_2.58.0
[15] XVector_0.30.0 SummarizedExperiment_1.20.0
[17] MatrixGenerics_1.2.1 matrixStats_0.58.0
[19] GenomicRanges_1.42.0 GenomeInfoDb_1.26.4
[21] IRanges_2.24.1 S4Vectors_0.28.1
[23] eulerr_6.1.0 UpSetR_1.4.0
[25] ggplot2_3.3.3 DEP_1.12.0
[27] convert_1.64.0 marray_1.68.0
[29] limma_3.46.0 Biobase_2.50.0
[31] BiocGenerics_0.36.0 data.table_1.14.0
[33] tibble_3.1.0 dplyr_1.0.5

loaded via a namespace (and not attached):
[1] utf8_1.2.1 shinydashboard_0.7.1 reticulate_1.18 gmm_1.6-5
[5] tidyselect_1.1.0 RSQLite_2.2.5 AnnotationDbi_1.52.0 htmlwidgets_1.5.3
[9] grid_4.0.2 BiocParallel_1.24.1 norm_1.0-9.5 munsell_0.5.0
[13] codetools_0.2-18 preprocessCore_1.52.1 DT_0.18 withr_2.4.1
[17] colorspace_2.0-0 knitr_1.31 rstudioapi_0.13 mzID_1.28.0
[21] labeling_0.4.2 GenomeInfoDbData_1.2.4 farver_2.1.0 bit64_4.0.5
[25] rhdf5_2.34.0 vctrs_0.3.7 generics_0.1.0 xfun_0.20
[29] BiocFileCache_1.14.0 R6_2.5.0 doParallel_1.0.16 illuminaio_0.32.0
[33] clue_0.3-58 bitops_1.0-6 rhdf5filters_1.2.0 cachem_1.0.4
[37] reshape_0.8.8 DelayedArray_0.16.3 assertthat_0.2.1 promises_1.2.0.1
[41] scales_1.1.1 gtable_0.3.0 Cairo_1.5-12.2 affy_1.68.0
[45] sandwich_3.0-0 rlang_0.4.10 genefilter_1.72.0 mzR_2.24.1
[49] GlobalOptions_0.1.2 splines_4.0.2 rtracklayer_1.50.0 GEOquery_2.58.0
[53] impute_1.64.0 yaml_2.2.1 reshape2_1.4.4 BiocManager_1.30.12
[57] GenomicFeatures_1.42.2 httpuv_1.5.5 tools_4.0.2 nor1mix_1.3-0
[61] affyio_1.60.0 ellipsis_0.3.1 jquerylib_0.1.3 RColorBrewer_1.1-2
[65] siggenes_1.64.0 MSnbase_2.16.1 Rcpp_1.0.6 plyr_1.8.6
[69] sparseMatrixStats_1.2.1 progress_1.2.2 zlibbioc_1.36.0 purrr_0.3.4
[73] RCurl_1.98-1.3 prettyunits_1.1.1 openssl_1.4.3 GetoptLong_1.0.5
[77] cowplot_1.1.0 zoo_1.8-8 ggrepel_0.9.1.9999 cluster_2.1.0
[81] tinytex_0.31 magrittr_2.0.1 RSpectra_0.16-0 circlize_0.4.10
[85] pcaMethods_1.80.0 mvtnorm_1.1-1 ProtGenerics_1.22.0 evaluate_0.14
[89] hms_1.0.0 mime_0.10 xtable_1.8-4 XML_3.99-0.6
[93] mclust_5.4.7 gridExtra_2.3 shape_1.4.5 compiler_4.0.2
[97] biomaRt_2.46.3 ncdf4_1.17 crayon_1.4.1 htmltools_0.5.1.1
[101] mgcv_1.8-34 later_1.1.0.1 tidyr_1.1.3 DBI_1.1.1
[105] dbplyr_2.1.1 ComplexHeatmap_2.6.2 MASS_7.3-53.1 tmvtnorm_1.4-10
[109] rappdirs_0.3.3 Matrix_1.3-2 readr_1.4.0 cli_2.4.0
[113] vsn_3.58.0 imputeLCMD_2.0 quadprog_1.5-8 pkgconfig_2.0.3
[117] GenomicAlignments_1.26.0 MALDIquant_1.19.3 xml2_1.3.2 annotate_1.68.0
[121] bslib_0.2.4 rngtools_1.5 multtest_2.46.0 beanplot_1.2
[125] doRNG_1.8.2 scrime_1.3.5 stringr_1.4.0 digest_0.6.27
[129] rmarkdown_2.7 base64_2.0 DelayedMatrixStats_1.12.3 curl_4.3
[133] shiny_1.6.0 Rsamtools_2.6.0 rjson_0.2.20 jsonlite_1.7.2
[137] nlme_3.1-150 lifecycle_1.0.0 Rhdf5lib_1.12.1 askpass_1.1
[141] fansi_0.4.2 pillar_1.6.0 fastmap_1.1.0 httr_1.4.2
[145] survival_3.2-10 glue_1.4.2 png_0.1-7 bit_4.0.4
[149] sass_0.3.1 stringi_1.4.6 HDF5Array_1.18.1 blob_1.2.1
[153] memoise_2.0.0

Reference

  1. How to change Legend of ggplot2

  2. How to change ggplot facet labels

参考文章如引起任何侵权问题,可以与我联系,谢谢。


------------- The End Thanks for reading --------