How to download taxa profile of multiple samples
Data derived from curatedMetagenomicData
Table Of Contents
Introduction
R package curatedMetagenomicData
provides more than 20,000 metaphlan’s results of samples. However, to download all the results seems not easy when using simple R codes. Here, we give users codes with step by step to grap all data.
Since phyloseq object contains abundance table, taxa table and metadata etc, we convert the final metadata and profile including absolute and relative abundance into phyloseq object.
Loading R packages
suppressMessages(library(dplyr))
suppressMessages(library(tibble))
suppressMessages(library(curatedMetagenomicData))
library(phyloseq)
Studies in curatedMetagenomicData
All the studies’ information were stored in sampleMetadata. Therefore, we obtain the studies’ names by extracting the information from it.
data("sampleMetadata")
availablediseases <- pull(sampleMetadata, study_condition) %>%
table() %>%
sort(decreasing = TRUE)
studies <- lapply(names(availablediseases), function(x) {
dplyr::filter(sampleMetadata, study_condition %in% x) %>%
dplyr::pull(study_name) %>%
unique()
})
names(studies) <- names(availablediseases)
Acquiring absolute and relative abundance per study
Merging metadata without duplicated samples
Whether there is absolute or relative abundance profile per study
Whether the samples are identical between metadata and profile
metadata <- data.frame()
profile_counts <- data.frame()
profile_rb <- data.frame()
for (i in 1:length(studies)) {
disease_dir <- paste0("./dataset/", names(studies)[i])
if (!dir.exists(disease_dir)) {
dir.create(disease_dir, recursive = TRUE)
}
for (disease in studies[[i]]) {
rb_pattern <- paste0(disease, ".relative_abundance")
# absolute abundance
dat_counts_list <- curatedMetagenomicData(
pattern = rb_pattern,
dryrun = FALSE,
counts = TRUE,
rownames = "long")
# relative_abundance
dat_RB_list <- curatedMetagenomicData(
pattern = rb_pattern,
dryrun = FALSE,
counts = FALSE,
rownames = "long")
# whether there is a taxa profile
if (length(dat_counts_list) > 0) {
dat_counts <- dat_counts_list[[1]]
disease_path_counts <- paste0(disease_dir, "/", paste0(disease, "_counts.RData"))
save(dat_counts, file = disease_path_counts)
# metadata & profile
temp_metadata <- colData(dat_counts) %>%
data.frame()
temp_profile_counts <- assay(dat_counts) %>%
data.frame() %>%
tibble::rownames_to_column("TaxaID")
if (length(profile_counts) == 0) {
profile_counts <- temp_profile_counts
} else {
colnames(profile_counts) <- gsub("-", ".", colnames(profile_counts))
colnames(temp_profile_counts) <- gsub("-", ".", colnames(temp_profile_counts))
# the overlap samples
sid_count <- intersect(colnames(temp_profile_counts), colnames(profile_counts))
sid_count <- sid_count[-which(sid_count == "TaxaID")]
if (length(sid_count) > 0) {
profile_counts <- profile_counts[, !colnames(profile_counts)%in%sid_count]
}
profile_counts <- dplyr::full_join(temp_profile_counts,
profile_counts,
by = "TaxaID")
colnames(profile_counts) <- gsub("-", ".", colnames(profile_counts))
}
}
# whether there is a taxa profile
if (length(dat_RB_list) > 0) {
dat_RB <- dat_RB_list[[1]]
disease_path_RB <- paste0(disease_dir, "/", paste0(disease, "_RB.RData"))
save(dat_RB, file = disease_path_RB)
# metadata & profile
temp_metadata <- colData(dat_RB) %>%
data.frame()
temp_profile_rb <- assay(dat_RB) %>%
data.frame() %>%
tibble::rownames_to_column("TaxaID")
if (length(profile_rb) == 0) {
profile_rb <- temp_profile_rb
} else {
colnames(profile_rb) <- gsub("-", ".", colnames(profile_rb))
colnames(temp_profile_rb) <- gsub("-", ".", colnames(temp_profile_rb))
# the overlap samples
sid_rb <- intersect(colnames(temp_profile_rb), colnames(profile_rb))
sid_rb <- sid_rb[-which(sid_rb == "TaxaID")]
if (length(sid_rb) > 0) {
profile_rb <- profile_rb[, !colnames(profile_rb)%in%sid_rb]
}
profile_rb <- dplyr::full_join(temp_profile_rb,
profile_rb,
by = "TaxaID")
colnames(profile_rb) <- gsub("-", ".", colnames(profile_rb))
}
}
if (any(length(dat_counts_list) > 0, length(dat_RB_list) > 0)) {
if (length(metadata) == 0) {
metadata <- temp_metadata
} else {
# diff: metadata vs temp_metadata
diff1 <- setdiff(colnames(metadata), colnames(temp_metadata))
diff1_df <- data.frame(matrix(NA, nrow = nrow(temp_metadata), ncol = length(diff1)),
row.names = rownames(temp_metadata))
colnames(diff1_df) <- diff1
temp_metadata_new <- cbind(temp_metadata, diff1_df)
# diff: temp_metadata vs metadata
diff2 <- setdiff(colnames(temp_metadata), colnames(metadata))
diff2_df <- data.frame(matrix(NA, nrow = nrow(metadata), ncol = length(diff2)),
row.names = rownames(metadata))
colnames(diff2_df) <- diff2
metadata_new <- cbind(metadata, diff2_df)
rownames(metadata) <- gsub("-", ".", rownames(metadata))
rownames(temp_metadata) <- gsub("-", ".", rownames(temp_metadata))
# the overlap samples
sid <- intersect(rownames(temp_metadata), rownames(metadata))
if (length(sid) > 0) {
metadata <- metadata[!rownames(metadata)%in%sid, ]
}
metadata <- rbind(temp_metadata_new, metadata_new) %>%
dplyr::distinct()
rownames(metadata) <- gsub("-", ".", rownames(metadata))
}
}
}
}
Converting datasets into phyloseq
Extracting taxonomic levels from 1st column of profile
Removing prevalence of features less than 0
# extract taxnomic levels
import_metaphlan_taxa <- function(data_metaphlan2,
taxa_level = NULL) {
taxa_rank <- c("Kingdom", "Phylum", "Class",
"Order", "Family", "Genus",
"Species", "Strain")
# rename 1st column into "ID"
colnames(data_metaphlan2)[which(colnames(data_metaphlan2) == "ID" |
colnames(data_metaphlan2) == "clade_name")] <- "ID"
# remove the "NCBI_tax_id" column
if (any(colnames(data_metaphlan2) %in% "NCBI_tax_id")) {
data_metaphlan2 <- data_metaphlan2 %>% dplyr::select(-NCBI_tax_id)
}
# remove sample names with "metaphlan_bugs_list" suffix
colnames(data_metaphlan2) <- gsub("_metaphlan_bugs_list$", "", colnames(data_metaphlan2))
if (is.null(taxa_level)) {
taxa_level <- "Species"
}
ind_number <- match(taxa_level, taxa_rank)
# remove k__Archaea & k__Viruses & k__Eukaryota & Others which are not bacteria
if (length(grep("k__Bacteria", data_metaphlan2$ID)) > 0) {
taxa_bacteria <- data_metaphlan2 %>%
dplyr::slice(grep("k__Bacteria", ID)) %>%
tibble::column_to_rownames("ID")
} else {
stop("There are no bacteria identified in this profile")
}
# dividing by 100 to normalize the sum into 1
taxa_bac_per <- (taxa_bacteria / 100) %>%
tibble::rownames_to_column("ID")
taxa <- taxa_bac_per %>%
dplyr::group_by(ID) %>%
dplyr::mutate(Number=length(unlist(strsplit(ID, "|", fixed = TRUE)))) %>%
dplyr::select(ID, Number) %>%
dplyr::filter(Number == ind_number)
abundance_table <- taxa_bac_per %>%
dplyr::filter(ID%in%taxa$ID)
taxa_table <- lapply(abundance_table$ID, strsplit, split = "|", fixed = TRUE)
taxa_table <- lapply(taxa_table, unlist)
taxa_table <- do.call(rbind, taxa_table)
taxa_table <- data.frame(taxa_table)
names(taxa_table) <- taxa_rank[1:ind_number]
rownames(taxa_table) <- taxa_table[, ind_number]
abundance_table_res <- abundance_table %>%
tibble::column_to_rownames("ID")
rownames(abundance_table_res) <- rownames(taxa_table)
abundance_table_res <- abundance_table_res[match(rownames(taxa_table),
row.names(abundance_table_res)), ,
drop = FALSE]
res <- list(tax_tab=taxa_table,
abu_tab=abundance_table_res)
return(res)
}
# Import function to read the the output of metaphlan as phyloseq object
get_metaphlan_phyloseq <- function(otu_tab,
sam_tab,
tax_tab = NULL) {
if (!is.null(tax_tab)) {
tax_tab <- phyloseq::tax_table(as(tax_tab, "matrix"))
}
if (!is.null(sam_tab)) {
sam_tab <- phyloseq::sample_data(sam_tab %>% data.frame())
}
asv_tab <- phyloseq::otu_table(as(otu_tab, "matrix"),
taxa_are_rows = TRUE)
ps <- phyloseq::phyloseq(asv_tab, tax_tab, sam_tab)
return(ps)
}
# Trim samples or taxa in `otu_table` by Occurrences
run_trim <- function(object,
taxa_level = NULL,
cutoff = 0.1,
trim = c("none", "both", "feature", "sample")){
trim <- match.arg(trim, c("none", "both", "feature", "sample"))
if (inherits(object, "phyloseq")) {
# ps <- preprocess_ps(object)
if (!is.null(taxa_level)) {
ps_taxa <- summarize_taxa(object, taxa_level = taxa_level)
} else {
ps_taxa <- object
}
prf <- as(phyloseq::otu_table(ps_taxa), "matrix")
} else if (inherits(object, "environment")) {
prf <- as(object$.Data, "matrix")
} else {
prf <- object
}
if (trim == "feature") {
tmp1 <- trim_FeatureOrSample(prf, 1, cutoff)
remain_features <- rownames(tmp1)
remain_samples <- colnames(prf)
} else if (trim == "sample") {
tmp2 <- trim_FeatureOrSample(prf, 2, cutoff)
remain_features <- rownames(prf)
remain_samples <- rownames(tmp2)
} else if(trim == "both") {
tmp1 <- trim_FeatureOrSample(prf, 1, cutoff)
tmp2 <- trim_FeatureOrSample(prf, 2, cutoff)
remain_features <- rownames(tmp1)
remain_samples <- rownames(tmp2)
} else if(trim == "none") {
return(object)
}
if (length(remain_features) > 1 & length(remain_samples) > 1) {
prf_remain <- prf[remain_features, remain_samples]
} else if (length(remain_features) > 1 & length(remain_samples) == 1) {
prf_remain <- prf[remain_features, remain_samples] %>%
data.frame()
colnames(prf_remain) <- remain_samples
} else if (length(remain_features) == 1 & length(remain_samples) > 1) {
prf_remain <- prf[remain_features, remain_samples] %>%
data.frame()
rownames(prf_remain) <- remain_features
} else if (length(remain_features) == 1 & length(remain_samples) == 1) {
prf_remain <- prf[remain_features, remain_samples] %>%
data.frame()
colnames(prf_remain) <- remain_samples
rownames(prf_remain) <- remain_features
} else {
stop("No sample or taxa were remained, please reinput your cutoff")
}
if (inherits(object, "phyloseq")) {
otu_table(ps_taxa) <- phyloseq::otu_table(prf_remain, taxa_are_rows = taxa_are_rows(ps_taxa))
object <- ps_taxa
} else if (inherits(object, "environment")) {
object <- phyloseq::otu_table(prf_remain, taxa_are_rows = taxa_are_rows(object))
} else {
object <- prf_remain
}
return(object)
}
# the data is trimmed by threshold
trim_FeatureOrSample <- function(x, nRow, threshold) {
df_occ <- apply(x, nRow, function(x) {
length(x[c(which(!is.na(x) & x!=0))]) / length(x)
}) %>%
data.frame() %>% stats::setNames("Occ") %>%
tibble::rownames_to_column("type")
if(nRow == 1){
rownames(df_occ) <- rownames(x)
}else{
rownames(df_occ) <- colnames(x)
}
df_KEEP <- apply(df_occ > threshold, 1, all) %>%
data.frame() %>% stats::setNames("Status") %>%
dplyr::filter(Status)
return(df_KEEP)
}
# generate phyloseq
get_phyloseq <- function(y, x=metadata) {
# x = metadata
# y = profile_counts
phenotype <- x %>%
# dplyr::mutate(V1=gsub("\\.*", "", V1)) %>%
dplyr::distinct() %>%
tibble::column_to_rownames("V1")
profile <- y %>%
dplyr::select(-V1) %>%
dplyr::rename(ID=TaxaID)
# colnames(profile) <- gsub("\\.*", "", colnames(profile))
sid <- intersect(rownames(phenotype),
colnames(profile))
phen <- phenotype[rownames(phenotype) %in% sid, ]
prof <- profile %>%
dplyr::select(all_of(c("ID", rownames(phen))))
dat_res <- import_metaphlan_taxa(data_metaphlan2 = prof)
phy <- get_metaphlan_phyloseq(otu_tab = dat_res$abu_tab,
sam_tab = phen,
tax_tab = dat_res$tax_tab)
res <- run_trim(object = phy,
trim = "feature",
cutoff = 0)
return(res)
}
- Acquiring phyloseq object
## Counts
phy_counts <- get_phyloseq(y = profile_counts)
## Relative abundance
phy_rb <- get_phyloseq(y = profile_rb)
if (!dir.exists("./phyloseq/")) {
dir.create("./phyloseq/", recursive = T)
}
file_name_counts <- paste0("./phyloseq/curatedMetagenomic_", Sys.Date(), "_count.RDS")
saveRDS(phy_counts, file_name_counts, compress = TRUE)
file_name_rb <- paste0("./phyloseq/curatedMetagenomic_", Sys.Date(), "_relative.RDS")
saveRDS(phy_rb, file_name_rb, compress = TRUE)
Finally, taxonomic profile of 20,089 samples were downloaded from curatedMetagenomicData
.
Summary
Download taxa profile of multiple samples curatedMetagenomicData using R.
- Extracting Studies’ information.
- Downloading results in one batch.
- Converting metadata and profile into phyloseq object.