How to download taxa profile of multiple samples

Data derived from curatedMetagenomicData

Hua Zou

Last updated on Nov 11, 2022 7 min read Tool

Introduction

R package curatedMetagenomicData provides more than 20,000 metaphlan’s results of samples. However, to download all the results seems not easy when using simple R codes. Here, we give users codes with step by step to grap all data.

Since phyloseq object contains abundance table, taxa table and metadata etc, we convert the final metadata and profile including absolute and relative abundance into phyloseq object.

Loading R packages

suppressMessages(library(dplyr))
suppressMessages(library(tibble))
suppressMessages(library(curatedMetagenomicData))
library(phyloseq)

Studies in curatedMetagenomicData

All the studies’ information were stored in sampleMetadata. Therefore, we obtain the studies’ names by extracting the information from it.

data("sampleMetadata")
availablediseases <- pull(sampleMetadata, study_condition) %>%
  table() %>%
  sort(decreasing = TRUE)

studies <- lapply(names(availablediseases), function(x) {
  dplyr::filter(sampleMetadata, study_condition %in% x) %>%
    dplyr::pull(study_name) %>%
    unique()
})
names(studies) <- names(availablediseases)

Acquiring absolute and relative abundance per study

Merging metadata without duplicated samples
Whether there is absolute or relative abundance profile per study
Whether the samples are identical between metadata and profile

metadata <- data.frame()
profile_counts <- data.frame()
profile_rb <- data.frame()

for (i in 1:length(studies)) {
  disease_dir <- paste0("./dataset/", names(studies)[i])
  
  if (!dir.exists(disease_dir)) {
    dir.create(disease_dir, recursive = TRUE)
  }
  
  for (disease in studies[[i]]) {
    rb_pattern <- paste0(disease, ".relative_abundance")
    
    # absolute abundance
    dat_counts_list <- curatedMetagenomicData(
      pattern = rb_pattern,
      dryrun = FALSE,
      counts = TRUE,
      rownames = "long")
      
    # relative_abundance
    dat_RB_list <- curatedMetagenomicData(
      pattern = rb_pattern,
      dryrun = FALSE,
      counts = FALSE,
      rownames = "long")
    
    # whether there is a taxa profile
    if (length(dat_counts_list) > 0) {
      dat_counts <- dat_counts_list[[1]]
      disease_path_counts <- paste0(disease_dir, "/", paste0(disease, "_counts.RData"))
      save(dat_counts, file = disease_path_counts)
      
      # metadata & profile
      temp_metadata <- colData(dat_counts) %>%
        data.frame()
      temp_profile_counts <- assay(dat_counts) %>%
        data.frame() %>%
        tibble::rownames_to_column("TaxaID")
      
      if (length(profile_counts) == 0) {
        profile_counts <- temp_profile_counts
      } else {
        
        colnames(profile_counts) <- gsub("-", ".", colnames(profile_counts))
        colnames(temp_profile_counts) <- gsub("-", ".", colnames(temp_profile_counts))
        
        # the overlap samples
        sid_count <- intersect(colnames(temp_profile_counts), colnames(profile_counts))
        sid_count <- sid_count[-which(sid_count == "TaxaID")]
        if (length(sid_count) > 0) {
          profile_counts <- profile_counts[, !colnames(profile_counts)%in%sid_count]
        } 
        profile_counts <- dplyr::full_join(temp_profile_counts, 
                                           profile_counts, 
                                           by = "TaxaID")
        
        colnames(profile_counts) <- gsub("-", ".", colnames(profile_counts))
      }
    }
    # whether there is a taxa profile
    if (length(dat_RB_list) > 0) {
      dat_RB <- dat_RB_list[[1]]
      disease_path_RB <- paste0(disease_dir, "/", paste0(disease, "_RB.RData"))
      save(dat_RB, file = disease_path_RB) 
      
      # metadata & profile
      temp_metadata <- colData(dat_RB) %>%
        data.frame()
      temp_profile_rb <- assay(dat_RB) %>%
        data.frame() %>%
        tibble::rownames_to_column("TaxaID")
      
      if (length(profile_rb) == 0) {
        profile_rb <- temp_profile_rb
      } else {
        
        colnames(profile_rb) <- gsub("-", ".", colnames(profile_rb))
        colnames(temp_profile_rb) <- gsub("-", ".", colnames(temp_profile_rb))
        
        # the overlap samples
        sid_rb <- intersect(colnames(temp_profile_rb), colnames(profile_rb))
        sid_rb <- sid_rb[-which(sid_rb == "TaxaID")]
        if (length(sid_rb) > 0) {
          profile_rb <- profile_rb[, !colnames(profile_rb)%in%sid_rb]
        }        
        
        profile_rb <- dplyr::full_join(temp_profile_rb, 
                                       profile_rb, 
                                       by = "TaxaID")
        
        colnames(profile_rb) <- gsub("-", ".", colnames(profile_rb))
      }
      
    }    
    
    if (any(length(dat_counts_list) > 0, length(dat_RB_list) > 0)) {
      
      if (length(metadata) == 0) {
        metadata <- temp_metadata
      } else {
        # diff: metadata vs temp_metadata
        diff1 <- setdiff(colnames(metadata), colnames(temp_metadata))
        diff1_df <- data.frame(matrix(NA, nrow = nrow(temp_metadata), ncol = length(diff1)),
                               row.names = rownames(temp_metadata))
        colnames(diff1_df) <- diff1
        temp_metadata_new <- cbind(temp_metadata, diff1_df)
        
        # diff: temp_metadata vs metadata
        diff2 <- setdiff(colnames(temp_metadata), colnames(metadata))
        diff2_df <- data.frame(matrix(NA, nrow = nrow(metadata), ncol = length(diff2)),
                               row.names = rownames(metadata))
        colnames(diff2_df) <- diff2
        metadata_new <- cbind(metadata, diff2_df) 
        
        rownames(metadata) <- gsub("-", ".", rownames(metadata))
        rownames(temp_metadata) <- gsub("-", ".", rownames(temp_metadata))        
        
        # the overlap samples
        sid <- intersect(rownames(temp_metadata), rownames(metadata))
        if (length(sid) > 0) {
          metadata <- metadata[!rownames(metadata)%in%sid, ]
        }
        
        metadata <- rbind(temp_metadata_new, metadata_new) %>%
          dplyr::distinct()
        
        rownames(metadata) <- gsub("-", ".", rownames(metadata))
      }      
    }
  }
}

Converting datasets into phyloseq

Extracting taxonomic levels from 1st column of profile
Removing prevalence of features less than 0

# extract taxnomic levels
import_metaphlan_taxa <- function(data_metaphlan2,
                                  taxa_level = NULL) {

  taxa_rank <- c("Kingdom", "Phylum", "Class",
                 "Order", "Family", "Genus",
                 "Species", "Strain")

  # rename 1st column into "ID"
  colnames(data_metaphlan2)[which(colnames(data_metaphlan2) == "ID" |
                                    colnames(data_metaphlan2) == "clade_name")] <- "ID"

  # remove the "NCBI_tax_id" column
  if (any(colnames(data_metaphlan2) %in% "NCBI_tax_id")) {
    data_metaphlan2 <- data_metaphlan2 %>% dplyr::select(-NCBI_tax_id)
  }

  # remove sample names with "metaphlan_bugs_list" suffix
  colnames(data_metaphlan2) <- gsub("_metaphlan_bugs_list$", "", colnames(data_metaphlan2))

  if (is.null(taxa_level)) {
    taxa_level <- "Species"
  }
  ind_number <- match(taxa_level, taxa_rank)

  # remove k__Archaea & k__Viruses & k__Eukaryota & Others which are not bacteria
  if (length(grep("k__Bacteria", data_metaphlan2$ID)) > 0) {
    taxa_bacteria <- data_metaphlan2 %>%
      dplyr::slice(grep("k__Bacteria", ID)) %>%
      tibble::column_to_rownames("ID")
  } else {
    stop("There are no bacteria identified in this profile")
  }

  # dividing by 100 to normalize the sum into 1
  taxa_bac_per <- (taxa_bacteria / 100)  %>%
    tibble::rownames_to_column("ID")

  taxa <- taxa_bac_per %>%
    dplyr::group_by(ID) %>%
    dplyr::mutate(Number=length(unlist(strsplit(ID, "|", fixed = TRUE)))) %>%
    dplyr::select(ID, Number) %>%
    dplyr::filter(Number == ind_number)

  abundance_table <- taxa_bac_per %>%
    dplyr::filter(ID%in%taxa$ID)

  taxa_table <- lapply(abundance_table$ID, strsplit, split = "|", fixed = TRUE)
  taxa_table <- lapply(taxa_table, unlist)
  taxa_table <- do.call(rbind, taxa_table)
  taxa_table <- data.frame(taxa_table)
  names(taxa_table) <- taxa_rank[1:ind_number]
  rownames(taxa_table) <- taxa_table[, ind_number]

  abundance_table_res <- abundance_table %>%
    tibble::column_to_rownames("ID")
  rownames(abundance_table_res) <- rownames(taxa_table)
  abundance_table_res <- abundance_table_res[match(rownames(taxa_table),
                                                   row.names(abundance_table_res)), ,
                                             drop = FALSE]

  res <- list(tax_tab=taxa_table,
              abu_tab=abundance_table_res)

  return(res)
}

# Import function to read the the output of metaphlan as phyloseq object
get_metaphlan_phyloseq <- function(otu_tab,
                                   sam_tab,
                                   tax_tab = NULL) {

  if (!is.null(tax_tab)) {
    tax_tab <- phyloseq::tax_table(as(tax_tab, "matrix"))
  }

  if (!is.null(sam_tab)) {
    sam_tab <- phyloseq::sample_data(sam_tab %>% data.frame())
  }

  asv_tab <- phyloseq::otu_table(as(otu_tab, "matrix"),
                                 taxa_are_rows = TRUE)

  ps <- phyloseq::phyloseq(asv_tab, tax_tab, sam_tab)

  return(ps)
}

# Trim samples or taxa in `otu_table` by Occurrences
run_trim <- function(object,
                     taxa_level = NULL,
                     cutoff = 0.1,
                     trim = c("none", "both", "feature", "sample")){

  trim <- match.arg(trim, c("none", "both", "feature", "sample"))
  if (inherits(object, "phyloseq")) {
    # ps <- preprocess_ps(object)
    if (!is.null(taxa_level)) {
      ps_taxa <- summarize_taxa(object, taxa_level = taxa_level)
    } else {
      ps_taxa <- object
    }
    prf <- as(phyloseq::otu_table(ps_taxa), "matrix")
  } else if (inherits(object, "environment")) {
    prf <- as(object$.Data, "matrix")
  } else {
    prf <- object
  }

  if (trim == "feature") {
    tmp1 <- trim_FeatureOrSample(prf, 1, cutoff)
    remain_features <- rownames(tmp1)
    remain_samples <- colnames(prf)
  } else if (trim == "sample") {
    tmp2 <- trim_FeatureOrSample(prf, 2, cutoff)
    remain_features <- rownames(prf)
    remain_samples <- rownames(tmp2)
  } else if(trim == "both") {
    tmp1 <- trim_FeatureOrSample(prf, 1, cutoff)
    tmp2 <- trim_FeatureOrSample(prf, 2, cutoff)
    remain_features <- rownames(tmp1)
    remain_samples <- rownames(tmp2)
  } else if(trim == "none") {
    return(object)
  }

  if (length(remain_features) > 1 & length(remain_samples) > 1) {
    prf_remain <- prf[remain_features, remain_samples]
  } else if (length(remain_features) > 1 & length(remain_samples) == 1) {
    prf_remain <- prf[remain_features, remain_samples] %>%
      data.frame()
    colnames(prf_remain) <- remain_samples
  } else if (length(remain_features) == 1 & length(remain_samples) > 1) {
    prf_remain <- prf[remain_features, remain_samples] %>%
      data.frame()
    rownames(prf_remain) <- remain_features
  } else if (length(remain_features) == 1 & length(remain_samples) == 1) {
    prf_remain <- prf[remain_features, remain_samples] %>%
      data.frame()

    colnames(prf_remain) <- remain_samples
    rownames(prf_remain) <- remain_features
  } else {
    stop("No sample or taxa were remained, please reinput your cutoff")
  }

  if (inherits(object, "phyloseq")) {
    otu_table(ps_taxa) <- phyloseq::otu_table(prf_remain, taxa_are_rows = taxa_are_rows(ps_taxa))
    object <- ps_taxa
  } else if (inherits(object, "environment")) {
    object <- phyloseq::otu_table(prf_remain, taxa_are_rows = taxa_are_rows(object))
  } else {
    object <- prf_remain
  }

  return(object)
}

# the data is trimmed by threshold
trim_FeatureOrSample <- function(x, nRow, threshold) {

  df_occ <- apply(x, nRow, function(x) {
    length(x[c(which(!is.na(x) & x!=0))]) / length(x)
  }) %>%
    data.frame() %>% stats::setNames("Occ") %>%
    tibble::rownames_to_column("type")
  if(nRow == 1){
    rownames(df_occ) <- rownames(x)
  }else{
    rownames(df_occ) <- colnames(x)
  }
  df_KEEP <- apply(df_occ > threshold, 1, all) %>%
    data.frame() %>% stats::setNames("Status") %>%
    dplyr::filter(Status)

  return(df_KEEP)
}

# generate phyloseq
get_phyloseq <- function(y, x=metadata) {
  
  # x = metadata
  # y = profile_counts

  phenotype <- x %>%
    # dplyr::mutate(V1=gsub("\\.*", "", V1)) %>%
    dplyr::distinct() %>%
    tibble::column_to_rownames("V1")
  profile <- y %>%
    dplyr::select(-V1) %>%
    dplyr::rename(ID=TaxaID)
  # colnames(profile) <- gsub("\\.*", "", colnames(profile))
  
  sid <- intersect(rownames(phenotype),
                   colnames(profile))
  phen <- phenotype[rownames(phenotype) %in% sid, ]
  prof <- profile %>%
    dplyr::select(all_of(c("ID", rownames(phen))))
  
  dat_res <- import_metaphlan_taxa(data_metaphlan2 = prof)
  
  phy <- get_metaphlan_phyloseq(otu_tab = dat_res$abu_tab,
                         sam_tab = phen,
                         tax_tab = dat_res$tax_tab)
  
  res <- run_trim(object = phy, 
                  trim = "feature",
                  cutoff = 0)
  
  return(res)
}

Acquiring phyloseq object

## Counts 
phy_counts <- get_phyloseq(y = profile_counts)

## Relative abundance
phy_rb <- get_phyloseq(y = profile_rb)


if (!dir.exists("./phyloseq/")) {
  dir.create("./phyloseq/", recursive = T)
}

file_name_counts <- paste0("./phyloseq/curatedMetagenomic_", Sys.Date(), "_count.RDS")
saveRDS(phy_counts, file_name_counts, compress = TRUE)

file_name_rb <- paste0("./phyloseq/curatedMetagenomic_", Sys.Date(), "_relative.RDS")
saveRDS(phy_rb, file_name_rb, compress = TRUE)

Finally, taxonomic profile of 20,089 samples were downloaded from curatedMetagenomicData.

Summary

Download taxa profile of multiple samples curatedMetagenomicData using R.

Extracting Studies’ information.
Downloading results in one batch.
Converting metadata and profile into phyloseq object.

metaphlan microbiota