Machine Learning on gut microbiota of patients with Colorectal cancer (1)

Obtaining inpudata

Rmarkdown 1: Downloading data.

Downloading datasets using curatedMetagenomicData, which contains the HUMANN or Metaphlan results.

Loading packages

knitr::opts_chunk$set(warning = FALSE)
library(dplyr)
library(tibble)
library(curatedMetagenomicData)
#library(curatedMetagenomicAnalyses)

# rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 1000 * 1024^2)

Investigating potential response variables

  • These are the 10 study conditions most commonly found in curatedMetagenomicData:
data("sampleMetadata")
availablediseases <- pull(sampleMetadata, study_condition) %>%
  table() %>%
  sort(decreasing = TRUE)

availablediseases

  • And the number of studies they are found in:
studies <- lapply(names(availablediseases), function(x){
  filter(sampleMetadata, study_condition %in% x) %>%
    pull(study_name) %>%
    unique()
})
names(studies) <- names(availablediseases)
studies <- studies[-grep("control", names(studies))] #get rid of controls
studies <- studies[sapply(studies, length) > 1] #available in more than one study

studies

  • Each of these datasets has six data types associated with it; for example:
curatedMetagenomicData(pattern = "YachidaS_2019.+",
                       dryrun = TRUE,
                       counts = TRUE,
                       rownames = "long") 

The metagenomics datasets contain more than 13 types data, which comprising taxonomic and functional profile with relative and absolute abundance matrix.

  • Relative abundance: storing into TreeSummarizedExperiment object
YachidaS_2019_dataset <- curatedMetagenomicData(pattern = "YachidaS_2019.+relative_abundance",
                       dryrun = FALSE,
                       counts = TRUE,
                       rownames = "long")

YachidaS_2019_RB_TSE <- YachidaS_2019_dataset$`2021-10-14.YachidaS_2019.relative_abundance`
YachidaS_2019_RB_TSE

Write relative abundance datasets to disk

if (0) {
for (i in seq_along(studies)){
  cond <- names(studies)[i]
  se <-
    curatedMetagenomicAnalyses::makeSEforCondition(cond, removestudies = "HMP_2019_ibdmdb", dataType = "relative_abundance")
  print(paste("Next study condition:", cond, " /// Body site: ", unique(colData(se)$body_site)))
  print(with(colData(se), table(study_name, study_condition)))
  cat("\n \n")
  save(se, file = paste0(cond, ".rda"))
  flattext <- select(as.data.frame(colData(se)), c("study_name", "study_condition", "subject_id"))
  rownames(flattext) <- colData(se)$sample_id
  flattext <- cbind(flattext, data.frame(t(assay(se))))
  write.csv(flattext, file = paste0(cond, ".csv"))
  system(paste0("gzip ", cond, ".csv"))
}
}

Preparing for machine learning

  • metadata

  • relative abundance profile

metadata <- colData(YachidaS_2019_RB_TSE) %>% 
  data.frame()
phenotype <- metadata %>% 
  dplyr::select(disease) %>%
  tibble::rownames_to_column("SampleID") %>%
  dplyr::filter(disease %in% c("CRC", "healthy"))

profile <- assay(YachidaS_2019_RB_TSE)

sid <- intersect(phenotype$SampleID, colnames(profile))

prof <- profile %>%
  data.frame() %>% 
  tibble::rownames_to_column("TaxaID") %>%
  dplyr::group_by(TaxaID) %>%
  dplyr::mutate(TaxaID_new = unlist(strsplit(TaxaID, "\\|"))[7]) %>%
  dplyr::select(TaxaID, TaxaID_new, all_of(sid)) %>%
  dplyr::ungroup() %>%
  dplyr::select(-TaxaID) %>%
  dplyr::rename(TaxaID = TaxaID_new)

phen <- phenotype %>%
  dplyr::filter(SampleID %in% sid)

output

if (!dir.exists("./dataset")) {
  dir.create("./dataset", recursive = TRUE)
}

write.csv(phen, "./dataset/metadata.csv", row.names = F)
write.table(prof, "./dataset/species.tsv", sep = "\t", quote = F, row.names = F)

Session info

devtools::session_info()

Reference

Previous
Next