Machine Learning on gut microbiota of patients with Colorectal cancer (5)

Differential Analysis

Notebook 5 Differential Analysis.

There are 152 species in our dataset. In order to clinically interpret the potential biomarkers identified by the next step in machine learning, we perform the differential analysis to figure out the significant species as the inputs for building models. The method and criterion are as follows:

  • T-test based on CLR tranformed data;
  • Pvalue or Adjusted-Pvalue is less than 0.05;
  • the absolute values of Log2Foldchange on Mean values is more than 1.

Goal

Find the significant features of the data and choose them for machine learning.

Loading data and essential libraries

import pandas as pd 
import numpy as np

data_df = pd.read_table('./dataset/MergeData_clr.tsv', sep="\t", index_col=0)
data = data_df.reset_index(drop=True)
data.head()

T test

Since profile are CLR transformed data, we choose t test to identify the significant species

from itertools import combinations
from scipy import stats as st


def all_pairwise(df, compare_col = 'disease'):

    decade_pairs = [(i, j) for i, j in combinations(df[compare_col].unique().tolist(), 2)]
    # or add a list of colnames to function signature
    cols = list(df.columns)
    cols.remove(compare_col)
    list_of_dfs = []
    for pair in decade_pairs:
        for col in cols:
            c1 = df[df[compare_col] == pair[0]][col]
            c2 = df[df[compare_col] == pair[1]][col]
            results = st.ttest_ind(c1, c2, nan_policy='omit')
            tmp = pd.DataFrame({'dec1': pair[0],
                                'dec2': pair[1],
                                'tstat': results.statistic,
                                'pvalue': results.pvalue}, index = [col])
            list_of_dfs.append(tmp)
    df_stats = pd.concat(list_of_dfs)

    return df_stats

ttest_res = all_pairwise(data)

ttest_res.head()

  • FDR correction
def correct_pvalues_for_multiple_testing(pvalues, correction_type = "Benjamini-Hochberg"):                
    """                                                                                                   
    consistent with R - print correct_pvalues_for_multiple_testing([0.0, 0.01, 0.029, 0.03, 0.031, 0.05, 0.069, 0.07, 0.071, 0.09, 0.1]) 
    """
    from numpy import array, empty                                                                        
    pvalues = array(pvalues) 
    #n = float(pvalues.shape[0])
    n = pvalues.shape[0]                                                                        
    new_pvalues = empty(n)
    if correction_type == "Bonferroni":                                                                   
        new_pvalues = n * pvalues
    elif correction_type == "Bonferroni-Holm":                                                            
        values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]                                      
        values.sort()
        for rank, vals in enumerate(values):                                                              
            pvalue, i = vals
            new_pvalues[i] = (n-rank) * pvalue                                                            
    elif correction_type == "Benjamini-Hochberg":                                                         
        values = [ (pvalue, i) for i, pvalue in enumerate(pvalues) ]                                      
        values.sort()
        values.reverse()                                                                                  
        new_values = []
        for i, vals in enumerate(values):                                                                 
            rank = n - i
            pvalue, index = vals                                                                          
            new_values.append((n/rank) * pvalue)                                                          
        for i in range(0, int(n)-1):  
            if new_values[i] < new_values[i+1]:                                                           
                new_values[i+1] = new_values[i]                                                           
        for i, vals in enumerate(values):
            pvalue, index = vals
            new_pvalues[index] = new_values[i]
                                                                                                               
    return new_pvalues

ttest_res['Adjusted-pvalue'] = correct_pvalues_for_multiple_testing(ttest_res['pvalue'], correction_type = "Benjamini-Hochberg")
ttest_res_sort = ttest_res.sort_values(by='Adjusted-pvalue')
ttest_res_sort.head()

  • Significant species with pvalue less than 0.05
threshold = 0.05 # pvalue or Adjusted-pvalue
signif_species = ttest_res_sort.loc[ttest_res_sort['pvalue'] < threshold]
signif_species.shape
(40, 5)

There are 40 significant species identified by pvalue between CRC and healthy group

Output

Selecting the 40 species profile to downstream analysis

data_signif = pd.concat([data['disease'], data[signif_species.index.tolist()]], axis=1)

data_signif.to_csv('./dataset/MergeData_clr_signif.tsv',
            sep='\t', encoding='utf-8', index=True)

data_signif.head()

correlation matrix

# plot correlation matrix
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

plt.style.use('fivethirtyeight')
sns.set_style("white")

# Compute the correlation matrix
data_species = data_signif.iloc[:, 1:data_signif.shape[1]]
corr = data_species.corr(method="spearman")

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype='bool')
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(20, 20))
plt.title('CRC Species Correlation')

# Generate a custom diverging colormap
cmap = sns.diverging_palette(260, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, 
            vmax=1.1, 
            square='square', 
            cmap=cmap, 
            mask=mask, 
            ax=ax,
            annot=True, 
            fmt='.1g',
            linewidths=2)

A Summary of the significant species here:

  1. t-test on microbiota data
  2. FDR correction on pvalue of t-test
  3. obtain significant species

Referenece

Previous
Next