Source code for njab.stats.ancova

"""Analysis of covariance using pingouin and statsmodels."""
from __future__ import annotations

import numpy as np
import pandas as pd
import pingouin as pg
import statsmodels


[docs] def ancova_pg( df_long: pd.DataFrame, feat_col: str, dv: str, between: str, covar: list[str] | str, ) -> pd.DataFrame: """ Analysis of covariance (ANCOVA) using pg.ancova https://pingouin-stats.org/generated/pingouin.ancova.html Adds multiple hypothesis testing correction by Benjamini-Hochberg (qvalue, rejected) Parameters ---------- df_long : pd.DataFrame should be long data format feat_col : str feature column (or index) name dv : str Name of column containing the dependant variable, passed to pg.ancova between : str Name of column containing the between factor, passed to pg.ancova covar : list, str Name(s) of column(s) containing the covariate, passed to pg.ancova fdr : float, optional FDR treshold to apply, by default 0.05 Returns ------- pd.DataFrame Columns: [ 'Source', 'SS', 'DF', 'F', 'p-unc', 'np2', '{feat_col}', '-Log10 pvalue', 'qvalue', 'rejected'] """ scores = [] # num_covar = len(covar) for feat_name, data_feat in df_long.groupby(feat_col): # ? drop duplicated colummns in long data format? ancova = pg.ancova(data=data_feat, dv=dv, between=between, covar=covar) ancova[feat_col] = feat_name scores.append(ancova) scores = pd.concat(scores) scores['-Log10 pvalue'] = -np.log10(scores['p-unc']) return scores
[docs] def add_fdr_scores(scores: pd.DataFrame, random_seed: int = None, alpha: float = 0.05, method: str = 'indep', p_val_column: str = 'p-unc') -> pd.DataFrame: """Add FDR scores based on p-values in p_val_column.""" if random_seed is not None: np.random.seed(random_seed) reject, qvalue = statsmodels.stats.multitest.fdrcorrection( scores[p_val_column], alpha=alpha, method=method) scores['qvalue'] = qvalue scores['rejected'] = reject return scores
[docs] class Ancova(): """Base Ancova class.""" def __init__(self, df_proteomics: pd.DataFrame, df_clinic: pd.DataFrame, target: str, covar: list[str], value_name: str = 'intensity'): """ Parameters ---------- df_proteomics : pd.DataFrame proteomic measurements in wide format df_clinic : pd.DataFrame clinical data, containing `target` and `covar` target : str Variable for stratification contained in `df_clinic` covar : list[str] List of control varialbles contained in `df_clinic` value_name : str Name to be used for protemics measurements in long-format, default "intensity" """ self.df_proteomics = df_proteomics self.df_clinic = df_clinic self.target = target self.covar = covar self.value_name = value_name
[docs] def get_scores(self): data = (self.df_proteomics.loc[self.df_clinic[ self.target].notna()].stack().to_frame(self.value_name).join( self.df_clinic)) feat_col = data.index.names[-1] scores = ancova_pg(data, feat_col=feat_col, dv=self.value_name, between=self.target, covar=self.covar) return scores.set_index(feat_col)
[docs] def ancova(self): raise NotImplementedError
[docs] def filter_residuals_from_scores(scores: pd.DataFrame, filter_for='Residual') -> pd.DataFrame: """Remove residual from pingouin ANCOVA list.""" scores = scores[scores.Source != filter_for] return scores
[docs] class AncovaAll(Ancova): """Ancova with FDR on all variables except the constant of the linear regression for each. """
[docs] def ancova(self, random_seed=123): scores = self.get_scores() scores = filter_residuals_from_scores(scores) # drop nan values (due to multicollinearity of features - i.e. duplicated features) scores = scores.dropna() scores = add_fdr_scores(scores, random_seed=random_seed) self.scores = scores return scores.set_index('Source', append=True)
[docs] def filter_all_covars_from_scores(scores: pd.DataFrame, filter_for: str) -> pd.DataFrame: """Only keep feature score from pingouin ANCOVA list.""" scores = scores[scores.Source == filter_for] return scores
[docs] class AncovaOnlyTarget(Ancova): """Ancova with FDR on only the target variables p-values in the set of hypothesis."""
[docs] def ancova(self, random_seed=123) -> pd.DataFrame: scores = self.get_scores() scores = filter_all_covars_from_scores(scores, filter_for=self.target) scores = add_fdr_scores(scores, random_seed=random_seed) self.scores = scores return scores.set_index('Source', append=True)