Source code for njab.stats.groups_comparision

"""Bionomial test and t-test for groups comparision."""
import logging
import pandas as pd
import pingouin as pg

from scipy.stats import binomtest as scipy_binomtest

logger = logging.getLogger(__name__)


[docs] def means_between_groups( df: pd.DataFrame, boolean_array: pd.Series, event_names: tuple[str, str] = ('1', '0') ) -> pd.DataFrame: """Mean comparison between groups""" sub = df.loc[boolean_array].describe().iloc[:3] sub['event'] = event_names[0] sub = sub.set_index('event', append=True).swaplevel() ret = sub sub = df.loc[~boolean_array].describe().iloc[:3] sub['event'] = event_names[1] sub = sub.set_index('event', append=True).swaplevel() ret = pd.concat([ret, sub]) ret.columns.name = 'variable' ret.index.names = ('event', 'stats') return ret.T
[docs] def calc_stats(df: pd.DataFrame, boolean_array: pd.Series, vars: list[str]) -> pd.DataFrame: """Calculate t-test for each variable in `vars` between two groups defined by boolean array.""" ret = [] for var in vars: _ = pg.ttest(df.loc[boolean_array, var], df.loc[~boolean_array, var]) ret.append(_) ret = pd.concat(ret) ret = ret.set_index(vars) ret.columns.name = 'ttest' ret.columns = pd.MultiIndex.from_product([['ttest'], ret.columns], names=('test', 'var')) return ret
[docs] def diff_analysis( df: pd.DataFrame, boolean_array: pd.Series, event_names: tuple[str, str] = ('1', '0'), ttest_vars=("alternative", "p-val", "cohen-d") ) -> pd.DataFrame: """Differential analysis procedure between two groups. Calculaes mean per group and t-test for each variable in `vars` between two groups.""" ret = means_between_groups(df, boolean_array=boolean_array, event_names=event_names) ttests = calc_stats(df, boolean_array=boolean_array, vars=ret.index) ret = ret.join(ttests.loc[:, pd.IndexSlice[:, ttest_vars]]) return ret
[docs] def binomtest( var: pd.Series, boolean_array: pd.Series, alternative='two-sided', event_names: tuple[str, str] = ('event', 'no-event') ) -> pd.DataFrame: """Binomial test for categorical variable between two groups defined by a boolean array.""" entry = {} entry['variable'] = var.name if var.dtype != 'category': logger.warn( f"Passed on categorical data (which was expected): {var.name}") var = var.astype('category') assert len( var.cat.categories ) == 2, f"No binary variable, found {len(var.cat.categories)} categories: {list(var.cat.categories)}" p_1 = var.loc[boolean_array].dropna().cat.codes.mean() p_0 = var.loc[~boolean_array].dropna().cat.codes.mean() logger.debug(f"p cat==0: {p_0}, p cat==1: {p_1}") cat_at_pos_one = var.cat.categories[1] logger.debug('Category with code 1', cat_at_pos_one) counts = var.loc[boolean_array].value_counts() k, n = counts.loc[cat_at_pos_one], counts.sum() entry[event_names[0]] = dict(count=n, p=p_1) entry[event_names[1]] = dict( count=var.loc[~boolean_array].value_counts().sum(), p=p_0) test_res = scipy_binomtest(k, n, p_0, alternative=alternative) test_res = pd.Series(test_res.__dict__).to_frame('binomial test').unstack() test_res.name = entry['variable'] test_res = test_res.to_frame().T entry = pd.DataFrame(entry).set_index('variable', append=True).unstack(0) entry = entry.join(test_res) return entry