Source code for njab.sklearn

"""Scikit-learn related functionality.

Finds the a good set of features using minimum redundancy maximum relevance (MRMR)
for a logistic regression model a binary target variable.
"""
from typing import Optional

import pandas as pd
import sklearn
import sklearn.model_selection

from njab.sklearn import scoring
from njab.sklearn.pca import run_pca
from njab.sklearn.preprocessing import StandardScaler
from njab.sklearn.types import (AucRocCurve, PrecisionRecallCurve, Results,
                                ResultsSplit, Splits)

__all__ = [
    'run_model',
    'get_results_split',
    'find_n_best_features',
    'run_pca',
    'scoring',
    'StandardScaler',
]

RANDOM_STATE = 42

default_log_reg = sklearn.linear_model.LogisticRegression(
    random_state=RANDOM_STATE
    #   , solver='liblinear'
)


[docs] def run_model( splits: Splits, model: sklearn.base.BaseEstimator = default_log_reg, fit_params=None, n_feat_to_select=9, ) -> Results: """Fit a model on the training split and calculate performance metrics on both train and test split. """ from mrmr import mrmr_classif selected_features = mrmr_classif(X=splits.X_train, y=splits.y_train, K=n_feat_to_select) if fit_params is None: fit_params = {} model = model.fit(splits.X_train[selected_features], splits.y_train, **fit_params) pred_score_test = model.predict_proba(splits.X_test[selected_features])[:, 1] results_test = get_results_split(y_true=splits.y_test, y_score=pred_score_test) pred_score_train = model.predict_proba( splits.X_train[selected_features])[:, 1] results_train = get_results_split(y_true=splits.y_train, y_score=pred_score_train) ret = Results(model=model, selected_features=selected_features, train=results_train, test=results_test) return ret
[docs] def get_results_split(y_true, y_score): """Calculate metrics for a single set of samples.""" ret = ResultsSplit( auc=sklearn.metrics.roc_auc_score(y_true=y_true, y_score=y_score)) ret.roc = AucRocCurve( *sklearn.metrics.roc_curve(y_true=y_true, y_score=y_score)) ret.prc = PrecisionRecallCurve(*sklearn.metrics.precision_recall_curve( y_true=y_true, probas_pred=y_score)) ret.aps = sklearn.metrics.average_precision_score(y_true=y_true, y_score=y_score) return ret
default_log_reg = sklearn.linear_model.LogisticRegression( random_state=RANDOM_STATE, solver='liblinear')
[docs] def find_n_best_features( X: pd.DataFrame, y: pd.Series, name: str, model: sklearn.base.BaseEstimator = default_log_reg, groups=None, # ? Optional[array-like] n_features_max: int = 15, random_state: int = RANDOM_STATE, scoring: Optional[tuple] = ('precision', 'recall', 'f1', 'balanced_accuracy', 'roc_auc', 'average_precision'), return_train_score: bool = False, fit_params: Optional[dict] = None): """Create a summary of model performance on 10 times 5-fold cross-validation.""" from mrmr import mrmr_classif summary = [] cv = sklearn.model_selection.RepeatedStratifiedKFold( n_splits=5, n_repeats=10, random_state=random_state) in_both = y.index.intersection(X.index) # could have a warning in case _X = X.loc[in_both] _y = y.loc[in_both] n_features_max = min(n_features_max, X.shape[-1]) for n_features in range(1, n_features_max + 1): selected_features = mrmr_classif(_X, _y, K=n_features) _X_mrmr = _X[selected_features] scores = sklearn.model_selection.cross_validate( estimator=model, X=_X_mrmr, y=_y, groups=groups, scoring=scoring, cv=cv, return_train_score=return_train_score, params=fit_params, error_score='raise') scores['n_features'] = n_features scores['test_case'] = name scores['n_observations'] = _X.shape[0] results = pd.DataFrame(scores) summary.append(results) summary_n_features = pd.concat(summary) return summary_n_features
def transform_DataFrame(X: pd.DataFrame, fct: callable) -> pd.DataFrame: """Set index and columns of a DataFrame after applying a callable which might only return a numpy array. Parameters ---------- X : pd.DataFrame Original DataFrame to be transformed fct : callable Callable to be applied to every element in the DataFrame. Returns ------- pd.DataFrame Transformed DataFrame """ ret = fct(X) ret = pd.DataFrame(ret, index=X.index, columns=X.columns) return ret