Source code for njab.sklearn.scoring

import numpy as np
import pandas as pd
import sklearn.metrics as sklm

from njab.sklearn.types import Results



[docs]
class ConfusionMatrix():
    """Wrapper for `sklearn.metrics.confusion_matrix`"""

    def __init__(self, y_true, y_pred):
        self.cm_ = sklm.confusion_matrix(y_true, y_pred)


[docs]
    def as_dataframe(self, names=('true', 'pred')) -> pd.DataFrame:
        """Create pandas.DataFrame and return.
        Names rows and columns."""
        if not hasattr(self, 'df'):
            true_name, pred_name = names
            self.df = pd.DataFrame(self.cm_)
            self.df.index.name = true_name
            self.df.columns = pd.MultiIndex.from_product([[pred_name],
                                                          self.df.columns])
        return self.df



[docs]
    def classification_label(self) -> dict:
        """Classification labels as dict."""
        tn, fp, fn, tp = self.cm_.ravel()
        return {'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp}



[docs]
    def as_classification_series(self) -> pd.Series:
        """Classification labels as pandas.Series."""
        return pd.Series(self.classification_label())


    @property
    def as_array(self):
        """Return sklearn.metrics.confusion_matrix array"""
        return self.cm_

    def __str__(self):
        """sklearn.metrics.confusion_matrix __str__"""
        return str(self.cm_)

    def __repr__(self):
        """sklearn.metrics.confusion_matrix __repr__"""
        return repr(self.cm_)




[docs]
def get_label_binary_classification(y_true: int, y_pred: int) -> str:
    """Get labels (TP, FN, TN, FP) for single case in binary classification."""
    if y_true == 1:
        if y_pred == 1:
            return 'TP'
        elif y_pred == 0:
            return 'FN'
        else:
            ValueError(f"Unknown `y_pred`: {y_pred} ({ type(y_pred) })")
    elif y_true == 0:
        if y_pred == 0:
            return 'TN'
        elif y_pred == 1:
            return 'FP'
        else:
            ValueError(f"Unknown `y_pred`: {y_pred} ({ type(y_pred) })")
    else:
        raise ValueError(f"Unknown `y_true`: {y_true} ({ type(y_pred) })")




[docs]
def get_score(clf, X: pd.DataFrame, pos=1) -> pd.Series:
    """Extract score from binary classifier for class one (target class)."""
    scores = clf.predict_proba(X)
    if scores.shape[-1] > 2:
        raise NotImplementedError
    else:
        scores = scores[:, pos]
    scores = pd.Series(scores, index=X.index)
    return scores




[docs]
def get_pred(clf, X: pd.DataFrame) -> pd.Series:
    """Predict class for binary classifier and keep indices of from data X."""
    ret = clf.predict(X)
    ret = pd.Series(ret, index=X.index)
    return ret




[docs]
def get_custom_pred(clf, X: pd.DataFrame, cutoff=0.5) -> pd.Series:
    """Calculate predicted class for binary classifier using the specified cutoff.
       Keep indices of from data X.
    """
    scores = get_score(clf, X)
    ret = (scores > cutoff).astype(int)
    return ret




[docs]
def get_target_count_per_bin(score: pd.Series,
                             y: pd.Series,
                             n_bins: int = 10) -> pd.DataFrame:
    """Created pivot table with y summed per equality sized bin of scores."""
    pred_bins = pd.DataFrame({
        'score':
        pd.cut(score, bins=list(x / n_bins for x in range(0, n_bins + 1))),
        'y==1':
        y
    })
    pred_bins = pred_bins.groupby(by='score').sum().astype(int)
    return pred_bins




[docs]
def get_lr_multiplicative_decomposition(results: Results, X: pd.DataFrame,
                                        prob: pd.Series,
                                        y: pd.Series) -> pd.DataFrame:
    """Multiplicative decompositon of odds at the base of the
    logistic regresion model."""
    components = X[results.selected_features].multiply(results.model.coef_)
    components['intercept'] = float(results.model.intercept_)
    components = np.exp(components)
    components['odds'] = prob / (1.0 - prob)
    components['prob'] = prob
    components[y.name] = y
    components = components.sort_values('prob', ascending=False)
    return components