Source code for njab.sklearn.scoring

import numpy as np
import pandas as pd
import sklearn.metrics as sklm

from njab.sklearn.types import Results


[docs] class ConfusionMatrix(): """Wrapper for `sklearn.metrics.confusion_matrix`""" def __init__(self, y_true, y_pred): self.cm_ = sklm.confusion_matrix(y_true, y_pred)
[docs] def as_dataframe(self, names=('true', 'pred')) -> pd.DataFrame: """Create pandas.DataFrame and return. Names rows and columns.""" if not hasattr(self, 'df'): true_name, pred_name = names self.df = pd.DataFrame(self.cm_) self.df.index.name = true_name self.df.columns = pd.MultiIndex.from_product([[pred_name], self.df.columns]) return self.df
[docs] def classification_label(self) -> dict: """Classification labels as dict.""" tn, fp, fn, tp = self.cm_.ravel() return {'TN': tn, 'FP': fp, 'FN': fn, 'TP': tp}
[docs] def as_classification_series(self) -> pd.Series: """Classification labels as pandas.Series.""" return pd.Series(self.classification_label())
@property def as_array(self): """Return sklearn.metrics.confusion_matrix array""" return self.cm_ def __str__(self): """sklearn.metrics.confusion_matrix __str__""" return str(self.cm_) def __repr__(self): """sklearn.metrics.confusion_matrix __repr__""" return repr(self.cm_)
[docs] def get_label_binary_classification(y_true: int, y_pred: int) -> str: """Get labels (TP, FN, TN, FP) for single case in binary classification.""" if y_true == 1: if y_pred == 1: return 'TP' elif y_pred == 0: return 'FN' else: ValueError(f"Unknown `y_pred`: {y_pred} ({ type(y_pred) })") elif y_true == 0: if y_pred == 0: return 'TN' elif y_pred == 1: return 'FP' else: ValueError(f"Unknown `y_pred`: {y_pred} ({ type(y_pred) })") else: raise ValueError(f"Unknown `y_true`: {y_true} ({ type(y_pred) })")
[docs] def get_score(clf, X: pd.DataFrame, pos=1) -> pd.Series: """Extract score from binary classifier for class one (target class).""" scores = clf.predict_proba(X) if scores.shape[-1] > 2: raise NotImplementedError else: scores = scores[:, pos] scores = pd.Series(scores, index=X.index) return scores
[docs] def get_pred(clf, X: pd.DataFrame) -> pd.Series: """Predict class for binary classifier and keep indices of from data X.""" ret = clf.predict(X) ret = pd.Series(ret, index=X.index) return ret
[docs] def get_custom_pred(clf, X: pd.DataFrame, cutoff=0.5) -> pd.Series: """Calculate predicted class for binary classifier using the specified cutoff. Keep indices of from data X. """ scores = get_score(clf, X) ret = (scores > cutoff).astype(int) return ret
[docs] def get_target_count_per_bin(score: pd.Series, y: pd.Series, n_bins: int = 10) -> pd.DataFrame: """Created pivot table with y summed per equality sized bin of scores.""" pred_bins = pd.DataFrame({ 'score': pd.cut(score, bins=list(x / n_bins for x in range(0, n_bins + 1))), 'y==1': y }) pred_bins = pred_bins.groupby(by='score').sum().astype(int) return pred_bins
[docs] def get_lr_multiplicative_decomposition(results: Results, X: pd.DataFrame, prob: pd.Series, y: pd.Series) -> pd.DataFrame: """Multiplicative decompositon of odds at the base of the logistic regresion model.""" components = X[results.selected_features].multiply(results.model.coef_) components['intercept'] = float(results.model.intercept_) components = np.exp(components) components['odds'] = prob / (1.0 - prob) components['prob'] = prob components[y.name] = y components = components.sort_values('prob', ascending=False) return components