Source code for njab.sklearn.pca
from typing import Optional
import matplotlib
import pandas as pd
import sklearn.decomposition
[docs]
def run_pca(
df_wide: pd.DataFrame,
n_components: int = 2
) -> tuple[pd.DataFrame, sklearn.decomposition.PCA]:
"""Run PCA on DataFrame and return result.
Parameters
----------
df : pd.DataFrame
DataFrame in wide format to fit features on.
n_components : int, optional
Number of Principal Components to fit, by default 2
Returns
-------
Tuple[pd.DataFrame, PCA]
principal compoments of DataFrame with same indices as in original DataFrame,
and fitted PCA model of sklearn
"""
n_comp_max = None
if n_components is not None:
n_comp_max = min(df_wide.shape)
n_comp_max = min(n_comp_max, n_components)
pca = sklearn.decomposition.PCA(n_components=n_comp_max)
PCs = pca.fit_transform(df_wide)
cols = [
f'principal component {i+1} ({var_explained*100:.2f} %)'
for i, var_explained in enumerate(pca.explained_variance_ratio_)
]
PCs = pd.DataFrame(PCs, index=df_wide.index, columns=cols)
return PCs, pca
[docs]
def plot_explained_variance(
pca: sklearn.decomposition.PCA,
ax: Optional[matplotlib.axes.Axes] = None) -> matplotlib.axes.Axes:
"""Plot explained variance of PCA from scikit-learn."""
exp_var = pd.Series(
pca.explained_variance_ratio_).to_frame('explained variance')
exp_var.index += 1 # start at 1
exp_var["explained variance (cummulated)"] = exp_var[
'explained variance'].cumsum()
exp_var.index.name = 'PC'
ax = exp_var.plot(ax=ax)
return ax