Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added option to use NaN values in categorical columns #305

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
18 changes: 16 additions & 2 deletions explainerdashboard/explainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,12 @@ def __init__(
col for col in self.regular_cols if not is_numeric_dtype(self.X[col])
]
self.categorical_dict = {
col: sorted(self.X[col].unique().tolist()) for col in self.categorical_cols
col: sorted(self.X[col].dropna().unique().tolist()) for col in self.categorical_cols
}
#Add nan to list, as this is a valid option for encoders
for col in self.categorical_cols:
if self.X[col].isnull().values.any():
self.categorical_dict[col].append('NaN')
self.cat_cols = self.onehot_cols + self.categorical_cols
self.original_cols = self.X.columns
self.merged_cols = pd.Index(self.regular_cols + self.onehot_cols)
Expand Down Expand Up @@ -282,7 +286,7 @@ def __init__(
raise ValueError(
"y should be a pd.Series or np.ndarray not a pd.DataFrame!"
)

self.y = pd.Series(y.squeeze()).astype(precision)
self.y_missing = False
else:
Expand Down Expand Up @@ -773,6 +777,11 @@ def get_row_from_input(
df_merged = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna(
self.na_fill
)[self.merged_cols]
#Adjust categorical col to proper nan value instead of self.na_fill
for col, values in self.categorical_dict.items():
if 'NaN' in values:
df_merged[col] = df_merged[col].replace(self.na_fill, np.nan) #If the categorical feature comes from the existing data it will be nan
df_merged[col] = df_merged[col].replace('NaN', np.nan) #If the categorical feature is changed to NaN in the frontend it will be a string
if return_merged:
return df_merged
else:
Expand All @@ -781,6 +790,11 @@ def get_row_from_input(
elif len(inputs) == len(self.columns):
cols = self.columns
df = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna(self.na_fill)
#unsure if this is okay here for categorical defined values
for col, values in self.categorical_dict.items():
if 'NaN' in values:
df[col] = df[col].replace(self.na_fill, np.nan) #If the categorical feature comes from the existing data it will be nan
df[col] = df[col].replace('NaN', np.nan) #If the categorical feature is changed to NaN in the frontend it will be a string
if return_merged:
return merge_categorical_columns(df, self.onehot_dict, self.merged_cols)
else:
Expand Down
84 changes: 84 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
import numpy as np

class CategoricalModelWrapper:
def __init__(self, model, categorical_label_test) -> None:
self._model = model
self._categorical_label_test = categorical_label_test
pass

def _perform_label_encoding(self, y):
label_enc = LabelEncoder()
label_enc.fit([["Survived"],["Not Survived"]])
return pd.Series(label_enc.transform(y.values), name=y.name, index=y.index)

def _perform_label_decoding(self, y):
label_enc = LabelEncoder()
label_enc.fit([["Survived"],["Not Survived"]])
return pd.Series(label_enc.inverse_transform(y), name=y.name)

def _preprocessor(self, X):
return X.drop(["Name"], axis=1)

def _postprocessor(self, y):
if self._categorical_label_test == True:
y = self._perform_label_decoding(y)
return y

def predict(self, X):
X = self._preprocessor(X)
y = self._model.predict(X)
return self._postprocessor(y)

def predict_proba(self, X):
X = self._preprocessor(X)
probabilities_raw = self._model.predict_proba(X)
return probabilities_raw

def generate_categorical_dataset_model_wrapper(categorical_label_test=False):
model = RandomForestClassifier(n_estimators=5, max_depth=2)
wrapper = CategoricalModelWrapper(model, categorical_label_test)
df = pd.read_csv(os.path.join(os.getcwd(), "tests\\test_assets\\data.csv"))
if categorical_label_test == True:
#Test for categorical label, convert titanic binary numeric label to categorical ["Survived"],["Not Survived"]
df["Survival"] = wrapper._perform_label_decoding(df["Survival"])
else:
#We only test NaN in categorical features and numerical target
df["Name"][0] = np.nan
df["Name"][10] = np.nan
df["Name"][20] = np.nan
df["Name"][30] = np.nan
df["Name"][40] = np.nan
df["Name"][50] = np.nan
df["Name"][60] = np.nan
df["Name"][70] = np.nan
df["Name"][80] = np.nan
X_train, X_test, y_train, y_test = train_test_split(df.drop(["Survival"], axis=1), df["Survival"], test_size=0.2, random_state=42)

X_train = wrapper._preprocessor(X_train)

if categorical_label_test == True:
y_train = wrapper._perform_label_encoding(y_train)

model.fit(X_train, y_train)
return CategoricalModelWrapper(model, categorical_label_test), X_test, y_test

def test_NaN_containing_categorical_dataset():
_wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper()
explainer = ClassifierExplainer(
_wrapper, _test_X, _test_y)
dashboard = ExplainerDashboard(explainer)
assert "NaN" in explainer.categorical_dict["Name"]


# def test_categorical_label():
# _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper(True)
# explainer = ClassifierExplainer(
# _wrapper, _test_X, _test_y)
# dashboard = ExplainerDashboard(explainer)
# assert "Survived" in explainer.labels