Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added dropna to avoid crash on nan values #275

4 changes: 2 additions & 2 deletions explainerdashboard/explainer_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ def one_vs_all_metric(metric, pos_label, y_true, y_pred):
sign = 1 if greater_is_better else -1

def _scorer(clf, X, y):
y_pred = clf.predict_proba(X)
y_pred = clf.predict_proba(X.copy())
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this is needed as we already have the .copy in line 654

score = sign * partial_metric(y, y_pred)
return score

Expand Down Expand Up @@ -915,7 +915,7 @@ def get_pdp_df(
first_row = X_sample.iloc[[0]].values.astype("float32")
else:
first_row = X_sample.iloc[[0]]
n_labels = model.predict_proba(first_row).shape[1]
n_labels = model.predict_proba(first_row.copy()).shape[1]
if multiclass:
pdp_dfs = [pd.DataFrame() for i in range(n_labels)]
else:
Expand Down
2 changes: 1 addition & 1 deletion explainerdashboard/explainer_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -2813,7 +2813,7 @@ def plotly_rf_trees(
"model": range(len(model.estimators_)),
"prediction": [
np.round(
100 * m.predict_proba(observation)[0, pos_label], round
100 * m.predict_proba(observation.copy())[0, pos_label], round
)
for m in model.estimators_
],
Expand Down
26 changes: 20 additions & 6 deletions explainerdashboard/explainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,8 +240,12 @@ def __init__(
col for col in self.regular_cols if not is_numeric_dtype(self.X[col])
]
self.categorical_dict = {
col: sorted(self.X[col].unique().tolist()) for col in self.categorical_cols
col: sorted(self.X[col].dropna().unique().tolist()) for col in self.categorical_cols
}
#Add nan to list, as this is a valid option for encoders
for col in self.categorical_cols:
if self.X[col].isnull().values.any():
self.categorical_dict[col].append('NaN')
self.cat_cols = self.onehot_cols + self.categorical_cols
self.original_cols = self.X.columns
self.merged_cols = pd.Index(self.regular_cols + self.onehot_cols)
Expand Down Expand Up @@ -757,6 +761,11 @@ def get_row_from_input(
df_merged = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna(
self.na_fill
)[self.merged_cols]
#Adjust categorical col to proper nan value instead of self.na_fill
for col, values in self.categorical_dict.items():
if 'NaN' in values:
df_merged[col] = df_merged[col].replace(self.na_fill, np.nan) #If the categorical feature comes from the existing data it will be nan
df_merged[col] = df_merged[col].replace('NaN', np.nan) #If the categorical feature is changed to NaN in the frontend it will be a string
if return_merged:
return df_merged
else:
Expand All @@ -765,6 +774,11 @@ def get_row_from_input(
elif len(inputs) == len(self.columns):
cols = self.columns
df = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna(self.na_fill)
#unsure if this is okay here for categorical defined values
for col, values in self.categorical_dict.items():
if 'NaN' in values:
df[col] = df[col].replace(self.na_fill, np.nan) #If the categorical feature comes from the existing data it will be nan
df[col] = df[col].replace('NaN', np.nan) #If the categorical feature is changed to NaN in the frontend it will be a string
if return_merged:
return merge_categorical_columns(df, self.onehot_dict, self.merged_cols)
else:
Expand Down Expand Up @@ -2561,11 +2575,11 @@ def pred_probas_raw(self):
self.model, "predict_proba"
), "model does not have a predict_proba method!"
if self.shap == "skorch":
self._pred_probas = self.model.predict_proba(self.X.values).astype(
self._pred_probas = self.model.predict_proba(self.X.copy().values).astype(
self.precision
)
else:
self._pred_probas = self.model.predict_proba(self.X).astype(
self._pred_probas = self.model.predict_proba(self.X.copy()).astype(
self.precision
)
return self._pred_probas
Expand Down Expand Up @@ -2766,7 +2780,7 @@ def shap_explainer(self):

def model_predict(data_asarray):
data_asframe = pd.DataFrame(data_asarray, columns=self.columns)
return self.model.predict_proba(data_asframe)
return self.model.predict_proba(data_asframe.copy())

self._shap_explainer = shap.KernelExplainer(
model_predict,
Expand Down Expand Up @@ -3249,7 +3263,7 @@ def get_cv_metrics(n_splits):
):
X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]
preds = clone(self.model).fit(X_train, y_train).predict_proba(X_test)
preds = clone(self.model).fit(X_train, y_train).predict_proba(X_test.copy())
for label in range(len(self.labels)):
for cut in np.linspace(1, 99, 99, dtype=int):
y_true = np.where(y_test == label, 1, 0)
Expand Down Expand Up @@ -3482,7 +3496,7 @@ def prediction_result_df(
X_row = X_cats_to_X(X_row, self.onehot_dict, self.X.columns)
if self.shap == "skorch":
X_row = X_row.values.astype("float32")
pred_probas = self.model.predict_proba(X_row)[0, :].squeeze()
pred_probas = self.model.predict_proba(X_row.copy())[0, :].squeeze()

preds_df = pd.DataFrame(dict(label=self.labels, probability=pred_probas))
if logodds and all(preds_df.probability < 1 - np.finfo(np.float64).eps):
Expand Down
83 changes: 83 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from explainerdashboard import ClassifierExplainer, ExplainerDashboard
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
import numpy as np

class CategoricalModelWrapper:
def __init__(self, model, categorical_label_test) -> None:
self._model = model
self._categorical_label_test = categorical_label_test
pass

def _perform_label_encoding(self, y):
label_enc = LabelEncoder()
label_enc.fit([["Survived"],["Not Survived"]])
return pd.Series(label_enc.transform(y.values), name=y.name, index=y.index)

def _perform_label_decoding(self, y):
label_enc = LabelEncoder()
label_enc.fit([["Survived"],["Not Survived"]])
return pd.Series(label_enc.inverse_transform(y), name=y.name)

def _preprocessor(self, X):
return X.drop(["Name"], axis=1)

def _postprocessor(self, y):
if self._categorical_label_test == True:
y = self._perform_label_decoding(y)
return y

def predict(self, X):
X = self._preprocessor(X)
y = self._model.predict(X)
return self._postprocessor(y)

def predict_proba(self, X):
X = self._preprocessor(X)
probabilities_raw = self._model.predict_proba(X)
return probabilities_raw

def generate_categorical_dataset_model_wrapper(categorical_label_test=False):
model = RandomForestClassifier(n_estimators=5, max_depth=2)
wrapper = CategoricalModelWrapper(model, categorical_label_test)
df = pd.read_csv(os.path.join(os.getcwd(), "tests\\test_assets\\data.csv"))
if categorical_label_test == True:
#Test for categorical label, convert titanic binary numeric label to categorical ["Survived"],["Not Survived"]
df["Survival"] = wrapper._perform_label_decoding(df["Survival"])
else:
#We only test NaN in categorical features and numerical target
df["Name"][0] = np.nan
df["Name"][10] = np.nan
df["Name"][20] = np.nan
df["Name"][30] = np.nan
df["Name"][40] = np.nan
df["Name"][50] = np.nan
df["Name"][60] = np.nan
df["Name"][70] = np.nan
df["Name"][80] = np.nan
X_train, X_test, y_train, y_test = train_test_split(df.drop(["Survival"], axis=1), df["Survival"], test_size=0.2, random_state=42)

X_train = wrapper._preprocessor(X_train)

if categorical_label_test == True:
y_train = wrapper._perform_label_encoding(y_train)

model.fit(X_train, y_train)
return CategoricalModelWrapper(model, categorical_label_test), X_test, y_test

def test_NaN_containing_categorical_dataset():
_wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper()
explainer = ClassifierExplainer(
_wrapper, _test_X, _test_y)
dashboard = ExplainerDashboard(explainer)
assert "NaN" in explainer.categorical_dict["Name"]

def test_categorical_label():
_wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper(True)
explainer = ClassifierExplainer(
_wrapper, _test_X, _test_y)
dashboard = ExplainerDashboard(explainer)
assert "Survived" in explainer.labels