Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable None sentinel for columns #207

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 77 additions & 12 deletions sklearn_pandas/dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,12 @@ def _build_feature(columns, transformers, options={}):
return (columns, _build_transformer(transformers), options)


def _get_feature_names(estimator):
def _get_feature_names(estimator, x):
"""
Attempt to extract feature names based on a given estimator
"""
if isinstance(x, pd.DataFrame):
return list(x.columns)
if hasattr(estimator, 'classes_'):
return estimator.classes_
elif hasattr(estimator, 'get_feature_names'):
Expand Down Expand Up @@ -75,7 +77,8 @@ def __init__(self, features, default=False, sparse=False, df_out=False,

features a list of tuples with features definitions.
The first element is the pandas column selector. This can
be a string (for one column) or a list of strings.
be a string (for one column), a list of strings, or None
(for all columns).
The second element is an object that supports
sklearn's transform interface, or a list of such objects.
The third element is optional and, if present, must be
Expand Down Expand Up @@ -162,13 +165,32 @@ def __setstate__(self, state):
self.built_default = state.get('built_default', self.default)
self.transformed_names_ = state.get('transformed_names_', [])

def _build_cols(self, X, cols):
"""
Build columns, replacing None sentinel with all cols of X.

X a Pandas dataframe; the table to select columns from
cols a string or list of strings representing the columns
to select. if None, will be converted to a list of
all columns in X.

Returns a numpy array with the data from the selected columns
"""
if cols is None:
if isinstance(X, DataWrapper):
cols = list(X.df.columns)
else:
cols = list(X.columns)
return cols

def _get_col_subset(self, X, cols, input_df=False):
"""
Get a subset of columns from the given table X.

X a Pandas dataframe; the table to select columns from
cols a string or list of strings representing the columns
to select
to select. if None, will be converted to a list of
all columns in X.

Returns a numpy array with the data from the selected columns
"""
Expand All @@ -178,6 +200,9 @@ def _get_col_subset(self, X, cols, input_df=False):
else:
return_vector = False

# None is a sentinel to select all columns
cols = self._build_cols(X, cols)

# Needed when using the cross-validation compatibility
# layer for sklearn<0.16.0.
# Will be dropped on sklearn-pandas 2.0.
Expand Down Expand Up @@ -226,14 +251,18 @@ def fit(self, X, y=None):
_call_fit(self.built_default.fit, Xt, y)
return self

def get_names(self, columns, transformer, x, alias=None):
def get_names(self, columns, transformer, x, alias=None, mode=None):
"""
Return verbose names for the transformed columns.

columns name (or list of names) of the original column(s)
transformer transformer - can be a TransformerPipeline
x transformed columns (numpy.ndarray)
x transformed columns (numpy.ndarray or
pd.DataFrame)
alias base name to use for the selected columns
mode if not None, either "nonecols" (cols is None
indicating to use all) or "nonecolstransforms"
(cols and transformer is None)
"""
if alias is not None:
name = alias
Expand All @@ -252,17 +281,40 @@ def get_names(self, columns, transformer, x, alias=None):
if isinstance(transformer, TransformerPipeline):
inverse_steps = transformer.steps[::-1]
estimators = (estimator for name, estimator in inverse_steps)
names_steps = (_get_feature_names(e) for e in estimators)
names_steps = (_get_feature_names(e, x) for e in estimators)
names = next((n for n in names_steps if n is not None), None)
# Otherwise use the only estimator present
else:
names = _get_feature_names(transformer)
if names is not None and len(names) == num_cols:
return ['%s_%s' % (name, o) for o in names]
# otherwise, return name concatenated with '_1', '_2', etc.
names = _get_feature_names(transformer, x)

if mode == "nonecolstransforms":
return columns
elif mode == "nonecols":
if names is not None and len(names) == num_cols:
return [str(o) for o in names]
else:
return [str(o) for o in range(num_cols)]
else:
return [name + '_' + str(o) for o in range(num_cols)]
if names is not None and len(names) == num_cols:
return ['%s_%s' % (name, o) for o in names]
# otherwise, return name concatenated with '_1', '_2', etc.
else:
return [name + '_' + str(o) for o in range(num_cols)]
else:
if isinstance(transformer, TransformerPipeline):
inverse_steps = transformer.steps[::-1]
estimators = (estimator for name, estimator in inverse_steps)
names_steps = (_get_feature_names(e, x) for e in estimators)
names = next((n for n in names_steps if n is not None), None)
# Otherwise use the only estimator present
else:
names = _get_feature_names(transformer, x)

if mode == "nonecols":
if names is not None and len(names) == num_cols:
return [str(o) for o in names]
else:
return [str(o) for o in range(num_cols)]
return [name]

def get_dtypes(self, extracted):
Expand Down Expand Up @@ -307,8 +359,14 @@ def _transform(self, X, y=None, do_fit=False):
extracted.append(_handle_feature(Xt))

alias = options.get('alias')
mode = None
if columns is None and transformers is None:
mode = "nonecolstransforms"
elif columns is None:
mode = "nonecols"
self.transformed_names_ += self.get_names(
columns, transformers, Xt, alias)
self._build_cols(X, columns), transformers, Xt, alias,
mode)

# handle features not explicitly selected
if self.built_default is not False:
Expand Down Expand Up @@ -363,6 +421,13 @@ def _transform(self, X, y=None, do_fit=False):
index=index)
# preserve types
for col, dtype in zip(self.transformed_names_, dtypes):
# this ensures that int types with null values are
# correctly cast to float
if ((np.issubdtype(df_out[col].values.dtype, np.floating) and
np.issubdtype(dtype, np.integer)) and
not np.isfinite(df_out[col].values).all()):
dtype = np.float64

df_out[col] = df_out[col].astype(dtype)
return df_out
else:
Expand Down
14 changes: 14 additions & 0 deletions tests/test_dataframe_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,20 @@ def test_complex_df(complex_dataframe):
assert len(transformed[c]) == len(df[c])


def test_none_all_col_sentinel(complex_dataframe):
"""
Get a dataframe from a complex mapped dataframe returning all cols
without spec.
"""
df = complex_dataframe
mapper = DataFrameMapper([(None, None)], df_out=True)
transformed = mapper.fit_transform(df)
print(transformed)
assert len(transformed) == len(complex_dataframe)
for c in df.columns:
assert len(transformed[c]) == len(df[c])


def test_numeric_column_names(complex_dataframe):
"""
Get a dataframe from a complex mapped dataframe with numeric column names
Expand Down