scikit-learn-contrib · vsriram11 · Apr 28, 2020 · Apr 28, 2020 · Apr 28, 2020 · Apr 28, 2020
diff --git a/sklearn_pandas/dataframe_mapper.py b/sklearn_pandas/dataframe_mapper.py
@@ -37,10 +37,12 @@ def _build_feature(columns, transformers, options={}):
     return (columns, _build_transformer(transformers), options)
 
 
-def _get_feature_names(estimator):
+def _get_feature_names(estimator, x):
     """
     Attempt to extract feature names based on a given estimator
     """
+    if isinstance(x, pd.DataFrame):
+        return list(x.columns)
     if hasattr(estimator, 'classes_'):
         return estimator.classes_
     elif hasattr(estimator, 'get_feature_names'):
@@ -75,7 +77,8 @@ def __init__(self, features, default=False, sparse=False, df_out=False,
 
         features    a list of tuples with features definitions.
                     The first element is the pandas column selector. This can
-                    be a string (for one column) or a list of strings.
+                    be a string (for one column), a list of strings, or None
+                    (for all columns).
                     The second element is an object that supports
                     sklearn's transform interface, or a list of such objects.
                     The third element is optional and, if present, must be
@@ -162,13 +165,32 @@ def __setstate__(self, state):
         self.built_default = state.get('built_default', self.default)
         self.transformed_names_ = state.get('transformed_names_', [])
 
+    def _build_cols(self, X, cols):
+        """
+        Build columns, replacing None sentinel with all cols of X.
+
+        X       a Pandas dataframe; the table to select columns from
+        cols    a string or list of strings representing the columns
+                to select. if None, will be converted to a list of
+                all columns in X.
+
+        Returns a numpy array with the data from the selected columns
+        """
+        if cols is None:
+            if isinstance(X, DataWrapper):
+                cols = list(X.df.columns)
+            else:
+                cols = list(X.columns)
+        return cols
+
     def _get_col_subset(self, X, cols, input_df=False):
         """
         Get a subset of columns from the given table X.
 
         X       a Pandas dataframe; the table to select columns from
         cols    a string or list of strings representing the columns
-                to select
+                to select. if None, will be converted to a list of
+                all columns in X.
 
         Returns a numpy array with the data from the selected columns
         """
@@ -178,6 +200,9 @@ def _get_col_subset(self, X, cols, input_df=False):
         else:
             return_vector = False
 
+        # None is a sentinel to select all columns
+        cols = self._build_cols(X, cols)
+
         # Needed when using the cross-validation compatibility
         # layer for sklearn<0.16.0.
         # Will be dropped on sklearn-pandas 2.0.
@@ -226,14 +251,18 @@ def fit(self, X, y=None):
                 _call_fit(self.built_default.fit, Xt, y)
         return self
 
-    def get_names(self, columns, transformer, x, alias=None):
+    def get_names(self, columns, transformer, x, alias=None, mode=None):
         """
         Return verbose names for the transformed columns.
 
         columns       name (or list of names) of the original column(s)
         transformer   transformer - can be a TransformerPipeline
-        x             transformed columns (numpy.ndarray)
+        x             transformed columns (numpy.ndarray or
+                      pd.DataFrame)
         alias         base name to use for the selected columns
+        mode          if not None, either "nonecols" (cols is None
+                      indicating to use all) or "nonecolstransforms"
+                      (cols and transformer is None)
         """
         if alias is not None:
             name = alias
@@ -252,17 +281,40 @@ def get_names(self, columns, transformer, x, alias=None):
             if isinstance(transformer, TransformerPipeline):
                 inverse_steps = transformer.steps[::-1]
                 estimators = (estimator for name, estimator in inverse_steps)
-                names_steps = (_get_feature_names(e) for e in estimators)
+                names_steps = (_get_feature_names(e, x) for e in estimators)
                 names = next((n for n in names_steps if n is not None), None)
             # Otherwise use the only estimator present
             else:
-                names = _get_feature_names(transformer)
-            if names is not None and len(names) == num_cols:
-                return ['%s_%s' % (name, o) for o in names]
-            # otherwise, return name concatenated with '_1', '_2', etc.
+                names = _get_feature_names(transformer, x)
+
+            if mode == "nonecolstransforms":
+                return columns
+            elif mode == "nonecols":
+                if names is not None and len(names) == num_cols:
+                    return [str(o) for o in names]
+                else:
+                    return [str(o) for o in range(num_cols)]
             else:
-                return [name + '_' + str(o) for o in range(num_cols)]
+                if names is not None and len(names) == num_cols:
+                    return ['%s_%s' % (name, o) for o in names]
+                # otherwise, return name concatenated with '_1', '_2', etc.
+                else:
+                    return [name + '_' + str(o) for o in range(num_cols)]
         else:
+            if isinstance(transformer, TransformerPipeline):
+                inverse_steps = transformer.steps[::-1]
+                estimators = (estimator for name, estimator in inverse_steps)
+                names_steps = (_get_feature_names(e, x) for e in estimators)
+                names = next((n for n in names_steps if n is not None), None)
+            # Otherwise use the only estimator present
+            else:
+                names = _get_feature_names(transformer, x)
+
+            if mode == "nonecols":
+                if names is not None and len(names) == num_cols:
+                    return [str(o) for o in names]
+                else:
+                    return [str(o) for o in range(num_cols)]
             return [name]
 
     def get_dtypes(self, extracted):
@@ -307,8 +359,14 @@ def _transform(self, X, y=None, do_fit=False):
             extracted.append(_handle_feature(Xt))
 
             alias = options.get('alias')
+            mode = None
+            if columns is None and transformers is None:
+                mode = "nonecolstransforms"
+            elif columns is None:
+                mode = "nonecols"
             self.transformed_names_ += self.get_names(
-                columns, transformers, Xt, alias)
+                self._build_cols(X, columns), transformers, Xt, alias,
+                mode)
 
         # handle features not explicitly selected
         if self.built_default is not False:
@@ -363,6 +421,13 @@ def _transform(self, X, y=None, do_fit=False):
                 index=index)
             # preserve types
             for col, dtype in zip(self.transformed_names_, dtypes):
+                # this ensures that int types with null values are
+                # correctly cast to float
+                if ((np.issubdtype(df_out[col].values.dtype, np.floating) and
+                     np.issubdtype(dtype, np.integer)) and
+                        not np.isfinite(df_out[col].values).all()):
+                    dtype = np.float64
+
                 df_out[col] = df_out[col].astype(dtype)
             return df_out
         else:

diff --git a/tests/test_dataframe_mapper.py b/tests/test_dataframe_mapper.py
@@ -257,6 +257,20 @@ def test_complex_df(complex_dataframe):
         assert len(transformed[c]) == len(df[c])
 
 
+def test_none_all_col_sentinel(complex_dataframe):
+    """
+    Get a dataframe from a complex mapped dataframe returning all cols
+     without spec.
+    """
+    df = complex_dataframe
+    mapper = DataFrameMapper([(None, None)], df_out=True)
+    transformed = mapper.fit_transform(df)
+    print(transformed)
+    assert len(transformed) == len(complex_dataframe)
+    for c in df.columns:
+        assert len(transformed[c]) == len(df[c])
+
+
 def test_numeric_column_names(complex_dataframe):
     """
     Get a dataframe from a complex mapped dataframe with numeric column names