From e15f24b6ecd844e1f86a182a96287723e2850df6 Mon Sep 17 00:00:00 2001 From: raynardj Date: Sat, 30 Apr 2022 18:41:20 +0800 Subject: [PATCH] filter dataframe --- forgebox/__init__.py | 2 +- forgebox/filter_df.py | 319 ++++++++++++ forgebox/widgets.py | 3 +- nbs/31_df_filter.ipynb | 1068 ++++------------------------------------ settings.ini | 2 +- 5 files changed, 430 insertions(+), 964 deletions(-) create mode 100644 forgebox/filter_df.py diff --git a/forgebox/__init__.py b/forgebox/__init__.py index 976498a..92192ee 100644 --- a/forgebox/__init__.py +++ b/forgebox/__init__.py @@ -1 +1 @@ -__version__ = "1.0.3" +__version__ = "1.0.4" diff --git a/forgebox/filter_df.py b/forgebox/filter_df.py new file mode 100644 index 0000000..8d33226 --- /dev/null +++ b/forgebox/filter_df.py @@ -0,0 +1,319 @@ +from ipywidgets import ( + VBox, Button,HTML, + FloatSlider, IntSlider, SelectionSlider, Dropdown, Label, Checkbox, + interact, Output, interact_manual) + +from forgebox.imports import * +from forgebox.html import DOM +from abc import ABC +from typing import List, Callable + +try: + display(HTML('''''')) +except: + display = print + + +def pct_to_float(x): + if type(x) != str: + return x + if "%" not in x: + return x + else: + return float(x[:-1])/100 + + +def ensure_pct(df): + for col in df: + if df[col].dtype.name == "object": + df[col] = df[col].apply(pct_to_float) + return df + + +def detect_number_column(df): + """ + Detect number columns in dataframe + """ + cols = df.columns + dtypes = [df[col].dtype.name for col in cols] + return pd.DataFrame({"cols": cols, "dtypes": dtypes}) + + +class DataFilter: + """ + Single column number filter + """ + + def __init__(self, df: pd.DataFrame, fix_pct=True): + """ + df: input dataframe + + data_filter = DataFilter(df) + + # start filtering + data_filter() + """ + self.df = df + + if fix_pct: + self.df = ensure_pct(self.df) + + def show_distribution(self, col_name): + """ + show distribution of a column, using plotly + """ + import plotly.express as px + fig = px.histogram(self.df, x=col_name, height=300, width=800) + return fig + + def create_filter(self, field: str) -> None: + big_boxes = [] + + dtype = self.df[field].dtype.name + if 'float' in dtype: + slider = FloatSlider + slide = slider( + min=self.df[field].min(), + max=self.df[field].max(), + step=0.001 + ) + elif 'int' in dtype: + slider = IntSlider + slide = slider( + min=self.df[field].min(), + max=self.df[field].max(),) + else: + print(f"filter of {dtype} not supported") + slider = SelectionSlider + slide = slider(options=sorted(map(str, set(list(self.df[field]))))) + + btn = Button(description="Run Filter") + btn.on_click(self.execute_filter) + + print(f"NaN count: {(self.df[field].isna()).sum()}") + + widget = VBox([ + Label(f"Range for {field}"), + Dropdown(options=["Larger Than or equal to", + "Smaller Than or equal to"]), + slide, + Checkbox(description="Remove NaN", value=True), + btn + ]) + self.widget = widget + widget.original_name = field + + display(widget) + + def execute_filter( + self, _) -> None: + """ + This function will be used as a callback + for ipywidgets.Button.on_click + """ + original_name = self.widget.original_name + label_, condi_, value_, remove_na_, btn_ = self.widget.children + label, condi, value, remove_na = label_.value, condi_.value, value_.value, remove_na_.value + condi = ">=" if condi == 'Larger Than or equal to' else "<=" + if type(value_) == SelectionSlider: + value = f"'{value}'" + expression = f"{original_name} {condi} {value}" + + if remove_na: + self.remove_na(original_name) + + print(f"Filter with query expression: {expression}") + before = len(self.df) + self.df = self.df.query(expression).reset_index(drop=True) + after = len(self.df) + print(f"[Before]: {before}, [After]: {after}") + + def remove_na(self, field): + """ + Remove nan value in a dataframe + """ + before = len(self.df) + self.df = self.df[~self.df[field].isna()] + after = len(self.df) + print(f"Remove NA on {field} [Before]: {before}, [After]: {after}") + + def __call__(self, columns=None): + """ + Execute an interact to filter things column by column + """ + columns = columns if columns else list(self.df.columns) + + @interact + def select_field(field=columns): + # visualize histogram + self.show_distribution(field).show() + + # create a filter execution interactive + self.create_filter(field) + +# export + + +class LayerTorch: + """ + information passon from layer to layer + """ + + def __init__(self, df, level=0, last_layer=None): + self.df = df + self.level = level + self.data = dict() + if last_layer is not None: + self.last_layer = last_layer + self.out = Output() + if last_layer is None: + self.axis = dict() + df.filter_layers = self.axis + else: + self.axis = last_layer.axis + self.axis[level] = self + + def __call__(self, **kwargs): + self.data.update(kwargs) + for k, v in kwargs.items(): + setattr(self, k, v) + + def __repr__(self): + return f"Level:{self.level}\n\t{self.data}" + + def next_layer(self): + new = LayerTorch(self.df, level=self.level+1, last_layer=self) + self.next_layer = new + return new + + @property + def layers(self) -> pd.DataFrame: + """ + Filter layers as a pandans dataframe + """ + return pd.DataFrame( + list(i.data for i in self.axis.values())) + + @property + def filter_chain(self) -> str: + query_list = list(i.data["query"] for i in self.axis.values()) + return ' and '.join(list(f"({q})" for q in query_list)) + + +class RecursiveFilterCore(ABC): + + def display_queries(self, this_layer): + if this_layer.level > 0: + DOM("Queried Filters", "h3", )() + display(this_layer.layers[:-1]) + + def handpick( + self, + chunk_callbacks: List[Callable] = [], + show_top: bool = True, + show_top_k: int = 20, + pick_value_top_k: int = 30, + from_last_layer: LayerTorch = None + ) -> None: + """ + Hand pick the portion of the data frame you liked + from filtering the column by value. + A function from enhanced pandas dataframe + + Inputs: + - chunk_callbacks: List[Callable]=[], + - show_top: bool, default True, do we should + the most frequent values of the current column + - show_top_k: int, the number of rows we show for + the most frequent values, when show_top=True, + default 20 + - pick_value_top_k: int, number of the most frequent + values in pick drop down default 30 + - from_last_layer: LayerTorch, default None, this + column doesn't mean for user configuration + """ + this_layer = LayerTorch( + self) if from_last_layer is None else from_last_layer.next_layer() + display(this_layer.out) + + with this_layer.out: + self.display_queries(this_layer) + + DOM("Select Filter Column", "h3")() + + @interact + def select_columns(column=self.columns): + + this_layer(column=column) + series = self[column] + vc = self.vc(column) + + if show_top: + display(vc.head(show_top_k)) + + this_layer.out.played = True + + top_values = list(vc.index) + if pick_value_top_k is not None: + top_values = top_values[:pick_value_top_k] + + DOM(f"'{column}' equals to ?", "h3", )() + + @interact() + def pick_value(picked=top_values): + query = f"`{column}`=='{picked}'" + sub = RecursiveFilter(self.query(query)) + + # keep record on this layer + this_layer( + query=query, + picked=picked, + before_rows=len(self), + after_rows=len(sub) + ) + + for cb in chunk_callbacks: + cb(sub) + + @interact_manual + def start_recursion(): + this_layer.out.clear_output() + + with this_layer.out: + + # Recursion + # Go on the filter to the next layer + sub.handpick( + chunk_callbacks=chunk_callbacks, + show_top=show_top, + show_top_k=show_top_k, + pick_value_top_k=pick_value_top_k, + from_last_layer=this_layer, + ) + + sub.paginate(10) + + +class RecursiveFilter(pd.DataFrame, RecursiveFilterCore): + """ + Interactive Pandas DataFrame Filter + df = RecursiveFilter(df) + df.handpick() + + Hand pick the portion of the data frame you liked + from filtering the column by value. + A function from enhanced pandas dataframe + + Inputs: + - chunk_callbacks: List[Callable]=[], + - show_top: bool, default True, do we should + the most frequent values of the current column + - show_top_k: int, the number of rows we show for + the most frequent values, when show_top=True, + default 20 + - pick_value_top_k: int, number of the most frequent + values in pick drop down default 30 + - from_last_layer: LayerTorch, default None, this + column doesn't mean for user configuration + """ + pass diff --git a/forgebox/widgets.py b/forgebox/widgets.py index 41f6973..ae03065 100644 --- a/forgebox/widgets.py +++ b/forgebox/widgets.py @@ -22,6 +22,7 @@ except: display = print + def display_df(df): display(df) @@ -228,8 +229,6 @@ def wrapper(*args, **kwargs): display(self.vbox) return f -# Cell - class Labeler: """ diff --git a/nbs/31_df_filter.ipynb b/nbs/31_df_filter.ipynb index 87f85be..edd5e7c 100644 --- a/nbs/31_df_filter.ipynb +++ b/nbs/31_df_filter.ipynb @@ -12,27 +12,39 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], - "source": [ - "# default_exp df_filter" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9b80848c8fa145a8b1e0280678df5b6f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "HTML(value='')" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ - "# export\n", - "from abc import ABC\n", - "from typing import List, Callable\n", - "from forgebox.html import DOM\n", - "from forgebox.imports import pd, np" + "from forgebox.imports import *" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": { "code_folding": [] }, @@ -47,16 +59,26 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/salvor/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " dtype=np.int):\n" + ] + } + ], "source": [ "df = get_cal_housing()" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -92,7 +114,7 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 8.3252\n", " 41.0\n", " 6.984127\n", @@ -103,7 +125,7 @@ " -122.23\n", " \n", " \n", - " 1\n", + " 1\n", " 8.3014\n", " 21.0\n", " 6.238137\n", @@ -114,7 +136,7 @@ " -122.22\n", " \n", " \n", - " 2\n", + " 2\n", " 7.2574\n", " 52.0\n", " 8.288136\n", @@ -125,7 +147,7 @@ " -122.24\n", " \n", " \n", - " 3\n", + " 3\n", " 5.6431\n", " 52.0\n", " 5.817352\n", @@ -136,7 +158,7 @@ " -122.25\n", " \n", " \n", - " 4\n", + " 4\n", " 3.8462\n", " 52.0\n", " 6.281853\n", @@ -146,119 +168,38 @@ " 37.85\n", " -122.25\n", " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 20635\n", - " 1.5603\n", - " 25.0\n", - " 5.045455\n", - " 1.133333\n", - " 845.0\n", - " 2.560606\n", - " 39.48\n", - " -121.09\n", - " \n", - " \n", - " 20636\n", - " 2.5568\n", - " 18.0\n", - " 6.114035\n", - " 1.315789\n", - " 356.0\n", - " 3.122807\n", - " 39.49\n", - " -121.21\n", - " \n", - " \n", - " 20637\n", - " 1.7000\n", - " 17.0\n", - " 5.205543\n", - " 1.120092\n", - " 1007.0\n", - " 2.325635\n", - " 39.43\n", - " -121.22\n", - " \n", - " \n", - " 20638\n", - " 1.8672\n", - " 18.0\n", - " 5.329513\n", - " 1.171920\n", - " 741.0\n", - " 2.123209\n", - " 39.43\n", - " -121.32\n", - " \n", - " \n", - " 20639\n", - " 2.3886\n", - " 16.0\n", - " 5.254717\n", - " 1.162264\n", - " 1387.0\n", - " 2.616981\n", - " 39.37\n", - " -121.24\n", - " \n", " \n", "\n", - "

20640 rows × 8 columns

\n", "" ], "text/plain": [ - " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", - "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", - "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", - "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", - "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", - "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", - "... ... ... ... ... ... ... ... \n", - "20635 1.5603 25.0 5.045455 1.133333 845.0 2.560606 39.48 \n", - "20636 2.5568 18.0 6.114035 1.315789 356.0 3.122807 39.49 \n", - "20637 1.7000 17.0 5.205543 1.120092 1007.0 2.325635 39.43 \n", - "20638 1.8672 18.0 5.329513 1.171920 741.0 2.123209 39.43 \n", - "20639 2.3886 16.0 5.254717 1.162264 1387.0 2.616981 39.37 \n", - "\n", - " Longitude \n", - "0 -122.23 \n", - "1 -122.22 \n", - "2 -122.24 \n", - "3 -122.25 \n", - "4 -122.25 \n", - "... ... \n", - "20635 -121.09 \n", - "20636 -121.21 \n", - "20637 -121.22 \n", - "20638 -121.32 \n", - "20639 -121.24 \n", + " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", + "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", + "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", + "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", + "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", + "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", "\n", - "[20640 rows x 8 columns]" + " Longitude \n", + "0 -122.23 \n", + "1 -122.22 \n", + "2 -122.24 \n", + "3 -122.25 \n", + "4 -122.25 " ] }, - "execution_count": 12, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df" + "df.head()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -267,7 +208,7 @@ "(False, True)" ] }, - "execution_count": 15, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -278,7 +219,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -310,184 +251,52 @@ " AveOccup\n", " Latitude\n", " Longitude\n", - " TestPercent\n", " \n", " \n", " \n", " \n", - " 0\n", + " 0\n", " 8.3252\n", " 41.0\n", " 6.984127\n", - " 1.023810\n", + " 1.02381\n", " 322.0\n", " 2.555556\n", " 37.88\n", " -122.23\n", - " 88.91410755185981%\n", " \n", " \n", - " 1\n", + " 1\n", " 8.3014\n", " 21.0\n", " 6.238137\n", - " 0.971880\n", + " 0.97188\n", " 2401.0\n", " 2.109842\n", " 37.86\n", " -122.22\n", - " 93.83564895188734%\n", - " \n", - " \n", - " 2\n", - " 7.2574\n", - " 52.0\n", - " 8.288136\n", - " 1.073446\n", - " 496.0\n", - " 2.802260\n", - " 37.85\n", - " -122.24\n", - " 70.91076297605066%\n", - " \n", - " \n", - " 3\n", - " 5.6431\n", - " 52.0\n", - " 5.817352\n", - " 1.073059\n", - " 558.0\n", - " 2.547945\n", - " 37.85\n", - " -122.25\n", - " 21.56618642530764%\n", - " \n", - " \n", - " 4\n", - " 3.8462\n", - " 52.0\n", - " 6.281853\n", - " 1.081081\n", - " 565.0\n", - " 2.181467\n", - " 37.85\n", - " -122.25\n", - " 80.45282896672339%\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 20635\n", - " 1.5603\n", - " 25.0\n", - " 5.045455\n", - " 1.133333\n", - " 845.0\n", - " 2.560606\n", - " 39.48\n", - " -121.09\n", - " 36.93266496322409%\n", - " \n", - " \n", - " 20636\n", - " 2.5568\n", - " 18.0\n", - " 6.114035\n", - " 1.315789\n", - " 356.0\n", - " 3.122807\n", - " 39.49\n", - " -121.21\n", - " 75.0362361985378%\n", - " \n", - " \n", - " 20637\n", - " 1.7000\n", - " 17.0\n", - " 5.205543\n", - " 1.120092\n", - " 1007.0\n", - " 2.325635\n", - " 39.43\n", - " -121.22\n", - " 78.31525111633049%\n", - " \n", - " \n", - " 20638\n", - " 1.8672\n", - " 18.0\n", - " 5.329513\n", - " 1.171920\n", - " 741.0\n", - " 2.123209\n", - " 39.43\n", - " -121.32\n", - " 46.47093977393043%\n", - " \n", - " \n", - " 20639\n", - " 2.3886\n", - " 16.0\n", - " 5.254717\n", - " 1.162264\n", - " 1387.0\n", - " 2.616981\n", - " 39.37\n", - " -121.24\n", - " 42.4080878652766%\n", " \n", " \n", "\n", - "

20640 rows × 9 columns

\n", "" ], "text/plain": [ - " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", - "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", - "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", - "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", - "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", - "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", - "... ... ... ... ... ... ... ... \n", - "20635 1.5603 25.0 5.045455 1.133333 845.0 2.560606 39.48 \n", - "20636 2.5568 18.0 6.114035 1.315789 356.0 3.122807 39.49 \n", - "20637 1.7000 17.0 5.205543 1.120092 1007.0 2.325635 39.43 \n", - "20638 1.8672 18.0 5.329513 1.171920 741.0 2.123209 39.43 \n", - "20639 2.3886 16.0 5.254717 1.162264 1387.0 2.616981 39.37 \n", + " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", + "0 8.3252 41.0 6.984127 1.02381 322.0 2.555556 37.88 \n", + "1 8.3014 21.0 6.238137 0.97188 2401.0 2.109842 37.86 \n", "\n", - " Longitude TestPercent \n", - "0 -122.23 88.91410755185981% \n", - "1 -122.22 93.83564895188734% \n", - "2 -122.24 70.91076297605066% \n", - "3 -122.25 21.56618642530764% \n", - "4 -122.25 80.45282896672339% \n", - "... ... ... \n", - "20635 -121.09 36.93266496322409% \n", - "20636 -121.21 75.0362361985378% \n", - "20637 -121.22 78.31525111633049% \n", - "20638 -121.32 46.47093977393043% \n", - "20639 -121.24 42.4080878652766% \n", - "\n", - "[20640 rows x 9 columns]" + " Longitude \n", + "0 -122.23 \n", + "1 -122.22 " ] }, - "execution_count": 16, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df" + "df.head(2)" ] }, { @@ -499,390 +308,45 @@ }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "# export\n", - "from ipywidgets import VBox, Button,\\\n", - " FloatSlider, IntSlider, SelectionSlider,Dropdown, Label,Checkbox,\\\n", - " interact, Output, interact_manual\n", - "\n", - "\n", - "def pct_to_float(x):\n", - " if type(x)!=str: return x\n", - " if \"%\" not in x:\n", - " return x\n", - " else:\n", - " return float(x[:-1])/100\n", - " \n", - "def ensure_pct(df):\n", - " for col in df:\n", - " if df[col].dtype.name==\"object\":\n", - " df[col] = df[col].apply(pct_to_float)\n", - " return df\n", - "\n", - "def detect_number_column(df):\n", - " \"\"\"\n", - " Detect number columns in dataframe\n", - " \"\"\"\n", - " cols = df.columns\n", - " dtypes = [df[col].dtype.name for col in cols]\n", - " return pd.DataFrame({\"cols\":cols, \"dtypes\":dtypes})\n", - "\n", - "class DataFilter:\n", - " \"\"\"\n", - " Single column number filter\n", - " \"\"\"\n", - " def __init__(self, df: pd.DataFrame, fix_pct=True):\n", - " \"\"\"\n", - " df: input dataframe\n", - " \n", - " data_filter = DataFilter(df)\n", - " \n", - " # start filtering\n", - " data_filter()\n", - " \"\"\"\n", - " self.df = df\n", - " \n", - " if fix_pct:\n", - " self.df = ensure_pct(self.df)\n", - " \n", - " def show_distribution(self, col_name):\n", - " \"\"\"\n", - " show distribution of a column, using plotly\n", - " \"\"\"\n", - " import plotly.express as px\n", - " fig = px.histogram(self.df, x=col_name, height=300, width=800)\n", - " return fig\n", - " \n", - " def create_filter(self, field: str) -> None:\n", - " big_boxes = []\n", - " \n", - " dtype = self.df[field].dtype.name\n", - " if 'float' in dtype:\n", - " slider = FloatSlider\n", - " slide = slider(\n", - " min = self.df[field].min(),\n", - " max = self.df[field].max(),\n", - " step = 0.001\n", - " )\n", - " elif 'int' in dtype:\n", - " slider = IntSlider\n", - " slide = slider(\n", - " min = self.df[field].min(),\n", - " max = self.df[field].max(),)\n", - " else:\n", - " print(f\"filter of {dtype} not supported\")\n", - " slider = SelectionSlider\n", - " slide = slider(options=sorted(map(str, set(list(self.df[field])))))\n", - " \n", - " btn = Button(description=\"Run Filter\")\n", - " btn.on_click(self.execute_filter)\n", - " \n", - " print(f\"NaN count: {(self.df[field].isna()).sum()}\")\n", - " \n", - " widget = VBox([\n", - " Label(f\"Range for {field}\"),\n", - " Dropdown(options=[\"Larger Than or equal to\", \"Smaller Than or equal to\"]),\n", - " slide,\n", - " Checkbox(description=\"Remove NaN\", value=True),\n", - " btn\n", - " ])\n", - " self.widget = widget\n", - " widget.original_name = field\n", - " \n", - " display(widget)\n", - " \n", - " def execute_filter(\n", - " self, _) -> None:\n", - " \"\"\"\n", - " This function will be used as a callback\n", - " for ipywidgets.Button.on_click\n", - " \"\"\"\n", - " original_name = self.widget.original_name\n", - " label_,condi_,value_,remove_na_, btn_ = self.widget.children\n", - " label, condi, value, remove_na = label_.value ,condi_.value ,value_.value, remove_na_.value\n", - " condi = \">=\" if condi=='Larger Than or equal to' else \"<=\"\n", - " if type(value_)==SelectionSlider:\n", - " value = f\"'{value}'\"\n", - " expression = f\"{original_name} {condi} {value}\"\n", - " \n", - " if remove_na:\n", - " self.remove_na(original_name)\n", - " \n", - " print(f\"Filter with query expression: {expression}\")\n", - " before = len(self.df)\n", - " self.df = self.df.query(expression).reset_index(drop=True)\n", - " after = len(self.df)\n", - " print(f\"[Before]: {before}, [After]: {after}\")\n", - " \n", - " def remove_na(self, field):\n", - " \"\"\"\n", - " Remove nan value in a dataframe\n", - " \"\"\"\n", - " before = len(self.df)\n", - " self.df = self.df[~self.df[field].isna()]\n", - " after = len(self.df)\n", - " print(f\"Remove NA on {field} [Before]: {before}, [After]: {after}\")\n", - " \n", - " def __call__(self, columns=None):\n", - " \"\"\"\n", - " Execute an interact to filter things column by column\n", - " \"\"\"\n", - " columns = columns if columns else list(self.df.columns)\n", - " @interact\n", - " def select_field(field = columns):\n", - " # visualize histogram\n", - " self.show_distribution(field).show()\n", - " \n", - " # create a filter execution interactive\n", - " self.create_filter(field)" - ] - }, - { - "cell_type": "code", - "execution_count": 79, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
MedIncHouseAgeAveRoomsAveBedrmsPopulationAveOccupLatitudeLongitudeTestPercent
08.325241.06.9841271.023810322.02.55555637.88-122.2388.91410755185981%
18.301421.06.2381370.9718802401.02.10984237.86-122.2293.83564895188734%
27.257452.08.2881361.073446496.02.80226037.85-122.2470.91076297605066%
35.643152.05.8173521.073059558.02.54794537.85-122.2521.56618642530764%
43.846252.06.2818531.081081565.02.18146737.85-122.2580.45282896672339%
..............................
206351.560325.05.0454551.133333845.02.56060639.48-121.0936.93266496322409%
206362.556818.06.1140351.315789356.03.12280739.49-121.2175.0362361985378%
206371.700017.05.2055431.1200921007.02.32563539.43-121.2278.31525111633049%
206381.867218.05.3295131.171920741.02.12320939.43-121.3246.47093977393043%
206392.388616.05.2547171.1622641387.02.61698139.37-121.2442.4080878652766%
\n", - "

20640 rows × 9 columns

\n", - "
" - ], + "text/html": [], "text/plain": [ - " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", - "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", - "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", - "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", - "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", - "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", - "... ... ... ... ... ... ... ... \n", - "20635 1.5603 25.0 5.045455 1.133333 845.0 2.560606 39.48 \n", - "20636 2.5568 18.0 6.114035 1.315789 356.0 3.122807 39.49 \n", - "20637 1.7000 17.0 5.205543 1.120092 1007.0 2.325635 39.43 \n", - "20638 1.8672 18.0 5.329513 1.171920 741.0 2.123209 39.43 \n", - "20639 2.3886 16.0 5.254717 1.162264 1387.0 2.616981 39.37 \n", - "\n", - " Longitude TestPercent \n", - "0 -122.23 88.91410755185981% \n", - "1 -122.22 93.83564895188734% \n", - "2 -122.24 70.91076297605066% \n", - "3 -122.25 21.56618642530764% \n", - "4 -122.25 80.45282896672339% \n", - "... ... ... \n", - "20635 -121.09 36.93266496322409% \n", - "20636 -121.21 75.0362361985378% \n", - "20637 -121.22 78.31525111633049% \n", - "20638 -121.32 46.47093977393043% \n", - "20639 -121.24 42.4080878652766% \n", - "\n", - "[20640 rows x 9 columns]" + "" ] }, - "execution_count": 79, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "df" + "from forgebox.filter_df import DataFilter" ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "data_filter = DataFilter(df)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There should be interactive widget at this step" + ] + }, { "cell_type": "code", - "execution_count": 82, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "10278991ae7740ba874d891b644bec10", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "interactive(children=(Dropdown(description='field', options=('AveBedrms', 'Latitude', 'TestPercent'), value='A…" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "data_filter(columns=[\"AveBedrms\", \"Latitude\", \"TestPercent\"])" ] @@ -896,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -932,168 +396,35 @@ " \n", " \n", " \n", - " 0\n", + " 0\n", " 8.3252\n", " 41.0\n", " 6.984127\n", - " 1.023810\n", + " 1.02381\n", " 322.0\n", " 2.555556\n", " 37.88\n", " -122.23\n", " \n", - " \n", - " 1\n", - " 8.3014\n", - " 21.0\n", - " 6.238137\n", - " 0.971880\n", - " 2401.0\n", - " 2.109842\n", - " 37.86\n", - " -122.22\n", - " \n", - " \n", - " 2\n", - " 7.2574\n", - " 52.0\n", - " 8.288136\n", - " 1.073446\n", - " 496.0\n", - " 2.802260\n", - " 37.85\n", - " -122.24\n", - " \n", - " \n", - " 3\n", - " 5.6431\n", - " 52.0\n", - " 5.817352\n", - " 1.073059\n", - " 558.0\n", - " 2.547945\n", - " 37.85\n", - " -122.25\n", - " \n", - " \n", - " 4\n", - " 3.8462\n", - " 52.0\n", - " 6.281853\n", - " 1.081081\n", - " 565.0\n", - " 2.181467\n", - " 37.85\n", - " -122.25\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 20635\n", - " 1.5603\n", - " 25.0\n", - " 5.045455\n", - " 1.133333\n", - " 845.0\n", - " 2.560606\n", - " 39.48\n", - " -121.09\n", - " \n", - " \n", - " 20636\n", - " 2.5568\n", - " 18.0\n", - " 6.114035\n", - " 1.315789\n", - " 356.0\n", - " 3.122807\n", - " 39.49\n", - " -121.21\n", - " \n", - " \n", - " 20637\n", - " 1.7000\n", - " 17.0\n", - " 5.205543\n", - " 1.120092\n", - " 1007.0\n", - " 2.325635\n", - " 39.43\n", - " -121.22\n", - " \n", - " \n", - " 20638\n", - " 1.8672\n", - " 18.0\n", - " 5.329513\n", - " 1.171920\n", - " 741.0\n", - " 2.123209\n", - " 39.43\n", - " -121.32\n", - " \n", - " \n", - " 20639\n", - " 2.3886\n", - " 16.0\n", - " 5.254717\n", - " 1.162264\n", - " 1387.0\n", - " 2.616981\n", - " 39.37\n", - " -121.24\n", - " \n", " \n", "\n", - "

20640 rows × 8 columns

\n", "" ], "text/plain": [ - " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", - "0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 \n", - "1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 \n", - "2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 \n", - "3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 \n", - "4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 \n", - "... ... ... ... ... ... ... ... \n", - "20635 1.5603 25.0 5.045455 1.133333 845.0 2.560606 39.48 \n", - "20636 2.5568 18.0 6.114035 1.315789 356.0 3.122807 39.49 \n", - "20637 1.7000 17.0 5.205543 1.120092 1007.0 2.325635 39.43 \n", - "20638 1.8672 18.0 5.329513 1.171920 741.0 2.123209 39.43 \n", - "20639 2.3886 16.0 5.254717 1.162264 1387.0 2.616981 39.37 \n", + " MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude \\\n", + "0 8.3252 41.0 6.984127 1.02381 322.0 2.555556 37.88 \n", "\n", - " Longitude \n", - "0 -122.23 \n", - "1 -122.22 \n", - "2 -122.24 \n", - "3 -122.25 \n", - "4 -122.25 \n", - "... ... \n", - "20635 -121.09 \n", - "20636 -121.21 \n", - "20637 -121.22 \n", - "20638 -121.32 \n", - "20639 -121.24 \n", - "\n", - "[20640 rows x 8 columns]" + " Longitude \n", + "0 -122.23 " ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data_filter.df" + "data_filter.df.head(1)" ] }, { @@ -1105,176 +436,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "# export\n", - "class LayerTorch:\n", - " \"\"\"\n", - " information passon from layer to layer\n", - " \"\"\"\n", - " def __init__(self, df, level=0,last_layer=None):\n", - " self.df = df\n", - " self.level = level\n", - " self.data = dict()\n", - " if last_layer is not None:\n", - " self.last_layer = last_layer\n", - " self.out = Output()\n", - " if last_layer is None:\n", - " self.axis = dict()\n", - " df.filter_layers = self.axis\n", - " else:\n", - " self.axis = last_layer.axis\n", - " self.axis[level] = self\n", - " \n", - " def __call__(self, **kwargs):\n", - " self.data.update(kwargs)\n", - " for k, v in kwargs.items():\n", - " setattr(self, k, v)\n", - " \n", - " def __repr__(self):\n", - " return f\"Level:{self.level}\\n\\t{self.data}\"\n", - " \n", - " def next_layer(self):\n", - " new = LayerTorch(self.df, level=self.level+1, last_layer=self)\n", - " self.next_layer = new\n", - " return new\n", - " \n", - " @property\n", - " def layers(self) -> pd.DataFrame:\n", - " \"\"\"\n", - " Filter layers as a pandans dataframe\n", - " \"\"\"\n", - " return pd.DataFrame(\n", - " list(i.data for i in self.axis.values()))\n", - " \n", - " @property\n", - " def filter_chain(self) -> str:\n", - " query_list = list(i.data[\"query\"] for i in self.axis.values())\n", - " return ' and '.join(list(f\"({q})\" for q in query_list))\n", - "\n", - "class RecursiveFilterCore(ABC):\n", - " \n", - " def display_queries(self, this_layer):\n", - " if this_layer.level > 0:\n", - " DOM(\"Queried Filters\",\"h3\", )()\n", - " display(this_layer.layers[:-1])\n", - " \n", - " def handpick(\n", - " self,\n", - " chunk_callbacks: List[Callable]=[],\n", - " show_top: bool=True,\n", - " show_top_k: int=20,\n", - " pick_value_top_k: int=30,\n", - " from_last_layer: LayerTorch=None\n", - " ) -> None:\n", - " \"\"\"\n", - " Hand pick the portion of the data frame you liked\n", - " from filtering the column by value.\n", - " A function from enhanced pandas dataframe\n", - " \n", - " Inputs:\n", - " - chunk_callbacks: List[Callable]=[],\n", - " - show_top: bool, default True, do we should\n", - " the most frequent values of the current column\n", - " - show_top_k: int, the number of rows we show for\n", - " the most frequent values, when show_top=True,\n", - " default 20\n", - " - pick_value_top_k: int, number of the most frequent\n", - " values in pick drop down default 30\n", - " - from_last_layer: LayerTorch, default None, this\n", - " column doesn't mean for user configuration\n", - " \"\"\"\n", - " this_layer = LayerTorch(self) if from_last_layer is None else from_last_layer.next_layer()\n", - " display(this_layer.out)\n", - " \n", - " with this_layer.out:\n", - " self.display_queries(this_layer)\n", - " \n", - " DOM(\"Select Filter Column\",\"h3\")()\n", - " @interact\n", - " def select_columns(column = self.columns):\n", - " \n", - " this_layer(column=column)\n", - " series = self[column]\n", - " vc = self.vc(column)\n", - "\n", - " if show_top:\n", - " display(vc.head(show_top_k))\n", - "\n", - " this_layer.out.played = True\n", - " \n", - " top_values = list(vc.index)\n", - " if pick_value_top_k is not None:\n", - " top_values = top_values[:pick_value_top_k]\n", - " \n", - " DOM(f\"'{column}' equals to ?\",\"h3\", )()\n", - " \n", - " @interact()\n", - " def pick_value(picked = top_values):\n", - " query = f\"`{column}`=='{picked}'\"\n", - " sub = RecursiveFilter(self.query(query))\n", - " \n", - " # keep record on this layer\n", - " this_layer(\n", - " query = query,\n", - " picked=picked,\n", - " before_rows=len(self),\n", - " after_rows=len(sub)\n", - " )\n", - "\n", - " for cb in chunk_callbacks:\n", - " cb(sub)\n", - " \n", - " @interact_manual\n", - " def start_recursion():\n", - " this_layer.out.clear_output()\n", - " \n", - " with this_layer.out:\n", - " \n", - " # Recursion\n", - " # Go on the filter to the next layer\n", - " sub.handpick(\n", - " chunk_callbacks=chunk_callbacks,\n", - " show_top=show_top,\n", - " show_top_k=show_top_k,\n", - " pick_value_top_k=pick_value_top_k,\n", - " from_last_layer=this_layer,\n", - " )\n", - " \n", - " sub.paginate(10)\n", - "\n", - " \n", - " \n", - "class RecursiveFilter(pd.DataFrame, RecursiveFilterCore):\n", - " \"\"\"\n", - " Interactive Pandas DataFrame Filter\n", - " df = RecursiveFilter(df)\n", - " df.handpick()\n", - " \n", - " Hand pick the portion of the data frame you liked\n", - " from filtering the column by value.\n", - " A function from enhanced pandas dataframe\n", - " \n", - " Inputs:\n", - " - chunk_callbacks: List[Callable]=[],\n", - " - show_top: bool, default True, do we should\n", - " the most frequent values of the current column\n", - " - show_top_k: int, the number of rows we show for\n", - " the most frequent values, when show_top=True,\n", - " default 20\n", - " - pick_value_top_k: int, number of the most frequent\n", - " values in pick drop down default 30\n", - " - from_last_layer: LayerTorch, default None, this\n", - " column doesn't mean for user configuration\n", - " \"\"\"\n", - " pass" + "from forgebox.filter_df import RecursiveFilter" ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -1282,35 +453,10 @@ ] }, { - "cell_type": "code", - "execution_count": 28, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/xiaochen.zhang/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:15: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access\n", - " from ipykernel import kernelapp as app\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fde8a36d0aa34811974ccc591d594df3", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], "source": [ - "df.handpick()" + "There should be interactive widget at this step" ] }, { @@ -1318,12 +464,14 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "df.handpick()" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, diff --git a/settings.ini b/settings.ini index 76b7aed..b49c171 100644 --- a/settings.ini +++ b/settings.ini @@ -7,7 +7,7 @@ author = xiaochen(ray) zhang author_email = b2ray2c@gmail.com copyright = xiaochen(ray) zhang branch = master -version = 1.0.3 +version = 1.0.4 min_python = 3.6 host = github audience = Developers