From 7d2d02170024623ad047bd5684def004c83a4c97 Mon Sep 17 00:00:00 2001 From: Tyler White <50381805+IndexSeek@users.noreply.github.com> Date: Sun, 1 Dec 2024 12:48:04 -0500 Subject: [PATCH] style(ruff): clear ruff check violations (#175) --- .pre-commit-config.yaml | 2 +- .../Preprocess your data with recipes.ipynb | 6 +- ... and DuckDB for a Kaggle competition.ipynb | 4195 ++++++++--------- ibis_ml/core.py | 2 +- ibis_ml/select.py | 2 +- ibis_ml/steps/_discretize.py | 2 +- ibis_ml/utils/_pprint.py | 2 +- pyproject.toml | 1 + tests/test_core.py | 11 +- tests/test_encode.py | 4 +- tests/test_generate_features.py | 2 +- tests/test_impute.py | 2 +- tests/test_pprint.py | 4 +- 13 files changed, 2114 insertions(+), 2121 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5e2b423..3034b48 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: prettier - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.2 + rev: v0.8.0 hooks: - id: ruff args: [--fix] diff --git a/examples/Preprocess your data with recipes.ipynb b/examples/Preprocess your data with recipes.ipynb index 87393d2..1f560a6 100644 --- a/examples/Preprocess your data with recipes.ipynb +++ b/examples/Preprocess your data with recipes.ipynb @@ -490,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "dc04f24e-c8cb-4580-b502-a9410c64a126", "metadata": {}, "outputs": [], @@ -510,7 +510,7 @@ " from skorch import NeuralNetClassifier\n", "\n", " class MyModule(nn.Module):\n", - " def __init__(self, num_units=10, nonlin=nn.ReLU()):\n", + " def __init__(self, num_units=10, nonlin=nn.ReLU()): # noqa: B008\n", " super().__init__()\n", "\n", " self.dense0 = nn.Linear(10, num_units)\n", @@ -525,7 +525,7 @@ " X = self.dropout(X)\n", " X = self.nonlin(self.dense1(X))\n", " X = self.softmax(self.output(X))\n", - " return X\n", + " return X # noqa: RET504\n", "\n", " mod = NeuralNetClassifier(\n", " MyModule,\n", diff --git a/examples/Using IbisML and DuckDB for a Kaggle competition.ipynb b/examples/Using IbisML and DuckDB for a Kaggle competition.ipynb index c96dea6..d889805 100644 --- a/examples/Using IbisML and DuckDB for a Kaggle competition.ipynb +++ b/examples/Using IbisML and DuckDB for a Kaggle competition.ipynb @@ -1,2118 +1,2111 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "In this post, we'll demonstrate how to use Ibis and [IbisML](https://github.com/ibis-project/ibis-ml)\n", - "end-to-end for the\n", - "[credit risk model stability Kaggle competition](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability).\n", - "\n", - "1. Load data and perform feature engineering on DuckDB backend using IbisML\n", - "2. Perform last-mile ML data preprocessing on DuckDB backend using IbisML\n", - "3. Train two models using different frameworks:\n", - " * An XGBoost model within a scikit-learn pipeline.\n", - " * A neural network with PyTorch and PyTorch Lightning.\n", - "\n", - "The aim of this competition is to predict which clients are more likely to default on their\n", - "loans by using both internal and external data sources.\n", - "\n", - "To get started with Ibis and IbisML, please refer to the websites:\n", - "\n", - "* [Ibis](https://ibis-project.org/): An open-source dataframe library that works with any data system.\n", - "* [IbisML](https://ibis-project.github.io/ibis-ml/): A library for building scalable ML pipelines.\n", - "\n", - "\n", - "## Prerequisites\n", - "To run this example, you'll need to download the data from Kaggle website with a Kaggle user account and install Ibis, IbisML, and the necessary modeling library.\n", - "\n", - "### Download data\n", - "You need a Kaggle account to download the data. If you do not have one,\n", - "feel free to register one.\n", - "\n", - "1. Option 1: Manual download\n", - " * Log into your Kaggle account and download all data from this\n", - " [link](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data),\n", - " unzip the files, and save them to your local disk.\n", - "2. Option 2: Kaggle API\n", - " * Go to your `Kaggle Account Settings`.\n", - " * Under the `API` section, click on `Create New API Token`. This will download the `kaggle.json`\n", - " file to your computer.\n", - " * Place the `kaggle.json` file in the correct directory, normally it is under your home directory\n", - " `~/.kaggle`:\n", - "\n", - " ```bash\n", - " mkdir ~/.kaggle\n", - " mv ~/Downloads/kaggle.json ~/.kaggle\n", - " ```\n", - " * Install Kaggle CLI and download the data:\n", - "\n", - " ```bash\n", - " pip install kaggle\n", - " kaggle competitions download -c home-credit-credit-risk-model-stability\n", - " unzip home-credit-credit-risk-model-stability.zip\n", - " ```\n", - "\n", - "### Install libraries\n", - "To use Ibis and IbisML with the DuckDB backend for building models, you'll need to install the\n", - "necessary packages. Depending on your preferred machine learning framework, you can choose\n", - "one of the following installation commands:\n", - "\n", - "For PyTorch-based models:\n", - "\n", - "```bash\n", - "pip install 'ibis-framework[duckdb]' ibis-ml torch pytorch-lightning\n", - "```\n", - "\n", - "For XGBoost and scikit-learn-based models:\n", - "\n", - "```bash\n", - "pip install 'ibis-framework[duckdb]' ibis-ml xgboost[scikit-learn]\n", - "```\n", - "\n", - "Import libraries:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import ibis\n", - "import ibis.expr.datatypes as dt\n", - "from ibis import _\n", - "import ibis_ml as ml\n", - "from pathlib import Path\n", - "from glob import glob\n", - "\n", - "# enable interactive mode for ibis\n", - "ibis.options.interactive = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set the backend for computing:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "con = ibis.duckdb.connect()\n", - "# remove the black bars from duckdb's progress bar\n", - "con.raw_sql(\"set enable_progress_bar = false\")\n", - "# DuckDB is the default backend for Ibis\n", - "ibis.set_backend(con)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set data path:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# change the root path to yours\n", - "ROOT = Path(\"/Users/jiting/Downloads/home-credit-credit-risk-model-stability\")\n", - "TRAIN_DIR = ROOT / \"parquet_files\" / \"train\"\n", - "TEST_DIR = ROOT / \"parquet_files\" / \"test\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data loading and processing\n", - "We'll use Ibis to read the Parquet files and perform the necessary processing for the next step.\n", - "\n", - "### Directory structure and tables\n", - "Since there are many data files, let's start by examining the directory structure and\n", - "tables within the train directory:\n", - "\n", - "```bash\n", - "# change this to your directory\n", - "tree -L 2 ~/Downloads/home-credit-credit-risk-model-stability/parquet_files/train\n", - "```\n", - "Data directory:\n", - "\n", - "```bash\n", - "~/Downloads/home-credit-credit-risk-model-stability/parquet_files/train\n", - "├── train_applprev_1_0.parquet\n", - "├── train_applprev_1_1.parquet\n", - "├── train_applprev_2.parquet\n", - "├── train_base.parquet\n", - "├── train_credit_bureau_a_1_0.parquet\n", - "├── train_credit_bureau_a_1_1.parquet\n", - "├── train_credit_bureau_a_1_3.parquet\n", - "├── train_credit_bureau_a_2_0.parquet\n", - "├── train_credit_bureau_a_2_1.parquet\n", - "├── train_credit_bureau_a_2_10.parquet\n", - "├── train_credit_bureau_a_2_2.parquet\n", - "├── train_credit_bureau_a_2_3.parquet\n", - "├── train_credit_bureau_a_2_4.parquet\n", - "├── train_credit_bureau_a_2_5.parquet\n", - "├── train_credit_bureau_a_2_6.parquet\n", - "├── train_credit_bureau_a_2_7.parquet\n", - "├── train_credit_bureau_a_2_8.parquet\n", - "├── train_credit_bureau_a_2_9.parquet\n", - "├── train_credit_bureau_b_1.parquet\n", - "├── train_credit_bureau_b_2.parquet\n", - "├── train_debitcard_1.parquet\n", - "├── train_deposit_1.parquet\n", - "├── train_other_1.parquet\n", - "├── train_person_1.parquet\n", - "├── train_person_2.parquet\n", - "├── train_static_0_0.parquet\n", - "├── train_static_0_1.parquet\n", - "├── train_static_cb_0.parquet\n", - "├── train_tax_registry_a_1.parquet\n", - "├── train_tax_registry_b_1.parquet\n", - "└── train_tax_registry_c_1.parquet\n", - "```\n", - "\n", - "The `train_base.parquet` file is the base table, while the others are feature tables.\n", - "Let's take a quick look at these tables.\n", - "\n", - "#### Base table\n", - "The base table (`train_base.parquet`) contains the unique ID, a binary target flag\n", - "and other information for the training samples. This unique ID will serve as the\n", - "linking key for joining with other feature tables.\n", - "\n", - "* `case_id` - This is the unique ID for each loan. You'll need this ID to\n", - " join feature tables to the base table. There are about 1.5m unique loans.\n", - "* `date_decision` - This refers to the date when a decision was made regarding the\n", - " approval of the loan.\n", - "* `WEEK_NUM` - This is the week number used for aggregation. In the test sample,\n", - " `WEEK_NUM` continues sequentially from the last training value of `WEEK_NUM`.\n", - "* `MONTH` - This column represents the month when the approval decision was made.\n", - "* `target` - This is the binary target flag, determined after a certain period based on\n", - " whether or not the client defaulted on the specific loan.\n", - "\n", - "Here is several examples from the base table:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┓\n",
-              "┃ case_id  date_decision  MONTH   WEEK_NUM  target ┃\n",
-              "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━┩\n",
-              "│ int64stringint64int64int64  │\n",
-              "├─────────┼───────────────┼────────┼──────────┼────────┤\n",
-              "│       02019-01-03   20190100 │\n",
-              "│       12019-01-03   20190100 │\n",
-              "│       22019-01-04   20190100 │\n",
-              "│       32019-01-03   20190100 │\n",
-              "│       42019-01-04   20190101 │\n",
-              "└─────────┴───────────────┴────────┴──────────┴────────┘\n",
-              "
\n" - ], - "text/plain": [ - "┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate_decision\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mMONTH\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mWEEK_NUM\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtarget\u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━┩\n", - "│ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │\n", - "├─────────┼───────────────┼────────┼──────────┼────────┤\n", - "│ \u001b[1;36m0\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", - "│ \u001b[1;36m1\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", - "│ \u001b[1;36m2\u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", - "│ \u001b[1;36m3\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", - "│ \u001b[1;36m4\u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1\u001b[0m │\n", - "└─────────┴───────────────┴────────┴──────────┴────────┘" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ibis.read_parquet(TRAIN_DIR / \"train_base.parquet\").head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Feature tables\n", - "The remaining files contain features, consisting of approximately 370 features from\n", - "previous loan applications and external data sources. Their definitions can be found in the feature\n", - "definition [file](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data)\n", - "from the competition website.\n", - "\n", - "There are several things we want to mention for the feature tables:\n", - "\n", - "* **Union datasets**: One dataset could be saved into multiple parquet files, such as\n", - "`train_applprev_1_0.parquet` and `train_applprev_1_1.parquet`, We need to union this data.\n", - "* **Dataset levels**: Datasets may have different levels, which we will explain as\n", - "follows:\n", - " * **Depth = 0**: Each row in the table is identified by a unique `case_id`.\n", - " In this case, you can directly join the features with the base table and use them as\n", - " features for further analysis or processing.\n", - " * **Depth > 0**: You will group the data based on the `case_id` and perform calculations\n", - " or aggregations within each group.\n", - "\n", - "Here are two examples of tables with different levels.\n", - "\n", - "Example of table with depth = 0, `case_id` is the row identifier, features can be directly joined\n", - " with the base table." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n",
-              "┃ case_id  assignmentdate_238D  assignmentdate_4527235D  assignmentdate_4955616D  birthdate_574D  contractssum_5085716L  dateofbirth_337D  dateofbirth_342D  days120_123L  days180_256L  days30_165L  days360_512L  days90_310L  description_5085714M  education_1103M  education_88M  firstquarter_103L  for3years_128L  for3years_504L  for3years_584L  formonth_118L  formonth_206L  formonth_535L  forquarter_1017L  forquarter_462L  forquarter_634L  fortoday_1092L  forweek_1077L  forweek_528L  forweek_601L  foryear_618L  foryear_818L  foryear_850L  fourthquarter_440L  maritalst_385M  maritalst_893M  numberofqueries_373L  pmtaverage_3A  pmtaverage_4527227A  pmtaverage_4955615A  pmtcount_4527229L  pmtcount_4955617L  pmtcount_693L  pmtscount_423L  pmtssum_45A  requesttype_4525192L  responsedate_1012D  responsedate_4527233D  responsedate_4917613D  riskassesment_302T  riskassesment_940T  secondquarter_766L  thirdquarter_1082L ┃\n",
-              "┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n",
-              "│ int64stringstringstringstringfloat64stringstringfloat64float64float64float64float64stringstringstringfloat64float64float64float64float64float64float64float64float64float64float64float64float64float64float64float64float64float64stringstringfloat64float64float64float64float64float64float64float64float64stringstringstringstringstringfloat64float64float64            │\n",
-              "├─────────┼─────────────────────┼─────────────────────────┼─────────────────────────┼────────────────┼───────────────────────┼──────────────────┼──────────────────┼──────────────┼──────────────┼─────────────┼──────────────┼─────────────┼──────────────────────┼─────────────────┼───────────────┼───────────────────┼────────────────┼────────────────┼────────────────┼───────────────┼───────────────┼───────────────┼──────────────────┼─────────────────┼─────────────────┼────────────────┼───────────────┼──────────────┼──────────────┼──────────────┼──────────────┼──────────────┼────────────────────┼────────────────┼────────────────┼──────────────────────┼───────────────┼─────────────────────┼─────────────────────┼───────────────────┼───────────────────┼───────────────┼────────────────┼─────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────────┤\n",
-              "│     357NULLNULLNULL1988-04-01    NULLNULLNULLNULLNULLNULLNULLNULLa55475b1            a55475b1       a55475b1     NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1      a55475b1      NULLNULLNULLNULLNULLNULLNULL6.06301.4000NULL2019-01-25        NULLNULLNULLNULLNULLNULL │\n",
-              "│     381NULLNULLNULL1973-11-01    NULLNULLNULLNULLNULLNULLNULLNULLa55475b1            a55475b1       a55475b1     NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1      a55475b1      NULLNULLNULLNULLNULLNULLNULL6.04019.6000NULL2019-01-25        NULLNULLNULLNULLNULLNULL │\n",
-              "│     388NULLNULLNULL1989-04-01    NULL1989-04-01      NULL6.08.02.010.04.0a55475b1            a55475b1       a55475b1     2.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL6.0a55475b1      a55475b1      10.0NULLNULLNULLNULLNULLNULL6.014548.0000NULL2019-01-28        NULLNULLNULLNULL3.05.0 │\n",
-              "│     405NULLNULLNULL1974-03-01    NULL1974-03-01      NULL0.00.00.01.00.0a55475b1            a55475b1       a55475b1     0.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL4.0a55475b1      a55475b1      1.0NULLNULLNULLNULLNULLNULL6.010498.2400NULL2019-01-21        NULLNULLNULLNULL2.00.0 │\n",
-              "│     409NULLNULLNULL1993-06-01    NULL1993-06-01      NULL2.03.00.03.01.0a55475b1            717ddd49       a55475b1     4.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL1.0a7fcb6e5      a55475b1      3.0NULLNULLNULLNULLNULLNULL7.06344.8804NULL2019-01-21        NULLNULLNULLNULL0.04.0 │\n",
-              "└─────────┴─────────────────────┴─────────────────────────┴─────────────────────────┴────────────────┴───────────────────────┴──────────────────┴──────────────────┴──────────────┴──────────────┴─────────────┴──────────────┴─────────────┴──────────────────────┴─────────────────┴───────────────┴───────────────────┴────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┴──────────────────┴─────────────────┴─────────────────┴────────────────┴───────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴────────────────────┴────────────────┴────────────────┴──────────────────────┴───────────────┴─────────────────────┴─────────────────────┴───────────────────┴───────────────────┴───────────────┴────────────────┴─────────────┴──────────────────────┴────────────────────┴───────────────────────┴───────────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────────┘\n",
-              "
\n" - ], - "text/plain": [ - "┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_238D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_4527235D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_4955616D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbirthdate_574D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractssum_5085716L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_337D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_342D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays120_123L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays180_256L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays30_165L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays360_512L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays90_310L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdescription_5085714M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_1103M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_88M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirstquarter_103L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_128L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_504L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_584L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_118L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_206L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_535L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_1017L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_462L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_634L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfortoday_1092L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_1077L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_528L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_601L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_618L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_818L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_850L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfourthquarter_440L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_385M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_893M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofqueries_373L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_3A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_4527227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_4955615A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_4527229L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_4955617L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_693L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtscount_423L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtssum_45A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mrequesttype_4525192L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_1012D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4527233D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4917613D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mriskassesment_302T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mriskassesment_940T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msecondquarter_766L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mthirdquarter_1082L\u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n", - "│ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", - "├─────────┼─────────────────────┼─────────────────────────┼─────────────────────────┼────────────────┼───────────────────────┼──────────────────┼──────────────────┼──────────────┼──────────────┼─────────────┼──────────────┼─────────────┼──────────────────────┼─────────────────┼───────────────┼───────────────────┼────────────────┼────────────────┼────────────────┼───────────────┼───────────────┼───────────────┼──────────────────┼─────────────────┼─────────────────┼────────────────┼───────────────┼──────────────┼──────────────┼──────────────┼──────────────┼──────────────┼────────────────────┼────────────────┼────────────────┼──────────────────────┼───────────────┼─────────────────────┼─────────────────────┼───────────────────┼───────────────────┼───────────────┼────────────────┼─────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────────┤\n", - "│ \u001b[1;36m357\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1988-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m6301.4000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-25 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", - "│ \u001b[1;36m381\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1973-11-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m4019.6000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-25 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", - "│ \u001b[1;36m388\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1989-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1989-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m8.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m14548.0000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-28 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │\n", - "│ \u001b[1;36m405\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1974-03-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1974-03-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m10498.2400\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-21 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │\n", - "│ \u001b[1;36m409\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1993-06-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1993-06-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32m717ddd49 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[32ma7fcb6e5 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m7.0\u001b[0m │ \u001b[1;36m6344.8804\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-21 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │\n", - "└─────────┴─────────────────────┴─────────────────────────┴─────────────────────────┴────────────────┴───────────────────────┴──────────────────┴──────────────────┴──────────────┴──────────────┴─────────────┴──────────────┴─────────────┴──────────────────────┴─────────────────┴───────────────┴───────────────────┴────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┴──────────────────┴─────────────────┴─────────────────┴────────────────┴───────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴────────────────────┴────────────────┴────────────────┴──────────────────────┴───────────────┴─────────────────────┴─────────────────────┴───────────────────┴───────────────────┴───────────────┴────────────────┴─────────────┴──────────────────────┴────────────────────┴───────────────────────┴───────────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────────┘" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ibis.read_parquet(TRAIN_DIR / \"train_static_cb_0.parquet\").head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Example of a table with depth = 1, we need to aggregate the features and collect statistics\n", - "based on `case_id` then join with the base table." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓\n",
-              "┃ num_group1  case_id  amount_1115A  classificationofcontr_1114M  contractdate_551D  contractmaturitydate_151D  contractst_516M  contracttype_653M  credlmt_1052A  credlmt_228A  credlmt_3940954A  credor_3940957M  credquantity_1099L  credquantity_984L  debtpastduevalue_732A  debtvalue_227A  dpd_550P  dpd_733P  dpdmax_851P  dpdmaxdatemonth_804T  dpdmaxdateyear_742T  installmentamount_644A  installmentamount_833A  instlamount_892A  interesteffectiverate_369L  interestrateyearly_538L  lastupdate_260D  maxdebtpduevalodued_3940955A  numberofinstls_810L  overdueamountmax_950A  overdueamountmaxdatemonth_494T  overdueamountmaxdateyear_432T  periodicityofpmts_997L  periodicityofpmts_997M  pmtdaysoverdue_1135P  pmtmethod_731M  pmtnumpending_403L  purposeofcred_722M  residualamount_1093A  residualamount_127A  residualamount_3940956A  subjectrole_326M  subjectrole_43M  totalamount_503A  totalamount_881A ┃\n",
-              "┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩\n",
-              "│ int64int64float64stringstringstringstringstringfloat64float64float64stringfloat64float64float64float64float64float64float64float64float64float64float64float64float64float64stringfloat64float64float64float64float64stringstringfloat64stringfloat64stringfloat64float64float64stringstringfloat64float64          │\n",
-              "├────────────┼─────────┼──────────────┼─────────────────────────────┼───────────────────┼───────────────────────────┼─────────────────┼───────────────────┼───────────────┼──────────────┼──────────────────┼─────────────────┼────────────────────┼───────────────────┼───────────────────────┼────────────────┼──────────┼──────────┼─────────────┼──────────────────────┼─────────────────────┼────────────────────────┼────────────────────────┼──────────────────┼────────────────────────────┼─────────────────────────┼─────────────────┼──────────────────────────────┼─────────────────────┼───────────────────────┼────────────────────────────────┼───────────────────────────────┼────────────────────────┼────────────────────────┼──────────────────────┼────────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────────────┼─────────────────────────┼──────────────────┼─────────────────┼──────────────────┼──────────────────┤\n",
-              "│          0467NULLea6782cc                   2011-06-15       2031-06-13               7241344e       724be82a         3.000000e+0610000.03.000000e+06P164_34_168    2.01.0NULLNULL0.00.0NULLNULLNULL0.00.000NULLNULLNULL2019-01-20     NULLNULLNULLNULLNULLNULLa55475b1              NULLa55475b1      NULL96a8fdfe          0.00.0NULLfa4f56f1        ab3c25cf       3.000000e+0610000.0 │\n",
-              "│          1467NULLea6782cc                   2019-01-04       2021-08-04               7241344e       724be82a         NULLNULL1.303650e+05P164_34_168    1.02.0NULLNULL0.00.0NULLNULLNULL0.026571.969NULLNULLNULL2019-01-20     NULLNULLNULLNULLNULLNULLa55475b1              NULLa55475b1      NULL96a8fdfe          NULLNULLNULLab3c25cf        ab3c25cf       7.800000e+04960000.0 │\n",
-              "│          246778000.0ea6782cc                   2016-10-25       2019-10-25               7241344e       4257cbed         NULLNULLNULLc5a72b57       NULLNULL0.026571.969NULLNULL0.011.02016.0NULLNULL2898.76NULLNULL2019-01-10     0.036.00.011.02016.0NULLa0b598e4              0.0e914c86c      10.096a8fdfe          NULLNULLNULLa55475b1        a55475b1       NULLNULL │\n",
-              "│          01445NULLea6782cc                   2015-01-30       2021-01-30               7241344e       1c9c5356         4.000000e+05100000.07.400000e+04b619fa46       2.05.00.0NULL0.00.0200418.01.02018.00.00.000NULLNULLNULL2019-01-19     0.4NULL1.42.02018.0NULLa55475b1              0.0a55475b1      NULL60c73645          0.00.073044.18daf49a8a        ab3c25cf       4.000000e+05100000.0 │\n",
-              "│          11445NULL01f63ac8                   2014-09-12       2021-09-12               7241344e       724be82a         NULLNULL4.000000e+0574bd67a8       3.017.0NULLNULL0.00.0NULLNULLNULL0.0209617.770NULLNULLNULL2019-01-13     NULLNULLNULLNULLNULLNULLa55475b1              NULLa55475b1      NULL96a8fdfe          NULLNULLNULLab3c25cf        ab3c25cf       3.968006e+05184587.8 │\n",
-              "└────────────┴─────────┴──────────────┴─────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────┴───────────────────┴───────────────┴──────────────┴──────────────────┴─────────────────┴────────────────────┴───────────────────┴───────────────────────┴────────────────┴──────────┴──────────┴─────────────┴──────────────────────┴─────────────────────┴────────────────────────┴────────────────────────┴──────────────────┴────────────────────────────┴─────────────────────────┴─────────────────┴──────────────────────────────┴─────────────────────┴───────────────────────┴────────────────────────────────┴───────────────────────────────┴────────────────────────┴────────────────────────┴──────────────────────┴────────────────┴────────────────────┴────────────────────┴──────────────────────┴─────────────────────┴─────────────────────────┴──────────────────┴─────────────────┴──────────────────┴──────────────────┘\n",
-              "
\n" - ], - "text/plain": [ - "┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mnum_group1\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mamount_1115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mclassificationofcontr_1114M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractdate_551D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractmaturitydate_151D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractst_516M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontracttype_653M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_1052A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_228A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_3940954A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredor_3940957M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredquantity_1099L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredquantity_984L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdebtpastduevalue_732A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdebtvalue_227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpd_550P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpd_733P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmax_851P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmaxdatemonth_804T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmaxdateyear_742T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstallmentamount_644A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstallmentamount_833A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstlamount_892A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minteresteffectiverate_369L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minterestrateyearly_538L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlastupdate_260D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaxdebtpduevalodued_3940955A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofinstls_810L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmax_950A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmaxdatemonth_494T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmaxdateyear_432T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mperiodicityofpmts_997L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mperiodicityofpmts_997M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtdaysoverdue_1135P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtmethod_731M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtnumpending_403L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpurposeofcred_722M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_1093A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_127A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_3940956A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msubjectrole_326M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msubjectrole_43M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotalamount_503A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotalamount_881A\u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩\n", - "│ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", - "├────────────┼─────────┼──────────────┼─────────────────────────────┼───────────────────┼───────────────────────────┼─────────────────┼───────────────────┼───────────────┼──────────────┼──────────────────┼─────────────────┼────────────────────┼───────────────────┼───────────────────────┼────────────────┼──────────┼──────────┼─────────────┼──────────────────────┼─────────────────────┼────────────────────────┼────────────────────────┼──────────────────┼────────────────────────────┼─────────────────────────┼─────────────────┼──────────────────────────────┼─────────────────────┼───────────────────────┼────────────────────────────────┼───────────────────────────────┼────────────────────────┼────────────────────────┼──────────────────────┼────────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────────────┼─────────────────────────┼──────────────────┼─────────────────┼──────────────────┼──────────────────┤\n", - "│ \u001b[1;36m0\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2011-06-15 \u001b[0m │ \u001b[32m2031-06-13 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[1;36m10000.0\u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[32mP164_34_168 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-20 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mfa4f56f1 \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[1;36m10000.0\u001b[0m │\n", - "│ \u001b[1;36m1\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[32m2021-08-04 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.303650e+05\u001b[0m │ \u001b[32mP164_34_168 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m26571.969\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-20 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m7.800000e+04\u001b[0m │ \u001b[1;36m960000.0\u001b[0m │\n", - "│ \u001b[1;36m2\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[1;36m78000.0\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2016-10-25 \u001b[0m │ \u001b[32m2019-10-25 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m4257cbed \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mc5a72b57 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m26571.969\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m11.0\u001b[0m │ \u001b[1;36m2016.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2898.76\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-10 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m36.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m11.0\u001b[0m │ \u001b[1;36m2016.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma0b598e4 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32me914c86c \u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", - "│ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1445\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2015-01-30 \u001b[0m │ \u001b[32m2021-01-30 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m1c9c5356 \u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[1;36m7.400000e+04\u001b[0m │ \u001b[32mb619fa46 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m200418.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2018.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-19 \u001b[0m │ \u001b[1;36m0.4\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.4\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m2018.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m60c73645 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m73044.18\u001b[0m │ \u001b[32mdaf49a8a \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │\n", - "│ \u001b[1;36m1\u001b[0m │ \u001b[1;36m1445\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m01f63ac8 \u001b[0m │ \u001b[32m2014-09-12 \u001b[0m │ \u001b[32m2021-09-12 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[32m74bd67a8 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m17.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m209617.770\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-13 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m3.968006e+05\u001b[0m │ \u001b[1;36m184587.8\u001b[0m │\n", - "└────────────┴─────────┴──────────────┴─────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────┴───────────────────┴───────────────┴──────────────┴──────────────────┴─────────────────┴────────────────────┴───────────────────┴───────────────────────┴────────────────┴──────────┴──────────┴─────────────┴──────────────────────┴─────────────────────┴────────────────────────┴────────────────────────┴──────────────────┴────────────────────────────┴─────────────────────────┴─────────────────┴──────────────────────────────┴─────────────────────┴───────────────────────┴────────────────────────────────┴───────────────────────────────┴────────────────────────┴────────────────────────┴──────────────────────┴────────────────┴────────────────────┴────────────────────┴──────────────────────┴─────────────────────┴─────────────────────────┴──────────────────┴─────────────────┴──────────────────┴──────────────────┘" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ibis.read_parquet(TRAIN_DIR / \"train_credit_bureau_b_1.parquet\").relocate(\n", - " \"num_group1\"\n", - ").order_by([\"case_id\", \"num_group1\"]).head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For more details on features and its exploratory data analysis (EDA), you can refer to\n", - "feature definition and these Kaggle notebooks:\n", - "\n", - "* [Feature\n", - " definition](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data#:~:text=calendar_view_week-,feature_definitions,-.csv)\n", - "* [Home credit risk prediction\n", - " EDA](https://www.kaggle.com/code/loki97/home-credit-risk-prediction-eda)\n", - "* [Home credit CRMS 2024\n", - " EDA](https://www.kaggle.com/code/sergiosaharovskiy/home-credit-crms-2024-eda-and-submission)\n", - "\n", - "### Data loading and processing\n", - "We will perform the following data processing steps using Ibis and IbisML:\n", - "\n", - "* **Convert data types**: Ensure consistency by converting data types, as the same column\n", - " in different sub-files may have different types.\n", - "* **Aggregate features**: For tables with depth greater than 0, aggregate features based\n", - " on `case_id`, including statistics calculation. You can collect statistics such as mean,\n", - " median, mode, minimum, standard deviation, and others.\n", - "* **Union and join datasets**: Combine multiple sub-files of the same dataset into one\n", - " table, as some datasets are split into multiple sub-files with a common prefix. Afterward,\n", - " join these tables with the base table.\n", - "\n", - "#### Convert data types\n", - "We'll use IbisML to create a chain of `Cast` steps, forming a recipe for data type\n", - "conversion across the dataset. This conversion is based on the provided information\n", - "extracted from column names. Columns that have similar transformations are indicated by a\n", - "capital letter at the end of their names:\n", - "\n", - "* P - Transform DPD (Days past due)\n", - "* M - Masking categories\n", - "* A - Transform amount\n", - "* D - Transform date\n", - "* T - Unspecified Transform\n", - "* L - Unspecified Transform\n", - "\n", - "For example, we'll define a IbisML transformation step to convert columns ends with `P`\n", - "to floating number:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# convert columns ends with P to floating number\n", - "step_cast_P_to_float = ml.Cast(ml.endswith(\"P\"), dt.float64)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, let's define additional type conversion transformations based on the postfix of column names:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# convert columns ends with A to floating number\n", - "step_cast_A_to_float = ml.Cast(ml.endswith(\"A\"), dt.float64)\n", - "# convert columns ends with D to date\n", - "step_cast_D_to_date = ml.Cast(ml.endswith(\"D\"), dt.date)\n", - "# convert columns ends with M to str\n", - "step_cast_M_to_str = ml.Cast(ml.endswith(\"M\"), dt.str)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll construct the\n", - "[IbisML Recipe](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)\n", - "which chains together all the transformation steps." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data format conversion recipe:\n", - "Recipe(Cast(endswith('P'), 'float64'),\n", - " Cast(endswith('D'), 'date'),\n", - " Cast(endswith('M'), 'string'),\n", - " Cast(endswith('A'), 'float64'),\n", - " Cast(cols(('date_decision',)), 'date'),\n", - " Cast(cols(('case_id', 'WEEK_NUM', 'num_group1', 'num_group2')), 'int64'),\n", - " Cast(cols(('cardtype_51L', 'credacc_status_367L', 'requesttype_4525192L', 'riskassesment_302T', 'max_periodicityofpmts_997L')),\n", - " 'string'),\n", - " Cast(cols(('isbidproductrequest_292L', 'isdebitcard_527L', 'equalityempfrom_62L')),\n", - " 'int64'))\n" - ] - } - ], - "source": [ - "data_type_recipes = ml.Recipe(\n", - " step_cast_P_to_float,\n", - " step_cast_D_to_date,\n", - " step_cast_M_to_str,\n", - " step_cast_A_to_float,\n", - " # cast some special columns\n", - " ml.Cast([\"date_decision\"], \"date\"),\n", - " ml.Cast([\"case_id\", \"WEEK_NUM\", \"num_group1\", \"num_group2\"], dt.int64),\n", - " ml.Cast(\n", - " [\n", - " \"cardtype_51L\",\n", - " \"credacc_status_367L\",\n", - " \"requesttype_4525192L\",\n", - " \"riskassesment_302T\",\n", - " \"max_periodicityofpmts_997L\",\n", - " ],\n", - " dt.str,\n", - " ),\n", - " ml.Cast(\n", - " [\n", - " \"isbidproductrequest_292L\",\n", - " \"isdebitcard_527L\",\n", - " \"equalityempfrom_62L\",\n", - " ],\n", - " dt.int64,\n", - " ),\n", - ")\n", - "print(f\"Data format conversion recipe:\\n{data_type_recipes}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "IbisML offers a powerful set of column selectors, allowing you to select columns based\n", - "on names, types, and patterns. For more information, you can refer to the IbisML column\n", - "selectors [documentation](https://ibis-project.github.io/ibis-ml/reference/selectors.html).\n", - "\n", - "\n", - "#### Aggregate features\n", - "For tables with a depth greater than 0 that can't be directly joined with the base table,\n", - "we need to aggregate the features by the `case_id`. You could compute the different statistics for numeric columns and\n", - "non-numeric columns.\n", - "\n", - "Here, we use the `maximum` as an example." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def agg_by_id(table):\n", - " return table.group_by(\"case_id\").agg(\n", - " [\n", - " table[col_name].max().name(f\"max_{col_name}\")\n", - " for col_name in table.columns\n", - " if col_name[-1] in (\"T\", \"L\", \"P\", \"A\", \"D\", \"M\")\n", - " ]\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "For better predicting power, you need to collect different statistics based on the meaning of features. For simplicity,\n", - "we'll only collect the maximum value of the features here.\n", - "\n", - "\n", - "#### Put them together\n", - "We'll put them together in a function reads parquet files, optionally handles regex patterns for\n", - "multiple sub-files, applies data type transformations defined by `data_type_recipes`, and\n", - "performs aggregation based on `case_id` if specified by the depth parameter." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def read_and_process_files(file_path, depth=None, is_regex=False):\n", - " \"\"\"\n", - " Read and process Parquet files.\n", - "\n", - " Args:\n", - " file_path (str): Path to the file or regex pattern to match files.\n", - " depth (int, optional): Depth of processing. If 1 or 2, additional aggregation is performed.\n", - " is_regex (bool, optional): Whether the file_path is a regex pattern.\n", - "\n", - " Returns:\n", - " ibis.Table: The processed Ibis table.\n", - " \"\"\"\n", - " if is_regex:\n", - " # read and union multiple files\n", - " chunks = []\n", - " for path in glob(str(file_path)):\n", - " chunk = ibis.read_parquet(path)\n", - " # transform table using IbisML Recipe\n", - " chunk = data_type_recipes.fit(chunk).to_ibis(chunk)\n", - " chunks.append(chunk)\n", - " table = ibis.union(*chunks)\n", - " else:\n", - " # read a single file\n", - " table = ibis.read_parquet(file_path)\n", - " # transform table using IbisML\n", - " table = data_type_recipes.fit(table).to_ibis(table)\n", - "\n", - " # perform aggregation if depth is 1 or 2\n", - " if depth in [1, 2]:\n", - " table = agg_by_id(table)\n", - "\n", - " return table" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define two dictionaries, `train_data_store` and `test_data_store`, that organize and\n", - "store processed datasets for training and testing datasets." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "train_data_store = {\n", - " \"df_base\": read_and_process_files(TRAIN_DIR / \"train_base.parquet\"),\n", - " \"depth_0\": [\n", - " read_and_process_files(TRAIN_DIR / \"train_static_cb_0.parquet\"),\n", - " read_and_process_files(TRAIN_DIR / \"train_static_0_*.parquet\", is_regex=True),\n", - " ],\n", - " \"depth_1\": [\n", - " read_and_process_files(\n", - " TRAIN_DIR / \"train_applprev_1_*.parquet\", 1, is_regex=True\n", - " ),\n", - " read_and_process_files(TRAIN_DIR / \"train_tax_registry_a_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_tax_registry_b_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_tax_registry_c_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_credit_bureau_b_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_other_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_person_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_deposit_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_debitcard_1.parquet\", 1),\n", - " ],\n", - " \"depth_2\": [\n", - " read_and_process_files(TRAIN_DIR / \"train_credit_bureau_b_2.parquet\", 2),\n", - " ],\n", - "}\n", - "# we won't be submitting the predictions, so let's comment out the test data.\n", - "# test_data_store = {\n", - "# \"df_base\": read_and_process_files(TEST_DIR / \"test_base.parquet\"),\n", - "# \"depth_0\": [\n", - "# read_and_process_files(TEST_DIR / \"test_static_cb_0.parquet\"),\n", - "# read_and_process_files(TEST_DIR / \"test_static_0_*.parquet\", is_regex=True),\n", - "# ],\n", - "# \"depth_1\": [\n", - "# read_and_process_files(TEST_DIR / \"test_applprev_1_*.parquet\", 1, is_regex=True),\n", - "# read_and_process_files(TEST_DIR / \"test_tax_registry_a_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_tax_registry_b_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_tax_registry_c_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_credit_bureau_b_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_other_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_person_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_deposit_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_debitcard_1.parquet\", 1),\n", - "# ],\n", - "# \"depth_2\": [\n", - "# read_and_process_files(TEST_DIR / \"test_credit_bureau_b_2.parquet\", 2),\n", - "# ]\n", - "# }" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Join all features data to base table:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def join_data(df_base, depth_0, depth_1, depth_2):\n", - " for i, df in enumerate(depth_0 + depth_1 + depth_2):\n", - " df_base = df_base.join(\n", - " df, \"case_id\", how=\"left\", rname=\"{name}_right\" + f\"_{i}\"\n", - " )\n", - " return df_base" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generate train and test datasets:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "There is 1526659 rows and 377 columns\n" - ] - } - ], - "source": [ - "df_train = join_data(**train_data_store)\n", - "# df_test = join_data(**test_data_store)\n", - "total_rows = df_train.count().execute()\n", - "print(f\"There is {total_rows} rows and {len(df_train.columns)} columns\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Select features\n", - "Given the large number of features (~370), we'll focus on selecting just a few of the most\n", - "informative ones by name for demonstration purposes in this post:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
-              "┃ case_id  date_decision  target  days30_165L  days360_512L  days90_310L  pmtscount_423L  pmtssum_45A  dateofbirth_337D  education_1103M  firstquarter_103L  secondquarter_766L  thirdquarter_1082L  fourthquarter_440L  maritalst_893M  numberofqueries_373L  requesttype_4525192L  responsedate_4527233D  actualdpdtolerance_344P  amtinstpaidbefduel24m_4187115A  annuity_780A  annuitynextmonth_57A  applicationcnt_361L  applications30d_658L  applicationscnt_1086L  avgdbddpdlast24m_3658932P  avgdbddpdlast3m_4187120P  max_contractmaturitydate_151D  max_credlmt_1052A  max_credquantity_1099L  max_dpdmaxdatemonth_804T  max_dpdmaxdateyear_742T  max_maxdebtpduevalodued_3940955A  max_overdueamountmax_950A  max_purposeofcred_722M  max_residualamount_3940956A  max_totalamount_503A  max_cancelreason_3545846M  max_childnum_21L  max_currdebt_94A  max_employedfrom_700D  max_mainoccupationinc_437A  max_profession_152M  max_rejectreason_755M  max_status_219L  max_amount_1115A  max_debtpastduevalue_732A  max_debtvalue_227A  max_installmentamount_833A  max_instlamount_892A  max_numberofinstls_810L  max_pmtnumpending_403L  max_last180dayaveragebalance_704A  max_last30dayturnover_651A  max_openingdate_857D  max_amount_416A  max_amtdebitincoming_4809443A  max_amtdebitoutgoing_4809440A  max_amtdepositbalance_4809441A  max_amtdepositincoming_4809444A  max_amtdepositoutgoing_4809442A  max_empl_industry_691L  max_gender_992L  max_housingtype_772L  max_mainoccupationinc_384A  max_incometype_1044T    ┃\n",
-              "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
-              "│ int64dateint64float64float64float64float64float64datestringfloat64float64float64float64stringfloat64stringdatefloat64float64float64float64float64float64float64float64float64datefloat64float64float64float64float64float64stringfloat64float64stringfloat64float64datefloat64stringstringstringfloat64float64float64float64float64float64float64float64float64datefloat64float64float64float64float64float64stringstringstringfloat64string                  │\n",
-              "├─────────┼───────────────┼────────┼─────────────┼──────────────┼─────────────┼────────────────┼─────────────┼──────────────────┼─────────────────┼───────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────┼──────────────────────┼──────────────────────┼───────────────────────┼─────────────────────────┼────────────────────────────────┼──────────────┼──────────────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┼───────────────────┼────────────────────────┼──────────────────────────┼─────────────────────────┼──────────────────────────────────┼───────────────────────────┼────────────────────────┼─────────────────────────────┼──────────────────────┼───────────────────────────┼──────────────────┼──────────────────┼───────────────────────┼────────────────────────────┼─────────────────────┼───────────────────────┼─────────────────┼──────────────────┼───────────────────────────┼────────────────────┼────────────────────────────┼──────────────────────┼─────────────────────────┼────────────────────────┼───────────────────────────────────┼────────────────────────────┼──────────────────────┼─────────────────┼───────────────────────────────┼───────────────────────────────┼────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼────────────────────────┼─────────────────┼──────────────────────┼────────────────────────────┼─────────────────────────┤\n",
-              "│ 19159072020-09-0200.04.00.0NULLNULL1965-03-01a55475b1       5.02.01.03.0a55475b1      4.0NULLNULL0.039089.6003740.64886.20000.00.00.0-3.0-6.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 0.055290.2502006-09-15120000.0a55475b1           a55475b1             D              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL22000.022000.00.00.00.0NULLNULLNULL60000.0EMPLOYED                │\n",
-              "│ 19165722020-09-0301.06.02.0NULLNULL1985-01-01a55475b1       2.02.01.02.0a55475b1      6.0NULLNULL0.0110432.0002400.07555.80030.00.00.0-5.0-10.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 0.045862.9342007-04-15194000.0a55475b1           a55475b1             T              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL13353.413333.40.00.00.0NULLNULLNULL28000.0PRIVATE_SECTOR_EMPLOYEE │\n",
-              "│ 19167442020-09-0300.03.02.0NULLNULL1974-04-016b2ae0fa       5.09.07.05.0a55475b1      3.0NULLNULL0.086690.2004333.24199.80030.00.00.0-1.00.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 2.041992.0002007-03-15100000.0a55475b1           a55475b1             K              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL0.00.081909.40.07152.0NULLNULLNULL100000.0SALARIED_GOVT           │\n",
-              "│ 19172122020-09-0300.02.00.0NULLNULL1981-10-01a55475b1       1.02.06.02.0a55475b1      2.0NULLNULL0.0160111.3301864.610964.00000.00.00.0-6.0-10.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 3.019254.0002000-01-1560000.0a55475b1           a55475b1             K              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL2685.82660.0206.20.068.8NULLNULLNULL18000.0EMPLOYED                │\n",
-              "│ 19175522020-09-0300.01.00.0NULLNULL1984-12-01a55475b1       0.01.00.02.0a55475b1      1.0NULLNULL0.089029.8053788.02962.60000.00.00.0-33.0-6.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 0.010627.9372017-10-2647000.0a55475b1           a55475b1             K              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL0.00.00.00.00.0NULLNULLNULL20000.0SALARIED_GOVT           │\n",
-              "└─────────┴───────────────┴────────┴─────────────┴──────────────┴─────────────┴────────────────┴─────────────┴──────────────────┴─────────────────┴───────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────┴──────────────────────┴──────────────────────┴───────────────────────┴─────────────────────────┴────────────────────────────────┴──────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┴───────────────────┴────────────────────────┴──────────────────────────┴─────────────────────────┴──────────────────────────────────┴───────────────────────────┴────────────────────────┴─────────────────────────────┴──────────────────────┴───────────────────────────┴──────────────────┴──────────────────┴───────────────────────┴────────────────────────────┴─────────────────────┴───────────────────────┴─────────────────┴──────────────────┴───────────────────────────┴────────────────────┴────────────────────────────┴──────────────────────┴─────────────────────────┴────────────────────────┴───────────────────────────────────┴────────────────────────────┴──────────────────────┴─────────────────┴───────────────────────────────┴───────────────────────────────┴────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┴────────────────────────┴─────────────────┴──────────────────────┴────────────────────────────┴─────────────────────────┘\n",
-              "
\n" - ], - "text/plain": [ - "┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate_decision\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtarget\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays30_165L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays360_512L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays90_310L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtscount_423L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtssum_45A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_337D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_1103M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirstquarter_103L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msecondquarter_766L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mthirdquarter_1082L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfourthquarter_440L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_893M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofqueries_373L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mrequesttype_4525192L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4527233D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mactualdpdtolerance_344P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mamtinstpaidbefduel24m_4187115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mannuity_780A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mannuitynextmonth_57A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplicationcnt_361L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplications30d_658L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplicationscnt_1086L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mavgdbddpdlast24m_3658932P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mavgdbddpdlast3m_4187120P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_contractmaturitydate_151D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_credlmt_1052A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_credquantity_1099L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_dpdmaxdatemonth_804T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_dpdmaxdateyear_742T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_maxdebtpduevalodued_3940955A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_overdueamountmax_950A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_purposeofcred_722M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_residualamount_3940956A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_totalamount_503A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_cancelreason_3545846M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_childnum_21L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_currdebt_94A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_employedfrom_700D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_mainoccupationinc_437A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_profession_152M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_rejectreason_755M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_status_219L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amount_1115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_debtpastduevalue_732A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_debtvalue_227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_installmentamount_833A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_instlamount_892A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_numberofinstls_810L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_pmtnumpending_403L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_last180dayaveragebalance_704A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_last30dayturnover_651A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_openingdate_857D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amount_416A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdebitincoming_4809443A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdebitoutgoing_4809440A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositbalance_4809441A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositincoming_4809444A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositoutgoing_4809442A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_empl_industry_691L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_gender_992L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_housingtype_772L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_mainoccupationinc_384A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_incometype_1044T\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", - "│ \u001b[2mint64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │\n", - "├─────────┼───────────────┼────────┼─────────────┼──────────────┼─────────────┼────────────────┼─────────────┼──────────────────┼─────────────────┼───────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────┼──────────────────────┼──────────────────────┼───────────────────────┼─────────────────────────┼────────────────────────────────┼──────────────┼──────────────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┼───────────────────┼────────────────────────┼──────────────────────────┼─────────────────────────┼──────────────────────────────────┼───────────────────────────┼────────────────────────┼─────────────────────────────┼──────────────────────┼───────────────────────────┼──────────────────┼──────────────────┼───────────────────────┼────────────────────────────┼─────────────────────┼───────────────────────┼─────────────────┼──────────────────┼───────────────────────────┼────────────────────┼────────────────────────────┼──────────────────────┼─────────────────────────┼────────────────────────┼───────────────────────────────────┼────────────────────────────┼──────────────────────┼─────────────────┼───────────────────────────────┼───────────────────────────────┼────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼────────────────────────┼─────────────────┼──────────────────────┼────────────────────────────┼─────────────────────────┤\n", - "│ \u001b[1;36m1915907\u001b[0m │ \u001b[35m2020-09-02\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1965-03-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m39089.600\u001b[0m │ \u001b[1;36m3740.6\u001b[0m │ \u001b[1;36m4886.2000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-3.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m55290.250\u001b[0m │ \u001b[35m2006-09-15\u001b[0m │ \u001b[1;36m120000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mD \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m22000.0\u001b[0m │ \u001b[1;36m22000.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m60000.0\u001b[0m │ \u001b[32mEMPLOYED \u001b[0m │\n", - "│ \u001b[1;36m1916572\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1985-01-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m110432.000\u001b[0m │ \u001b[1;36m2400.0\u001b[0m │ \u001b[1;36m7555.8003\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-5.0\u001b[0m │ \u001b[1;36m-10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m45862.934\u001b[0m │ \u001b[35m2007-04-15\u001b[0m │ \u001b[1;36m194000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mT \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m13353.4\u001b[0m │ \u001b[1;36m13333.4\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m28000.0\u001b[0m │ \u001b[32mPRIVATE_SECTOR_EMPLOYEE\u001b[0m │\n", - "│ \u001b[1;36m1916744\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1974-04-01\u001b[0m │ \u001b[32m6b2ae0fa \u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m9.0\u001b[0m │ \u001b[1;36m7.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m86690.200\u001b[0m │ \u001b[1;36m4333.2\u001b[0m │ \u001b[1;36m4199.8003\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m41992.000\u001b[0m │ \u001b[35m2007-03-15\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m81909.4\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m7152.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[32mSALARIED_GOVT \u001b[0m │\n", - "│ \u001b[1;36m1917212\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1981-10-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m160111.330\u001b[0m │ \u001b[1;36m1864.6\u001b[0m │ \u001b[1;36m10964.0000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[1;36m-10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m19254.000\u001b[0m │ \u001b[35m2000-01-15\u001b[0m │ \u001b[1;36m60000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2685.8\u001b[0m │ \u001b[1;36m2660.0\u001b[0m │ \u001b[1;36m206.2\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m68.8\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m18000.0\u001b[0m │ \u001b[32mEMPLOYED \u001b[0m │\n", - "│ \u001b[1;36m1917552\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1984-12-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m89029.805\u001b[0m │ \u001b[1;36m3788.0\u001b[0m │ \u001b[1;36m2962.6000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-33.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m10627.937\u001b[0m │ \u001b[35m2017-10-26\u001b[0m │ \u001b[1;36m47000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m20000.0\u001b[0m │ \u001b[32mSALARIED_GOVT \u001b[0m │\n", - "└─────────┴───────────────┴────────┴─────────────┴──────────────┴─────────────┴────────────────┴─────────────┴──────────────────┴─────────────────┴───────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────┴──────────────────────┴──────────────────────┴───────────────────────┴─────────────────────────┴────────────────────────────────┴──────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┴───────────────────┴────────────────────────┴──────────────────────────┴─────────────────────────┴──────────────────────────────────┴───────────────────────────┴────────────────────────┴─────────────────────────────┴──────────────────────┴───────────────────────────┴──────────────────┴──────────────────┴───────────────────────┴────────────────────────────┴─────────────────────┴───────────────────────┴─────────────────┴──────────────────┴───────────────────────────┴────────────────────┴────────────────────────────┴──────────────────────┴─────────────────────────┴────────────────────────┴───────────────────────────────────┴────────────────────────────┴──────────────────────┴─────────────────┴───────────────────────────────┴───────────────────────────────┴────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┴────────────────────────┴─────────────────┴──────────────────────┴────────────────────────────┴─────────────────────────┘" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_train = df_train.select(\n", - " \"case_id\",\n", - " \"date_decision\",\n", - " \"target\",\n", - " # number of credit bureau queries for the last X days.\n", - " \"days30_165L\",\n", - " \"days360_512L\",\n", - " \"days90_310L\",\n", - " # number of tax deduction payments\n", - " \"pmtscount_423L\",\n", - " # sum of tax deductions for the client\n", - " \"pmtssum_45A\",\n", - " \"dateofbirth_337D\",\n", - " \"education_1103M\",\n", - " \"firstquarter_103L\",\n", - " \"secondquarter_766L\",\n", - " \"thirdquarter_1082L\",\n", - " \"fourthquarter_440L\",\n", - " \"maritalst_893M\",\n", - " \"numberofqueries_373L\",\n", - " \"requesttype_4525192L\",\n", - " \"responsedate_4527233D\",\n", - " \"actualdpdtolerance_344P\",\n", - " \"amtinstpaidbefduel24m_4187115A\",\n", - " \"annuity_780A\",\n", - " \"annuitynextmonth_57A\",\n", - " \"applicationcnt_361L\",\n", - " \"applications30d_658L\",\n", - " \"applicationscnt_1086L\",\n", - " # average days past or before due of payment during the last 24 months.\n", - " \"avgdbddpdlast24m_3658932P\",\n", - " # average days past or before due of payment during the last 3 months.\n", - " \"avgdbddpdlast3m_4187120P\",\n", - " # end date of active contract.\n", - " \"max_contractmaturitydate_151D\",\n", - " # credit limit of an active loan.\n", - " \"max_credlmt_1052A\",\n", - " # number of credits in credit bureau\n", - " \"max_credquantity_1099L\",\n", - " \"max_dpdmaxdatemonth_804T\",\n", - " \"max_dpdmaxdateyear_742T\",\n", - " \"max_maxdebtpduevalodued_3940955A\",\n", - " \"max_overdueamountmax_950A\",\n", - " \"max_purposeofcred_722M\",\n", - " \"max_residualamount_3940956A\",\n", - " \"max_totalamount_503A\",\n", - " \"max_cancelreason_3545846M\",\n", - " \"max_childnum_21L\",\n", - " \"max_currdebt_94A\",\n", - " \"max_employedfrom_700D\",\n", - " # client's main income amount in their previous application\n", - " \"max_mainoccupationinc_437A\",\n", - " \"max_profession_152M\",\n", - " \"max_rejectreason_755M\",\n", - " \"max_status_219L\",\n", - " # credit amount of the active contract provided by the credit bureau\n", - " \"max_amount_1115A\",\n", - " # amount of unpaid debt for existing contracts\n", - " \"max_debtpastduevalue_732A\",\n", - " \"max_debtvalue_227A\",\n", - " \"max_installmentamount_833A\",\n", - " \"max_instlamount_892A\",\n", - " \"max_numberofinstls_810L\",\n", - " \"max_pmtnumpending_403L\",\n", - " \"max_last180dayaveragebalance_704A\",\n", - " \"max_last30dayturnover_651A\",\n", - " \"max_openingdate_857D\",\n", - " \"max_amount_416A\",\n", - " \"max_amtdebitincoming_4809443A\",\n", - " \"max_amtdebitoutgoing_4809440A\",\n", - " \"max_amtdepositbalance_4809441A\",\n", - " \"max_amtdepositincoming_4809444A\",\n", - " \"max_amtdepositoutgoing_4809442A\",\n", - " \"max_empl_industry_691L\",\n", - " \"max_gender_992L\",\n", - " \"max_housingtype_772L\",\n", - " \"max_mainoccupationinc_384A\",\n", - " \"max_incometype_1044T\",\n", - ")\n", - "\n", - "df_train.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Univariate analysis:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
-              "┃ name             pos    type     count    nulls   unique   mode      mean          std            min      p25          p50           p75           max          ┃\n",
-              "┡━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
-              "│ stringint16stringint64int64int64stringfloat64float64float64float64float64float64float64      │\n",
-              "├─────────────────┼───────┼─────────┼─────────┼────────┼─────────┼──────────┼──────────────┼───────────────┼─────────┼─────────────┼──────────────┼──────────────┼──────────────┤\n",
-              "│ case_id        0int64  152665901526659NULL1.286077e+06718946.5922850.0766197.50001.357358e+061.739022e+062.703454e+06 │\n",
-              "│ target         2int64  152665902NULL3.143728e-020.1744960.00.00000.000000e+000.000000e+001.000000e+00 │\n",
-              "│ days30_165L    3float64152665914096822NULL5.177078e-010.8992380.00.00000.000000e+001.000000e+002.200000e+01 │\n",
-              "│ days360_512L   4float64152665914096892NULL4.777066e+005.1688560.01.00003.000000e+006.500000e+001.150000e+02 │\n",
-              "│ days90_310L    5float64152665914096837NULL1.211420e+001.6559310.00.00001.000000e+002.000000e+004.100000e+01 │\n",
-              "│ pmtscount_423L 6float64152665995402166NULL5.839291e+004.1482640.03.00006.000000e+007.000000e+001.210000e+02 │\n",
-              "│ pmtssum_45A    7float641526659954021265229NULL1.319994e+0418117.2183120.03156.40018.391900e+031.699200e+044.768434e+05 │\n",
-              "│ education_1103M9string 1526659261835a55475b1NULLNULLNULLNULLNULLNULLNULL │\n",
-              "└─────────────────┴───────┴─────────┴─────────┴────────┴─────────┴──────────┴──────────────┴───────────────┴─────────┴─────────────┴──────────────┴──────────────┴──────────────┘\n",
-              "
\n" - ], - "text/plain": [ - "┏━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mname\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpos\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtype\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcount\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnulls\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1munique\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmode\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mstd\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp50\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", - "│ \u001b[2mstring\u001b[0m │ \u001b[2mint16\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", - "├─────────────────┼───────┼─────────┼─────────┼────────┼─────────┼──────────┼──────────────┼───────────────┼─────────┼─────────────┼──────────────┼──────────────┼──────────────┤\n", - "│ \u001b[32mcase_id \u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[32mint64 \u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.286077e+06\u001b[0m │ \u001b[1;36m718946.592285\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m766197.5000\u001b[0m │ \u001b[1;36m1.357358e+06\u001b[0m │ \u001b[1;36m1.739022e+06\u001b[0m │ \u001b[1;36m2.703454e+06\u001b[0m │\n", - "│ \u001b[32mtarget \u001b[0m │ \u001b[1;36m2\u001b[0m │ \u001b[32mint64 \u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m2\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m3.143728e-02\u001b[0m │ \u001b[1;36m0.174496\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0000\u001b[0m │ \u001b[1;36m0.000000e+00\u001b[0m │ \u001b[1;36m0.000000e+00\u001b[0m │ \u001b[1;36m1.000000e+00\u001b[0m │\n", - "│ \u001b[32mdays30_165L \u001b[0m │ \u001b[1;36m3\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m140968\u001b[0m │ \u001b[1;36m22\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m5.177078e-01\u001b[0m │ \u001b[1;36m0.899238\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0000\u001b[0m │ \u001b[1;36m0.000000e+00\u001b[0m │ \u001b[1;36m1.000000e+00\u001b[0m │ \u001b[1;36m2.200000e+01\u001b[0m │\n", - "│ \u001b[32mdays360_512L \u001b[0m │ \u001b[1;36m4\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m140968\u001b[0m │ \u001b[1;36m92\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.777066e+00\u001b[0m │ \u001b[1;36m5.168856\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0000\u001b[0m │ \u001b[1;36m3.000000e+00\u001b[0m │ \u001b[1;36m6.500000e+00\u001b[0m │ \u001b[1;36m1.150000e+02\u001b[0m │\n", - "│ \u001b[32mdays90_310L \u001b[0m │ \u001b[1;36m5\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m140968\u001b[0m │ \u001b[1;36m37\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.211420e+00\u001b[0m │ \u001b[1;36m1.655931\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0000\u001b[0m │ \u001b[1;36m1.000000e+00\u001b[0m │ \u001b[1;36m2.000000e+00\u001b[0m │ \u001b[1;36m4.100000e+01\u001b[0m │\n", - "│ \u001b[32mpmtscount_423L \u001b[0m │ \u001b[1;36m6\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m954021\u001b[0m │ \u001b[1;36m66\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m5.839291e+00\u001b[0m │ \u001b[1;36m4.148264\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0000\u001b[0m │ \u001b[1;36m6.000000e+00\u001b[0m │ \u001b[1;36m7.000000e+00\u001b[0m │ \u001b[1;36m1.210000e+02\u001b[0m │\n", - "│ \u001b[32mpmtssum_45A \u001b[0m │ \u001b[1;36m7\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m954021\u001b[0m │ \u001b[1;36m265229\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.319994e+04\u001b[0m │ \u001b[1;36m18117.218312\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3156.4001\u001b[0m │ \u001b[1;36m8.391900e+03\u001b[0m │ \u001b[1;36m1.699200e+04\u001b[0m │ \u001b[1;36m4.768434e+05\u001b[0m │\n", - "│ \u001b[32meducation_1103M\u001b[0m │ \u001b[1;36m9\u001b[0m │ \u001b[32mstring \u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m26183\u001b[0m │ \u001b[1;36m5\u001b[0m │ \u001b[32ma55475b1\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", - "└─────────────────┴───────┴─────────┴─────────┴────────┴─────────┴──────────┴──────────────┴───────────────┴─────────┴─────────────┴──────────────┴──────────────┴──────────────┘" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# take the first 10 columns\n", - "df_train[df_train.columns[:10]].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Last-mile data preprocessing\n", - "We will perform the following transformation before feeding the data to models:\n", - "\n", - "* Missing value imputation\n", - "* Encoding categorical variables\n", - "* Handling date variables\n", - "* Handling outliers\n", - "* Scaling and normalization\n", - "\n", - "\n", - "IbisML provides a set of transformations. You can find the\n", - "[roadmap](https://github.com/ibis-project/ibis-ml/issues/32).\n", - "The [IbisML website](https://ibis-project.github.io/ibis-ml/) also includes tutorials and API documentation.\n", - "\n", - "### Impute features\n", - "Impute all numeric columns using the median. In real-life scenarios, it's important to\n", - "understand the meaning of each feature and apply the appropriate imputation method for\n", - "different features. For more imputations, please refer to this\n", - "[documentation](https://ibis-project.github.io/ibis-ml/reference/steps-imputation.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "step_impute_median = ml.ImputeMedian(ml.numeric())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Encode categorical features\n", - "Encode all categorical features using one-hot-encode. For more encoding steps,\n", - "please refer to this\n", - "[doc](https://ibis-project.github.io/ibis-ml/reference/steps-encoding.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "ohe_step = ml.OneHotEncode(\n", - " [\n", - " \"maritalst_893M\",\n", - " \"requesttype_4525192L\",\n", - " \"max_profession_152M\",\n", - " \"max_gender_992L\",\n", - " \"max_empl_industry_691L\",\n", - " \"max_housingtype_772L\",\n", - " \"max_incometype_1044T\",\n", - " \"max_cancelreason_3545846M\",\n", - " \"max_rejectreason_755M\",\n", - " \"education_1103M\",\n", - " \"max_status_219L\",\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Handle date variables\n", - "Calculate all the days difference between any date columns and the column `date_decision`:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "date_cols = [col_name for col_name in df_train.columns if col_name[-1] == \"D\"]\n", - "days_to_decision_expr = {\n", - " # difference in days\n", - " f\"{col}_date_decision_diff\": (\n", - " _.date_decision.epoch_seconds() - getattr(_, col).epoch_seconds()\n", - " )\n", - " / (60 * 60 * 24)\n", - " for col in date_cols\n", - "}\n", - "days_to_decision_step = ml.Mutate(days_to_decision_expr)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Extract information from the date columns:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# dow and month is set to catagoery\n", - "expand_date_step = ml.ExpandDate(ml.date(), [\"week\", \"day\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Handle outliers\n", - "Capping outliers using `z-score` method:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "step_handle_outliers = ml.HandleUnivariateOutliers(\n", - " [\"max_amount_1115A\", \"max_overdueamountmax_950A\"],\n", - " method=\"z-score\",\n", - " treatment=\"capping\",\n", - " deviation_factor=3,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Construct recipe\n", - "We'll construct the last mile preprocessing [recipe](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)\n", - "by chaining all transformation steps, which will be fitted to the training dataset and later applied test datasets." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Last-mile preprocessing recipe: \n", - "Recipe(ExpandDate(date(), components=['week', 'day']),\n", - " Drop(date()),\n", - " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L'))),\n", - " Drop(string()),\n", - " HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", - " method='z-score',\n", - " deviation_factor=3,\n", - " treatment='capping'),\n", - " ImputeMedian(numeric()),\n", - " ScaleMinMax(numeric()),\n", - " FillNA(numeric(), 0),\n", - " Cast(numeric(), 'float32'))\n" - ] - } - ], - "source": [ - "last_mile_preprocessing = ml.Recipe(\n", - " expand_date_step,\n", - " ml.Drop(ml.date()),\n", - " # handle string columns\n", - " ohe_step,\n", - " ml.Drop(ml.string()),\n", - " # handle numeric cols\n", - " # capping outliers\n", - " step_handle_outliers,\n", - " step_impute_median,\n", - " ml.ScaleMinMax(ml.numeric()),\n", - " # fill missing value\n", - " ml.FillNA(ml.numeric(), 0),\n", - " ml.Cast(ml.numeric(), \"float32\"),\n", - ")\n", - "print(f\"Last-mile preprocessing recipe: \\n{last_mile_preprocessing}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Modeling\n", - "After completing data preprocessing with Ibis and IbisML, we proceed to the modeling\n", - "phase. Here are two approaches:\n", - "\n", - "* Use IbisML as a independent data preprocessing component and hand off the data to downstream modeling\n", - "frameworks with various output formats:\n", - " - pandas Dataframe\n", - " - NumPy Array\n", - " - Polars Dataframe\n", - " - Dask Dataframe\n", - " - xgboost.DMatrix\n", - " - Pyarrow Table\n", - "* Use IbisML recipes as components within an sklearn Pipeline and\n", - "train models similarly to how you would do with sklearn pipeline.\n", - "\n", - "We will build an XGBoost model within a scikit-learn pipeline, and a neural network classifier using the\n", - "output transformed by IbisML recipes.\n", - "\n", - "### Train and test data splitting\n", - "We'll use hashing on the unique key to consistently split rows to different groups.\n", - "Hashing is robust to underlying changes in the data, such as adding, deleting, or\n", - "reordering rows. This deterministic process ensures that each data point is always\n", - "assigned to the same split, thereby enhancing reproducibility." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jiting/anaconda3/envs/ibis-ml-dev/lib/python3.12/site-packages/ibis/expr/types/relations.py:685: FutureWarning: Selecting/filtering arbitrary expressions in `Table.__getitem__` is deprecated and will be removed in version 10.0. Please use `Table.select` or `Table.filter` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "train dataset size = 1145346 \n", - "test data size = 381313\n" - ] - } - ], - "source": [ - "\n", - "train_data, test_data = ml.train_test_split(\n", - " df_train,\n", - " unique_key=[\"case_id\"],\n", - " test_size=0.25,\n", - " random_seed=222,\n", - ")\n", - "\n", - "X_train = train_data.drop(\"target\")\n", - "y_train = train_data.target.cast(dt.float32).name(\"target\")\n", - "\n", - "X_test = test_data.drop(\"target\")\n", - "y_test = test_data.target.cast(dt.float32).name(\"target\")\n", - "\n", - "train_cnt = X_train.count().execute()\n", - "test_cnt = X_test.count().execute()\n", - "print(f\"train dataset size = {train_cnt} \\ntest data size = {test_cnt}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Hashing provides a consistent but pseudo-random distribution of data, which\n", - "may not precisely align with the specified train/test ratio. While hash codes\n", - "ensure reproducibility, they don't guarantee an exact split. Due to statistical variance,\n", - "you might find a slight imbalance in the distribution, resulting in marginally more or\n", - "fewer samples in either the training or test dataset than the target percentage. This\n", - "minor deviation from the intended ratio is a normal consequence of hash-based\n", - "partitioning.\n", - "\n", - "\n", - "### XGBoost\n", - "In this section, we integrate XGBoost into a scikit-learn pipeline to create a\n", - "streamlined workflow for training and evaluating our model.\n", - "\n", - "We'll set up a pipeline that includes two components:\n", - "\n", - "* **Preprocessing**: This step applies the `last_mile_preprocessing` for final data preprocessing.\n", - "* **Modeling**: This step applies the `xgb.XGBClassifier()` to train the XGBoost model." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "last_mile_recipes\n", - "** y type = \n", - "** y type = \n", - "self._final_estimator = XGBClassifier(base_score=None, booster=None, callbacks=None,\n", - " colsample_bylevel=None, colsample_bynode=None,\n", - " colsample_bytree=0.8, device=None, early_stopping_rounds=None,\n", - " enable_categorical=False, eval_metric=None, feature_types=None,\n", - " gamma=None, grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.05, max_bin=None,\n", - " max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=5, max_leaves=None,\n", - " min_child_weight=None, missing=nan, monotone_constraints=None,\n", - " multi_strategy=None, n_estimators=100, n_jobs=None,\n", - " num_parallel_tree=None, random_state=42, ...)\n", - "last_step_params = {'fit': {}, 'partial_fit': {}, 'predict': {}, 'predict_proba': {}, 'predict_log_proba': {}, 'decision_function': {}, 'score': {}, 'split': {}, 'transform': {}, 'inverse_transform': {}, 'fit_transform': {}, 'fit_predict': {}}\n" - ] - }, - { - "data": { - "text/html": [ - "
Pipeline(steps=[('last_mile_recipes',\n",
-              "                 Recipe(ExpandDate(date(), components=['week', 'day']),\n",
-              "                        Drop(date()),\n",
-              "                        OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_sta...\n",
-              "                               feature_types=None, gamma=None, grow_policy=None,\n",
-              "                               importance_type=None,\n",
-              "                               interaction_constraints=None, learning_rate=0.05,\n",
-              "                               max_bin=None, max_cat_threshold=None,\n",
-              "                               max_cat_to_onehot=None, max_delta_step=None,\n",
-              "                               max_depth=5, max_leaves=None,\n",
-              "                               min_child_weight=None, missing=nan,\n",
-              "                               monotone_constraints=None, multi_strategy=None,\n",
-              "                               n_estimators=100, n_jobs=None,\n",
-              "                               num_parallel_tree=None, random_state=42, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "Pipeline(steps=[('last_mile_recipes',\n", - " Recipe(ExpandDate(date(), components=['week', 'day']),\n", - " Drop(date()),\n", - " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_sta...\n", - " feature_types=None, gamma=None, grow_policy=None,\n", - " importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.05,\n", - " max_bin=None, max_cat_threshold=None,\n", - " max_cat_to_onehot=None, max_delta_step=None,\n", - " max_depth=5, max_leaves=None,\n", - " min_child_weight=None, missing=nan,\n", - " monotone_constraints=None, multi_strategy=None,\n", - " n_estimators=100, n_jobs=None,\n", - " num_parallel_tree=None, random_state=42, ...))])" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction\n", + "In this post, we'll demonstrate how to use Ibis and [IbisML](https://github.com/ibis-project/ibis-ml)\n", + "end-to-end for the\n", + "[credit risk model stability Kaggle competition](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability).\n", + "\n", + "1. Load data and perform feature engineering on DuckDB backend using IbisML\n", + "2. Perform last-mile ML data preprocessing on DuckDB backend using IbisML\n", + "3. Train two models using different frameworks:\n", + " * An XGBoost model within a scikit-learn pipeline.\n", + " * A neural network with PyTorch and PyTorch Lightning.\n", + "\n", + "The aim of this competition is to predict which clients are more likely to default on their\n", + "loans by using both internal and external data sources.\n", + "\n", + "To get started with Ibis and IbisML, please refer to the websites:\n", + "\n", + "* [Ibis](https://ibis-project.org/): An open-source dataframe library that works with any data system.\n", + "* [IbisML](https://ibis-project.github.io/ibis-ml/): A library for building scalable ML pipelines.\n", + "\n", + "\n", + "## Prerequisites\n", + "To run this example, you'll need to download the data from Kaggle website with a Kaggle user account and install Ibis, IbisML, and the necessary modeling library.\n", + "\n", + "### Download data\n", + "You need a Kaggle account to download the data. If you do not have one,\n", + "feel free to register one.\n", + "\n", + "1. Option 1: Manual download\n", + " * Log into your Kaggle account and download all data from this\n", + " [link](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data),\n", + " unzip the files, and save them to your local disk.\n", + "2. Option 2: Kaggle API\n", + " * Go to your `Kaggle Account Settings`.\n", + " * Under the `API` section, click on `Create New API Token`. This will download the `kaggle.json`\n", + " file to your computer.\n", + " * Place the `kaggle.json` file in the correct directory, normally it is under your home directory\n", + " `~/.kaggle`:\n", + "\n", + " ```bash\n", + " mkdir ~/.kaggle\n", + " mv ~/Downloads/kaggle.json ~/.kaggle\n", + " ```\n", + " * Install Kaggle CLI and download the data:\n", + "\n", + " ```bash\n", + " pip install kaggle\n", + " kaggle competitions download -c home-credit-credit-risk-model-stability\n", + " unzip home-credit-credit-risk-model-stability.zip\n", + " ```\n", + "\n", + "### Install libraries\n", + "To use Ibis and IbisML with the DuckDB backend for building models, you'll need to install the\n", + "necessary packages. Depending on your preferred machine learning framework, you can choose\n", + "one of the following installation commands:\n", + "\n", + "For PyTorch-based models:\n", + "\n", + "```bash\n", + "pip install 'ibis-framework[duckdb]' ibis-ml torch pytorch-lightning\n", + "```\n", + "\n", + "For XGBoost and scikit-learn-based models:\n", + "\n", + "```bash\n", + "pip install 'ibis-framework[duckdb]' ibis-ml xgboost[scikit-learn]\n", + "```\n", + "\n", + "Import libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ibis\n", + "import ibis.expr.datatypes as dt\n", + "from ibis import _\n", + "import ibis_ml as ml\n", + "from pathlib import Path\n", + "\n", + "# enable interactive mode for ibis\n", + "ibis.options.interactive = True" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the backend for computing:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "con = ibis.duckdb.connect()\n", + "# remove the black bars from duckdb's progress bar\n", + "con.raw_sql(\"set enable_progress_bar = false\")\n", + "# DuckDB is the default backend for Ibis\n", + "ibis.set_backend(con)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set data path:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# change the root path to yours\n", + "ROOT = Path(\"/Users/jiting/Downloads/home-credit-credit-risk-model-stability\")\n", + "TRAIN_DIR = ROOT / \"parquet_files\" / \"train\"\n", + "TEST_DIR = ROOT / \"parquet_files\" / \"test\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data loading and processing\n", + "We'll use Ibis to read the Parquet files and perform the necessary processing for the next step.\n", + "\n", + "### Directory structure and tables\n", + "Since there are many data files, let's start by examining the directory structure and\n", + "tables within the train directory:\n", + "\n", + "```bash\n", + "# change this to your directory\n", + "tree -L 2 ~/Downloads/home-credit-credit-risk-model-stability/parquet_files/train\n", + "```\n", + "Data directory:\n", + "\n", + "```bash\n", + "~/Downloads/home-credit-credit-risk-model-stability/parquet_files/train\n", + "├── train_applprev_1_0.parquet\n", + "├── train_applprev_1_1.parquet\n", + "├── train_applprev_2.parquet\n", + "├── train_base.parquet\n", + "├── train_credit_bureau_a_1_0.parquet\n", + "├── train_credit_bureau_a_1_1.parquet\n", + "├── train_credit_bureau_a_1_3.parquet\n", + "├── train_credit_bureau_a_2_0.parquet\n", + "├── train_credit_bureau_a_2_1.parquet\n", + "├── train_credit_bureau_a_2_10.parquet\n", + "├── train_credit_bureau_a_2_2.parquet\n", + "├── train_credit_bureau_a_2_3.parquet\n", + "├── train_credit_bureau_a_2_4.parquet\n", + "├── train_credit_bureau_a_2_5.parquet\n", + "├── train_credit_bureau_a_2_6.parquet\n", + "├── train_credit_bureau_a_2_7.parquet\n", + "├── train_credit_bureau_a_2_8.parquet\n", + "├── train_credit_bureau_a_2_9.parquet\n", + "├── train_credit_bureau_b_1.parquet\n", + "├── train_credit_bureau_b_2.parquet\n", + "├── train_debitcard_1.parquet\n", + "├── train_deposit_1.parquet\n", + "├── train_other_1.parquet\n", + "├── train_person_1.parquet\n", + "├── train_person_2.parquet\n", + "├── train_static_0_0.parquet\n", + "├── train_static_0_1.parquet\n", + "├── train_static_cb_0.parquet\n", + "├── train_tax_registry_a_1.parquet\n", + "├── train_tax_registry_b_1.parquet\n", + "└── train_tax_registry_c_1.parquet\n", + "```\n", + "\n", + "The `train_base.parquet` file is the base table, while the others are feature tables.\n", + "Let's take a quick look at these tables.\n", + "\n", + "#### Base table\n", + "The base table (`train_base.parquet`) contains the unique ID, a binary target flag\n", + "and other information for the training samples. This unique ID will serve as the\n", + "linking key for joining with other feature tables.\n", + "\n", + "* `case_id` - This is the unique ID for each loan. You'll need this ID to\n", + " join feature tables to the base table. There are about 1.5m unique loans.\n", + "* `date_decision` - This refers to the date when a decision was made regarding the\n", + " approval of the loan.\n", + "* `WEEK_NUM` - This is the week number used for aggregation. In the test sample,\n", + " `WEEK_NUM` continues sequentially from the last training value of `WEEK_NUM`.\n", + "* `MONTH` - This column represents the month when the approval decision was made.\n", + "* `target` - This is the binary target flag, determined after a certain period based on\n", + " whether or not the client defaulted on the specific loan.\n", + "\n", + "Here is several examples from the base table:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┓\n",
+       "┃ case_id  date_decision  MONTH   WEEK_NUM  target ┃\n",
+       "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━┩\n",
+       "│ int64stringint64int64int64  │\n",
+       "├─────────┼───────────────┼────────┼──────────┼────────┤\n",
+       "│       02019-01-03   20190100 │\n",
+       "│       12019-01-03   20190100 │\n",
+       "│       22019-01-04   20190100 │\n",
+       "│       32019-01-03   20190100 │\n",
+       "│       42019-01-04   20190101 │\n",
+       "└─────────┴───────────────┴────────┴──────────┴────────┘\n",
+       "
\n" ], - "source": [ - "from sklearn.pipeline import Pipeline\n", - "from sklearn.metrics import roc_auc_score\n", - "import xgboost as xgb\n", - "\n", - "model = xgb.XGBClassifier(\n", - " n_estimators=100,\n", - " max_depth=5,\n", - " learning_rate=0.05,\n", - " subsample=0.8,\n", - " colsample_bytree=0.8,\n", - " random_state=42,\n", - ")\n", - "# create the pipeline with the last mile ML recipes and the model\n", - "pipe = Pipeline([(\"last_mile_recipes\", last_mile_preprocessing), (\"model\", model)])\n", - "# fit the pipeline on the training data\n", - "pipe.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's evaluate the model on the test data using Gini index:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "gini_score for test dataset: 0.06491440835995244\n" - ] - } + "text/plain": [ + "┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate_decision\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mMONTH\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mWEEK_NUM\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtarget\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━┩\n", + "│ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │\n", + "├─────────┼───────────────┼────────┼──────────┼────────┤\n", + "│ \u001b[1;36m0\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", + "│ \u001b[1;36m1\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", + "│ \u001b[1;36m2\u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", + "│ \u001b[1;36m3\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", + "│ \u001b[1;36m4\u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1\u001b[0m │\n", + "└─────────┴───────────────┴────────┴──────────┴────────┘" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ibis.read_parquet(TRAIN_DIR / \"train_base.parquet\").head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Feature tables\n", + "The remaining files contain features, consisting of approximately 370 features from\n", + "previous loan applications and external data sources. Their definitions can be found in the feature\n", + "definition [file](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data)\n", + "from the competition website.\n", + "\n", + "There are several things we want to mention for the feature tables:\n", + "\n", + "* **Union datasets**: One dataset could be saved into multiple parquet files, such as\n", + "`train_applprev_1_0.parquet` and `train_applprev_1_1.parquet`, We need to union this data.\n", + "* **Dataset levels**: Datasets may have different levels, which we will explain as\n", + "follows:\n", + " * **Depth = 0**: Each row in the table is identified by a unique `case_id`.\n", + " In this case, you can directly join the features with the base table and use them as\n", + " features for further analysis or processing.\n", + " * **Depth > 0**: You will group the data based on the `case_id` and perform calculations\n", + " or aggregations within each group.\n", + "\n", + "Here are two examples of tables with different levels.\n", + "\n", + "Example of table with depth = 0, `case_id` is the row identifier, features can be directly joined\n", + " with the base table." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃ case_id  assignmentdate_238D  assignmentdate_4527235D  assignmentdate_4955616D  birthdate_574D  contractssum_5085716L  dateofbirth_337D  dateofbirth_342D  days120_123L  days180_256L  days30_165L  days360_512L  days90_310L  description_5085714M  education_1103M  education_88M  firstquarter_103L  for3years_128L  for3years_504L  for3years_584L  formonth_118L  formonth_206L  formonth_535L  forquarter_1017L  forquarter_462L  forquarter_634L  fortoday_1092L  forweek_1077L  forweek_528L  forweek_601L  foryear_618L  foryear_818L  foryear_850L  fourthquarter_440L  maritalst_385M  maritalst_893M  numberofqueries_373L  pmtaverage_3A  pmtaverage_4527227A  pmtaverage_4955615A  pmtcount_4527229L  pmtcount_4955617L  pmtcount_693L  pmtscount_423L  pmtssum_45A  requesttype_4525192L  responsedate_1012D  responsedate_4527233D  responsedate_4917613D  riskassesment_302T  riskassesment_940T  secondquarter_766L  thirdquarter_1082L ┃\n",
+       "┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│ int64stringstringstringstringfloat64stringstringfloat64float64float64float64float64stringstringstringfloat64float64float64float64float64float64float64float64float64float64float64float64float64float64float64float64float64float64stringstringfloat64float64float64float64float64float64float64float64float64stringstringstringstringstringfloat64float64float64            │\n",
+       "├─────────┼─────────────────────┼─────────────────────────┼─────────────────────────┼────────────────┼───────────────────────┼──────────────────┼──────────────────┼──────────────┼──────────────┼─────────────┼──────────────┼─────────────┼──────────────────────┼─────────────────┼───────────────┼───────────────────┼────────────────┼────────────────┼────────────────┼───────────────┼───────────────┼───────────────┼──────────────────┼─────────────────┼─────────────────┼────────────────┼───────────────┼──────────────┼──────────────┼──────────────┼──────────────┼──────────────┼────────────────────┼────────────────┼────────────────┼──────────────────────┼───────────────┼─────────────────────┼─────────────────────┼───────────────────┼───────────────────┼───────────────┼────────────────┼─────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────────┤\n",
+       "│     357NULLNULLNULL1988-04-01    NULLNULLNULLNULLNULLNULLNULLNULLa55475b1            a55475b1       a55475b1     NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1      a55475b1      NULLNULLNULLNULLNULLNULLNULL6.06301.4000NULL2019-01-25        NULLNULLNULLNULLNULLNULL │\n",
+       "│     381NULLNULLNULL1973-11-01    NULLNULLNULLNULLNULLNULLNULLNULLa55475b1            a55475b1       a55475b1     NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1      a55475b1      NULLNULLNULLNULLNULLNULLNULL6.04019.6000NULL2019-01-25        NULLNULLNULLNULLNULLNULL │\n",
+       "│     388NULLNULLNULL1989-04-01    NULL1989-04-01      NULL6.08.02.010.04.0a55475b1            a55475b1       a55475b1     2.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL6.0a55475b1      a55475b1      10.0NULLNULLNULLNULLNULLNULL6.014548.0000NULL2019-01-28        NULLNULLNULLNULL3.05.0 │\n",
+       "│     405NULLNULLNULL1974-03-01    NULL1974-03-01      NULL0.00.00.01.00.0a55475b1            a55475b1       a55475b1     0.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL4.0a55475b1      a55475b1      1.0NULLNULLNULLNULLNULLNULL6.010498.2400NULL2019-01-21        NULLNULLNULLNULL2.00.0 │\n",
+       "│     409NULLNULLNULL1993-06-01    NULL1993-06-01      NULL2.03.00.03.01.0a55475b1            717ddd49       a55475b1     4.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL1.0a7fcb6e5      a55475b1      3.0NULLNULLNULLNULLNULLNULL7.06344.8804NULL2019-01-21        NULLNULLNULLNULL0.04.0 │\n",
+       "└─────────┴─────────────────────┴─────────────────────────┴─────────────────────────┴────────────────┴───────────────────────┴──────────────────┴──────────────────┴──────────────┴──────────────┴─────────────┴──────────────┴─────────────┴──────────────────────┴─────────────────┴───────────────┴───────────────────┴────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┴──────────────────┴─────────────────┴─────────────────┴────────────────┴───────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴────────────────────┴────────────────┴────────────────┴──────────────────────┴───────────────┴─────────────────────┴─────────────────────┴───────────────────┴───────────────────┴───────────────┴────────────────┴─────────────┴──────────────────────┴────────────────────┴───────────────────────┴───────────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────────┘\n",
+       "
\n" ], - "source": [ - "y_pred_proba = pipe.predict_proba(X_test)[:, 1]\n", - "# calculate the AUC score\n", - "auc = roc_auc_score(y_test, y_pred_proba)\n", - "\n", - "# calculate the Gini score\n", - "gini_score = 2 * auc - 1\n", - "print(f\"gini_score for test dataset: {gini_score:,}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "The competition is evaluated using a Gini stability metric. For more information, see the\n", - "[evaluation guidelines](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/overview/evaluation)\n", - "\n", - "\n", - "### Neural network classifier\n", - "Build a neural network classifier using PyTorch and PyTorch Lightning.\n", - "\n", - "It is not recommended to build a neural network classifier for this competition, we are building\n", - "it solely for demonstration purposes.\n", - "\n", - "\n", - "We'll demonstrate how to build a model by directly passing the data to it. IbisML recipes can output\n", - "data in various formats, making it compatible with different modeling frameworks.\n", - "Let's first train the recipe:" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Recipe(ExpandDate(date(), components=['week', 'day']),\n", - " Drop(date()),\n", - " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L'))),\n", - " Drop(string()),\n", - " HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", - " method='z-score',\n", - " deviation_factor=3,\n", - " treatment='capping'),\n", - " ImputeMedian(numeric()),\n", - " ScaleMinMax(numeric()),\n", - " FillNA(numeric(), 0),\n", - " Cast(numeric(), 'float32'))" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } + "text/plain": [ + "┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_238D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_4527235D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_4955616D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbirthdate_574D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractssum_5085716L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_337D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_342D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays120_123L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays180_256L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays30_165L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays360_512L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays90_310L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdescription_5085714M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_1103M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_88M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirstquarter_103L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_128L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_504L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_584L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_118L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_206L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_535L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_1017L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_462L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_634L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfortoday_1092L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_1077L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_528L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_601L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_618L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_818L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_850L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfourthquarter_440L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_385M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_893M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofqueries_373L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_3A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_4527227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_4955615A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_4527229L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_4955617L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_693L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtscount_423L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtssum_45A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mrequesttype_4525192L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_1012D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4527233D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4917613D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mriskassesment_302T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mriskassesment_940T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msecondquarter_766L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mthirdquarter_1082L\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", + "├─────────┼─────────────────────┼─────────────────────────┼─────────────────────────┼────────────────┼───────────────────────┼──────────────────┼──────────────────┼──────────────┼──────────────┼─────────────┼──────────────┼─────────────┼──────────────────────┼─────────────────┼───────────────┼───────────────────┼────────────────┼────────────────┼────────────────┼───────────────┼───────────────┼───────────────┼──────────────────┼─────────────────┼─────────────────┼────────────────┼───────────────┼──────────────┼──────────────┼──────────────┼──────────────┼──────────────┼────────────────────┼────────────────┼────────────────┼──────────────────────┼───────────────┼─────────────────────┼─────────────────────┼───────────────────┼───────────────────┼───────────────┼────────────────┼─────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────────┤\n", + "│ \u001b[1;36m357\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1988-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m6301.4000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-25 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", + "│ \u001b[1;36m381\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1973-11-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m4019.6000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-25 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", + "│ \u001b[1;36m388\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1989-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1989-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m8.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m14548.0000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-28 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │\n", + "│ \u001b[1;36m405\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1974-03-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1974-03-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m10498.2400\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-21 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │\n", + "│ \u001b[1;36m409\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1993-06-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1993-06-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32m717ddd49 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[32ma7fcb6e5 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m7.0\u001b[0m │ \u001b[1;36m6344.8804\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-21 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │\n", + "└─────────┴─────────────────────┴─────────────────────────┴─────────────────────────┴────────────────┴───────────────────────┴──────────────────┴──────────────────┴──────────────┴──────────────┴─────────────┴──────────────┴─────────────┴──────────────────────┴─────────────────┴───────────────┴───────────────────┴────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┴──────────────────┴─────────────────┴─────────────────┴────────────────┴───────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴────────────────────┴────────────────┴────────────────┴──────────────────────┴───────────────┴─────────────────────┴─────────────────────┴───────────────────┴───────────────────┴───────────────┴────────────────┴─────────────┴──────────────────────┴────────────────────┴───────────────────────┴───────────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ibis.read_parquet(TRAIN_DIR / \"train_static_cb_0.parquet\").head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Example of a table with depth = 1, we need to aggregate the features and collect statistics\n", + "based on `case_id` then join with the base table." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓\n",
+       "┃ num_group1  case_id  amount_1115A  classificationofcontr_1114M  contractdate_551D  contractmaturitydate_151D  contractst_516M  contracttype_653M  credlmt_1052A  credlmt_228A  credlmt_3940954A  credor_3940957M  credquantity_1099L  credquantity_984L  debtpastduevalue_732A  debtvalue_227A  dpd_550P  dpd_733P  dpdmax_851P  dpdmaxdatemonth_804T  dpdmaxdateyear_742T  installmentamount_644A  installmentamount_833A  instlamount_892A  interesteffectiverate_369L  interestrateyearly_538L  lastupdate_260D  maxdebtpduevalodued_3940955A  numberofinstls_810L  overdueamountmax_950A  overdueamountmaxdatemonth_494T  overdueamountmaxdateyear_432T  periodicityofpmts_997L  periodicityofpmts_997M  pmtdaysoverdue_1135P  pmtmethod_731M  pmtnumpending_403L  purposeofcred_722M  residualamount_1093A  residualamount_127A  residualamount_3940956A  subjectrole_326M  subjectrole_43M  totalamount_503A  totalamount_881A ┃\n",
+       "┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩\n",
+       "│ int64int64float64stringstringstringstringstringfloat64float64float64stringfloat64float64float64float64float64float64float64float64float64float64float64float64float64float64stringfloat64float64float64float64float64stringstringfloat64stringfloat64stringfloat64float64float64stringstringfloat64float64          │\n",
+       "├────────────┼─────────┼──────────────┼─────────────────────────────┼───────────────────┼───────────────────────────┼─────────────────┼───────────────────┼───────────────┼──────────────┼──────────────────┼─────────────────┼────────────────────┼───────────────────┼───────────────────────┼────────────────┼──────────┼──────────┼─────────────┼──────────────────────┼─────────────────────┼────────────────────────┼────────────────────────┼──────────────────┼────────────────────────────┼─────────────────────────┼─────────────────┼──────────────────────────────┼─────────────────────┼───────────────────────┼────────────────────────────────┼───────────────────────────────┼────────────────────────┼────────────────────────┼──────────────────────┼────────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────────────┼─────────────────────────┼──────────────────┼─────────────────┼──────────────────┼──────────────────┤\n",
+       "│          0467NULLea6782cc                   2011-06-15       2031-06-13               7241344e       724be82a         3.000000e+0610000.03.000000e+06P164_34_168    2.01.0NULLNULL0.00.0NULLNULLNULL0.00.000NULLNULLNULL2019-01-20     NULLNULLNULLNULLNULLNULLa55475b1              NULLa55475b1      NULL96a8fdfe          0.00.0NULLfa4f56f1        ab3c25cf       3.000000e+0610000.0 │\n",
+       "│          1467NULLea6782cc                   2019-01-04       2021-08-04               7241344e       724be82a         NULLNULL1.303650e+05P164_34_168    1.02.0NULLNULL0.00.0NULLNULLNULL0.026571.969NULLNULLNULL2019-01-20     NULLNULLNULLNULLNULLNULLa55475b1              NULLa55475b1      NULL96a8fdfe          NULLNULLNULLab3c25cf        ab3c25cf       7.800000e+04960000.0 │\n",
+       "│          246778000.0ea6782cc                   2016-10-25       2019-10-25               7241344e       4257cbed         NULLNULLNULLc5a72b57       NULLNULL0.026571.969NULLNULL0.011.02016.0NULLNULL2898.76NULLNULL2019-01-10     0.036.00.011.02016.0NULLa0b598e4              0.0e914c86c      10.096a8fdfe          NULLNULLNULLa55475b1        a55475b1       NULLNULL │\n",
+       "│          01445NULLea6782cc                   2015-01-30       2021-01-30               7241344e       1c9c5356         4.000000e+05100000.07.400000e+04b619fa46       2.05.00.0NULL0.00.0200418.01.02018.00.00.000NULLNULLNULL2019-01-19     0.4NULL1.42.02018.0NULLa55475b1              0.0a55475b1      NULL60c73645          0.00.073044.18daf49a8a        ab3c25cf       4.000000e+05100000.0 │\n",
+       "│          11445NULL01f63ac8                   2014-09-12       2021-09-12               7241344e       724be82a         NULLNULL4.000000e+0574bd67a8       3.017.0NULLNULL0.00.0NULLNULLNULL0.0209617.770NULLNULLNULL2019-01-13     NULLNULLNULLNULLNULLNULLa55475b1              NULLa55475b1      NULL96a8fdfe          NULLNULLNULLab3c25cf        ab3c25cf       3.968006e+05184587.8 │\n",
+       "└────────────┴─────────┴──────────────┴─────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────┴───────────────────┴───────────────┴──────────────┴──────────────────┴─────────────────┴────────────────────┴───────────────────┴───────────────────────┴────────────────┴──────────┴──────────┴─────────────┴──────────────────────┴─────────────────────┴────────────────────────┴────────────────────────┴──────────────────┴────────────────────────────┴─────────────────────────┴─────────────────┴──────────────────────────────┴─────────────────────┴───────────────────────┴────────────────────────────────┴───────────────────────────────┴────────────────────────┴────────────────────────┴──────────────────────┴────────────────┴────────────────────┴────────────────────┴──────────────────────┴─────────────────────┴─────────────────────────┴──────────────────┴─────────────────┴──────────────────┴──────────────────┘\n",
+       "
\n" ], - "source": [ - "# train preprocessing recipe using training dataset\n", - "last_mile_preprocessing.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the previous cell, we trained the recipe using the training dataset. Now, we will\n", - "transform both the train and test datasets using the same recipe. The default output format is a `NumPy array`" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "train data shape = (1145346, 980)\n", - "test data shape = (381313, 980)\n" - ] - } + "text/plain": [ + "┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mnum_group1\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mamount_1115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mclassificationofcontr_1114M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractdate_551D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractmaturitydate_151D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractst_516M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontracttype_653M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_1052A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_228A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_3940954A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredor_3940957M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredquantity_1099L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredquantity_984L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdebtpastduevalue_732A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdebtvalue_227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpd_550P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpd_733P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmax_851P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmaxdatemonth_804T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmaxdateyear_742T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstallmentamount_644A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstallmentamount_833A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstlamount_892A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minteresteffectiverate_369L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minterestrateyearly_538L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlastupdate_260D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaxdebtpduevalodued_3940955A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofinstls_810L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmax_950A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmaxdatemonth_494T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmaxdateyear_432T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mperiodicityofpmts_997L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mperiodicityofpmts_997M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtdaysoverdue_1135P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtmethod_731M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtnumpending_403L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpurposeofcred_722M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_1093A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_127A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_3940956A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msubjectrole_326M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msubjectrole_43M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotalamount_503A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotalamount_881A\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", + "├────────────┼─────────┼──────────────┼─────────────────────────────┼───────────────────┼───────────────────────────┼─────────────────┼───────────────────┼───────────────┼──────────────┼──────────────────┼─────────────────┼────────────────────┼───────────────────┼───────────────────────┼────────────────┼──────────┼──────────┼─────────────┼──────────────────────┼─────────────────────┼────────────────────────┼────────────────────────┼──────────────────┼────────────────────────────┼─────────────────────────┼─────────────────┼──────────────────────────────┼─────────────────────┼───────────────────────┼────────────────────────────────┼───────────────────────────────┼────────────────────────┼────────────────────────┼──────────────────────┼────────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────────────┼─────────────────────────┼──────────────────┼─────────────────┼──────────────────┼──────────────────┤\n", + "│ \u001b[1;36m0\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2011-06-15 \u001b[0m │ \u001b[32m2031-06-13 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[1;36m10000.0\u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[32mP164_34_168 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-20 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mfa4f56f1 \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[1;36m10000.0\u001b[0m │\n", + "│ \u001b[1;36m1\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[32m2021-08-04 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.303650e+05\u001b[0m │ \u001b[32mP164_34_168 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m26571.969\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-20 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m7.800000e+04\u001b[0m │ \u001b[1;36m960000.0\u001b[0m │\n", + "│ \u001b[1;36m2\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[1;36m78000.0\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2016-10-25 \u001b[0m │ \u001b[32m2019-10-25 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m4257cbed \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mc5a72b57 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m26571.969\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m11.0\u001b[0m │ \u001b[1;36m2016.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2898.76\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-10 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m36.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m11.0\u001b[0m │ \u001b[1;36m2016.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma0b598e4 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32me914c86c \u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", + "│ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1445\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2015-01-30 \u001b[0m │ \u001b[32m2021-01-30 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m1c9c5356 \u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[1;36m7.400000e+04\u001b[0m │ \u001b[32mb619fa46 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m200418.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2018.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-19 \u001b[0m │ \u001b[1;36m0.4\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.4\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m2018.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m60c73645 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m73044.18\u001b[0m │ \u001b[32mdaf49a8a \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │\n", + "│ \u001b[1;36m1\u001b[0m │ \u001b[1;36m1445\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m01f63ac8 \u001b[0m │ \u001b[32m2014-09-12 \u001b[0m │ \u001b[32m2021-09-12 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[32m74bd67a8 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m17.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m209617.770\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-13 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m3.968006e+05\u001b[0m │ \u001b[1;36m184587.8\u001b[0m │\n", + "└────────────┴─────────┴──────────────┴─────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────┴───────────────────┴───────────────┴──────────────┴──────────────────┴─────────────────┴────────────────────┴───────────────────┴───────────────────────┴────────────────┴──────────┴──────────┴─────────────┴──────────────────────┴─────────────────────┴────────────────────────┴────────────────────────┴──────────────────┴────────────────────────────┴─────────────────────────┴─────────────────┴──────────────────────────────┴─────────────────────┴───────────────────────┴────────────────────────────────┴───────────────────────────────┴────────────────────────┴────────────────────────┴──────────────────────┴────────────────┴────────────────────┴────────────────────┴──────────────────────┴─────────────────────┴─────────────────────────┴──────────────────┴─────────────────┴──────────────────┴──────────────────┘" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ibis.read_parquet(TRAIN_DIR / \"train_credit_bureau_b_1.parquet\").relocate(\n", + " \"num_group1\"\n", + ").order_by([\"case_id\", \"num_group1\"]).head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For more details on features and its exploratory data analysis (EDA), you can refer to\n", + "feature definition and these Kaggle notebooks:\n", + "\n", + "* [Feature\n", + " definition](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data#:~:text=calendar_view_week-,feature_definitions,-.csv)\n", + "* [Home credit risk prediction\n", + " EDA](https://www.kaggle.com/code/loki97/home-credit-risk-prediction-eda)\n", + "* [Home credit CRMS 2024\n", + " EDA](https://www.kaggle.com/code/sergiosaharovskiy/home-credit-crms-2024-eda-and-submission)\n", + "\n", + "### Data loading and processing\n", + "We will perform the following data processing steps using Ibis and IbisML:\n", + "\n", + "* **Convert data types**: Ensure consistency by converting data types, as the same column\n", + " in different sub-files may have different types.\n", + "* **Aggregate features**: For tables with depth greater than 0, aggregate features based\n", + " on `case_id`, including statistics calculation. You can collect statistics such as mean,\n", + " median, mode, minimum, standard deviation, and others.\n", + "* **Union and join datasets**: Combine multiple sub-files of the same dataset into one\n", + " table, as some datasets are split into multiple sub-files with a common prefix. Afterward,\n", + " join these tables with the base table.\n", + "\n", + "#### Convert data types\n", + "We'll use IbisML to create a chain of `Cast` steps, forming a recipe for data type\n", + "conversion across the dataset. This conversion is based on the provided information\n", + "extracted from column names. Columns that have similar transformations are indicated by a\n", + "capital letter at the end of their names:\n", + "\n", + "* P - Transform DPD (Days past due)\n", + "* M - Masking categories\n", + "* A - Transform amount\n", + "* D - Transform date\n", + "* T - Unspecified Transform\n", + "* L - Unspecified Transform\n", + "\n", + "For example, we'll define a IbisML transformation step to convert columns ends with `P`\n", + "to floating number:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# convert columns ends with P to floating number\n", + "step_cast_P_to_float = ml.Cast(ml.endswith(\"P\"), dt.float64)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's define additional type conversion transformations based on the postfix of column names:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# convert columns ends with A to floating number\n", + "step_cast_A_to_float = ml.Cast(ml.endswith(\"A\"), dt.float64)\n", + "# convert columns ends with D to date\n", + "step_cast_D_to_date = ml.Cast(ml.endswith(\"D\"), dt.date)\n", + "# convert columns ends with M to str\n", + "step_cast_M_to_str = ml.Cast(ml.endswith(\"M\"), dt.str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll construct the\n", + "[IbisML Recipe](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)\n", + "which chains together all the transformation steps." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data format conversion recipe:\n", + "Recipe(Cast(endswith('P'), 'float64'),\n", + " Cast(endswith('D'), 'date'),\n", + " Cast(endswith('M'), 'string'),\n", + " Cast(endswith('A'), 'float64'),\n", + " Cast(cols(('date_decision',)), 'date'),\n", + " Cast(cols(('case_id', 'WEEK_NUM', 'num_group1', 'num_group2')), 'int64'),\n", + " Cast(cols(('cardtype_51L', 'credacc_status_367L', 'requesttype_4525192L', 'riskassesment_302T', 'max_periodicityofpmts_997L')),\n", + " 'string'),\n", + " Cast(cols(('isbidproductrequest_292L', 'isdebitcard_527L', 'equalityempfrom_62L')),\n", + " 'int64'))\n" + ] + } + ], + "source": [ + "data_type_recipes = ml.Recipe(\n", + " step_cast_P_to_float,\n", + " step_cast_D_to_date,\n", + " step_cast_M_to_str,\n", + " step_cast_A_to_float,\n", + " # cast some special columns\n", + " ml.Cast([\"date_decision\"], \"date\"),\n", + " ml.Cast([\"case_id\", \"WEEK_NUM\", \"num_group1\", \"num_group2\"], dt.int64),\n", + " ml.Cast(\n", + " [\n", + " \"cardtype_51L\",\n", + " \"credacc_status_367L\",\n", + " \"requesttype_4525192L\",\n", + " \"riskassesment_302T\",\n", + " \"max_periodicityofpmts_997L\",\n", + " ],\n", + " dt.str,\n", + " ),\n", + " ml.Cast(\n", + " [\"isbidproductrequest_292L\", \"isdebitcard_527L\", \"equalityempfrom_62L\"],\n", + " dt.int64,\n", + " ),\n", + ")\n", + "print(f\"Data format conversion recipe:\\n{data_type_recipes}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "IbisML offers a powerful set of column selectors, allowing you to select columns based\n", + "on names, types, and patterns. For more information, you can refer to the IbisML column\n", + "selectors [documentation](https://ibis-project.github.io/ibis-ml/reference/selectors.html).\n", + "\n", + "\n", + "#### Aggregate features\n", + "For tables with a depth greater than 0 that can't be directly joined with the base table,\n", + "we need to aggregate the features by the `case_id`. You could compute the different statistics for numeric columns and\n", + "non-numeric columns.\n", + "\n", + "Here, we use the `maximum` as an example." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def agg_by_id(table):\n", + " return table.group_by(\"case_id\").agg(\n", + " [\n", + " table[col_name].max().name(f\"max_{col_name}\")\n", + " for col_name in table.columns\n", + " if col_name[-1] in (\"T\", \"L\", \"P\", \"A\", \"D\", \"M\")\n", + " ]\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "For better predicting power, you need to collect different statistics based on the meaning of features. For simplicity,\n", + "we'll only collect the maximum value of the features here.\n", + "\n", + "\n", + "#### Put them together\n", + "We'll put them together in a function reads parquet files, optionally handles regex patterns for\n", + "multiple sub-files, applies data type transformations defined by `data_type_recipes`, and\n", + "performs aggregation based on `case_id` if specified by the depth parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def read_and_process_files(file_path, depth=None, is_regex=False):\n", + " \"\"\"\n", + " Read and process Parquet files.\n", + "\n", + " Args:\n", + " file_path (str): Path to the file or regex pattern to match files.\n", + " depth (int, optional): Depth of processing. If 1 or 2, additional\n", + " aggregation is performed.\n", + " is_regex (bool, optional): Whether the file_path is a regex pattern.\n", + "\n", + " Returns:\n", + " ibis.Table: The processed Ibis table.\n", + " \"\"\"\n", + " if is_regex:\n", + " # read and union multiple files\n", + " chunks = []\n", + " for path in Path(file_path).glob(\"*\"):\n", + " chunk = ibis.read_parquet(path)\n", + " # transform table using IbisML Recipe\n", + " chunk = data_type_recipes.fit(chunk).to_ibis(chunk)\n", + " chunks.append(chunk)\n", + " table = ibis.union(*chunks)\n", + " else:\n", + " # read a single file\n", + " table = ibis.read_parquet(file_path)\n", + " # transform table using IbisML\n", + " table = data_type_recipes.fit(table).to_ibis(table)\n", + "\n", + " # perform aggregation if depth is 1 or 2\n", + " if depth in [1, 2]:\n", + " table = agg_by_id(table)\n", + "\n", + " return table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define two dictionaries, `train_data_store` and `test_data_store`, that organize and\n", + "store processed datasets for training and testing datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data_store = {\n", + " \"df_base\": read_and_process_files(TRAIN_DIR / \"train_base.parquet\"),\n", + " \"depth_0\": [\n", + " read_and_process_files(TRAIN_DIR / \"train_static_cb_0.parquet\"),\n", + " read_and_process_files(TRAIN_DIR / \"train_static_0_*.parquet\", is_regex=True),\n", + " ],\n", + " \"depth_1\": [\n", + " read_and_process_files(\n", + " TRAIN_DIR / \"train_applprev_1_*.parquet\", 1, is_regex=True\n", + " ),\n", + " read_and_process_files(TRAIN_DIR / \"train_tax_registry_a_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_tax_registry_b_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_tax_registry_c_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_credit_bureau_b_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_other_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_person_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_deposit_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_debitcard_1.parquet\", 1),\n", + " ],\n", + " \"depth_2\": [\n", + " read_and_process_files(TRAIN_DIR / \"train_credit_bureau_b_2.parquet\", 2)\n", + " ],\n", + "}\n", + "# we won't be submitting the predictions, so let's comment out the test data.\n", + "# test_data_store = {\n", + "# \"df_base\": read_and_process_files(TEST_DIR / \"test_base.parquet\"),\n", + "# \"depth_0\": [\n", + "# read_and_process_files(TEST_DIR / \"test_static_cb_0.parquet\"),\n", + "# read_and_process_files(TEST_DIR / \"test_static_0_*.parquet\", is_regex=True),\n", + "# ],\n", + "# \"depth_1\": [\n", + "# read_and_process_files(TEST_DIR / \"test_applprev_1_*.parquet\", 1, is_regex=True),\n", + "# read_and_process_files(TEST_DIR / \"test_tax_registry_a_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_tax_registry_b_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_tax_registry_c_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_credit_bureau_b_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_other_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_person_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_deposit_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_debitcard_1.parquet\", 1),\n", + "# ],\n", + "# \"depth_2\": [\n", + "# read_and_process_files(TEST_DIR / \"test_credit_bureau_b_2.parquet\", 2),\n", + "# ]\n", + "# }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Join all features data to base table:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def join_data(df_base, depth_0, depth_1, depth_2):\n", + " for i, df in enumerate(depth_0 + depth_1 + depth_2):\n", + " df_base = df_base.join(\n", + " df, \"case_id\", how=\"left\", rname=\"{name}_right\" + f\"_{i}\"\n", + " )\n", + " return df_base" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate train and test datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There is 1526659 rows and 377 columns\n" + ] + } + ], + "source": [ + "df_train = join_data(**train_data_store)\n", + "# df_test = join_data(**test_data_store)\n", + "total_rows = df_train.count().execute()\n", + "print(f\"There is {total_rows} rows and {len(df_train.columns)} columns\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Select features\n", + "Given the large number of features (~370), we'll focus on selecting just a few of the most\n", + "informative ones by name for demonstration purposes in this post:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓\n",
+       "┃ case_id  date_decision  target  days30_165L  days360_512L  days90_310L  pmtscount_423L  pmtssum_45A  dateofbirth_337D  education_1103M  firstquarter_103L  secondquarter_766L  thirdquarter_1082L  fourthquarter_440L  maritalst_893M  numberofqueries_373L  requesttype_4525192L  responsedate_4527233D  actualdpdtolerance_344P  amtinstpaidbefduel24m_4187115A  annuity_780A  annuitynextmonth_57A  applicationcnt_361L  applications30d_658L  applicationscnt_1086L  avgdbddpdlast24m_3658932P  avgdbddpdlast3m_4187120P  max_contractmaturitydate_151D  max_credlmt_1052A  max_credquantity_1099L  max_dpdmaxdatemonth_804T  max_dpdmaxdateyear_742T  max_maxdebtpduevalodued_3940955A  max_overdueamountmax_950A  max_purposeofcred_722M  max_residualamount_3940956A  max_totalamount_503A  max_cancelreason_3545846M  max_childnum_21L  max_currdebt_94A  max_employedfrom_700D  max_mainoccupationinc_437A  max_profession_152M  max_rejectreason_755M  max_status_219L  max_amount_1115A  max_debtpastduevalue_732A  max_debtvalue_227A  max_installmentamount_833A  max_instlamount_892A  max_numberofinstls_810L  max_pmtnumpending_403L  max_last180dayaveragebalance_704A  max_last30dayturnover_651A  max_openingdate_857D  max_amount_416A  max_amtdebitincoming_4809443A  max_amtdebitoutgoing_4809440A  max_amtdepositbalance_4809441A  max_amtdepositincoming_4809444A  max_amtdepositoutgoing_4809442A  max_empl_industry_691L  max_gender_992L  max_housingtype_772L  max_mainoccupationinc_384A  max_incometype_1044T    ┃\n",
+       "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩\n",
+       "│ int64dateint64float64float64float64float64float64datestringfloat64float64float64float64stringfloat64stringdatefloat64float64float64float64float64float64float64float64float64datefloat64float64float64float64float64float64stringfloat64float64stringfloat64float64datefloat64stringstringstringfloat64float64float64float64float64float64float64float64float64datefloat64float64float64float64float64float64stringstringstringfloat64string                  │\n",
+       "├─────────┼───────────────┼────────┼─────────────┼──────────────┼─────────────┼────────────────┼─────────────┼──────────────────┼─────────────────┼───────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────┼──────────────────────┼──────────────────────┼───────────────────────┼─────────────────────────┼────────────────────────────────┼──────────────┼──────────────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┼───────────────────┼────────────────────────┼──────────────────────────┼─────────────────────────┼──────────────────────────────────┼───────────────────────────┼────────────────────────┼─────────────────────────────┼──────────────────────┼───────────────────────────┼──────────────────┼──────────────────┼───────────────────────┼────────────────────────────┼─────────────────────┼───────────────────────┼─────────────────┼──────────────────┼───────────────────────────┼────────────────────┼────────────────────────────┼──────────────────────┼─────────────────────────┼────────────────────────┼───────────────────────────────────┼────────────────────────────┼──────────────────────┼─────────────────┼───────────────────────────────┼───────────────────────────────┼────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼────────────────────────┼─────────────────┼──────────────────────┼────────────────────────────┼─────────────────────────┤\n",
+       "│ 19159072020-09-0200.04.00.0NULLNULL1965-03-01a55475b1       5.02.01.03.0a55475b1      4.0NULLNULL0.039089.6003740.64886.20000.00.00.0-3.0-6.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 0.055290.2502006-09-15120000.0a55475b1           a55475b1             D              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL22000.022000.00.00.00.0NULLNULLNULL60000.0EMPLOYED                │\n",
+       "│ 19165722020-09-0301.06.02.0NULLNULL1985-01-01a55475b1       2.02.01.02.0a55475b1      6.0NULLNULL0.0110432.0002400.07555.80030.00.00.0-5.0-10.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 0.045862.9342007-04-15194000.0a55475b1           a55475b1             T              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL13353.413333.40.00.00.0NULLNULLNULL28000.0PRIVATE_SECTOR_EMPLOYEE │\n",
+       "│ 19167442020-09-0300.03.02.0NULLNULL1974-04-016b2ae0fa       5.09.07.05.0a55475b1      3.0NULLNULL0.086690.2004333.24199.80030.00.00.0-1.00.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 2.041992.0002007-03-15100000.0a55475b1           a55475b1             K              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL0.00.081909.40.07152.0NULLNULLNULL100000.0SALARIED_GOVT           │\n",
+       "│ 19172122020-09-0300.02.00.0NULLNULL1981-10-01a55475b1       1.02.06.02.0a55475b1      2.0NULLNULL0.0160111.3301864.610964.00000.00.00.0-6.0-10.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 3.019254.0002000-01-1560000.0a55475b1           a55475b1             K              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL2685.82660.0206.20.068.8NULLNULLNULL18000.0EMPLOYED                │\n",
+       "│ 19175522020-09-0300.01.00.0NULLNULL1984-12-01a55475b1       0.01.00.02.0a55475b1      1.0NULLNULL0.089029.8053788.02962.60000.00.00.0-33.0-6.0NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLa55475b1                 0.010627.9372017-10-2647000.0a55475b1           a55475b1             K              NULLNULLNULLNULLNULLNULLNULLNULLNULLNULLNULL0.00.00.00.00.0NULLNULLNULL20000.0SALARIED_GOVT           │\n",
+       "└─────────┴───────────────┴────────┴─────────────┴──────────────┴─────────────┴────────────────┴─────────────┴──────────────────┴─────────────────┴───────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────┴──────────────────────┴──────────────────────┴───────────────────────┴─────────────────────────┴────────────────────────────────┴──────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┴───────────────────┴────────────────────────┴──────────────────────────┴─────────────────────────┴──────────────────────────────────┴───────────────────────────┴────────────────────────┴─────────────────────────────┴──────────────────────┴───────────────────────────┴──────────────────┴──────────────────┴───────────────────────┴────────────────────────────┴─────────────────────┴───────────────────────┴─────────────────┴──────────────────┴───────────────────────────┴────────────────────┴────────────────────────────┴──────────────────────┴─────────────────────────┴────────────────────────┴───────────────────────────────────┴────────────────────────────┴──────────────────────┴─────────────────┴───────────────────────────────┴───────────────────────────────┴────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┴────────────────────────┴─────────────────┴──────────────────────┴────────────────────────────┴─────────────────────────┘\n",
+       "
\n" ], - "source": [ - "# transform train and test dataset using IbisML recipe\n", - "X_train_transformed = last_mile_preprocessing.transform(X_train)\n", - "X_test_transformed = last_mile_preprocessing.transform(X_test)\n", - "print(f\"train data shape = {X_train_transformed.shape}\")\n", - "print(f\"test data shape = {X_test_transformed.shape}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define a neural network classifier using PyTorch and PyTorch Lighting:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "from torch.utils.data import DataLoader, TensorDataset\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning import Trainer\n", - "\n", - "\n", - "class NeuralNetClassifier(pl.LightningModule):\n", - " def __init__(self, input_dim, hidden_dim=8, output_dim=1):\n", - " super().__init__()\n", - " self.model = nn.Sequential(\n", - " nn.Linear(input_dim, hidden_dim),\n", - " nn.ReLU(),\n", - " nn.Linear(hidden_dim, output_dim),\n", - " )\n", - " self.loss = nn.BCEWithLogitsLoss()\n", - " self.sigmoid = nn.Sigmoid()\n", - "\n", - " def forward(self, x):\n", - " return self.model(x)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " y_hat = self(x)\n", - " loss = self.loss(y_hat.view(-1), y)\n", - " self.log(\"train_loss\", loss)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " y_hat = self(x)\n", - " loss = self.loss(y_hat.view(-1), y)\n", - " self.log(\"val_loss\", loss)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " return optim.Adam(self.parameters(), lr=0.001)\n", - "\n", - " def predict_proba(self, x):\n", - " self.eval()\n", - " with torch.no_grad():\n", - " x = x.to(self.device)\n", - " return self.sigmoid(self(x))\n", - "\n", - "# initialize your Lightning Module\n", - "nn_classifier = NeuralNetClassifier(input_dim=X_train_transformed.shape[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we'll create the PyTorch DataLoader using the output from IbisML:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "y_train_array = y_train.to_pandas().to_numpy().astype(np.float32)\n", - "x_train_tensor = torch.from_numpy(X_train_transformed)\n", - "y_train_tensor = torch.from_numpy(y_train_array)\n", - "train_dataset = TensorDataset(x_train_tensor, y_train_tensor)\n", - "\n", - "y_test_array = y_test.to_pandas().to_numpy().astype(np.float32)\n", - "X_test_tensor = torch.from_numpy(X_test_transformed)\n", - "y_test_tensor = torch.from_numpy(y_test_array)\n", - "val_dataset = TensorDataset(X_test_tensor, y_test_tensor)\n", - "\n", - "train_loader = DataLoader(train_dataset, batch_size=32, num_workers=13, shuffle=False)\n", - "val_loader = DataLoader(val_dataset, batch_size=32, num_workers=13, shuffle=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Initialize the PyTorch Lightning Trainer:" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU available: True (mps), used: True\n", - "TPU available: False, using: 0 TPU cores\n", - "HPU available: False, using: 0 HPUs\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NeuralNetClassifier(\n", - " (model): Sequential(\n", - " (0): Linear(in_features=980, out_features=8, bias=True)\n", - " (1): ReLU()\n", - " (2): Linear(in_features=8, out_features=1, bias=True)\n", - " )\n", - " (loss): BCEWithLogitsLoss()\n", - " (sigmoid): Sigmoid()\n", - ")\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jiting/anaconda3/envs/ibis-ml-dev/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n" - ] - } + "text/plain": [ + "┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate_decision\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtarget\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays30_165L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays360_512L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays90_310L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtscount_423L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtssum_45A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_337D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_1103M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirstquarter_103L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msecondquarter_766L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mthirdquarter_1082L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfourthquarter_440L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_893M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofqueries_373L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mrequesttype_4525192L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4527233D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mactualdpdtolerance_344P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mamtinstpaidbefduel24m_4187115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mannuity_780A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mannuitynextmonth_57A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplicationcnt_361L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplications30d_658L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplicationscnt_1086L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mavgdbddpdlast24m_3658932P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mavgdbddpdlast3m_4187120P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_contractmaturitydate_151D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_credlmt_1052A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_credquantity_1099L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_dpdmaxdatemonth_804T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_dpdmaxdateyear_742T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_maxdebtpduevalodued_3940955A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_overdueamountmax_950A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_purposeofcred_722M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_residualamount_3940956A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_totalamount_503A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_cancelreason_3545846M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_childnum_21L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_currdebt_94A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_employedfrom_700D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_mainoccupationinc_437A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_profession_152M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_rejectreason_755M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_status_219L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amount_1115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_debtpastduevalue_732A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_debtvalue_227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_installmentamount_833A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_instlamount_892A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_numberofinstls_810L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_pmtnumpending_403L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_last180dayaveragebalance_704A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_last30dayturnover_651A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_openingdate_857D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amount_416A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdebitincoming_4809443A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdebitoutgoing_4809440A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositbalance_4809441A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositincoming_4809444A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositoutgoing_4809442A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_empl_industry_691L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_gender_992L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_housingtype_772L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_mainoccupationinc_384A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_incometype_1044T\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001b[2mint64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │\n", + "├─────────┼───────────────┼────────┼─────────────┼──────────────┼─────────────┼────────────────┼─────────────┼──────────────────┼─────────────────┼───────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────┼──────────────────────┼──────────────────────┼───────────────────────┼─────────────────────────┼────────────────────────────────┼──────────────┼──────────────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┼───────────────────┼────────────────────────┼──────────────────────────┼─────────────────────────┼──────────────────────────────────┼───────────────────────────┼────────────────────────┼─────────────────────────────┼──────────────────────┼───────────────────────────┼──────────────────┼──────────────────┼───────────────────────┼────────────────────────────┼─────────────────────┼───────────────────────┼─────────────────┼──────────────────┼───────────────────────────┼────────────────────┼────────────────────────────┼──────────────────────┼─────────────────────────┼────────────────────────┼───────────────────────────────────┼────────────────────────────┼──────────────────────┼─────────────────┼───────────────────────────────┼───────────────────────────────┼────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼────────────────────────┼─────────────────┼──────────────────────┼────────────────────────────┼─────────────────────────┤\n", + "│ \u001b[1;36m1915907\u001b[0m │ \u001b[35m2020-09-02\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1965-03-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m39089.600\u001b[0m │ \u001b[1;36m3740.6\u001b[0m │ \u001b[1;36m4886.2000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-3.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m55290.250\u001b[0m │ \u001b[35m2006-09-15\u001b[0m │ \u001b[1;36m120000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mD \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m22000.0\u001b[0m │ \u001b[1;36m22000.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m60000.0\u001b[0m │ \u001b[32mEMPLOYED \u001b[0m │\n", + "│ \u001b[1;36m1916572\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1985-01-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m110432.000\u001b[0m │ \u001b[1;36m2400.0\u001b[0m │ \u001b[1;36m7555.8003\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-5.0\u001b[0m │ \u001b[1;36m-10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m45862.934\u001b[0m │ \u001b[35m2007-04-15\u001b[0m │ \u001b[1;36m194000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mT \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m13353.4\u001b[0m │ \u001b[1;36m13333.4\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m28000.0\u001b[0m │ \u001b[32mPRIVATE_SECTOR_EMPLOYEE\u001b[0m │\n", + "│ \u001b[1;36m1916744\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1974-04-01\u001b[0m │ \u001b[32m6b2ae0fa \u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m9.0\u001b[0m │ \u001b[1;36m7.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m86690.200\u001b[0m │ \u001b[1;36m4333.2\u001b[0m │ \u001b[1;36m4199.8003\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m41992.000\u001b[0m │ \u001b[35m2007-03-15\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m81909.4\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m7152.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[32mSALARIED_GOVT \u001b[0m │\n", + "│ \u001b[1;36m1917212\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1981-10-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m160111.330\u001b[0m │ \u001b[1;36m1864.6\u001b[0m │ \u001b[1;36m10964.0000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[1;36m-10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m19254.000\u001b[0m │ \u001b[35m2000-01-15\u001b[0m │ \u001b[1;36m60000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2685.8\u001b[0m │ \u001b[1;36m2660.0\u001b[0m │ \u001b[1;36m206.2\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m68.8\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m18000.0\u001b[0m │ \u001b[32mEMPLOYED \u001b[0m │\n", + "│ \u001b[1;36m1917552\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1984-12-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m89029.805\u001b[0m │ \u001b[1;36m3788.0\u001b[0m │ \u001b[1;36m2962.6000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-33.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m10627.937\u001b[0m │ \u001b[35m2017-10-26\u001b[0m │ \u001b[1;36m47000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m20000.0\u001b[0m │ \u001b[32mSALARIED_GOVT \u001b[0m │\n", + "└─────────┴───────────────┴────────┴─────────────┴──────────────┴─────────────┴────────────────┴─────────────┴──────────────────┴─────────────────┴───────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────┴──────────────────────┴──────────────────────┴───────────────────────┴─────────────────────────┴────────────────────────────────┴──────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┴───────────────────┴────────────────────────┴──────────────────────────┴─────────────────────────┴──────────────────────────────────┴───────────────────────────┴────────────────────────┴─────────────────────────────┴──────────────────────┴───────────────────────────┴──────────────────┴──────────────────┴───────────────────────┴────────────────────────────┴─────────────────────┴───────────────────────┴─────────────────┴──────────────────┴───────────────────────────┴────────────────────┴────────────────────────────┴──────────────────────┴─────────────────────────┴────────────────────────┴───────────────────────────────────┴────────────────────────────┴──────────────────────┴─────────────────┴───────────────────────────────┴───────────────────────────────┴────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┴────────────────────────┴─────────────────┴──────────────────────┴────────────────────────────┴─────────────────────────┘" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train = df_train.select(\n", + " \"case_id\",\n", + " \"date_decision\",\n", + " \"target\",\n", + " # number of credit bureau queries for the last X days.\n", + " \"days30_165L\",\n", + " \"days360_512L\",\n", + " \"days90_310L\",\n", + " # number of tax deduction payments\n", + " \"pmtscount_423L\",\n", + " # sum of tax deductions for the client\n", + " \"pmtssum_45A\",\n", + " \"dateofbirth_337D\",\n", + " \"education_1103M\",\n", + " \"firstquarter_103L\",\n", + " \"secondquarter_766L\",\n", + " \"thirdquarter_1082L\",\n", + " \"fourthquarter_440L\",\n", + " \"maritalst_893M\",\n", + " \"numberofqueries_373L\",\n", + " \"requesttype_4525192L\",\n", + " \"responsedate_4527233D\",\n", + " \"actualdpdtolerance_344P\",\n", + " \"amtinstpaidbefduel24m_4187115A\",\n", + " \"annuity_780A\",\n", + " \"annuitynextmonth_57A\",\n", + " \"applicationcnt_361L\",\n", + " \"applications30d_658L\",\n", + " \"applicationscnt_1086L\",\n", + " # average days past or before due of payment during the last 24 months.\n", + " \"avgdbddpdlast24m_3658932P\",\n", + " # average days past or before due of payment during the last 3 months.\n", + " \"avgdbddpdlast3m_4187120P\",\n", + " # end date of active contract.\n", + " \"max_contractmaturitydate_151D\",\n", + " # credit limit of an active loan.\n", + " \"max_credlmt_1052A\",\n", + " # number of credits in credit bureau\n", + " \"max_credquantity_1099L\",\n", + " \"max_dpdmaxdatemonth_804T\",\n", + " \"max_dpdmaxdateyear_742T\",\n", + " \"max_maxdebtpduevalodued_3940955A\",\n", + " \"max_overdueamountmax_950A\",\n", + " \"max_purposeofcred_722M\",\n", + " \"max_residualamount_3940956A\",\n", + " \"max_totalamount_503A\",\n", + " \"max_cancelreason_3545846M\",\n", + " \"max_childnum_21L\",\n", + " \"max_currdebt_94A\",\n", + " \"max_employedfrom_700D\",\n", + " # client's main income amount in their previous application\n", + " \"max_mainoccupationinc_437A\",\n", + " \"max_profession_152M\",\n", + " \"max_rejectreason_755M\",\n", + " \"max_status_219L\",\n", + " # credit amount of the active contract provided by the credit bureau\n", + " \"max_amount_1115A\",\n", + " # amount of unpaid debt for existing contracts\n", + " \"max_debtpastduevalue_732A\",\n", + " \"max_debtvalue_227A\",\n", + " \"max_installmentamount_833A\",\n", + " \"max_instlamount_892A\",\n", + " \"max_numberofinstls_810L\",\n", + " \"max_pmtnumpending_403L\",\n", + " \"max_last180dayaveragebalance_704A\",\n", + " \"max_last30dayturnover_651A\",\n", + " \"max_openingdate_857D\",\n", + " \"max_amount_416A\",\n", + " \"max_amtdebitincoming_4809443A\",\n", + " \"max_amtdebitoutgoing_4809440A\",\n", + " \"max_amtdepositbalance_4809441A\",\n", + " \"max_amtdepositincoming_4809444A\",\n", + " \"max_amtdepositoutgoing_4809442A\",\n", + " \"max_empl_industry_691L\",\n", + " \"max_gender_992L\",\n", + " \"max_housingtype_772L\",\n", + " \"max_mainoccupationinc_384A\",\n", + " \"max_incometype_1044T\",\n", + ")\n", + "\n", + "df_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Univariate analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n",
+       "┃ name             pos    type     count    nulls   unique   mode      mean          std            min      p25          p50           p75           max          ┃\n",
+       "┡━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n",
+       "│ stringint16stringint64int64int64stringfloat64float64float64float64float64float64float64      │\n",
+       "├─────────────────┼───────┼─────────┼─────────┼────────┼─────────┼──────────┼──────────────┼───────────────┼─────────┼─────────────┼──────────────┼──────────────┼──────────────┤\n",
+       "│ case_id        0int64  152665901526659NULL1.286077e+06718946.5922850.0766197.50001.357358e+061.739022e+062.703454e+06 │\n",
+       "│ target         2int64  152665902NULL3.143728e-020.1744960.00.00000.000000e+000.000000e+001.000000e+00 │\n",
+       "│ days30_165L    3float64152665914096822NULL5.177078e-010.8992380.00.00000.000000e+001.000000e+002.200000e+01 │\n",
+       "│ days360_512L   4float64152665914096892NULL4.777066e+005.1688560.01.00003.000000e+006.500000e+001.150000e+02 │\n",
+       "│ days90_310L    5float64152665914096837NULL1.211420e+001.6559310.00.00001.000000e+002.000000e+004.100000e+01 │\n",
+       "│ pmtscount_423L 6float64152665995402166NULL5.839291e+004.1482640.03.00006.000000e+007.000000e+001.210000e+02 │\n",
+       "│ pmtssum_45A    7float641526659954021265229NULL1.319994e+0418117.2183120.03156.40018.391900e+031.699200e+044.768434e+05 │\n",
+       "│ education_1103M9string 1526659261835a55475b1NULLNULLNULLNULLNULLNULLNULL │\n",
+       "└─────────────────┴───────┴─────────┴─────────┴────────┴─────────┴──────────┴──────────────┴───────────────┴─────────┴─────────────┴──────────────┴──────────────┴──────────────┘\n",
+       "
\n" ], - "source": [ - "# initialize a Trainer\n", - "trainer = Trainer(max_epochs=2)\n", - "print(nn_classifier)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's train the classifier:" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " | Name | Type | Params | Mode \n", - "------------------------------------------------------\n", - "0 | model | Sequential | 7.9 K | train\n", - "1 | loss | BCEWithLogitsLoss | 0 | train\n", - "2 | sigmoid | Sigmoid | 0 | train\n", - "------------------------------------------------------\n", - "7.9 K Trainable params\n", - "0 Non-trainable params\n", - "7.9 K Total params\n", - "0.031 Total estimated model params size (MB)\n", - "6 Modules in train mode\n", - "0 Modules in eval mode\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2cf7098fcd4c41f286c6059b3b170828", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Sanity Checking: | | 0/? [00:00\n", + "** y type = \n", + "self._final_estimator = XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=0.8, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=0.05, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=5, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=100, n_jobs=None,\n", + " num_parallel_tree=None, random_state=42, ...)\n", + "last_step_params = {'fit': {}, 'partial_fit': {}, 'predict': {}, 'predict_proba': {}, 'predict_log_proba': {}, 'decision_function': {}, 'score': {}, 'split': {}, 'transform': {}, 'inverse_transform': {}, 'fit_transform': {}, 'fit_predict': {}}\n" + ] + }, + { + "data": { + "text/html": [ + "
Pipeline(steps=[('last_mile_recipes',\n",
+       "                 Recipe(ExpandDate(date(), components=['week', 'day']),\n",
+       "                        Drop(date()),\n",
+       "                        OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_sta...\n",
+       "                               feature_types=None, gamma=None, grow_policy=None,\n",
+       "                               importance_type=None,\n",
+       "                               interaction_constraints=None, learning_rate=0.05,\n",
+       "                               max_bin=None, max_cat_threshold=None,\n",
+       "                               max_cat_to_onehot=None, max_delta_step=None,\n",
+       "                               max_depth=5, max_leaves=None,\n",
+       "                               min_child_weight=None, missing=nan,\n",
+       "                               monotone_constraints=None, multi_strategy=None,\n",
+       "                               n_estimators=100, n_jobs=None,\n",
+       "                               num_parallel_tree=None, random_state=42, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], - "source": [ - "# train the model\n", - "trainer.fit(nn_classifier, train_loader, val_loader)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's use the trained model to make a prediction:" + "text/plain": [ + "Pipeline(steps=[('last_mile_recipes',\n", + " Recipe(ExpandDate(date(), components=['week', 'day']),\n", + " Drop(date()),\n", + " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_sta...\n", + " feature_types=None, gamma=None, grow_policy=None,\n", + " importance_type=None,\n", + " interaction_constraints=None, learning_rate=0.05,\n", + " max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None,\n", + " max_depth=5, max_leaves=None,\n", + " min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None,\n", + " n_estimators=100, n_jobs=None,\n", + " num_parallel_tree=None, random_state=42, ...))])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.pipeline import Pipeline\n", + "from sklearn.metrics import roc_auc_score\n", + "import xgboost as xgb\n", + "\n", + "model = xgb.XGBClassifier(\n", + " n_estimators=100,\n", + " max_depth=5,\n", + " learning_rate=0.05,\n", + " subsample=0.8,\n", + " colsample_bytree=0.8,\n", + " random_state=42,\n", + ")\n", + "# create the pipeline with the last mile ML recipes and the model\n", + "pipe = Pipeline([(\"last_mile_recipes\", last_mile_preprocessing), (\"model\", model)])\n", + "# fit the pipeline on the training data\n", + "pipe.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's evaluate the model on the test data using Gini index:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gini_score for test dataset: 0.06491440835995244\n" + ] + } + ], + "source": [ + "y_pred_proba = pipe.predict_proba(X_test)[:, 1]\n", + "# calculate the AUC score\n", + "auc = roc_auc_score(y_test, y_pred_proba)\n", + "\n", + "# calculate the Gini score\n", + "gini_score = 2 * auc - 1\n", + "print(f\"gini_score for test dataset: {gini_score:,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "The competition is evaluated using a Gini stability metric. For more information, see the\n", + "[evaluation guidelines](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/overview/evaluation)\n", + "\n", + "\n", + "### Neural network classifier\n", + "Build a neural network classifier using PyTorch and PyTorch Lightning.\n", + "\n", + "It is not recommended to build a neural network classifier for this competition, we are building\n", + "it solely for demonstration purposes.\n", + "\n", + "\n", + "We'll demonstrate how to build a model by directly passing the data to it. IbisML recipes can output\n", + "data in various formats, making it compatible with different modeling frameworks.\n", + "Let's first train the recipe:" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Recipe(ExpandDate(date(), components=['week', 'day']),\n", + " Drop(date()),\n", + " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L'))),\n", + " Drop(string()),\n", + " HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", + " method='z-score',\n", + " deviation_factor=3,\n", + " treatment='capping'),\n", + " ImputeMedian(numeric()),\n", + " ScaleMinMax(numeric()),\n", + " FillNA(numeric(), 0),\n", + " Cast(numeric(), 'float32'))" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# train preprocessing recipe using training dataset\n", + "last_mile_preprocessing.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the previous cell, we trained the recipe using the training dataset. Now, we will\n", + "transform both the train and test datasets using the same recipe. The default output format is a `NumPy array`" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train data shape = (1145346, 980)\n", + "test data shape = (381313, 980)\n" + ] + } + ], + "source": [ + "# transform train and test dataset using IbisML recipe\n", + "X_train_transformed = last_mile_preprocessing.transform(X_train)\n", + "X_test_transformed = last_mile_preprocessing.transform(X_test)\n", + "print(f\"train data shape = {X_train_transformed.shape}\")\n", + "print(f\"test data shape = {X_test_transformed.shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define a neural network classifier using PyTorch and PyTorch Lighting:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "from torch.utils.data import DataLoader, TensorDataset\n", + "import pytorch_lightning as pl\n", + "from pytorch_lightning import Trainer\n", + "\n", + "\n", + "class NeuralNetClassifier(pl.LightningModule):\n", + " def __init__(self, input_dim, hidden_dim=8, output_dim=1):\n", + " super().__init__()\n", + " self.model = nn.Sequential(\n", + " nn.Linear(input_dim, hidden_dim),\n", + " nn.ReLU(),\n", + " nn.Linear(hidden_dim, output_dim),\n", + " )\n", + " self.loss = nn.BCEWithLogitsLoss()\n", + " self.sigmoid = nn.Sigmoid()\n", + "\n", + " def forward(self, x):\n", + " return self.model(x)\n", + "\n", + " def training_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " y_hat = self(x)\n", + " loss = self.loss(y_hat.view(-1), y)\n", + " self.log(\"train_loss\", loss)\n", + " return loss\n", + "\n", + " def validation_step(self, batch, batch_idx):\n", + " x, y = batch\n", + " y_hat = self(x)\n", + " loss = self.loss(y_hat.view(-1), y)\n", + " self.log(\"val_loss\", loss)\n", + " return loss\n", + "\n", + " def configure_optimizers(self):\n", + " return optim.Adam(self.parameters(), lr=0.001)\n", + "\n", + " def predict_proba(self, x):\n", + " self.eval()\n", + " with torch.no_grad():\n", + " x = x.to(self.device)\n", + " return self.sigmoid(self(x))\n", + "\n", + "\n", + "# initialize your Lightning Module\n", + "nn_classifier = NeuralNetClassifier(input_dim=X_train_transformed.shape[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we'll create the PyTorch DataLoader using the output from IbisML:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "y_train_array = y_train.to_pandas().to_numpy().astype(np.float32)\n", + "x_train_tensor = torch.from_numpy(X_train_transformed)\n", + "y_train_tensor = torch.from_numpy(y_train_array)\n", + "train_dataset = TensorDataset(x_train_tensor, y_train_tensor)\n", + "\n", + "y_test_array = y_test.to_pandas().to_numpy().astype(np.float32)\n", + "X_test_tensor = torch.from_numpy(X_test_transformed)\n", + "y_test_tensor = torch.from_numpy(y_test_array)\n", + "val_dataset = TensorDataset(X_test_tensor, y_test_tensor)\n", + "\n", + "train_loader = DataLoader(train_dataset, batch_size=32, num_workers=13, shuffle=False)\n", + "val_loader = DataLoader(val_dataset, batch_size=32, num_workers=13, shuffle=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Initialize the PyTorch Lightning Trainer:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GPU available: True (mps), used: True\n", + "TPU available: False, using: 0 TPU cores\n", + "HPU available: False, using: 0 HPUs\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NeuralNetClassifier(\n", + " (model): Sequential(\n", + " (0): Linear(in_features=980, out_features=8, bias=True)\n", + " (1): ReLU()\n", + " (2): Linear(in_features=8, out_features=1, bias=True)\n", + " )\n", + " (loss): BCEWithLogitsLoss()\n", + " (sigmoid): Sigmoid()\n", + ")\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jiting/anaconda3/envs/ibis-ml-dev/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n" + ] + } + ], + "source": [ + "# initialize a Trainer\n", + "trainer = Trainer(max_epochs=2)\n", + "print(nn_classifier)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's train the classifier:" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + " | Name | Type | Params | Mode \n", + "------------------------------------------------------\n", + "0 | model | Sequential | 7.9 K | train\n", + "1 | loss | BCEWithLogitsLoss | 0 | train\n", + "2 | sigmoid | Sigmoid | 0 | train\n", + "------------------------------------------------------\n", + "7.9 K Trainable params\n", + "0 Non-trainable params\n", + "7.9 K Total params\n", + "0.031 Total estimated model params size (MB)\n", + "6 Modules in train mode\n", + "0 Modules in eval mode\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2cf7098fcd4c41f286c6059b3b170828", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sanity Checking: | | 0/? [00:00