From 7d2d02170024623ad047bd5684def004c83a4c97 Mon Sep 17 00:00:00 2001 From: Tyler White <50381805+IndexSeek@users.noreply.github.com> Date: Sun, 1 Dec 2024 12:48:04 -0500 Subject: [PATCH] style(ruff): clear ruff check violations (#175) --- .pre-commit-config.yaml | 2 +- .../Preprocess your data with recipes.ipynb | 6 +- ... and DuckDB for a Kaggle competition.ipynb | 4195 ++++++++--------- ibis_ml/core.py | 2 +- ibis_ml/select.py | 2 +- ibis_ml/steps/_discretize.py | 2 +- ibis_ml/utils/_pprint.py | 2 +- pyproject.toml | 1 + tests/test_core.py | 11 +- tests/test_encode.py | 4 +- tests/test_generate_features.py | 2 +- tests/test_impute.py | 2 +- tests/test_pprint.py | 4 +- 13 files changed, 2114 insertions(+), 2121 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5e2b423..3034b48 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -4,7 +4,7 @@ repos: hooks: - id: prettier - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.3.2 + rev: v0.8.0 hooks: - id: ruff args: [--fix] diff --git a/examples/Preprocess your data with recipes.ipynb b/examples/Preprocess your data with recipes.ipynb index 87393d2..1f560a6 100644 --- a/examples/Preprocess your data with recipes.ipynb +++ b/examples/Preprocess your data with recipes.ipynb @@ -490,7 +490,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "dc04f24e-c8cb-4580-b502-a9410c64a126", "metadata": {}, "outputs": [], @@ -510,7 +510,7 @@ " from skorch import NeuralNetClassifier\n", "\n", " class MyModule(nn.Module):\n", - " def __init__(self, num_units=10, nonlin=nn.ReLU()):\n", + " def __init__(self, num_units=10, nonlin=nn.ReLU()): # noqa: B008\n", " super().__init__()\n", "\n", " self.dense0 = nn.Linear(10, num_units)\n", @@ -525,7 +525,7 @@ " X = self.dropout(X)\n", " X = self.nonlin(self.dense1(X))\n", " X = self.softmax(self.output(X))\n", - " return X\n", + " return X # noqa: RET504\n", "\n", " mod = NeuralNetClassifier(\n", " MyModule,\n", diff --git a/examples/Using IbisML and DuckDB for a Kaggle competition.ipynb b/examples/Using IbisML and DuckDB for a Kaggle competition.ipynb index c96dea6..d889805 100644 --- a/examples/Using IbisML and DuckDB for a Kaggle competition.ipynb +++ b/examples/Using IbisML and DuckDB for a Kaggle competition.ipynb @@ -1,2118 +1,2111 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Introduction\n", - "In this post, we'll demonstrate how to use Ibis and [IbisML](https://github.com/ibis-project/ibis-ml)\n", - "end-to-end for the\n", - "[credit risk model stability Kaggle competition](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability).\n", - "\n", - "1. Load data and perform feature engineering on DuckDB backend using IbisML\n", - "2. Perform last-mile ML data preprocessing on DuckDB backend using IbisML\n", - "3. Train two models using different frameworks:\n", - " * An XGBoost model within a scikit-learn pipeline.\n", - " * A neural network with PyTorch and PyTorch Lightning.\n", - "\n", - "The aim of this competition is to predict which clients are more likely to default on their\n", - "loans by using both internal and external data sources.\n", - "\n", - "To get started with Ibis and IbisML, please refer to the websites:\n", - "\n", - "* [Ibis](https://ibis-project.org/): An open-source dataframe library that works with any data system.\n", - "* [IbisML](https://ibis-project.github.io/ibis-ml/): A library for building scalable ML pipelines.\n", - "\n", - "\n", - "## Prerequisites\n", - "To run this example, you'll need to download the data from Kaggle website with a Kaggle user account and install Ibis, IbisML, and the necessary modeling library.\n", - "\n", - "### Download data\n", - "You need a Kaggle account to download the data. If you do not have one,\n", - "feel free to register one.\n", - "\n", - "1. Option 1: Manual download\n", - " * Log into your Kaggle account and download all data from this\n", - " [link](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data),\n", - " unzip the files, and save them to your local disk.\n", - "2. Option 2: Kaggle API\n", - " * Go to your `Kaggle Account Settings`.\n", - " * Under the `API` section, click on `Create New API Token`. This will download the `kaggle.json`\n", - " file to your computer.\n", - " * Place the `kaggle.json` file in the correct directory, normally it is under your home directory\n", - " `~/.kaggle`:\n", - "\n", - " ```bash\n", - " mkdir ~/.kaggle\n", - " mv ~/Downloads/kaggle.json ~/.kaggle\n", - " ```\n", - " * Install Kaggle CLI and download the data:\n", - "\n", - " ```bash\n", - " pip install kaggle\n", - " kaggle competitions download -c home-credit-credit-risk-model-stability\n", - " unzip home-credit-credit-risk-model-stability.zip\n", - " ```\n", - "\n", - "### Install libraries\n", - "To use Ibis and IbisML with the DuckDB backend for building models, you'll need to install the\n", - "necessary packages. Depending on your preferred machine learning framework, you can choose\n", - "one of the following installation commands:\n", - "\n", - "For PyTorch-based models:\n", - "\n", - "```bash\n", - "pip install 'ibis-framework[duckdb]' ibis-ml torch pytorch-lightning\n", - "```\n", - "\n", - "For XGBoost and scikit-learn-based models:\n", - "\n", - "```bash\n", - "pip install 'ibis-framework[duckdb]' ibis-ml xgboost[scikit-learn]\n", - "```\n", - "\n", - "Import libraries:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import ibis\n", - "import ibis.expr.datatypes as dt\n", - "from ibis import _\n", - "import ibis_ml as ml\n", - "from pathlib import Path\n", - "from glob import glob\n", - "\n", - "# enable interactive mode for ibis\n", - "ibis.options.interactive = True" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set the backend for computing:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "con = ibis.duckdb.connect()\n", - "# remove the black bars from duckdb's progress bar\n", - "con.raw_sql(\"set enable_progress_bar = false\")\n", - "# DuckDB is the default backend for Ibis\n", - "ibis.set_backend(con)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set data path:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# change the root path to yours\n", - "ROOT = Path(\"/Users/jiting/Downloads/home-credit-credit-risk-model-stability\")\n", - "TRAIN_DIR = ROOT / \"parquet_files\" / \"train\"\n", - "TEST_DIR = ROOT / \"parquet_files\" / \"test\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data loading and processing\n", - "We'll use Ibis to read the Parquet files and perform the necessary processing for the next step.\n", - "\n", - "### Directory structure and tables\n", - "Since there are many data files, let's start by examining the directory structure and\n", - "tables within the train directory:\n", - "\n", - "```bash\n", - "# change this to your directory\n", - "tree -L 2 ~/Downloads/home-credit-credit-risk-model-stability/parquet_files/train\n", - "```\n", - "Data directory:\n", - "\n", - "```bash\n", - "~/Downloads/home-credit-credit-risk-model-stability/parquet_files/train\n", - "├── train_applprev_1_0.parquet\n", - "├── train_applprev_1_1.parquet\n", - "├── train_applprev_2.parquet\n", - "├── train_base.parquet\n", - "├── train_credit_bureau_a_1_0.parquet\n", - "├── train_credit_bureau_a_1_1.parquet\n", - "├── train_credit_bureau_a_1_3.parquet\n", - "├── train_credit_bureau_a_2_0.parquet\n", - "├── train_credit_bureau_a_2_1.parquet\n", - "├── train_credit_bureau_a_2_10.parquet\n", - "├── train_credit_bureau_a_2_2.parquet\n", - "├── train_credit_bureau_a_2_3.parquet\n", - "├── train_credit_bureau_a_2_4.parquet\n", - "├── train_credit_bureau_a_2_5.parquet\n", - "├── train_credit_bureau_a_2_6.parquet\n", - "├── train_credit_bureau_a_2_7.parquet\n", - "├── train_credit_bureau_a_2_8.parquet\n", - "├── train_credit_bureau_a_2_9.parquet\n", - "├── train_credit_bureau_b_1.parquet\n", - "├── train_credit_bureau_b_2.parquet\n", - "├── train_debitcard_1.parquet\n", - "├── train_deposit_1.parquet\n", - "├── train_other_1.parquet\n", - "├── train_person_1.parquet\n", - "├── train_person_2.parquet\n", - "├── train_static_0_0.parquet\n", - "├── train_static_0_1.parquet\n", - "├── train_static_cb_0.parquet\n", - "├── train_tax_registry_a_1.parquet\n", - "├── train_tax_registry_b_1.parquet\n", - "└── train_tax_registry_c_1.parquet\n", - "```\n", - "\n", - "The `train_base.parquet` file is the base table, while the others are feature tables.\n", - "Let's take a quick look at these tables.\n", - "\n", - "#### Base table\n", - "The base table (`train_base.parquet`) contains the unique ID, a binary target flag\n", - "and other information for the training samples. This unique ID will serve as the\n", - "linking key for joining with other feature tables.\n", - "\n", - "* `case_id` - This is the unique ID for each loan. You'll need this ID to\n", - " join feature tables to the base table. There are about 1.5m unique loans.\n", - "* `date_decision` - This refers to the date when a decision was made regarding the\n", - " approval of the loan.\n", - "* `WEEK_NUM` - This is the week number used for aggregation. In the test sample,\n", - " `WEEK_NUM` continues sequentially from the last training value of `WEEK_NUM`.\n", - "* `MONTH` - This column represents the month when the approval decision was made.\n", - "* `target` - This is the binary target flag, determined after a certain period based on\n", - " whether or not the client defaulted on the specific loan.\n", - "\n", - "Here is several examples from the base table:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┓\n", - "┃ case_id ┃ date_decision ┃ MONTH ┃ WEEK_NUM ┃ target ┃\n", - "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━┩\n", - "│ int64 │ string │ int64 │ int64 │ int64 │\n", - "├─────────┼───────────────┼────────┼──────────┼────────┤\n", - "│ 0 │ 2019-01-03 │ 201901 │ 0 │ 0 │\n", - "│ 1 │ 2019-01-03 │ 201901 │ 0 │ 0 │\n", - "│ 2 │ 2019-01-04 │ 201901 │ 0 │ 0 │\n", - "│ 3 │ 2019-01-03 │ 201901 │ 0 │ 0 │\n", - "│ 4 │ 2019-01-04 │ 201901 │ 0 │ 1 │\n", - "└─────────┴───────────────┴────────┴──────────┴────────┘\n", - "\n" - ], - "text/plain": [ - "┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate_decision\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mMONTH\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mWEEK_NUM\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtarget\u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━┩\n", - "│ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │\n", - "├─────────┼───────────────┼────────┼──────────┼────────┤\n", - "│ \u001b[1;36m0\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", - "│ \u001b[1;36m1\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", - "│ \u001b[1;36m2\u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", - "│ \u001b[1;36m3\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", - "│ \u001b[1;36m4\u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1\u001b[0m │\n", - "└─────────┴───────────────┴────────┴──────────┴────────┘" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ibis.read_parquet(TRAIN_DIR / \"train_base.parquet\").head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Feature tables\n", - "The remaining files contain features, consisting of approximately 370 features from\n", - "previous loan applications and external data sources. Their definitions can be found in the feature\n", - "definition [file](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data)\n", - "from the competition website.\n", - "\n", - "There are several things we want to mention for the feature tables:\n", - "\n", - "* **Union datasets**: One dataset could be saved into multiple parquet files, such as\n", - "`train_applprev_1_0.parquet` and `train_applprev_1_1.parquet`, We need to union this data.\n", - "* **Dataset levels**: Datasets may have different levels, which we will explain as\n", - "follows:\n", - " * **Depth = 0**: Each row in the table is identified by a unique `case_id`.\n", - " In this case, you can directly join the features with the base table and use them as\n", - " features for further analysis or processing.\n", - " * **Depth > 0**: You will group the data based on the `case_id` and perform calculations\n", - " or aggregations within each group.\n", - "\n", - "Here are two examples of tables with different levels.\n", - "\n", - "Example of table with depth = 0, `case_id` is the row identifier, features can be directly joined\n", - " with the base table." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n", - "┃ case_id ┃ assignmentdate_238D ┃ assignmentdate_4527235D ┃ assignmentdate_4955616D ┃ birthdate_574D ┃ contractssum_5085716L ┃ dateofbirth_337D ┃ dateofbirth_342D ┃ days120_123L ┃ days180_256L ┃ days30_165L ┃ days360_512L ┃ days90_310L ┃ description_5085714M ┃ education_1103M ┃ education_88M ┃ firstquarter_103L ┃ for3years_128L ┃ for3years_504L ┃ for3years_584L ┃ formonth_118L ┃ formonth_206L ┃ formonth_535L ┃ forquarter_1017L ┃ forquarter_462L ┃ forquarter_634L ┃ fortoday_1092L ┃ forweek_1077L ┃ forweek_528L ┃ forweek_601L ┃ foryear_618L ┃ foryear_818L ┃ foryear_850L ┃ fourthquarter_440L ┃ maritalst_385M ┃ maritalst_893M ┃ numberofqueries_373L ┃ pmtaverage_3A ┃ pmtaverage_4527227A ┃ pmtaverage_4955615A ┃ pmtcount_4527229L ┃ pmtcount_4955617L ┃ pmtcount_693L ┃ pmtscount_423L ┃ pmtssum_45A ┃ requesttype_4525192L ┃ responsedate_1012D ┃ responsedate_4527233D ┃ responsedate_4917613D ┃ riskassesment_302T ┃ riskassesment_940T ┃ secondquarter_766L ┃ thirdquarter_1082L ┃\n", - "┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n", - "│ int64 │ string │ string │ string │ string │ float64 │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ string │ string │ float64 │ float64 │ float64 │\n", - "├─────────┼─────────────────────┼─────────────────────────┼─────────────────────────┼────────────────┼───────────────────────┼──────────────────┼──────────────────┼──────────────┼──────────────┼─────────────┼──────────────┼─────────────┼──────────────────────┼─────────────────┼───────────────┼───────────────────┼────────────────┼────────────────┼────────────────┼───────────────┼───────────────┼───────────────┼──────────────────┼─────────────────┼─────────────────┼────────────────┼───────────────┼──────────────┼──────────────┼──────────────┼──────────────┼──────────────┼────────────────────┼────────────────┼────────────────┼──────────────────────┼───────────────┼─────────────────────┼─────────────────────┼───────────────────┼───────────────────┼───────────────┼────────────────┼─────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────────┤\n", - "│ 357 │ NULL │ NULL │ NULL │ 1988-04-01 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ 6301.4000 │ NULL │ 2019-01-25 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │\n", - "│ 381 │ NULL │ NULL │ NULL │ 1973-11-01 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ 4019.6000 │ NULL │ 2019-01-25 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │\n", - "│ 388 │ NULL │ NULL │ NULL │ 1989-04-01 │ NULL │ 1989-04-01 │ NULL │ 6.0 │ 8.0 │ 2.0 │ 10.0 │ 4.0 │ a55475b1 │ a55475b1 │ a55475b1 │ 2.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ a55475b1 │ a55475b1 │ 10.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ 14548.0000 │ NULL │ 2019-01-28 │ NULL │ NULL │ NULL │ NULL │ 3.0 │ 5.0 │\n", - "│ 405 │ NULL │ NULL │ NULL │ 1974-03-01 │ NULL │ 1974-03-01 │ NULL │ 0.0 │ 0.0 │ 0.0 │ 1.0 │ 0.0 │ a55475b1 │ a55475b1 │ a55475b1 │ 0.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 4.0 │ a55475b1 │ a55475b1 │ 1.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ 10498.2400 │ NULL │ 2019-01-21 │ NULL │ NULL │ NULL │ NULL │ 2.0 │ 0.0 │\n", - "│ 409 │ NULL │ NULL │ NULL │ 1993-06-01 │ NULL │ 1993-06-01 │ NULL │ 2.0 │ 3.0 │ 0.0 │ 3.0 │ 1.0 │ a55475b1 │ 717ddd49 │ a55475b1 │ 4.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 1.0 │ a7fcb6e5 │ a55475b1 │ 3.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 7.0 │ 6344.8804 │ NULL │ 2019-01-21 │ NULL │ NULL │ NULL │ NULL │ 0.0 │ 4.0 │\n", - "└─────────┴─────────────────────┴─────────────────────────┴─────────────────────────┴────────────────┴───────────────────────┴──────────────────┴──────────────────┴──────────────┴──────────────┴─────────────┴──────────────┴─────────────┴──────────────────────┴─────────────────┴───────────────┴───────────────────┴────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┴──────────────────┴─────────────────┴─────────────────┴────────────────┴───────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴────────────────────┴────────────────┴────────────────┴──────────────────────┴───────────────┴─────────────────────┴─────────────────────┴───────────────────┴───────────────────┴───────────────┴────────────────┴─────────────┴──────────────────────┴────────────────────┴───────────────────────┴───────────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────────┘\n", - "\n" - ], - "text/plain": [ - "┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_238D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_4527235D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_4955616D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbirthdate_574D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractssum_5085716L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_337D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_342D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays120_123L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays180_256L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays30_165L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays360_512L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays90_310L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdescription_5085714M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_1103M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_88M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirstquarter_103L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_128L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_504L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_584L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_118L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_206L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_535L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_1017L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_462L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_634L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfortoday_1092L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_1077L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_528L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_601L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_618L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_818L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_850L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfourthquarter_440L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_385M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_893M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofqueries_373L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_3A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_4527227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_4955615A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_4527229L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_4955617L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_693L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtscount_423L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtssum_45A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mrequesttype_4525192L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_1012D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4527233D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4917613D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mriskassesment_302T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mriskassesment_940T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msecondquarter_766L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mthirdquarter_1082L\u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n", - "│ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", - "├─────────┼─────────────────────┼─────────────────────────┼─────────────────────────┼────────────────┼───────────────────────┼──────────────────┼──────────────────┼──────────────┼──────────────┼─────────────┼──────────────┼─────────────┼──────────────────────┼─────────────────┼───────────────┼───────────────────┼────────────────┼────────────────┼────────────────┼───────────────┼───────────────┼───────────────┼──────────────────┼─────────────────┼─────────────────┼────────────────┼───────────────┼──────────────┼──────────────┼──────────────┼──────────────┼──────────────┼────────────────────┼────────────────┼────────────────┼──────────────────────┼───────────────┼─────────────────────┼─────────────────────┼───────────────────┼───────────────────┼───────────────┼────────────────┼─────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────────┤\n", - "│ \u001b[1;36m357\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1988-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m6301.4000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-25 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", - "│ \u001b[1;36m381\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1973-11-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m4019.6000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-25 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", - "│ \u001b[1;36m388\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1989-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1989-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m8.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m14548.0000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-28 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │\n", - "│ \u001b[1;36m405\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1974-03-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1974-03-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m10498.2400\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-21 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │\n", - "│ \u001b[1;36m409\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1993-06-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1993-06-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32m717ddd49 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[32ma7fcb6e5 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m7.0\u001b[0m │ \u001b[1;36m6344.8804\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-21 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │\n", - "└─────────┴─────────────────────┴─────────────────────────┴─────────────────────────┴────────────────┴───────────────────────┴──────────────────┴──────────────────┴──────────────┴──────────────┴─────────────┴──────────────┴─────────────┴──────────────────────┴─────────────────┴───────────────┴───────────────────┴────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┴──────────────────┴─────────────────┴─────────────────┴────────────────┴───────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴────────────────────┴────────────────┴────────────────┴──────────────────────┴───────────────┴─────────────────────┴─────────────────────┴───────────────────┴───────────────────┴───────────────┴────────────────┴─────────────┴──────────────────────┴────────────────────┴───────────────────────┴───────────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────────┘" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ibis.read_parquet(TRAIN_DIR / \"train_static_cb_0.parquet\").head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Example of a table with depth = 1, we need to aggregate the features and collect statistics\n", - "based on `case_id` then join with the base table." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓\n", - "┃ num_group1 ┃ case_id ┃ amount_1115A ┃ classificationofcontr_1114M ┃ contractdate_551D ┃ contractmaturitydate_151D ┃ contractst_516M ┃ contracttype_653M ┃ credlmt_1052A ┃ credlmt_228A ┃ credlmt_3940954A ┃ credor_3940957M ┃ credquantity_1099L ┃ credquantity_984L ┃ debtpastduevalue_732A ┃ debtvalue_227A ┃ dpd_550P ┃ dpd_733P ┃ dpdmax_851P ┃ dpdmaxdatemonth_804T ┃ dpdmaxdateyear_742T ┃ installmentamount_644A ┃ installmentamount_833A ┃ instlamount_892A ┃ interesteffectiverate_369L ┃ interestrateyearly_538L ┃ lastupdate_260D ┃ maxdebtpduevalodued_3940955A ┃ numberofinstls_810L ┃ overdueamountmax_950A ┃ overdueamountmaxdatemonth_494T ┃ overdueamountmaxdateyear_432T ┃ periodicityofpmts_997L ┃ periodicityofpmts_997M ┃ pmtdaysoverdue_1135P ┃ pmtmethod_731M ┃ pmtnumpending_403L ┃ purposeofcred_722M ┃ residualamount_1093A ┃ residualamount_127A ┃ residualamount_3940956A ┃ subjectrole_326M ┃ subjectrole_43M ┃ totalamount_503A ┃ totalamount_881A ┃\n", - "┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩\n", - "│ int64 │ int64 │ float64 │ string │ string │ string │ string │ string │ float64 │ float64 │ float64 │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ float64 │ string │ float64 │ string │ float64 │ float64 │ float64 │ string │ string │ float64 │ float64 │\n", - "├────────────┼─────────┼──────────────┼─────────────────────────────┼───────────────────┼───────────────────────────┼─────────────────┼───────────────────┼───────────────┼──────────────┼──────────────────┼─────────────────┼────────────────────┼───────────────────┼───────────────────────┼────────────────┼──────────┼──────────┼─────────────┼──────────────────────┼─────────────────────┼────────────────────────┼────────────────────────┼──────────────────┼────────────────────────────┼─────────────────────────┼─────────────────┼──────────────────────────────┼─────────────────────┼───────────────────────┼────────────────────────────────┼───────────────────────────────┼────────────────────────┼────────────────────────┼──────────────────────┼────────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────────────┼─────────────────────────┼──────────────────┼─────────────────┼──────────────────┼──────────────────┤\n", - "│ 0 │ 467 │ NULL │ ea6782cc │ 2011-06-15 │ 2031-06-13 │ 7241344e │ 724be82a │ 3.000000e+06 │ 10000.0 │ 3.000000e+06 │ P164_34_168 │ 2.0 │ 1.0 │ NULL │ NULL │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 0.0 │ 0.000 │ NULL │ NULL │ NULL │ 2019-01-20 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ NULL │ a55475b1 │ NULL │ 96a8fdfe │ 0.0 │ 0.0 │ NULL │ fa4f56f1 │ ab3c25cf │ 3.000000e+06 │ 10000.0 │\n", - "│ 1 │ 467 │ NULL │ ea6782cc │ 2019-01-04 │ 2021-08-04 │ 7241344e │ 724be82a │ NULL │ NULL │ 1.303650e+05 │ P164_34_168 │ 1.0 │ 2.0 │ NULL │ NULL │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 0.0 │ 26571.969 │ NULL │ NULL │ NULL │ 2019-01-20 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ NULL │ a55475b1 │ NULL │ 96a8fdfe │ NULL │ NULL │ NULL │ ab3c25cf │ ab3c25cf │ 7.800000e+04 │ 960000.0 │\n", - "│ 2 │ 467 │ 78000.0 │ ea6782cc │ 2016-10-25 │ 2019-10-25 │ 7241344e │ 4257cbed │ NULL │ NULL │ NULL │ c5a72b57 │ NULL │ NULL │ 0.0 │ 26571.969 │ NULL │ NULL │ 0.0 │ 11.0 │ 2016.0 │ NULL │ NULL │ 2898.76 │ NULL │ NULL │ 2019-01-10 │ 0.0 │ 36.0 │ 0.0 │ 11.0 │ 2016.0 │ NULL │ a0b598e4 │ 0.0 │ e914c86c │ 10.0 │ 96a8fdfe │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ NULL │ NULL │\n", - "│ 0 │ 1445 │ NULL │ ea6782cc │ 2015-01-30 │ 2021-01-30 │ 7241344e │ 1c9c5356 │ 4.000000e+05 │ 100000.0 │ 7.400000e+04 │ b619fa46 │ 2.0 │ 5.0 │ 0.0 │ NULL │ 0.0 │ 0.0 │ 200418.0 │ 1.0 │ 2018.0 │ 0.0 │ 0.000 │ NULL │ NULL │ NULL │ 2019-01-19 │ 0.4 │ NULL │ 1.4 │ 2.0 │ 2018.0 │ NULL │ a55475b1 │ 0.0 │ a55475b1 │ NULL │ 60c73645 │ 0.0 │ 0.0 │ 73044.18 │ daf49a8a │ ab3c25cf │ 4.000000e+05 │ 100000.0 │\n", - "│ 1 │ 1445 │ NULL │ 01f63ac8 │ 2014-09-12 │ 2021-09-12 │ 7241344e │ 724be82a │ NULL │ NULL │ 4.000000e+05 │ 74bd67a8 │ 3.0 │ 17.0 │ NULL │ NULL │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 0.0 │ 209617.770 │ NULL │ NULL │ NULL │ 2019-01-13 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ NULL │ a55475b1 │ NULL │ 96a8fdfe │ NULL │ NULL │ NULL │ ab3c25cf │ ab3c25cf │ 3.968006e+05 │ 184587.8 │\n", - "└────────────┴─────────┴──────────────┴─────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────┴───────────────────┴───────────────┴──────────────┴──────────────────┴─────────────────┴────────────────────┴───────────────────┴───────────────────────┴────────────────┴──────────┴──────────┴─────────────┴──────────────────────┴─────────────────────┴────────────────────────┴────────────────────────┴──────────────────┴────────────────────────────┴─────────────────────────┴─────────────────┴──────────────────────────────┴─────────────────────┴───────────────────────┴────────────────────────────────┴───────────────────────────────┴────────────────────────┴────────────────────────┴──────────────────────┴────────────────┴────────────────────┴────────────────────┴──────────────────────┴─────────────────────┴─────────────────────────┴──────────────────┴─────────────────┴──────────────────┴──────────────────┘\n", - "\n" - ], - "text/plain": [ - "┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mnum_group1\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mamount_1115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mclassificationofcontr_1114M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractdate_551D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractmaturitydate_151D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractst_516M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontracttype_653M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_1052A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_228A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_3940954A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredor_3940957M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredquantity_1099L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredquantity_984L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdebtpastduevalue_732A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdebtvalue_227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpd_550P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpd_733P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmax_851P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmaxdatemonth_804T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmaxdateyear_742T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstallmentamount_644A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstallmentamount_833A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstlamount_892A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minteresteffectiverate_369L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minterestrateyearly_538L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlastupdate_260D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaxdebtpduevalodued_3940955A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofinstls_810L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmax_950A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmaxdatemonth_494T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmaxdateyear_432T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mperiodicityofpmts_997L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mperiodicityofpmts_997M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtdaysoverdue_1135P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtmethod_731M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtnumpending_403L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpurposeofcred_722M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_1093A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_127A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_3940956A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msubjectrole_326M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msubjectrole_43M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotalamount_503A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotalamount_881A\u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩\n", - "│ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", - "├────────────┼─────────┼──────────────┼─────────────────────────────┼───────────────────┼───────────────────────────┼─────────────────┼───────────────────┼───────────────┼──────────────┼──────────────────┼─────────────────┼────────────────────┼───────────────────┼───────────────────────┼────────────────┼──────────┼──────────┼─────────────┼──────────────────────┼─────────────────────┼────────────────────────┼────────────────────────┼──────────────────┼────────────────────────────┼─────────────────────────┼─────────────────┼──────────────────────────────┼─────────────────────┼───────────────────────┼────────────────────────────────┼───────────────────────────────┼────────────────────────┼────────────────────────┼──────────────────────┼────────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────────────┼─────────────────────────┼──────────────────┼─────────────────┼──────────────────┼──────────────────┤\n", - "│ \u001b[1;36m0\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2011-06-15 \u001b[0m │ \u001b[32m2031-06-13 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[1;36m10000.0\u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[32mP164_34_168 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-20 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mfa4f56f1 \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[1;36m10000.0\u001b[0m │\n", - "│ \u001b[1;36m1\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[32m2021-08-04 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.303650e+05\u001b[0m │ \u001b[32mP164_34_168 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m26571.969\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-20 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m7.800000e+04\u001b[0m │ \u001b[1;36m960000.0\u001b[0m │\n", - "│ \u001b[1;36m2\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[1;36m78000.0\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2016-10-25 \u001b[0m │ \u001b[32m2019-10-25 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m4257cbed \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mc5a72b57 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m26571.969\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m11.0\u001b[0m │ \u001b[1;36m2016.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2898.76\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-10 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m36.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m11.0\u001b[0m │ \u001b[1;36m2016.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma0b598e4 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32me914c86c \u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", - "│ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1445\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2015-01-30 \u001b[0m │ \u001b[32m2021-01-30 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m1c9c5356 \u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[1;36m7.400000e+04\u001b[0m │ \u001b[32mb619fa46 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m200418.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2018.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-19 \u001b[0m │ \u001b[1;36m0.4\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.4\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m2018.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m60c73645 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m73044.18\u001b[0m │ \u001b[32mdaf49a8a \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │\n", - "│ \u001b[1;36m1\u001b[0m │ \u001b[1;36m1445\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m01f63ac8 \u001b[0m │ \u001b[32m2014-09-12 \u001b[0m │ \u001b[32m2021-09-12 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[32m74bd67a8 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m17.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m209617.770\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-13 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m3.968006e+05\u001b[0m │ \u001b[1;36m184587.8\u001b[0m │\n", - "└────────────┴─────────┴──────────────┴─────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────┴───────────────────┴───────────────┴──────────────┴──────────────────┴─────────────────┴────────────────────┴───────────────────┴───────────────────────┴────────────────┴──────────┴──────────┴─────────────┴──────────────────────┴─────────────────────┴────────────────────────┴────────────────────────┴──────────────────┴────────────────────────────┴─────────────────────────┴─────────────────┴──────────────────────────────┴─────────────────────┴───────────────────────┴────────────────────────────────┴───────────────────────────────┴────────────────────────┴────────────────────────┴──────────────────────┴────────────────┴────────────────────┴────────────────────┴──────────────────────┴─────────────────────┴─────────────────────────┴──────────────────┴─────────────────┴──────────────────┴──────────────────┘" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ibis.read_parquet(TRAIN_DIR / \"train_credit_bureau_b_1.parquet\").relocate(\n", - " \"num_group1\"\n", - ").order_by([\"case_id\", \"num_group1\"]).head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For more details on features and its exploratory data analysis (EDA), you can refer to\n", - "feature definition and these Kaggle notebooks:\n", - "\n", - "* [Feature\n", - " definition](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data#:~:text=calendar_view_week-,feature_definitions,-.csv)\n", - "* [Home credit risk prediction\n", - " EDA](https://www.kaggle.com/code/loki97/home-credit-risk-prediction-eda)\n", - "* [Home credit CRMS 2024\n", - " EDA](https://www.kaggle.com/code/sergiosaharovskiy/home-credit-crms-2024-eda-and-submission)\n", - "\n", - "### Data loading and processing\n", - "We will perform the following data processing steps using Ibis and IbisML:\n", - "\n", - "* **Convert data types**: Ensure consistency by converting data types, as the same column\n", - " in different sub-files may have different types.\n", - "* **Aggregate features**: For tables with depth greater than 0, aggregate features based\n", - " on `case_id`, including statistics calculation. You can collect statistics such as mean,\n", - " median, mode, minimum, standard deviation, and others.\n", - "* **Union and join datasets**: Combine multiple sub-files of the same dataset into one\n", - " table, as some datasets are split into multiple sub-files with a common prefix. Afterward,\n", - " join these tables with the base table.\n", - "\n", - "#### Convert data types\n", - "We'll use IbisML to create a chain of `Cast` steps, forming a recipe for data type\n", - "conversion across the dataset. This conversion is based on the provided information\n", - "extracted from column names. Columns that have similar transformations are indicated by a\n", - "capital letter at the end of their names:\n", - "\n", - "* P - Transform DPD (Days past due)\n", - "* M - Masking categories\n", - "* A - Transform amount\n", - "* D - Transform date\n", - "* T - Unspecified Transform\n", - "* L - Unspecified Transform\n", - "\n", - "For example, we'll define a IbisML transformation step to convert columns ends with `P`\n", - "to floating number:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# convert columns ends with P to floating number\n", - "step_cast_P_to_float = ml.Cast(ml.endswith(\"P\"), dt.float64)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, let's define additional type conversion transformations based on the postfix of column names:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "# convert columns ends with A to floating number\n", - "step_cast_A_to_float = ml.Cast(ml.endswith(\"A\"), dt.float64)\n", - "# convert columns ends with D to date\n", - "step_cast_D_to_date = ml.Cast(ml.endswith(\"D\"), dt.date)\n", - "# convert columns ends with M to str\n", - "step_cast_M_to_str = ml.Cast(ml.endswith(\"M\"), dt.str)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll construct the\n", - "[IbisML Recipe](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)\n", - "which chains together all the transformation steps." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Data format conversion recipe:\n", - "Recipe(Cast(endswith('P'), 'float64'),\n", - " Cast(endswith('D'), 'date'),\n", - " Cast(endswith('M'), 'string'),\n", - " Cast(endswith('A'), 'float64'),\n", - " Cast(cols(('date_decision',)), 'date'),\n", - " Cast(cols(('case_id', 'WEEK_NUM', 'num_group1', 'num_group2')), 'int64'),\n", - " Cast(cols(('cardtype_51L', 'credacc_status_367L', 'requesttype_4525192L', 'riskassesment_302T', 'max_periodicityofpmts_997L')),\n", - " 'string'),\n", - " Cast(cols(('isbidproductrequest_292L', 'isdebitcard_527L', 'equalityempfrom_62L')),\n", - " 'int64'))\n" - ] - } - ], - "source": [ - "data_type_recipes = ml.Recipe(\n", - " step_cast_P_to_float,\n", - " step_cast_D_to_date,\n", - " step_cast_M_to_str,\n", - " step_cast_A_to_float,\n", - " # cast some special columns\n", - " ml.Cast([\"date_decision\"], \"date\"),\n", - " ml.Cast([\"case_id\", \"WEEK_NUM\", \"num_group1\", \"num_group2\"], dt.int64),\n", - " ml.Cast(\n", - " [\n", - " \"cardtype_51L\",\n", - " \"credacc_status_367L\",\n", - " \"requesttype_4525192L\",\n", - " \"riskassesment_302T\",\n", - " \"max_periodicityofpmts_997L\",\n", - " ],\n", - " dt.str,\n", - " ),\n", - " ml.Cast(\n", - " [\n", - " \"isbidproductrequest_292L\",\n", - " \"isdebitcard_527L\",\n", - " \"equalityempfrom_62L\",\n", - " ],\n", - " dt.int64,\n", - " ),\n", - ")\n", - "print(f\"Data format conversion recipe:\\n{data_type_recipes}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "IbisML offers a powerful set of column selectors, allowing you to select columns based\n", - "on names, types, and patterns. For more information, you can refer to the IbisML column\n", - "selectors [documentation](https://ibis-project.github.io/ibis-ml/reference/selectors.html).\n", - "\n", - "\n", - "#### Aggregate features\n", - "For tables with a depth greater than 0 that can't be directly joined with the base table,\n", - "we need to aggregate the features by the `case_id`. You could compute the different statistics for numeric columns and\n", - "non-numeric columns.\n", - "\n", - "Here, we use the `maximum` as an example." - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def agg_by_id(table):\n", - " return table.group_by(\"case_id\").agg(\n", - " [\n", - " table[col_name].max().name(f\"max_{col_name}\")\n", - " for col_name in table.columns\n", - " if col_name[-1] in (\"T\", \"L\", \"P\", \"A\", \"D\", \"M\")\n", - " ]\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "For better predicting power, you need to collect different statistics based on the meaning of features. For simplicity,\n", - "we'll only collect the maximum value of the features here.\n", - "\n", - "\n", - "#### Put them together\n", - "We'll put them together in a function reads parquet files, optionally handles regex patterns for\n", - "multiple sub-files, applies data type transformations defined by `data_type_recipes`, and\n", - "performs aggregation based on `case_id` if specified by the depth parameter." - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "def read_and_process_files(file_path, depth=None, is_regex=False):\n", - " \"\"\"\n", - " Read and process Parquet files.\n", - "\n", - " Args:\n", - " file_path (str): Path to the file or regex pattern to match files.\n", - " depth (int, optional): Depth of processing. If 1 or 2, additional aggregation is performed.\n", - " is_regex (bool, optional): Whether the file_path is a regex pattern.\n", - "\n", - " Returns:\n", - " ibis.Table: The processed Ibis table.\n", - " \"\"\"\n", - " if is_regex:\n", - " # read and union multiple files\n", - " chunks = []\n", - " for path in glob(str(file_path)):\n", - " chunk = ibis.read_parquet(path)\n", - " # transform table using IbisML Recipe\n", - " chunk = data_type_recipes.fit(chunk).to_ibis(chunk)\n", - " chunks.append(chunk)\n", - " table = ibis.union(*chunks)\n", - " else:\n", - " # read a single file\n", - " table = ibis.read_parquet(file_path)\n", - " # transform table using IbisML\n", - " table = data_type_recipes.fit(table).to_ibis(table)\n", - "\n", - " # perform aggregation if depth is 1 or 2\n", - " if depth in [1, 2]:\n", - " table = agg_by_id(table)\n", - "\n", - " return table" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define two dictionaries, `train_data_store` and `test_data_store`, that organize and\n", - "store processed datasets for training and testing datasets." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "train_data_store = {\n", - " \"df_base\": read_and_process_files(TRAIN_DIR / \"train_base.parquet\"),\n", - " \"depth_0\": [\n", - " read_and_process_files(TRAIN_DIR / \"train_static_cb_0.parquet\"),\n", - " read_and_process_files(TRAIN_DIR / \"train_static_0_*.parquet\", is_regex=True),\n", - " ],\n", - " \"depth_1\": [\n", - " read_and_process_files(\n", - " TRAIN_DIR / \"train_applprev_1_*.parquet\", 1, is_regex=True\n", - " ),\n", - " read_and_process_files(TRAIN_DIR / \"train_tax_registry_a_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_tax_registry_b_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_tax_registry_c_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_credit_bureau_b_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_other_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_person_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_deposit_1.parquet\", 1),\n", - " read_and_process_files(TRAIN_DIR / \"train_debitcard_1.parquet\", 1),\n", - " ],\n", - " \"depth_2\": [\n", - " read_and_process_files(TRAIN_DIR / \"train_credit_bureau_b_2.parquet\", 2),\n", - " ],\n", - "}\n", - "# we won't be submitting the predictions, so let's comment out the test data.\n", - "# test_data_store = {\n", - "# \"df_base\": read_and_process_files(TEST_DIR / \"test_base.parquet\"),\n", - "# \"depth_0\": [\n", - "# read_and_process_files(TEST_DIR / \"test_static_cb_0.parquet\"),\n", - "# read_and_process_files(TEST_DIR / \"test_static_0_*.parquet\", is_regex=True),\n", - "# ],\n", - "# \"depth_1\": [\n", - "# read_and_process_files(TEST_DIR / \"test_applprev_1_*.parquet\", 1, is_regex=True),\n", - "# read_and_process_files(TEST_DIR / \"test_tax_registry_a_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_tax_registry_b_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_tax_registry_c_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_credit_bureau_b_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_other_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_person_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_deposit_1.parquet\", 1),\n", - "# read_and_process_files(TEST_DIR / \"test_debitcard_1.parquet\", 1),\n", - "# ],\n", - "# \"depth_2\": [\n", - "# read_and_process_files(TEST_DIR / \"test_credit_bureau_b_2.parquet\", 2),\n", - "# ]\n", - "# }" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Join all features data to base table:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "def join_data(df_base, depth_0, depth_1, depth_2):\n", - " for i, df in enumerate(depth_0 + depth_1 + depth_2):\n", - " df_base = df_base.join(\n", - " df, \"case_id\", how=\"left\", rname=\"{name}_right\" + f\"_{i}\"\n", - " )\n", - " return df_base" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Generate train and test datasets:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "There is 1526659 rows and 377 columns\n" - ] - } - ], - "source": [ - "df_train = join_data(**train_data_store)\n", - "# df_test = join_data(**test_data_store)\n", - "total_rows = df_train.count().execute()\n", - "print(f\"There is {total_rows} rows and {len(df_train.columns)} columns\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Select features\n", - "Given the large number of features (~370), we'll focus on selecting just a few of the most\n", - "informative ones by name for demonstration purposes in this post:" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", - "┃ case_id ┃ date_decision ┃ target ┃ days30_165L ┃ days360_512L ┃ days90_310L ┃ pmtscount_423L ┃ pmtssum_45A ┃ dateofbirth_337D ┃ education_1103M ┃ firstquarter_103L ┃ secondquarter_766L ┃ thirdquarter_1082L ┃ fourthquarter_440L ┃ maritalst_893M ┃ numberofqueries_373L ┃ requesttype_4525192L ┃ responsedate_4527233D ┃ actualdpdtolerance_344P ┃ amtinstpaidbefduel24m_4187115A ┃ annuity_780A ┃ annuitynextmonth_57A ┃ applicationcnt_361L ┃ applications30d_658L ┃ applicationscnt_1086L ┃ avgdbddpdlast24m_3658932P ┃ avgdbddpdlast3m_4187120P ┃ max_contractmaturitydate_151D ┃ max_credlmt_1052A ┃ max_credquantity_1099L ┃ max_dpdmaxdatemonth_804T ┃ max_dpdmaxdateyear_742T ┃ max_maxdebtpduevalodued_3940955A ┃ max_overdueamountmax_950A ┃ max_purposeofcred_722M ┃ max_residualamount_3940956A ┃ max_totalamount_503A ┃ max_cancelreason_3545846M ┃ max_childnum_21L ┃ max_currdebt_94A ┃ max_employedfrom_700D ┃ max_mainoccupationinc_437A ┃ max_profession_152M ┃ max_rejectreason_755M ┃ max_status_219L ┃ max_amount_1115A ┃ max_debtpastduevalue_732A ┃ max_debtvalue_227A ┃ max_installmentamount_833A ┃ max_instlamount_892A ┃ max_numberofinstls_810L ┃ max_pmtnumpending_403L ┃ max_last180dayaveragebalance_704A ┃ max_last30dayturnover_651A ┃ max_openingdate_857D ┃ max_amount_416A ┃ max_amtdebitincoming_4809443A ┃ max_amtdebitoutgoing_4809440A ┃ max_amtdepositbalance_4809441A ┃ max_amtdepositincoming_4809444A ┃ max_amtdepositoutgoing_4809442A ┃ max_empl_industry_691L ┃ max_gender_992L ┃ max_housingtype_772L ┃ max_mainoccupationinc_384A ┃ max_incometype_1044T ┃\n", - "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", - "│ int64 │ date │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ date │ string │ float64 │ float64 │ float64 │ float64 │ string │ float64 │ string │ date │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ date │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ float64 │ float64 │ string │ float64 │ float64 │ date │ float64 │ string │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ date │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ float64 │ string │\n", - "├─────────┼───────────────┼────────┼─────────────┼──────────────┼─────────────┼────────────────┼─────────────┼──────────────────┼─────────────────┼───────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────┼──────────────────────┼──────────────────────┼───────────────────────┼─────────────────────────┼────────────────────────────────┼──────────────┼──────────────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┼───────────────────┼────────────────────────┼──────────────────────────┼─────────────────────────┼──────────────────────────────────┼───────────────────────────┼────────────────────────┼─────────────────────────────┼──────────────────────┼───────────────────────────┼──────────────────┼──────────────────┼───────────────────────┼────────────────────────────┼─────────────────────┼───────────────────────┼─────────────────┼──────────────────┼───────────────────────────┼────────────────────┼────────────────────────────┼──────────────────────┼─────────────────────────┼────────────────────────┼───────────────────────────────────┼────────────────────────────┼──────────────────────┼─────────────────┼───────────────────────────────┼───────────────────────────────┼────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼────────────────────────┼─────────────────┼──────────────────────┼────────────────────────────┼─────────────────────────┤\n", - "│ 1915907 │ 2020-09-02 │ 0 │ 0.0 │ 4.0 │ 0.0 │ NULL │ NULL │ 1965-03-01 │ a55475b1 │ 5.0 │ 2.0 │ 1.0 │ 3.0 │ a55475b1 │ 4.0 │ NULL │ NULL │ 0.0 │ 39089.600 │ 3740.6 │ 4886.2000 │ 0.0 │ 0.0 │ 0.0 │ -3.0 │ -6.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 0.0 │ 55290.250 │ 2006-09-15 │ 120000.0 │ a55475b1 │ a55475b1 │ D │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 22000.0 │ 22000.0 │ 0.0 │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 60000.0 │ EMPLOYED │\n", - "│ 1916572 │ 2020-09-03 │ 0 │ 1.0 │ 6.0 │ 2.0 │ NULL │ NULL │ 1985-01-01 │ a55475b1 │ 2.0 │ 2.0 │ 1.0 │ 2.0 │ a55475b1 │ 6.0 │ NULL │ NULL │ 0.0 │ 110432.000 │ 2400.0 │ 7555.8003 │ 0.0 │ 0.0 │ 0.0 │ -5.0 │ -10.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 0.0 │ 45862.934 │ 2007-04-15 │ 194000.0 │ a55475b1 │ a55475b1 │ T │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 13353.4 │ 13333.4 │ 0.0 │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 28000.0 │ PRIVATE_SECTOR_EMPLOYEE │\n", - "│ 1916744 │ 2020-09-03 │ 0 │ 0.0 │ 3.0 │ 2.0 │ NULL │ NULL │ 1974-04-01 │ 6b2ae0fa │ 5.0 │ 9.0 │ 7.0 │ 5.0 │ a55475b1 │ 3.0 │ NULL │ NULL │ 0.0 │ 86690.200 │ 4333.2 │ 4199.8003 │ 0.0 │ 0.0 │ 0.0 │ -1.0 │ 0.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 2.0 │ 41992.000 │ 2007-03-15 │ 100000.0 │ a55475b1 │ a55475b1 │ K │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 0.0 │ 0.0 │ 81909.4 │ 0.0 │ 7152.0 │ NULL │ NULL │ NULL │ 100000.0 │ SALARIED_GOVT │\n", - "│ 1917212 │ 2020-09-03 │ 0 │ 0.0 │ 2.0 │ 0.0 │ NULL │ NULL │ 1981-10-01 │ a55475b1 │ 1.0 │ 2.0 │ 6.0 │ 2.0 │ a55475b1 │ 2.0 │ NULL │ NULL │ 0.0 │ 160111.330 │ 1864.6 │ 10964.0000 │ 0.0 │ 0.0 │ 0.0 │ -6.0 │ -10.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 3.0 │ 19254.000 │ 2000-01-15 │ 60000.0 │ a55475b1 │ a55475b1 │ K │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 2685.8 │ 2660.0 │ 206.2 │ 0.0 │ 68.8 │ NULL │ NULL │ NULL │ 18000.0 │ EMPLOYED │\n", - "│ 1917552 │ 2020-09-03 │ 0 │ 0.0 │ 1.0 │ 0.0 │ NULL │ NULL │ 1984-12-01 │ a55475b1 │ 0.0 │ 1.0 │ 0.0 │ 2.0 │ a55475b1 │ 1.0 │ NULL │ NULL │ 0.0 │ 89029.805 │ 3788.0 │ 2962.6000 │ 0.0 │ 0.0 │ 0.0 │ -33.0 │ -6.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 0.0 │ 10627.937 │ 2017-10-26 │ 47000.0 │ a55475b1 │ a55475b1 │ K │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 0.0 │ 0.0 │ 0.0 │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 20000.0 │ SALARIED_GOVT │\n", - "└─────────┴───────────────┴────────┴─────────────┴──────────────┴─────────────┴────────────────┴─────────────┴──────────────────┴─────────────────┴───────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────┴──────────────────────┴──────────────────────┴───────────────────────┴─────────────────────────┴────────────────────────────────┴──────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┴───────────────────┴────────────────────────┴──────────────────────────┴─────────────────────────┴──────────────────────────────────┴───────────────────────────┴────────────────────────┴─────────────────────────────┴──────────────────────┴───────────────────────────┴──────────────────┴──────────────────┴───────────────────────┴────────────────────────────┴─────────────────────┴───────────────────────┴─────────────────┴──────────────────┴───────────────────────────┴────────────────────┴────────────────────────────┴──────────────────────┴─────────────────────────┴────────────────────────┴───────────────────────────────────┴────────────────────────────┴──────────────────────┴─────────────────┴───────────────────────────────┴───────────────────────────────┴────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┴────────────────────────┴─────────────────┴──────────────────────┴────────────────────────────┴─────────────────────────┘\n", - "\n" - ], - "text/plain": [ - "┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate_decision\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtarget\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays30_165L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays360_512L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays90_310L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtscount_423L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtssum_45A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_337D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_1103M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirstquarter_103L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msecondquarter_766L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mthirdquarter_1082L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfourthquarter_440L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_893M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofqueries_373L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mrequesttype_4525192L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4527233D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mactualdpdtolerance_344P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mamtinstpaidbefduel24m_4187115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mannuity_780A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mannuitynextmonth_57A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplicationcnt_361L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplications30d_658L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplicationscnt_1086L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mavgdbddpdlast24m_3658932P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mavgdbddpdlast3m_4187120P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_contractmaturitydate_151D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_credlmt_1052A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_credquantity_1099L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_dpdmaxdatemonth_804T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_dpdmaxdateyear_742T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_maxdebtpduevalodued_3940955A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_overdueamountmax_950A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_purposeofcred_722M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_residualamount_3940956A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_totalamount_503A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_cancelreason_3545846M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_childnum_21L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_currdebt_94A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_employedfrom_700D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_mainoccupationinc_437A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_profession_152M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_rejectreason_755M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_status_219L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amount_1115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_debtpastduevalue_732A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_debtvalue_227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_installmentamount_833A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_instlamount_892A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_numberofinstls_810L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_pmtnumpending_403L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_last180dayaveragebalance_704A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_last30dayturnover_651A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_openingdate_857D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amount_416A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdebitincoming_4809443A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdebitoutgoing_4809440A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositbalance_4809441A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositincoming_4809444A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositoutgoing_4809442A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_empl_industry_691L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_gender_992L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_housingtype_772L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_mainoccupationinc_384A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_incometype_1044T\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", - "│ \u001b[2mint64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │\n", - "├─────────┼───────────────┼────────┼─────────────┼──────────────┼─────────────┼────────────────┼─────────────┼──────────────────┼─────────────────┼───────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────┼──────────────────────┼──────────────────────┼───────────────────────┼─────────────────────────┼────────────────────────────────┼──────────────┼──────────────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┼───────────────────┼────────────────────────┼──────────────────────────┼─────────────────────────┼──────────────────────────────────┼───────────────────────────┼────────────────────────┼─────────────────────────────┼──────────────────────┼───────────────────────────┼──────────────────┼──────────────────┼───────────────────────┼────────────────────────────┼─────────────────────┼───────────────────────┼─────────────────┼──────────────────┼───────────────────────────┼────────────────────┼────────────────────────────┼──────────────────────┼─────────────────────────┼────────────────────────┼───────────────────────────────────┼────────────────────────────┼──────────────────────┼─────────────────┼───────────────────────────────┼───────────────────────────────┼────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼────────────────────────┼─────────────────┼──────────────────────┼────────────────────────────┼─────────────────────────┤\n", - "│ \u001b[1;36m1915907\u001b[0m │ \u001b[35m2020-09-02\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1965-03-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m39089.600\u001b[0m │ \u001b[1;36m3740.6\u001b[0m │ \u001b[1;36m4886.2000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-3.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m55290.250\u001b[0m │ \u001b[35m2006-09-15\u001b[0m │ \u001b[1;36m120000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mD \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m22000.0\u001b[0m │ \u001b[1;36m22000.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m60000.0\u001b[0m │ \u001b[32mEMPLOYED \u001b[0m │\n", - "│ \u001b[1;36m1916572\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1985-01-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m110432.000\u001b[0m │ \u001b[1;36m2400.0\u001b[0m │ \u001b[1;36m7555.8003\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-5.0\u001b[0m │ \u001b[1;36m-10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m45862.934\u001b[0m │ \u001b[35m2007-04-15\u001b[0m │ \u001b[1;36m194000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mT \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m13353.4\u001b[0m │ \u001b[1;36m13333.4\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m28000.0\u001b[0m │ \u001b[32mPRIVATE_SECTOR_EMPLOYEE\u001b[0m │\n", - "│ \u001b[1;36m1916744\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1974-04-01\u001b[0m │ \u001b[32m6b2ae0fa \u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m9.0\u001b[0m │ \u001b[1;36m7.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m86690.200\u001b[0m │ \u001b[1;36m4333.2\u001b[0m │ \u001b[1;36m4199.8003\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m41992.000\u001b[0m │ \u001b[35m2007-03-15\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m81909.4\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m7152.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[32mSALARIED_GOVT \u001b[0m │\n", - "│ \u001b[1;36m1917212\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1981-10-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m160111.330\u001b[0m │ \u001b[1;36m1864.6\u001b[0m │ \u001b[1;36m10964.0000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[1;36m-10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m19254.000\u001b[0m │ \u001b[35m2000-01-15\u001b[0m │ \u001b[1;36m60000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2685.8\u001b[0m │ \u001b[1;36m2660.0\u001b[0m │ \u001b[1;36m206.2\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m68.8\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m18000.0\u001b[0m │ \u001b[32mEMPLOYED \u001b[0m │\n", - "│ \u001b[1;36m1917552\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1984-12-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m89029.805\u001b[0m │ \u001b[1;36m3788.0\u001b[0m │ \u001b[1;36m2962.6000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-33.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m10627.937\u001b[0m │ \u001b[35m2017-10-26\u001b[0m │ \u001b[1;36m47000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m20000.0\u001b[0m │ \u001b[32mSALARIED_GOVT \u001b[0m │\n", - "└─────────┴───────────────┴────────┴─────────────┴──────────────┴─────────────┴────────────────┴─────────────┴──────────────────┴─────────────────┴───────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────┴──────────────────────┴──────────────────────┴───────────────────────┴─────────────────────────┴────────────────────────────────┴──────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┴───────────────────┴────────────────────────┴──────────────────────────┴─────────────────────────┴──────────────────────────────────┴───────────────────────────┴────────────────────────┴─────────────────────────────┴──────────────────────┴───────────────────────────┴──────────────────┴──────────────────┴───────────────────────┴────────────────────────────┴─────────────────────┴───────────────────────┴─────────────────┴──────────────────┴───────────────────────────┴────────────────────┴────────────────────────────┴──────────────────────┴─────────────────────────┴────────────────────────┴───────────────────────────────────┴────────────────────────────┴──────────────────────┴─────────────────┴───────────────────────────────┴───────────────────────────────┴────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┴────────────────────────┴─────────────────┴──────────────────────┴────────────────────────────┴─────────────────────────┘" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df_train = df_train.select(\n", - " \"case_id\",\n", - " \"date_decision\",\n", - " \"target\",\n", - " # number of credit bureau queries for the last X days.\n", - " \"days30_165L\",\n", - " \"days360_512L\",\n", - " \"days90_310L\",\n", - " # number of tax deduction payments\n", - " \"pmtscount_423L\",\n", - " # sum of tax deductions for the client\n", - " \"pmtssum_45A\",\n", - " \"dateofbirth_337D\",\n", - " \"education_1103M\",\n", - " \"firstquarter_103L\",\n", - " \"secondquarter_766L\",\n", - " \"thirdquarter_1082L\",\n", - " \"fourthquarter_440L\",\n", - " \"maritalst_893M\",\n", - " \"numberofqueries_373L\",\n", - " \"requesttype_4525192L\",\n", - " \"responsedate_4527233D\",\n", - " \"actualdpdtolerance_344P\",\n", - " \"amtinstpaidbefduel24m_4187115A\",\n", - " \"annuity_780A\",\n", - " \"annuitynextmonth_57A\",\n", - " \"applicationcnt_361L\",\n", - " \"applications30d_658L\",\n", - " \"applicationscnt_1086L\",\n", - " # average days past or before due of payment during the last 24 months.\n", - " \"avgdbddpdlast24m_3658932P\",\n", - " # average days past or before due of payment during the last 3 months.\n", - " \"avgdbddpdlast3m_4187120P\",\n", - " # end date of active contract.\n", - " \"max_contractmaturitydate_151D\",\n", - " # credit limit of an active loan.\n", - " \"max_credlmt_1052A\",\n", - " # number of credits in credit bureau\n", - " \"max_credquantity_1099L\",\n", - " \"max_dpdmaxdatemonth_804T\",\n", - " \"max_dpdmaxdateyear_742T\",\n", - " \"max_maxdebtpduevalodued_3940955A\",\n", - " \"max_overdueamountmax_950A\",\n", - " \"max_purposeofcred_722M\",\n", - " \"max_residualamount_3940956A\",\n", - " \"max_totalamount_503A\",\n", - " \"max_cancelreason_3545846M\",\n", - " \"max_childnum_21L\",\n", - " \"max_currdebt_94A\",\n", - " \"max_employedfrom_700D\",\n", - " # client's main income amount in their previous application\n", - " \"max_mainoccupationinc_437A\",\n", - " \"max_profession_152M\",\n", - " \"max_rejectreason_755M\",\n", - " \"max_status_219L\",\n", - " # credit amount of the active contract provided by the credit bureau\n", - " \"max_amount_1115A\",\n", - " # amount of unpaid debt for existing contracts\n", - " \"max_debtpastduevalue_732A\",\n", - " \"max_debtvalue_227A\",\n", - " \"max_installmentamount_833A\",\n", - " \"max_instlamount_892A\",\n", - " \"max_numberofinstls_810L\",\n", - " \"max_pmtnumpending_403L\",\n", - " \"max_last180dayaveragebalance_704A\",\n", - " \"max_last30dayturnover_651A\",\n", - " \"max_openingdate_857D\",\n", - " \"max_amount_416A\",\n", - " \"max_amtdebitincoming_4809443A\",\n", - " \"max_amtdebitoutgoing_4809440A\",\n", - " \"max_amtdepositbalance_4809441A\",\n", - " \"max_amtdepositincoming_4809444A\",\n", - " \"max_amtdepositoutgoing_4809442A\",\n", - " \"max_empl_industry_691L\",\n", - " \"max_gender_992L\",\n", - " \"max_housingtype_772L\",\n", - " \"max_mainoccupationinc_384A\",\n", - " \"max_incometype_1044T\",\n", - ")\n", - "\n", - "df_train.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Univariate analysis:" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
┏━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", - "┃ name ┃ pos ┃ type ┃ count ┃ nulls ┃ unique ┃ mode ┃ mean ┃ std ┃ min ┃ p25 ┃ p50 ┃ p75 ┃ max ┃\n", - "┡━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", - "│ string │ int16 │ string │ int64 │ int64 │ int64 │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │\n", - "├─────────────────┼───────┼─────────┼─────────┼────────┼─────────┼──────────┼──────────────┼───────────────┼─────────┼─────────────┼──────────────┼──────────────┼──────────────┤\n", - "│ case_id │ 0 │ int64 │ 1526659 │ 0 │ 1526659 │ NULL │ 1.286077e+06 │ 718946.592285 │ 0.0 │ 766197.5000 │ 1.357358e+06 │ 1.739022e+06 │ 2.703454e+06 │\n", - "│ target │ 2 │ int64 │ 1526659 │ 0 │ 2 │ NULL │ 3.143728e-02 │ 0.174496 │ 0.0 │ 0.0000 │ 0.000000e+00 │ 0.000000e+00 │ 1.000000e+00 │\n", - "│ days30_165L │ 3 │ float64 │ 1526659 │ 140968 │ 22 │ NULL │ 5.177078e-01 │ 0.899238 │ 0.0 │ 0.0000 │ 0.000000e+00 │ 1.000000e+00 │ 2.200000e+01 │\n", - "│ days360_512L │ 4 │ float64 │ 1526659 │ 140968 │ 92 │ NULL │ 4.777066e+00 │ 5.168856 │ 0.0 │ 1.0000 │ 3.000000e+00 │ 6.500000e+00 │ 1.150000e+02 │\n", - "│ days90_310L │ 5 │ float64 │ 1526659 │ 140968 │ 37 │ NULL │ 1.211420e+00 │ 1.655931 │ 0.0 │ 0.0000 │ 1.000000e+00 │ 2.000000e+00 │ 4.100000e+01 │\n", - "│ pmtscount_423L │ 6 │ float64 │ 1526659 │ 954021 │ 66 │ NULL │ 5.839291e+00 │ 4.148264 │ 0.0 │ 3.0000 │ 6.000000e+00 │ 7.000000e+00 │ 1.210000e+02 │\n", - "│ pmtssum_45A │ 7 │ float64 │ 1526659 │ 954021 │ 265229 │ NULL │ 1.319994e+04 │ 18117.218312 │ 0.0 │ 3156.4001 │ 8.391900e+03 │ 1.699200e+04 │ 4.768434e+05 │\n", - "│ education_1103M │ 9 │ string │ 1526659 │ 26183 │ 5 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │\n", - "└─────────────────┴───────┴─────────┴─────────┴────────┴─────────┴──────────┴──────────────┴───────────────┴─────────┴─────────────┴──────────────┴──────────────┴──────────────┘\n", - "\n" - ], - "text/plain": [ - "┏━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", - "┃\u001b[1m \u001b[0m\u001b[1mname\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpos\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtype\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcount\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnulls\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1munique\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmode\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mstd\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp50\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", - "┡━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", - "│ \u001b[2mstring\u001b[0m │ \u001b[2mint16\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", - "├─────────────────┼───────┼─────────┼─────────┼────────┼─────────┼──────────┼──────────────┼───────────────┼─────────┼─────────────┼──────────────┼──────────────┼──────────────┤\n", - "│ \u001b[32mcase_id \u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[32mint64 \u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.286077e+06\u001b[0m │ \u001b[1;36m718946.592285\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m766197.5000\u001b[0m │ \u001b[1;36m1.357358e+06\u001b[0m │ \u001b[1;36m1.739022e+06\u001b[0m │ \u001b[1;36m2.703454e+06\u001b[0m │\n", - "│ \u001b[32mtarget \u001b[0m │ \u001b[1;36m2\u001b[0m │ \u001b[32mint64 \u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m2\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m3.143728e-02\u001b[0m │ \u001b[1;36m0.174496\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0000\u001b[0m │ \u001b[1;36m0.000000e+00\u001b[0m │ \u001b[1;36m0.000000e+00\u001b[0m │ \u001b[1;36m1.000000e+00\u001b[0m │\n", - "│ \u001b[32mdays30_165L \u001b[0m │ \u001b[1;36m3\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m140968\u001b[0m │ \u001b[1;36m22\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m5.177078e-01\u001b[0m │ \u001b[1;36m0.899238\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0000\u001b[0m │ \u001b[1;36m0.000000e+00\u001b[0m │ \u001b[1;36m1.000000e+00\u001b[0m │ \u001b[1;36m2.200000e+01\u001b[0m │\n", - "│ \u001b[32mdays360_512L \u001b[0m │ \u001b[1;36m4\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m140968\u001b[0m │ \u001b[1;36m92\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.777066e+00\u001b[0m │ \u001b[1;36m5.168856\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0000\u001b[0m │ \u001b[1;36m3.000000e+00\u001b[0m │ \u001b[1;36m6.500000e+00\u001b[0m │ \u001b[1;36m1.150000e+02\u001b[0m │\n", - "│ \u001b[32mdays90_310L \u001b[0m │ \u001b[1;36m5\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m140968\u001b[0m │ \u001b[1;36m37\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.211420e+00\u001b[0m │ \u001b[1;36m1.655931\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0000\u001b[0m │ \u001b[1;36m1.000000e+00\u001b[0m │ \u001b[1;36m2.000000e+00\u001b[0m │ \u001b[1;36m4.100000e+01\u001b[0m │\n", - "│ \u001b[32mpmtscount_423L \u001b[0m │ \u001b[1;36m6\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m954021\u001b[0m │ \u001b[1;36m66\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m5.839291e+00\u001b[0m │ \u001b[1;36m4.148264\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0000\u001b[0m │ \u001b[1;36m6.000000e+00\u001b[0m │ \u001b[1;36m7.000000e+00\u001b[0m │ \u001b[1;36m1.210000e+02\u001b[0m │\n", - "│ \u001b[32mpmtssum_45A \u001b[0m │ \u001b[1;36m7\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m954021\u001b[0m │ \u001b[1;36m265229\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.319994e+04\u001b[0m │ \u001b[1;36m18117.218312\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3156.4001\u001b[0m │ \u001b[1;36m8.391900e+03\u001b[0m │ \u001b[1;36m1.699200e+04\u001b[0m │ \u001b[1;36m4.768434e+05\u001b[0m │\n", - "│ \u001b[32meducation_1103M\u001b[0m │ \u001b[1;36m9\u001b[0m │ \u001b[32mstring \u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m26183\u001b[0m │ \u001b[1;36m5\u001b[0m │ \u001b[32ma55475b1\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", - "└─────────────────┴───────┴─────────┴─────────┴────────┴─────────┴──────────┴──────────────┴───────────────┴─────────┴─────────────┴──────────────┴──────────────┴──────────────┘" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# take the first 10 columns\n", - "df_train[df_train.columns[:10]].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Last-mile data preprocessing\n", - "We will perform the following transformation before feeding the data to models:\n", - "\n", - "* Missing value imputation\n", - "* Encoding categorical variables\n", - "* Handling date variables\n", - "* Handling outliers\n", - "* Scaling and normalization\n", - "\n", - "\n", - "IbisML provides a set of transformations. You can find the\n", - "[roadmap](https://github.com/ibis-project/ibis-ml/issues/32).\n", - "The [IbisML website](https://ibis-project.github.io/ibis-ml/) also includes tutorials and API documentation.\n", - "\n", - "### Impute features\n", - "Impute all numeric columns using the median. In real-life scenarios, it's important to\n", - "understand the meaning of each feature and apply the appropriate imputation method for\n", - "different features. For more imputations, please refer to this\n", - "[documentation](https://ibis-project.github.io/ibis-ml/reference/steps-imputation.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "step_impute_median = ml.ImputeMedian(ml.numeric())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Encode categorical features\n", - "Encode all categorical features using one-hot-encode. For more encoding steps,\n", - "please refer to this\n", - "[doc](https://ibis-project.github.io/ibis-ml/reference/steps-encoding.html)." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "ohe_step = ml.OneHotEncode(\n", - " [\n", - " \"maritalst_893M\",\n", - " \"requesttype_4525192L\",\n", - " \"max_profession_152M\",\n", - " \"max_gender_992L\",\n", - " \"max_empl_industry_691L\",\n", - " \"max_housingtype_772L\",\n", - " \"max_incometype_1044T\",\n", - " \"max_cancelreason_3545846M\",\n", - " \"max_rejectreason_755M\",\n", - " \"education_1103M\",\n", - " \"max_status_219L\",\n", - " ]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Handle date variables\n", - "Calculate all the days difference between any date columns and the column `date_decision`:" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "date_cols = [col_name for col_name in df_train.columns if col_name[-1] == \"D\"]\n", - "days_to_decision_expr = {\n", - " # difference in days\n", - " f\"{col}_date_decision_diff\": (\n", - " _.date_decision.epoch_seconds() - getattr(_, col).epoch_seconds()\n", - " )\n", - " / (60 * 60 * 24)\n", - " for col in date_cols\n", - "}\n", - "days_to_decision_step = ml.Mutate(days_to_decision_expr)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Extract information from the date columns:" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# dow and month is set to catagoery\n", - "expand_date_step = ml.ExpandDate(ml.date(), [\"week\", \"day\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Handle outliers\n", - "Capping outliers using `z-score` method:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "step_handle_outliers = ml.HandleUnivariateOutliers(\n", - " [\"max_amount_1115A\", \"max_overdueamountmax_950A\"],\n", - " method=\"z-score\",\n", - " treatment=\"capping\",\n", - " deviation_factor=3,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Construct recipe\n", - "We'll construct the last mile preprocessing [recipe](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)\n", - "by chaining all transformation steps, which will be fitted to the training dataset and later applied test datasets." - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Last-mile preprocessing recipe: \n", - "Recipe(ExpandDate(date(), components=['week', 'day']),\n", - " Drop(date()),\n", - " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L'))),\n", - " Drop(string()),\n", - " HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", - " method='z-score',\n", - " deviation_factor=3,\n", - " treatment='capping'),\n", - " ImputeMedian(numeric()),\n", - " ScaleMinMax(numeric()),\n", - " FillNA(numeric(), 0),\n", - " Cast(numeric(), 'float32'))\n" - ] - } - ], - "source": [ - "last_mile_preprocessing = ml.Recipe(\n", - " expand_date_step,\n", - " ml.Drop(ml.date()),\n", - " # handle string columns\n", - " ohe_step,\n", - " ml.Drop(ml.string()),\n", - " # handle numeric cols\n", - " # capping outliers\n", - " step_handle_outliers,\n", - " step_impute_median,\n", - " ml.ScaleMinMax(ml.numeric()),\n", - " # fill missing value\n", - " ml.FillNA(ml.numeric(), 0),\n", - " ml.Cast(ml.numeric(), \"float32\"),\n", - ")\n", - "print(f\"Last-mile preprocessing recipe: \\n{last_mile_preprocessing}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Modeling\n", - "After completing data preprocessing with Ibis and IbisML, we proceed to the modeling\n", - "phase. Here are two approaches:\n", - "\n", - "* Use IbisML as a independent data preprocessing component and hand off the data to downstream modeling\n", - "frameworks with various output formats:\n", - " - pandas Dataframe\n", - " - NumPy Array\n", - " - Polars Dataframe\n", - " - Dask Dataframe\n", - " - xgboost.DMatrix\n", - " - Pyarrow Table\n", - "* Use IbisML recipes as components within an sklearn Pipeline and\n", - "train models similarly to how you would do with sklearn pipeline.\n", - "\n", - "We will build an XGBoost model within a scikit-learn pipeline, and a neural network classifier using the\n", - "output transformed by IbisML recipes.\n", - "\n", - "### Train and test data splitting\n", - "We'll use hashing on the unique key to consistently split rows to different groups.\n", - "Hashing is robust to underlying changes in the data, such as adding, deleting, or\n", - "reordering rows. This deterministic process ensures that each data point is always\n", - "assigned to the same split, thereby enhancing reproducibility." - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jiting/anaconda3/envs/ibis-ml-dev/lib/python3.12/site-packages/ibis/expr/types/relations.py:685: FutureWarning: Selecting/filtering arbitrary expressions in `Table.__getitem__` is deprecated and will be removed in version 10.0. Please use `Table.select` or `Table.filter` instead.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "train dataset size = 1145346 \n", - "test data size = 381313\n" - ] - } - ], - "source": [ - "\n", - "train_data, test_data = ml.train_test_split(\n", - " df_train,\n", - " unique_key=[\"case_id\"],\n", - " test_size=0.25,\n", - " random_seed=222,\n", - ")\n", - "\n", - "X_train = train_data.drop(\"target\")\n", - "y_train = train_data.target.cast(dt.float32).name(\"target\")\n", - "\n", - "X_test = test_data.drop(\"target\")\n", - "y_test = test_data.target.cast(dt.float32).name(\"target\")\n", - "\n", - "train_cnt = X_train.count().execute()\n", - "test_cnt = X_test.count().execute()\n", - "print(f\"train dataset size = {train_cnt} \\ntest data size = {test_cnt}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Hashing provides a consistent but pseudo-random distribution of data, which\n", - "may not precisely align with the specified train/test ratio. While hash codes\n", - "ensure reproducibility, they don't guarantee an exact split. Due to statistical variance,\n", - "you might find a slight imbalance in the distribution, resulting in marginally more or\n", - "fewer samples in either the training or test dataset than the target percentage. This\n", - "minor deviation from the intended ratio is a normal consequence of hash-based\n", - "partitioning.\n", - "\n", - "\n", - "### XGBoost\n", - "In this section, we integrate XGBoost into a scikit-learn pipeline to create a\n", - "streamlined workflow for training and evaluating our model.\n", - "\n", - "We'll set up a pipeline that includes two components:\n", - "\n", - "* **Preprocessing**: This step applies the `last_mile_preprocessing` for final data preprocessing.\n", - "* **Modeling**: This step applies the `xgb.XGBClassifier()` to train the XGBoost model." - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "last_mile_recipes\n", - "** y type =
Pipeline(steps=[('last_mile_recipes',\n", - " Recipe(ExpandDate(date(), components=['week', 'day']),\n", - " Drop(date()),\n", - " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_sta...\n", - " feature_types=None, gamma=None, grow_policy=None,\n", - " importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.05,\n", - " max_bin=None, max_cat_threshold=None,\n", - " max_cat_to_onehot=None, max_delta_step=None,\n", - " max_depth=5, max_leaves=None,\n", - " min_child_weight=None, missing=nan,\n", - " monotone_constraints=None, multi_strategy=None,\n", - " n_estimators=100, n_jobs=None,\n", - " num_parallel_tree=None, random_state=42, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('last_mile_recipes',\n", - " Recipe(ExpandDate(date(), components=['week', 'day']),\n", - " Drop(date()),\n", - " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_sta...\n", - " feature_types=None, gamma=None, grow_policy=None,\n", - " importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.05,\n", - " max_bin=None, max_cat_threshold=None,\n", - " max_cat_to_onehot=None, max_delta_step=None,\n", - " max_depth=5, max_leaves=None,\n", - " min_child_weight=None, missing=nan,\n", - " monotone_constraints=None, multi_strategy=None,\n", - " n_estimators=100, n_jobs=None,\n", - " num_parallel_tree=None, random_state=42, ...))])
Recipe(ExpandDate(date(), components=['week', 'day']),\n", - " Drop(date()),\n", - " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L'))),\n", - " Drop(string()),\n", - " HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", - " method='z-score',\n", - " deviation_factor=3,\n", - " treatment='capping'),\n", - " ImputeMedian(numeric()),\n", - " ScaleMinMax(numeric()),\n", - " FillNA(numeric(), 0),\n", - " Cast(numeric(), 'float32'))
ExpandDate(date(), components=['week', 'day'])
Drop(date())
OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L')))
Drop(string())
HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", - " method='z-score',\n", - " deviation_factor=3,\n", - " treatment='capping')
ImputeMedian(numeric())
ScaleMinMax(numeric())
FillNA(numeric(), 0)
Cast(numeric(), 'float32')
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", - " colsample_bylevel=None, colsample_bynode=None,\n", - " colsample_bytree=0.8, device=None, early_stopping_rounds=None,\n", - " enable_categorical=False, eval_metric=None, feature_types=None,\n", - " gamma=None, grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.05, max_bin=None,\n", - " max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=5, max_leaves=None,\n", - " min_child_weight=None, missing=nan, monotone_constraints=None,\n", - " multi_strategy=None, n_estimators=100, n_jobs=None,\n", - " num_parallel_tree=None, random_state=42, ...)
┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┓\n", + "┃ case_id ┃ date_decision ┃ MONTH ┃ WEEK_NUM ┃ target ┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━┩\n", + "│ int64 │ string │ int64 │ int64 │ int64 │\n", + "├─────────┼───────────────┼────────┼──────────┼────────┤\n", + "│ 0 │ 2019-01-03 │ 201901 │ 0 │ 0 │\n", + "│ 1 │ 2019-01-03 │ 201901 │ 0 │ 0 │\n", + "│ 2 │ 2019-01-04 │ 201901 │ 0 │ 0 │\n", + "│ 3 │ 2019-01-03 │ 201901 │ 0 │ 0 │\n", + "│ 4 │ 2019-01-04 │ 201901 │ 0 │ 1 │\n", + "└─────────┴───────────────┴────────┴──────────┴────────┘\n", + "\n" ], - "source": [ - "from sklearn.pipeline import Pipeline\n", - "from sklearn.metrics import roc_auc_score\n", - "import xgboost as xgb\n", - "\n", - "model = xgb.XGBClassifier(\n", - " n_estimators=100,\n", - " max_depth=5,\n", - " learning_rate=0.05,\n", - " subsample=0.8,\n", - " colsample_bytree=0.8,\n", - " random_state=42,\n", - ")\n", - "# create the pipeline with the last mile ML recipes and the model\n", - "pipe = Pipeline([(\"last_mile_recipes\", last_mile_preprocessing), (\"model\", model)])\n", - "# fit the pipeline on the training data\n", - "pipe.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's evaluate the model on the test data using Gini index:" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "gini_score for test dataset: 0.06491440835995244\n" - ] - } + "text/plain": [ + "┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate_decision\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mMONTH\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mWEEK_NUM\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtarget\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━┩\n", + "│ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │\n", + "├─────────┼───────────────┼────────┼──────────┼────────┤\n", + "│ \u001b[1;36m0\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", + "│ \u001b[1;36m1\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", + "│ \u001b[1;36m2\u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", + "│ \u001b[1;36m3\u001b[0m │ \u001b[32m2019-01-03 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0\u001b[0m │\n", + "│ \u001b[1;36m4\u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[1;36m201901\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1\u001b[0m │\n", + "└─────────┴───────────────┴────────┴──────────┴────────┘" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ibis.read_parquet(TRAIN_DIR / \"train_base.parquet\").head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Feature tables\n", + "The remaining files contain features, consisting of approximately 370 features from\n", + "previous loan applications and external data sources. Their definitions can be found in the feature\n", + "definition [file](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data)\n", + "from the competition website.\n", + "\n", + "There are several things we want to mention for the feature tables:\n", + "\n", + "* **Union datasets**: One dataset could be saved into multiple parquet files, such as\n", + "`train_applprev_1_0.parquet` and `train_applprev_1_1.parquet`, We need to union this data.\n", + "* **Dataset levels**: Datasets may have different levels, which we will explain as\n", + "follows:\n", + " * **Depth = 0**: Each row in the table is identified by a unique `case_id`.\n", + " In this case, you can directly join the features with the base table and use them as\n", + " features for further analysis or processing.\n", + " * **Depth > 0**: You will group the data based on the `case_id` and perform calculations\n", + " or aggregations within each group.\n", + "\n", + "Here are two examples of tables with different levels.\n", + "\n", + "Example of table with depth = 0, `case_id` is the row identifier, features can be directly joined\n", + " with the base table." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n", + "┃ case_id ┃ assignmentdate_238D ┃ assignmentdate_4527235D ┃ assignmentdate_4955616D ┃ birthdate_574D ┃ contractssum_5085716L ┃ dateofbirth_337D ┃ dateofbirth_342D ┃ days120_123L ┃ days180_256L ┃ days30_165L ┃ days360_512L ┃ days90_310L ┃ description_5085714M ┃ education_1103M ┃ education_88M ┃ firstquarter_103L ┃ for3years_128L ┃ for3years_504L ┃ for3years_584L ┃ formonth_118L ┃ formonth_206L ┃ formonth_535L ┃ forquarter_1017L ┃ forquarter_462L ┃ forquarter_634L ┃ fortoday_1092L ┃ forweek_1077L ┃ forweek_528L ┃ forweek_601L ┃ foryear_618L ┃ foryear_818L ┃ foryear_850L ┃ fourthquarter_440L ┃ maritalst_385M ┃ maritalst_893M ┃ numberofqueries_373L ┃ pmtaverage_3A ┃ pmtaverage_4527227A ┃ pmtaverage_4955615A ┃ pmtcount_4527229L ┃ pmtcount_4955617L ┃ pmtcount_693L ┃ pmtscount_423L ┃ pmtssum_45A ┃ requesttype_4525192L ┃ responsedate_1012D ┃ responsedate_4527233D ┃ responsedate_4917613D ┃ riskassesment_302T ┃ riskassesment_940T ┃ secondquarter_766L ┃ thirdquarter_1082L ┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n", + "│ int64 │ string │ string │ string │ string │ float64 │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ string │ string │ float64 │ float64 │ float64 │\n", + "├─────────┼─────────────────────┼─────────────────────────┼─────────────────────────┼────────────────┼───────────────────────┼──────────────────┼──────────────────┼──────────────┼──────────────┼─────────────┼──────────────┼─────────────┼──────────────────────┼─────────────────┼───────────────┼───────────────────┼────────────────┼────────────────┼────────────────┼───────────────┼───────────────┼───────────────┼──────────────────┼─────────────────┼─────────────────┼────────────────┼───────────────┼──────────────┼──────────────┼──────────────┼──────────────┼──────────────┼────────────────────┼────────────────┼────────────────┼──────────────────────┼───────────────┼─────────────────────┼─────────────────────┼───────────────────┼───────────────────┼───────────────┼────────────────┼─────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────────┤\n", + "│ 357 │ NULL │ NULL │ NULL │ 1988-04-01 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ 6301.4000 │ NULL │ 2019-01-25 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │\n", + "│ 381 │ NULL │ NULL │ NULL │ 1973-11-01 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ 4019.6000 │ NULL │ 2019-01-25 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │\n", + "│ 388 │ NULL │ NULL │ NULL │ 1989-04-01 │ NULL │ 1989-04-01 │ NULL │ 6.0 │ 8.0 │ 2.0 │ 10.0 │ 4.0 │ a55475b1 │ a55475b1 │ a55475b1 │ 2.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ a55475b1 │ a55475b1 │ 10.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ 14548.0000 │ NULL │ 2019-01-28 │ NULL │ NULL │ NULL │ NULL │ 3.0 │ 5.0 │\n", + "│ 405 │ NULL │ NULL │ NULL │ 1974-03-01 │ NULL │ 1974-03-01 │ NULL │ 0.0 │ 0.0 │ 0.0 │ 1.0 │ 0.0 │ a55475b1 │ a55475b1 │ a55475b1 │ 0.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 4.0 │ a55475b1 │ a55475b1 │ 1.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 6.0 │ 10498.2400 │ NULL │ 2019-01-21 │ NULL │ NULL │ NULL │ NULL │ 2.0 │ 0.0 │\n", + "│ 409 │ NULL │ NULL │ NULL │ 1993-06-01 │ NULL │ 1993-06-01 │ NULL │ 2.0 │ 3.0 │ 0.0 │ 3.0 │ 1.0 │ a55475b1 │ 717ddd49 │ a55475b1 │ 4.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 1.0 │ a7fcb6e5 │ a55475b1 │ 3.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 7.0 │ 6344.8804 │ NULL │ 2019-01-21 │ NULL │ NULL │ NULL │ NULL │ 0.0 │ 4.0 │\n", + "└─────────┴─────────────────────┴─────────────────────────┴─────────────────────────┴────────────────┴───────────────────────┴──────────────────┴──────────────────┴──────────────┴──────────────┴─────────────┴──────────────┴─────────────┴──────────────────────┴─────────────────┴───────────────┴───────────────────┴────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┴──────────────────┴─────────────────┴─────────────────┴────────────────┴───────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴────────────────────┴────────────────┴────────────────┴──────────────────────┴───────────────┴─────────────────────┴─────────────────────┴───────────────────┴───────────────────┴───────────────┴────────────────┴─────────────┴──────────────────────┴────────────────────┴───────────────────────┴───────────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────────┘\n", + "\n" ], - "source": [ - "y_pred_proba = pipe.predict_proba(X_test)[:, 1]\n", - "# calculate the AUC score\n", - "auc = roc_auc_score(y_test, y_pred_proba)\n", - "\n", - "# calculate the Gini score\n", - "gini_score = 2 * auc - 1\n", - "print(f\"gini_score for test dataset: {gini_score:,}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "The competition is evaluated using a Gini stability metric. For more information, see the\n", - "[evaluation guidelines](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/overview/evaluation)\n", - "\n", - "\n", - "### Neural network classifier\n", - "Build a neural network classifier using PyTorch and PyTorch Lightning.\n", - "\n", - "It is not recommended to build a neural network classifier for this competition, we are building\n", - "it solely for demonstration purposes.\n", - "\n", - "\n", - "We'll demonstrate how to build a model by directly passing the data to it. IbisML recipes can output\n", - "data in various formats, making it compatible with different modeling frameworks.\n", - "Let's first train the recipe:" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Recipe(ExpandDate(date(), components=['week', 'day']),\n", - " Drop(date()),\n", - " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L'))),\n", - " Drop(string()),\n", - " HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", - " method='z-score',\n", - " deviation_factor=3,\n", - " treatment='capping'),\n", - " ImputeMedian(numeric()),\n", - " ScaleMinMax(numeric()),\n", - " FillNA(numeric(), 0),\n", - " Cast(numeric(), 'float32'))" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } + "text/plain": [ + "┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_238D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_4527235D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1massignmentdate_4955616D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mbirthdate_574D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractssum_5085716L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_337D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_342D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays120_123L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays180_256L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays30_165L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays360_512L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays90_310L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdescription_5085714M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_1103M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_88M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirstquarter_103L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_128L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_504L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfor3years_584L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_118L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_206L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mformonth_535L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_1017L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_462L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforquarter_634L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfortoday_1092L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_1077L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_528L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforweek_601L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_618L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_818L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mforyear_850L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfourthquarter_440L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_385M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_893M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofqueries_373L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_3A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_4527227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtaverage_4955615A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_4527229L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_4955617L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtcount_693L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtscount_423L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtssum_45A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mrequesttype_4525192L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_1012D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4527233D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4917613D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mriskassesment_302T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mriskassesment_940T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msecondquarter_766L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mthirdquarter_1082L\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", + "├─────────┼─────────────────────┼─────────────────────────┼─────────────────────────┼────────────────┼───────────────────────┼──────────────────┼──────────────────┼──────────────┼──────────────┼─────────────┼──────────────┼─────────────┼──────────────────────┼─────────────────┼───────────────┼───────────────────┼────────────────┼────────────────┼────────────────┼───────────────┼───────────────┼───────────────┼──────────────────┼─────────────────┼─────────────────┼────────────────┼───────────────┼──────────────┼──────────────┼──────────────┼──────────────┼──────────────┼────────────────────┼────────────────┼────────────────┼──────────────────────┼───────────────┼─────────────────────┼─────────────────────┼───────────────────┼───────────────────┼───────────────┼────────────────┼─────────────┼──────────────────────┼────────────────────┼───────────────────────┼───────────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────────┤\n", + "│ \u001b[1;36m357\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1988-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m6301.4000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-25 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", + "│ \u001b[1;36m381\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1973-11-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m4019.6000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-25 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", + "│ \u001b[1;36m388\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1989-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1989-04-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m8.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m14548.0000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-28 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │\n", + "│ \u001b[1;36m405\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1974-03-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1974-03-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m10498.2400\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-21 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │\n", + "│ \u001b[1;36m409\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1993-06-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m1993-06-01 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32m717ddd49 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[32ma7fcb6e5 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m7.0\u001b[0m │ \u001b[1;36m6344.8804\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-21 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │\n", + "└─────────┴─────────────────────┴─────────────────────────┴─────────────────────────┴────────────────┴───────────────────────┴──────────────────┴──────────────────┴──────────────┴──────────────┴─────────────┴──────────────┴─────────────┴──────────────────────┴─────────────────┴───────────────┴───────────────────┴────────────────┴────────────────┴────────────────┴───────────────┴───────────────┴───────────────┴──────────────────┴─────────────────┴─────────────────┴────────────────┴───────────────┴──────────────┴──────────────┴──────────────┴──────────────┴──────────────┴────────────────────┴────────────────┴────────────────┴──────────────────────┴───────────────┴─────────────────────┴─────────────────────┴───────────────────┴───────────────────┴───────────────┴────────────────┴─────────────┴──────────────────────┴────────────────────┴───────────────────────┴───────────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────────┘" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ibis.read_parquet(TRAIN_DIR / \"train_static_cb_0.parquet\").head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Example of a table with depth = 1, we need to aggregate the features and collect statistics\n", + "based on `case_id` then join with the base table." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓\n", + "┃ num_group1 ┃ case_id ┃ amount_1115A ┃ classificationofcontr_1114M ┃ contractdate_551D ┃ contractmaturitydate_151D ┃ contractst_516M ┃ contracttype_653M ┃ credlmt_1052A ┃ credlmt_228A ┃ credlmt_3940954A ┃ credor_3940957M ┃ credquantity_1099L ┃ credquantity_984L ┃ debtpastduevalue_732A ┃ debtvalue_227A ┃ dpd_550P ┃ dpd_733P ┃ dpdmax_851P ┃ dpdmaxdatemonth_804T ┃ dpdmaxdateyear_742T ┃ installmentamount_644A ┃ installmentamount_833A ┃ instlamount_892A ┃ interesteffectiverate_369L ┃ interestrateyearly_538L ┃ lastupdate_260D ┃ maxdebtpduevalodued_3940955A ┃ numberofinstls_810L ┃ overdueamountmax_950A ┃ overdueamountmaxdatemonth_494T ┃ overdueamountmaxdateyear_432T ┃ periodicityofpmts_997L ┃ periodicityofpmts_997M ┃ pmtdaysoverdue_1135P ┃ pmtmethod_731M ┃ pmtnumpending_403L ┃ purposeofcred_722M ┃ residualamount_1093A ┃ residualamount_127A ┃ residualamount_3940956A ┃ subjectrole_326M ┃ subjectrole_43M ┃ totalamount_503A ┃ totalamount_881A ┃\n", + "┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩\n", + "│ int64 │ int64 │ float64 │ string │ string │ string │ string │ string │ float64 │ float64 │ float64 │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ float64 │ string │ float64 │ string │ float64 │ float64 │ float64 │ string │ string │ float64 │ float64 │\n", + "├────────────┼─────────┼──────────────┼─────────────────────────────┼───────────────────┼───────────────────────────┼─────────────────┼───────────────────┼───────────────┼──────────────┼──────────────────┼─────────────────┼────────────────────┼───────────────────┼───────────────────────┼────────────────┼──────────┼──────────┼─────────────┼──────────────────────┼─────────────────────┼────────────────────────┼────────────────────────┼──────────────────┼────────────────────────────┼─────────────────────────┼─────────────────┼──────────────────────────────┼─────────────────────┼───────────────────────┼────────────────────────────────┼───────────────────────────────┼────────────────────────┼────────────────────────┼──────────────────────┼────────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────────────┼─────────────────────────┼──────────────────┼─────────────────┼──────────────────┼──────────────────┤\n", + "│ 0 │ 467 │ NULL │ ea6782cc │ 2011-06-15 │ 2031-06-13 │ 7241344e │ 724be82a │ 3.000000e+06 │ 10000.0 │ 3.000000e+06 │ P164_34_168 │ 2.0 │ 1.0 │ NULL │ NULL │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 0.0 │ 0.000 │ NULL │ NULL │ NULL │ 2019-01-20 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ NULL │ a55475b1 │ NULL │ 96a8fdfe │ 0.0 │ 0.0 │ NULL │ fa4f56f1 │ ab3c25cf │ 3.000000e+06 │ 10000.0 │\n", + "│ 1 │ 467 │ NULL │ ea6782cc │ 2019-01-04 │ 2021-08-04 │ 7241344e │ 724be82a │ NULL │ NULL │ 1.303650e+05 │ P164_34_168 │ 1.0 │ 2.0 │ NULL │ NULL │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 0.0 │ 26571.969 │ NULL │ NULL │ NULL │ 2019-01-20 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ NULL │ a55475b1 │ NULL │ 96a8fdfe │ NULL │ NULL │ NULL │ ab3c25cf │ ab3c25cf │ 7.800000e+04 │ 960000.0 │\n", + "│ 2 │ 467 │ 78000.0 │ ea6782cc │ 2016-10-25 │ 2019-10-25 │ 7241344e │ 4257cbed │ NULL │ NULL │ NULL │ c5a72b57 │ NULL │ NULL │ 0.0 │ 26571.969 │ NULL │ NULL │ 0.0 │ 11.0 │ 2016.0 │ NULL │ NULL │ 2898.76 │ NULL │ NULL │ 2019-01-10 │ 0.0 │ 36.0 │ 0.0 │ 11.0 │ 2016.0 │ NULL │ a0b598e4 │ 0.0 │ e914c86c │ 10.0 │ 96a8fdfe │ NULL │ NULL │ NULL │ a55475b1 │ a55475b1 │ NULL │ NULL │\n", + "│ 0 │ 1445 │ NULL │ ea6782cc │ 2015-01-30 │ 2021-01-30 │ 7241344e │ 1c9c5356 │ 4.000000e+05 │ 100000.0 │ 7.400000e+04 │ b619fa46 │ 2.0 │ 5.0 │ 0.0 │ NULL │ 0.0 │ 0.0 │ 200418.0 │ 1.0 │ 2018.0 │ 0.0 │ 0.000 │ NULL │ NULL │ NULL │ 2019-01-19 │ 0.4 │ NULL │ 1.4 │ 2.0 │ 2018.0 │ NULL │ a55475b1 │ 0.0 │ a55475b1 │ NULL │ 60c73645 │ 0.0 │ 0.0 │ 73044.18 │ daf49a8a │ ab3c25cf │ 4.000000e+05 │ 100000.0 │\n", + "│ 1 │ 1445 │ NULL │ 01f63ac8 │ 2014-09-12 │ 2021-09-12 │ 7241344e │ 724be82a │ NULL │ NULL │ 4.000000e+05 │ 74bd67a8 │ 3.0 │ 17.0 │ NULL │ NULL │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 0.0 │ 209617.770 │ NULL │ NULL │ NULL │ 2019-01-13 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ NULL │ a55475b1 │ NULL │ 96a8fdfe │ NULL │ NULL │ NULL │ ab3c25cf │ ab3c25cf │ 3.968006e+05 │ 184587.8 │\n", + "└────────────┴─────────┴──────────────┴─────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────┴───────────────────┴───────────────┴──────────────┴──────────────────┴─────────────────┴────────────────────┴───────────────────┴───────────────────────┴────────────────┴──────────┴──────────┴─────────────┴──────────────────────┴─────────────────────┴────────────────────────┴────────────────────────┴──────────────────┴────────────────────────────┴─────────────────────────┴─────────────────┴──────────────────────────────┴─────────────────────┴───────────────────────┴────────────────────────────────┴───────────────────────────────┴────────────────────────┴────────────────────────┴──────────────────────┴────────────────┴────────────────────┴────────────────────┴──────────────────────┴─────────────────────┴─────────────────────────┴──────────────────┴─────────────────┴──────────────────┴──────────────────┘\n", + "\n" ], - "source": [ - "# train preprocessing recipe using training dataset\n", - "last_mile_preprocessing.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In the previous cell, we trained the recipe using the training dataset. Now, we will\n", - "transform both the train and test datasets using the same recipe. The default output format is a `NumPy array`" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "train data shape = (1145346, 980)\n", - "test data shape = (381313, 980)\n" - ] - } + "text/plain": [ + "┏━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mnum_group1\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mamount_1115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mclassificationofcontr_1114M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractdate_551D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractmaturitydate_151D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontractst_516M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcontracttype_653M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_1052A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_228A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredlmt_3940954A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredor_3940957M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredquantity_1099L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcredquantity_984L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdebtpastduevalue_732A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdebtvalue_227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpd_550P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpd_733P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmax_851P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmaxdatemonth_804T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdpdmaxdateyear_742T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstallmentamount_644A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstallmentamount_833A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minstlamount_892A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minteresteffectiverate_369L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1minterestrateyearly_538L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mlastupdate_260D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaxdebtpduevalodued_3940955A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofinstls_810L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmax_950A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmaxdatemonth_494T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1moverdueamountmaxdateyear_432T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mperiodicityofpmts_997L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mperiodicityofpmts_997M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtdaysoverdue_1135P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtmethod_731M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtnumpending_403L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpurposeofcred_722M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_1093A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_127A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresidualamount_3940956A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msubjectrole_326M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msubjectrole_43M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotalamount_503A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtotalamount_881A\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", + "├────────────┼─────────┼──────────────┼─────────────────────────────┼───────────────────┼───────────────────────────┼─────────────────┼───────────────────┼───────────────┼──────────────┼──────────────────┼─────────────────┼────────────────────┼───────────────────┼───────────────────────┼────────────────┼──────────┼──────────┼─────────────┼──────────────────────┼─────────────────────┼────────────────────────┼────────────────────────┼──────────────────┼────────────────────────────┼─────────────────────────┼─────────────────┼──────────────────────────────┼─────────────────────┼───────────────────────┼────────────────────────────────┼───────────────────────────────┼────────────────────────┼────────────────────────┼──────────────────────┼────────────────┼────────────────────┼────────────────────┼──────────────────────┼─────────────────────┼─────────────────────────┼──────────────────┼─────────────────┼──────────────────┼──────────────────┤\n", + "│ \u001b[1;36m0\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2011-06-15 \u001b[0m │ \u001b[32m2031-06-13 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[1;36m10000.0\u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[32mP164_34_168 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-20 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mfa4f56f1 \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m3.000000e+06\u001b[0m │ \u001b[1;36m10000.0\u001b[0m │\n", + "│ \u001b[1;36m1\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2019-01-04 \u001b[0m │ \u001b[32m2021-08-04 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.303650e+05\u001b[0m │ \u001b[32mP164_34_168 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m26571.969\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-20 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m7.800000e+04\u001b[0m │ \u001b[1;36m960000.0\u001b[0m │\n", + "│ \u001b[1;36m2\u001b[0m │ \u001b[1;36m467\u001b[0m │ \u001b[1;36m78000.0\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2016-10-25 \u001b[0m │ \u001b[32m2019-10-25 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m4257cbed \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mc5a72b57 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m26571.969\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m11.0\u001b[0m │ \u001b[1;36m2016.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2898.76\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-10 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m36.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m11.0\u001b[0m │ \u001b[1;36m2016.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma0b598e4 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32me914c86c \u001b[0m │ \u001b[1;36m10.0\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", + "│ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1445\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mea6782cc \u001b[0m │ \u001b[32m2015-01-30 \u001b[0m │ \u001b[32m2021-01-30 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m1c9c5356 \u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[1;36m7.400000e+04\u001b[0m │ \u001b[32mb619fa46 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m200418.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2018.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.000\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-19 \u001b[0m │ \u001b[1;36m0.4\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.4\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m2018.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m60c73645 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m73044.18\u001b[0m │ \u001b[32mdaf49a8a \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │\n", + "│ \u001b[1;36m1\u001b[0m │ \u001b[1;36m1445\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m01f63ac8 \u001b[0m │ \u001b[32m2014-09-12 \u001b[0m │ \u001b[32m2021-09-12 \u001b[0m │ \u001b[32m7241344e \u001b[0m │ \u001b[32m724be82a \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.000000e+05\u001b[0m │ \u001b[32m74bd67a8 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m17.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m209617.770\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m2019-01-13 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32m96a8fdfe \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[32mab3c25cf \u001b[0m │ \u001b[1;36m3.968006e+05\u001b[0m │ \u001b[1;36m184587.8\u001b[0m │\n", + "└────────────┴─────────┴──────────────┴─────────────────────────────┴───────────────────┴───────────────────────────┴─────────────────┴───────────────────┴───────────────┴──────────────┴──────────────────┴─────────────────┴────────────────────┴───────────────────┴───────────────────────┴────────────────┴──────────┴──────────┴─────────────┴──────────────────────┴─────────────────────┴────────────────────────┴────────────────────────┴──────────────────┴────────────────────────────┴─────────────────────────┴─────────────────┴──────────────────────────────┴─────────────────────┴───────────────────────┴────────────────────────────────┴───────────────────────────────┴────────────────────────┴────────────────────────┴──────────────────────┴────────────────┴────────────────────┴────────────────────┴──────────────────────┴─────────────────────┴─────────────────────────┴──────────────────┴─────────────────┴──────────────────┴──────────────────┘" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ibis.read_parquet(TRAIN_DIR / \"train_credit_bureau_b_1.parquet\").relocate(\n", + " \"num_group1\"\n", + ").order_by([\"case_id\", \"num_group1\"]).head(5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For more details on features and its exploratory data analysis (EDA), you can refer to\n", + "feature definition and these Kaggle notebooks:\n", + "\n", + "* [Feature\n", + " definition](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data#:~:text=calendar_view_week-,feature_definitions,-.csv)\n", + "* [Home credit risk prediction\n", + " EDA](https://www.kaggle.com/code/loki97/home-credit-risk-prediction-eda)\n", + "* [Home credit CRMS 2024\n", + " EDA](https://www.kaggle.com/code/sergiosaharovskiy/home-credit-crms-2024-eda-and-submission)\n", + "\n", + "### Data loading and processing\n", + "We will perform the following data processing steps using Ibis and IbisML:\n", + "\n", + "* **Convert data types**: Ensure consistency by converting data types, as the same column\n", + " in different sub-files may have different types.\n", + "* **Aggregate features**: For tables with depth greater than 0, aggregate features based\n", + " on `case_id`, including statistics calculation. You can collect statistics such as mean,\n", + " median, mode, minimum, standard deviation, and others.\n", + "* **Union and join datasets**: Combine multiple sub-files of the same dataset into one\n", + " table, as some datasets are split into multiple sub-files with a common prefix. Afterward,\n", + " join these tables with the base table.\n", + "\n", + "#### Convert data types\n", + "We'll use IbisML to create a chain of `Cast` steps, forming a recipe for data type\n", + "conversion across the dataset. This conversion is based on the provided information\n", + "extracted from column names. Columns that have similar transformations are indicated by a\n", + "capital letter at the end of their names:\n", + "\n", + "* P - Transform DPD (Days past due)\n", + "* M - Masking categories\n", + "* A - Transform amount\n", + "* D - Transform date\n", + "* T - Unspecified Transform\n", + "* L - Unspecified Transform\n", + "\n", + "For example, we'll define a IbisML transformation step to convert columns ends with `P`\n", + "to floating number:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# convert columns ends with P to floating number\n", + "step_cast_P_to_float = ml.Cast(ml.endswith(\"P\"), dt.float64)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, let's define additional type conversion transformations based on the postfix of column names:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# convert columns ends with A to floating number\n", + "step_cast_A_to_float = ml.Cast(ml.endswith(\"A\"), dt.float64)\n", + "# convert columns ends with D to date\n", + "step_cast_D_to_date = ml.Cast(ml.endswith(\"D\"), dt.date)\n", + "# convert columns ends with M to str\n", + "step_cast_M_to_str = ml.Cast(ml.endswith(\"M\"), dt.str)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We'll construct the\n", + "[IbisML Recipe](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)\n", + "which chains together all the transformation steps." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data format conversion recipe:\n", + "Recipe(Cast(endswith('P'), 'float64'),\n", + " Cast(endswith('D'), 'date'),\n", + " Cast(endswith('M'), 'string'),\n", + " Cast(endswith('A'), 'float64'),\n", + " Cast(cols(('date_decision',)), 'date'),\n", + " Cast(cols(('case_id', 'WEEK_NUM', 'num_group1', 'num_group2')), 'int64'),\n", + " Cast(cols(('cardtype_51L', 'credacc_status_367L', 'requesttype_4525192L', 'riskassesment_302T', 'max_periodicityofpmts_997L')),\n", + " 'string'),\n", + " Cast(cols(('isbidproductrequest_292L', 'isdebitcard_527L', 'equalityempfrom_62L')),\n", + " 'int64'))\n" + ] + } + ], + "source": [ + "data_type_recipes = ml.Recipe(\n", + " step_cast_P_to_float,\n", + " step_cast_D_to_date,\n", + " step_cast_M_to_str,\n", + " step_cast_A_to_float,\n", + " # cast some special columns\n", + " ml.Cast([\"date_decision\"], \"date\"),\n", + " ml.Cast([\"case_id\", \"WEEK_NUM\", \"num_group1\", \"num_group2\"], dt.int64),\n", + " ml.Cast(\n", + " [\n", + " \"cardtype_51L\",\n", + " \"credacc_status_367L\",\n", + " \"requesttype_4525192L\",\n", + " \"riskassesment_302T\",\n", + " \"max_periodicityofpmts_997L\",\n", + " ],\n", + " dt.str,\n", + " ),\n", + " ml.Cast(\n", + " [\"isbidproductrequest_292L\", \"isdebitcard_527L\", \"equalityempfrom_62L\"],\n", + " dt.int64,\n", + " ),\n", + ")\n", + "print(f\"Data format conversion recipe:\\n{data_type_recipes}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "IbisML offers a powerful set of column selectors, allowing you to select columns based\n", + "on names, types, and patterns. For more information, you can refer to the IbisML column\n", + "selectors [documentation](https://ibis-project.github.io/ibis-ml/reference/selectors.html).\n", + "\n", + "\n", + "#### Aggregate features\n", + "For tables with a depth greater than 0 that can't be directly joined with the base table,\n", + "we need to aggregate the features by the `case_id`. You could compute the different statistics for numeric columns and\n", + "non-numeric columns.\n", + "\n", + "Here, we use the `maximum` as an example." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "def agg_by_id(table):\n", + " return table.group_by(\"case_id\").agg(\n", + " [\n", + " table[col_name].max().name(f\"max_{col_name}\")\n", + " for col_name in table.columns\n", + " if col_name[-1] in (\"T\", \"L\", \"P\", \"A\", \"D\", \"M\")\n", + " ]\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "For better predicting power, you need to collect different statistics based on the meaning of features. For simplicity,\n", + "we'll only collect the maximum value of the features here.\n", + "\n", + "\n", + "#### Put them together\n", + "We'll put them together in a function reads parquet files, optionally handles regex patterns for\n", + "multiple sub-files, applies data type transformations defined by `data_type_recipes`, and\n", + "performs aggregation based on `case_id` if specified by the depth parameter." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def read_and_process_files(file_path, depth=None, is_regex=False):\n", + " \"\"\"\n", + " Read and process Parquet files.\n", + "\n", + " Args:\n", + " file_path (str): Path to the file or regex pattern to match files.\n", + " depth (int, optional): Depth of processing. If 1 or 2, additional\n", + " aggregation is performed.\n", + " is_regex (bool, optional): Whether the file_path is a regex pattern.\n", + "\n", + " Returns:\n", + " ibis.Table: The processed Ibis table.\n", + " \"\"\"\n", + " if is_regex:\n", + " # read and union multiple files\n", + " chunks = []\n", + " for path in Path(file_path).glob(\"*\"):\n", + " chunk = ibis.read_parquet(path)\n", + " # transform table using IbisML Recipe\n", + " chunk = data_type_recipes.fit(chunk).to_ibis(chunk)\n", + " chunks.append(chunk)\n", + " table = ibis.union(*chunks)\n", + " else:\n", + " # read a single file\n", + " table = ibis.read_parquet(file_path)\n", + " # transform table using IbisML\n", + " table = data_type_recipes.fit(table).to_ibis(table)\n", + "\n", + " # perform aggregation if depth is 1 or 2\n", + " if depth in [1, 2]:\n", + " table = agg_by_id(table)\n", + "\n", + " return table" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's define two dictionaries, `train_data_store` and `test_data_store`, that organize and\n", + "store processed datasets for training and testing datasets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_data_store = {\n", + " \"df_base\": read_and_process_files(TRAIN_DIR / \"train_base.parquet\"),\n", + " \"depth_0\": [\n", + " read_and_process_files(TRAIN_DIR / \"train_static_cb_0.parquet\"),\n", + " read_and_process_files(TRAIN_DIR / \"train_static_0_*.parquet\", is_regex=True),\n", + " ],\n", + " \"depth_1\": [\n", + " read_and_process_files(\n", + " TRAIN_DIR / \"train_applprev_1_*.parquet\", 1, is_regex=True\n", + " ),\n", + " read_and_process_files(TRAIN_DIR / \"train_tax_registry_a_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_tax_registry_b_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_tax_registry_c_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_credit_bureau_b_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_other_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_person_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_deposit_1.parquet\", 1),\n", + " read_and_process_files(TRAIN_DIR / \"train_debitcard_1.parquet\", 1),\n", + " ],\n", + " \"depth_2\": [\n", + " read_and_process_files(TRAIN_DIR / \"train_credit_bureau_b_2.parquet\", 2)\n", + " ],\n", + "}\n", + "# we won't be submitting the predictions, so let's comment out the test data.\n", + "# test_data_store = {\n", + "# \"df_base\": read_and_process_files(TEST_DIR / \"test_base.parquet\"),\n", + "# \"depth_0\": [\n", + "# read_and_process_files(TEST_DIR / \"test_static_cb_0.parquet\"),\n", + "# read_and_process_files(TEST_DIR / \"test_static_0_*.parquet\", is_regex=True),\n", + "# ],\n", + "# \"depth_1\": [\n", + "# read_and_process_files(TEST_DIR / \"test_applprev_1_*.parquet\", 1, is_regex=True),\n", + "# read_and_process_files(TEST_DIR / \"test_tax_registry_a_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_tax_registry_b_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_tax_registry_c_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_credit_bureau_b_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_other_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_person_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_deposit_1.parquet\", 1),\n", + "# read_and_process_files(TEST_DIR / \"test_debitcard_1.parquet\", 1),\n", + "# ],\n", + "# \"depth_2\": [\n", + "# read_and_process_files(TEST_DIR / \"test_credit_bureau_b_2.parquet\", 2),\n", + "# ]\n", + "# }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Join all features data to base table:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "def join_data(df_base, depth_0, depth_1, depth_2):\n", + " for i, df in enumerate(depth_0 + depth_1 + depth_2):\n", + " df_base = df_base.join(\n", + " df, \"case_id\", how=\"left\", rname=\"{name}_right\" + f\"_{i}\"\n", + " )\n", + " return df_base" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Generate train and test datasets:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There is 1526659 rows and 377 columns\n" + ] + } + ], + "source": [ + "df_train = join_data(**train_data_store)\n", + "# df_test = join_data(**test_data_store)\n", + "total_rows = df_train.count().execute()\n", + "print(f\"There is {total_rows} rows and {len(df_train.columns)} columns\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Select features\n", + "Given the large number of features (~370), we'll focus on selecting just a few of the most\n", + "informative ones by name for demonstration purposes in this post:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃ case_id ┃ date_decision ┃ target ┃ days30_165L ┃ days360_512L ┃ days90_310L ┃ pmtscount_423L ┃ pmtssum_45A ┃ dateofbirth_337D ┃ education_1103M ┃ firstquarter_103L ┃ secondquarter_766L ┃ thirdquarter_1082L ┃ fourthquarter_440L ┃ maritalst_893M ┃ numberofqueries_373L ┃ requesttype_4525192L ┃ responsedate_4527233D ┃ actualdpdtolerance_344P ┃ amtinstpaidbefduel24m_4187115A ┃ annuity_780A ┃ annuitynextmonth_57A ┃ applicationcnt_361L ┃ applications30d_658L ┃ applicationscnt_1086L ┃ avgdbddpdlast24m_3658932P ┃ avgdbddpdlast3m_4187120P ┃ max_contractmaturitydate_151D ┃ max_credlmt_1052A ┃ max_credquantity_1099L ┃ max_dpdmaxdatemonth_804T ┃ max_dpdmaxdateyear_742T ┃ max_maxdebtpduevalodued_3940955A ┃ max_overdueamountmax_950A ┃ max_purposeofcred_722M ┃ max_residualamount_3940956A ┃ max_totalamount_503A ┃ max_cancelreason_3545846M ┃ max_childnum_21L ┃ max_currdebt_94A ┃ max_employedfrom_700D ┃ max_mainoccupationinc_437A ┃ max_profession_152M ┃ max_rejectreason_755M ┃ max_status_219L ┃ max_amount_1115A ┃ max_debtpastduevalue_732A ┃ max_debtvalue_227A ┃ max_installmentamount_833A ┃ max_instlamount_892A ┃ max_numberofinstls_810L ┃ max_pmtnumpending_403L ┃ max_last180dayaveragebalance_704A ┃ max_last30dayturnover_651A ┃ max_openingdate_857D ┃ max_amount_416A ┃ max_amtdebitincoming_4809443A ┃ max_amtdebitoutgoing_4809440A ┃ max_amtdepositbalance_4809441A ┃ max_amtdepositincoming_4809444A ┃ max_amtdepositoutgoing_4809442A ┃ max_empl_industry_691L ┃ max_gender_992L ┃ max_housingtype_772L ┃ max_mainoccupationinc_384A ┃ max_incometype_1044T ┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ int64 │ date │ int64 │ float64 │ float64 │ float64 │ float64 │ float64 │ date │ string │ float64 │ float64 │ float64 │ float64 │ string │ float64 │ string │ date │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ date │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ float64 │ float64 │ string │ float64 │ float64 │ date │ float64 │ string │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ date │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ string │ string │ string │ float64 │ string │\n", + "├─────────┼───────────────┼────────┼─────────────┼──────────────┼─────────────┼────────────────┼─────────────┼──────────────────┼─────────────────┼───────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────┼──────────────────────┼──────────────────────┼───────────────────────┼─────────────────────────┼────────────────────────────────┼──────────────┼──────────────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┼───────────────────┼────────────────────────┼──────────────────────────┼─────────────────────────┼──────────────────────────────────┼───────────────────────────┼────────────────────────┼─────────────────────────────┼──────────────────────┼───────────────────────────┼──────────────────┼──────────────────┼───────────────────────┼────────────────────────────┼─────────────────────┼───────────────────────┼─────────────────┼──────────────────┼───────────────────────────┼────────────────────┼────────────────────────────┼──────────────────────┼─────────────────────────┼────────────────────────┼───────────────────────────────────┼────────────────────────────┼──────────────────────┼─────────────────┼───────────────────────────────┼───────────────────────────────┼────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼────────────────────────┼─────────────────┼──────────────────────┼────────────────────────────┼─────────────────────────┤\n", + "│ 1915907 │ 2020-09-02 │ 0 │ 0.0 │ 4.0 │ 0.0 │ NULL │ NULL │ 1965-03-01 │ a55475b1 │ 5.0 │ 2.0 │ 1.0 │ 3.0 │ a55475b1 │ 4.0 │ NULL │ NULL │ 0.0 │ 39089.600 │ 3740.6 │ 4886.2000 │ 0.0 │ 0.0 │ 0.0 │ -3.0 │ -6.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 0.0 │ 55290.250 │ 2006-09-15 │ 120000.0 │ a55475b1 │ a55475b1 │ D │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 22000.0 │ 22000.0 │ 0.0 │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 60000.0 │ EMPLOYED │\n", + "│ 1916572 │ 2020-09-03 │ 0 │ 1.0 │ 6.0 │ 2.0 │ NULL │ NULL │ 1985-01-01 │ a55475b1 │ 2.0 │ 2.0 │ 1.0 │ 2.0 │ a55475b1 │ 6.0 │ NULL │ NULL │ 0.0 │ 110432.000 │ 2400.0 │ 7555.8003 │ 0.0 │ 0.0 │ 0.0 │ -5.0 │ -10.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 0.0 │ 45862.934 │ 2007-04-15 │ 194000.0 │ a55475b1 │ a55475b1 │ T │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 13353.4 │ 13333.4 │ 0.0 │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 28000.0 │ PRIVATE_SECTOR_EMPLOYEE │\n", + "│ 1916744 │ 2020-09-03 │ 0 │ 0.0 │ 3.0 │ 2.0 │ NULL │ NULL │ 1974-04-01 │ 6b2ae0fa │ 5.0 │ 9.0 │ 7.0 │ 5.0 │ a55475b1 │ 3.0 │ NULL │ NULL │ 0.0 │ 86690.200 │ 4333.2 │ 4199.8003 │ 0.0 │ 0.0 │ 0.0 │ -1.0 │ 0.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 2.0 │ 41992.000 │ 2007-03-15 │ 100000.0 │ a55475b1 │ a55475b1 │ K │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 0.0 │ 0.0 │ 81909.4 │ 0.0 │ 7152.0 │ NULL │ NULL │ NULL │ 100000.0 │ SALARIED_GOVT │\n", + "│ 1917212 │ 2020-09-03 │ 0 │ 0.0 │ 2.0 │ 0.0 │ NULL │ NULL │ 1981-10-01 │ a55475b1 │ 1.0 │ 2.0 │ 6.0 │ 2.0 │ a55475b1 │ 2.0 │ NULL │ NULL │ 0.0 │ 160111.330 │ 1864.6 │ 10964.0000 │ 0.0 │ 0.0 │ 0.0 │ -6.0 │ -10.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 3.0 │ 19254.000 │ 2000-01-15 │ 60000.0 │ a55475b1 │ a55475b1 │ K │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 2685.8 │ 2660.0 │ 206.2 │ 0.0 │ 68.8 │ NULL │ NULL │ NULL │ 18000.0 │ EMPLOYED │\n", + "│ 1917552 │ 2020-09-03 │ 0 │ 0.0 │ 1.0 │ 0.0 │ NULL │ NULL │ 1984-12-01 │ a55475b1 │ 0.0 │ 1.0 │ 0.0 │ 2.0 │ a55475b1 │ 1.0 │ NULL │ NULL │ 0.0 │ 89029.805 │ 3788.0 │ 2962.6000 │ 0.0 │ 0.0 │ 0.0 │ -33.0 │ -6.0 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ a55475b1 │ 0.0 │ 10627.937 │ 2017-10-26 │ 47000.0 │ a55475b1 │ a55475b1 │ K │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ 0.0 │ 0.0 │ 0.0 │ 0.0 │ 0.0 │ NULL │ NULL │ NULL │ 20000.0 │ SALARIED_GOVT │\n", + "└─────────┴───────────────┴────────┴─────────────┴──────────────┴─────────────┴────────────────┴─────────────┴──────────────────┴─────────────────┴───────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────┴──────────────────────┴──────────────────────┴───────────────────────┴─────────────────────────┴────────────────────────────────┴──────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┴───────────────────┴────────────────────────┴──────────────────────────┴─────────────────────────┴──────────────────────────────────┴───────────────────────────┴────────────────────────┴─────────────────────────────┴──────────────────────┴───────────────────────────┴──────────────────┴──────────────────┴───────────────────────┴────────────────────────────┴─────────────────────┴───────────────────────┴─────────────────┴──────────────────┴───────────────────────────┴────────────────────┴────────────────────────────┴──────────────────────┴─────────────────────────┴────────────────────────┴───────────────────────────────────┴────────────────────────────┴──────────────────────┴─────────────────┴───────────────────────────────┴───────────────────────────────┴────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┴────────────────────────┴─────────────────┴──────────────────────┴────────────────────────────┴─────────────────────────┘\n", + "\n" ], - "source": [ - "# transform train and test dataset using IbisML recipe\n", - "X_train_transformed = last_mile_preprocessing.transform(X_train)\n", - "X_test_transformed = last_mile_preprocessing.transform(X_test)\n", - "print(f\"train data shape = {X_train_transformed.shape}\")\n", - "print(f\"test data shape = {X_test_transformed.shape}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define a neural network classifier using PyTorch and PyTorch Lighting:" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim\n", - "from torch.utils.data import DataLoader, TensorDataset\n", - "import pytorch_lightning as pl\n", - "from pytorch_lightning import Trainer\n", - "\n", - "\n", - "class NeuralNetClassifier(pl.LightningModule):\n", - " def __init__(self, input_dim, hidden_dim=8, output_dim=1):\n", - " super().__init__()\n", - " self.model = nn.Sequential(\n", - " nn.Linear(input_dim, hidden_dim),\n", - " nn.ReLU(),\n", - " nn.Linear(hidden_dim, output_dim),\n", - " )\n", - " self.loss = nn.BCEWithLogitsLoss()\n", - " self.sigmoid = nn.Sigmoid()\n", - "\n", - " def forward(self, x):\n", - " return self.model(x)\n", - "\n", - " def training_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " y_hat = self(x)\n", - " loss = self.loss(y_hat.view(-1), y)\n", - " self.log(\"train_loss\", loss)\n", - " return loss\n", - "\n", - " def validation_step(self, batch, batch_idx):\n", - " x, y = batch\n", - " y_hat = self(x)\n", - " loss = self.loss(y_hat.view(-1), y)\n", - " self.log(\"val_loss\", loss)\n", - " return loss\n", - "\n", - " def configure_optimizers(self):\n", - " return optim.Adam(self.parameters(), lr=0.001)\n", - "\n", - " def predict_proba(self, x):\n", - " self.eval()\n", - " with torch.no_grad():\n", - " x = x.to(self.device)\n", - " return self.sigmoid(self(x))\n", - "\n", - "# initialize your Lightning Module\n", - "nn_classifier = NeuralNetClassifier(input_dim=X_train_transformed.shape[1])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we'll create the PyTorch DataLoader using the output from IbisML:" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "y_train_array = y_train.to_pandas().to_numpy().astype(np.float32)\n", - "x_train_tensor = torch.from_numpy(X_train_transformed)\n", - "y_train_tensor = torch.from_numpy(y_train_array)\n", - "train_dataset = TensorDataset(x_train_tensor, y_train_tensor)\n", - "\n", - "y_test_array = y_test.to_pandas().to_numpy().astype(np.float32)\n", - "X_test_tensor = torch.from_numpy(X_test_transformed)\n", - "y_test_tensor = torch.from_numpy(y_test_array)\n", - "val_dataset = TensorDataset(X_test_tensor, y_test_tensor)\n", - "\n", - "train_loader = DataLoader(train_dataset, batch_size=32, num_workers=13, shuffle=False)\n", - "val_loader = DataLoader(val_dataset, batch_size=32, num_workers=13, shuffle=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Initialize the PyTorch Lightning Trainer:" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "GPU available: True (mps), used: True\n", - "TPU available: False, using: 0 TPU cores\n", - "HPU available: False, using: 0 HPUs\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "NeuralNetClassifier(\n", - " (model): Sequential(\n", - " (0): Linear(in_features=980, out_features=8, bias=True)\n", - " (1): ReLU()\n", - " (2): Linear(in_features=8, out_features=1, bias=True)\n", - " )\n", - " (loss): BCEWithLogitsLoss()\n", - " (sigmoid): Sigmoid()\n", - ")\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jiting/anaconda3/envs/ibis-ml-dev/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default\n" - ] - } + "text/plain": [ + "┏━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mcase_id\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdate_decision\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtarget\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays30_165L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays360_512L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdays90_310L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtscount_423L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpmtssum_45A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mdateofbirth_337D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1meducation_1103M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfirstquarter_103L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1msecondquarter_766L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mthirdquarter_1082L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mfourthquarter_440L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmaritalst_893M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnumberofqueries_373L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mrequesttype_4525192L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mresponsedate_4527233D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mactualdpdtolerance_344P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mamtinstpaidbefduel24m_4187115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mannuity_780A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mannuitynextmonth_57A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplicationcnt_361L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplications30d_658L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mapplicationscnt_1086L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mavgdbddpdlast24m_3658932P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mavgdbddpdlast3m_4187120P\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_contractmaturitydate_151D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_credlmt_1052A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_credquantity_1099L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_dpdmaxdatemonth_804T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_dpdmaxdateyear_742T\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_maxdebtpduevalodued_3940955A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_overdueamountmax_950A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_purposeofcred_722M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_residualamount_3940956A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_totalamount_503A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_cancelreason_3545846M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_childnum_21L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_currdebt_94A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_employedfrom_700D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_mainoccupationinc_437A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_profession_152M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_rejectreason_755M\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_status_219L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amount_1115A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_debtpastduevalue_732A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_debtvalue_227A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_installmentamount_833A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_instlamount_892A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_numberofinstls_810L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_pmtnumpending_403L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_last180dayaveragebalance_704A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_last30dayturnover_651A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_openingdate_857D\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amount_416A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdebitincoming_4809443A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdebitoutgoing_4809440A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositbalance_4809441A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositincoming_4809444A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_amtdepositoutgoing_4809442A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_empl_industry_691L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_gender_992L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_housingtype_772L\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_mainoccupationinc_384A\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax_incometype_1044T\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩\n", + "│ \u001b[2mint64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mdate\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mstring\u001b[0m │\n", + "├─────────┼───────────────┼────────┼─────────────┼──────────────┼─────────────┼────────────────┼─────────────┼──────────────────┼─────────────────┼───────────────────┼────────────────────┼────────────────────┼────────────────────┼────────────────┼──────────────────────┼──────────────────────┼───────────────────────┼─────────────────────────┼────────────────────────────────┼──────────────┼──────────────────────┼─────────────────────┼──────────────────────┼───────────────────────┼───────────────────────────┼──────────────────────────┼───────────────────────────────┼───────────────────┼────────────────────────┼──────────────────────────┼─────────────────────────┼──────────────────────────────────┼───────────────────────────┼────────────────────────┼─────────────────────────────┼──────────────────────┼───────────────────────────┼──────────────────┼──────────────────┼───────────────────────┼────────────────────────────┼─────────────────────┼───────────────────────┼─────────────────┼──────────────────┼───────────────────────────┼────────────────────┼────────────────────────────┼──────────────────────┼─────────────────────────┼────────────────────────┼───────────────────────────────────┼────────────────────────────┼──────────────────────┼─────────────────┼───────────────────────────────┼───────────────────────────────┼────────────────────────────────┼─────────────────────────────────┼─────────────────────────────────┼────────────────────────┼─────────────────┼──────────────────────┼────────────────────────────┼─────────────────────────┤\n", + "│ \u001b[1;36m1915907\u001b[0m │ \u001b[35m2020-09-02\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1965-03-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m4.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m39089.600\u001b[0m │ \u001b[1;36m3740.6\u001b[0m │ \u001b[1;36m4886.2000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-3.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m55290.250\u001b[0m │ \u001b[35m2006-09-15\u001b[0m │ \u001b[1;36m120000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mD \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m22000.0\u001b[0m │ \u001b[1;36m22000.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m60000.0\u001b[0m │ \u001b[32mEMPLOYED \u001b[0m │\n", + "│ \u001b[1;36m1916572\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1985-01-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m110432.000\u001b[0m │ \u001b[1;36m2400.0\u001b[0m │ \u001b[1;36m7555.8003\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-5.0\u001b[0m │ \u001b[1;36m-10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m45862.934\u001b[0m │ \u001b[35m2007-04-15\u001b[0m │ \u001b[1;36m194000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mT \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m13353.4\u001b[0m │ \u001b[1;36m13333.4\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m28000.0\u001b[0m │ \u001b[32mPRIVATE_SECTOR_EMPLOYEE\u001b[0m │\n", + "│ \u001b[1;36m1916744\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1974-04-01\u001b[0m │ \u001b[32m6b2ae0fa \u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[1;36m9.0\u001b[0m │ \u001b[1;36m7.0\u001b[0m │ \u001b[1;36m5.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m86690.200\u001b[0m │ \u001b[1;36m4333.2\u001b[0m │ \u001b[1;36m4199.8003\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m41992.000\u001b[0m │ \u001b[35m2007-03-15\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m81909.4\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m7152.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m100000.0\u001b[0m │ \u001b[32mSALARIED_GOVT \u001b[0m │\n", + "│ \u001b[1;36m1917212\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1981-10-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[1;36m6.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m160111.330\u001b[0m │ \u001b[1;36m1864.6\u001b[0m │ \u001b[1;36m10964.0000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[1;36m-10.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m3.0\u001b[0m │ \u001b[1;36m19254.000\u001b[0m │ \u001b[35m2000-01-15\u001b[0m │ \u001b[1;36m60000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m2685.8\u001b[0m │ \u001b[1;36m2660.0\u001b[0m │ \u001b[1;36m206.2\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m68.8\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m18000.0\u001b[0m │ \u001b[32mEMPLOYED \u001b[0m │\n", + "│ \u001b[1;36m1917552\u001b[0m │ \u001b[35m2020-09-03\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[35m1984-12-01\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m2.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m1.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m89029.805\u001b[0m │ \u001b[1;36m3788.0\u001b[0m │ \u001b[1;36m2962.6000\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m-33.0\u001b[0m │ \u001b[1;36m-6.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m10627.937\u001b[0m │ \u001b[35m2017-10-26\u001b[0m │ \u001b[1;36m47000.0\u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32ma55475b1 \u001b[0m │ \u001b[32mK \u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m20000.0\u001b[0m │ \u001b[32mSALARIED_GOVT \u001b[0m │\n", + "└─────────┴───────────────┴────────┴─────────────┴──────────────┴─────────────┴────────────────┴─────────────┴──────────────────┴─────────────────┴───────────────────┴────────────────────┴────────────────────┴────────────────────┴────────────────┴──────────────────────┴──────────────────────┴───────────────────────┴─────────────────────────┴────────────────────────────────┴──────────────┴──────────────────────┴─────────────────────┴──────────────────────┴───────────────────────┴───────────────────────────┴──────────────────────────┴───────────────────────────────┴───────────────────┴────────────────────────┴──────────────────────────┴─────────────────────────┴──────────────────────────────────┴───────────────────────────┴────────────────────────┴─────────────────────────────┴──────────────────────┴───────────────────────────┴──────────────────┴──────────────────┴───────────────────────┴────────────────────────────┴─────────────────────┴───────────────────────┴─────────────────┴──────────────────┴───────────────────────────┴────────────────────┴────────────────────────────┴──────────────────────┴─────────────────────────┴────────────────────────┴───────────────────────────────────┴────────────────────────────┴──────────────────────┴─────────────────┴───────────────────────────────┴───────────────────────────────┴────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┴────────────────────────┴─────────────────┴──────────────────────┴────────────────────────────┴─────────────────────────┘" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_train = df_train.select(\n", + " \"case_id\",\n", + " \"date_decision\",\n", + " \"target\",\n", + " # number of credit bureau queries for the last X days.\n", + " \"days30_165L\",\n", + " \"days360_512L\",\n", + " \"days90_310L\",\n", + " # number of tax deduction payments\n", + " \"pmtscount_423L\",\n", + " # sum of tax deductions for the client\n", + " \"pmtssum_45A\",\n", + " \"dateofbirth_337D\",\n", + " \"education_1103M\",\n", + " \"firstquarter_103L\",\n", + " \"secondquarter_766L\",\n", + " \"thirdquarter_1082L\",\n", + " \"fourthquarter_440L\",\n", + " \"maritalst_893M\",\n", + " \"numberofqueries_373L\",\n", + " \"requesttype_4525192L\",\n", + " \"responsedate_4527233D\",\n", + " \"actualdpdtolerance_344P\",\n", + " \"amtinstpaidbefduel24m_4187115A\",\n", + " \"annuity_780A\",\n", + " \"annuitynextmonth_57A\",\n", + " \"applicationcnt_361L\",\n", + " \"applications30d_658L\",\n", + " \"applicationscnt_1086L\",\n", + " # average days past or before due of payment during the last 24 months.\n", + " \"avgdbddpdlast24m_3658932P\",\n", + " # average days past or before due of payment during the last 3 months.\n", + " \"avgdbddpdlast3m_4187120P\",\n", + " # end date of active contract.\n", + " \"max_contractmaturitydate_151D\",\n", + " # credit limit of an active loan.\n", + " \"max_credlmt_1052A\",\n", + " # number of credits in credit bureau\n", + " \"max_credquantity_1099L\",\n", + " \"max_dpdmaxdatemonth_804T\",\n", + " \"max_dpdmaxdateyear_742T\",\n", + " \"max_maxdebtpduevalodued_3940955A\",\n", + " \"max_overdueamountmax_950A\",\n", + " \"max_purposeofcred_722M\",\n", + " \"max_residualamount_3940956A\",\n", + " \"max_totalamount_503A\",\n", + " \"max_cancelreason_3545846M\",\n", + " \"max_childnum_21L\",\n", + " \"max_currdebt_94A\",\n", + " \"max_employedfrom_700D\",\n", + " # client's main income amount in their previous application\n", + " \"max_mainoccupationinc_437A\",\n", + " \"max_profession_152M\",\n", + " \"max_rejectreason_755M\",\n", + " \"max_status_219L\",\n", + " # credit amount of the active contract provided by the credit bureau\n", + " \"max_amount_1115A\",\n", + " # amount of unpaid debt for existing contracts\n", + " \"max_debtpastduevalue_732A\",\n", + " \"max_debtvalue_227A\",\n", + " \"max_installmentamount_833A\",\n", + " \"max_instlamount_892A\",\n", + " \"max_numberofinstls_810L\",\n", + " \"max_pmtnumpending_403L\",\n", + " \"max_last180dayaveragebalance_704A\",\n", + " \"max_last30dayturnover_651A\",\n", + " \"max_openingdate_857D\",\n", + " \"max_amount_416A\",\n", + " \"max_amtdebitincoming_4809443A\",\n", + " \"max_amtdebitoutgoing_4809440A\",\n", + " \"max_amtdepositbalance_4809441A\",\n", + " \"max_amtdepositincoming_4809444A\",\n", + " \"max_amtdepositoutgoing_4809442A\",\n", + " \"max_empl_industry_691L\",\n", + " \"max_gender_992L\",\n", + " \"max_housingtype_772L\",\n", + " \"max_mainoccupationinc_384A\",\n", + " \"max_incometype_1044T\",\n", + ")\n", + "\n", + "df_train.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Univariate analysis:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
┏━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃ name ┃ pos ┃ type ┃ count ┃ nulls ┃ unique ┃ mode ┃ mean ┃ std ┃ min ┃ p25 ┃ p50 ┃ p75 ┃ max ┃\n", + "┡━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│ string │ int16 │ string │ int64 │ int64 │ int64 │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │\n", + "├─────────────────┼───────┼─────────┼─────────┼────────┼─────────┼──────────┼──────────────┼───────────────┼─────────┼─────────────┼──────────────┼──────────────┼──────────────┤\n", + "│ case_id │ 0 │ int64 │ 1526659 │ 0 │ 1526659 │ NULL │ 1.286077e+06 │ 718946.592285 │ 0.0 │ 766197.5000 │ 1.357358e+06 │ 1.739022e+06 │ 2.703454e+06 │\n", + "│ target │ 2 │ int64 │ 1526659 │ 0 │ 2 │ NULL │ 3.143728e-02 │ 0.174496 │ 0.0 │ 0.0000 │ 0.000000e+00 │ 0.000000e+00 │ 1.000000e+00 │\n", + "│ days30_165L │ 3 │ float64 │ 1526659 │ 140968 │ 22 │ NULL │ 5.177078e-01 │ 0.899238 │ 0.0 │ 0.0000 │ 0.000000e+00 │ 1.000000e+00 │ 2.200000e+01 │\n", + "│ days360_512L │ 4 │ float64 │ 1526659 │ 140968 │ 92 │ NULL │ 4.777066e+00 │ 5.168856 │ 0.0 │ 1.0000 │ 3.000000e+00 │ 6.500000e+00 │ 1.150000e+02 │\n", + "│ days90_310L │ 5 │ float64 │ 1526659 │ 140968 │ 37 │ NULL │ 1.211420e+00 │ 1.655931 │ 0.0 │ 0.0000 │ 1.000000e+00 │ 2.000000e+00 │ 4.100000e+01 │\n", + "│ pmtscount_423L │ 6 │ float64 │ 1526659 │ 954021 │ 66 │ NULL │ 5.839291e+00 │ 4.148264 │ 0.0 │ 3.0000 │ 6.000000e+00 │ 7.000000e+00 │ 1.210000e+02 │\n", + "│ pmtssum_45A │ 7 │ float64 │ 1526659 │ 954021 │ 265229 │ NULL │ 1.319994e+04 │ 18117.218312 │ 0.0 │ 3156.4001 │ 8.391900e+03 │ 1.699200e+04 │ 4.768434e+05 │\n", + "│ education_1103M │ 9 │ string │ 1526659 │ 26183 │ 5 │ a55475b1 │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │ NULL │\n", + "└─────────────────┴───────┴─────────┴─────────┴────────┴─────────┴──────────┴──────────────┴───────────────┴─────────┴─────────────┴──────────────┴──────────────┴──────────────┘\n", + "\n" ], - "source": [ - "# initialize a Trainer\n", - "trainer = Trainer(max_epochs=2)\n", - "print(nn_classifier)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's train the classifier:" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " | Name | Type | Params | Mode \n", - "------------------------------------------------------\n", - "0 | model | Sequential | 7.9 K | train\n", - "1 | loss | BCEWithLogitsLoss | 0 | train\n", - "2 | sigmoid | Sigmoid | 0 | train\n", - "------------------------------------------------------\n", - "7.9 K Trainable params\n", - "0 Non-trainable params\n", - "7.9 K Total params\n", - "0.031 Total estimated model params size (MB)\n", - "6 Modules in train mode\n", - "0 Modules in eval mode\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "2cf7098fcd4c41f286c6059b3b170828", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Sanity Checking: | | 0/? [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jiting/anaconda3/envs/ibis-ml-dev/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=13` in the `DataLoader` to improve performance.\n", - "/Users/jiting/anaconda3/envs/ibis-ml-dev/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=13` in the `DataLoader` to improve performance.\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e77639d880a543a9a35d04c4306faf89", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Training: | | 0/? [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mname\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mpos\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mtype\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mcount\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mnulls\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1munique\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmode\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmean\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mstd\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmin\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp25\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp50\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mp75\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mmax\u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩\n", + "│ \u001b[2mstring\u001b[0m │ \u001b[2mint16\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mint64\u001b[0m │ \u001b[2mstring\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │ \u001b[2mfloat64\u001b[0m │\n", + "├─────────────────┼───────┼─────────┼─────────┼────────┼─────────┼──────────┼──────────────┼───────────────┼─────────┼─────────────┼──────────────┼──────────────┼──────────────┤\n", + "│ \u001b[32mcase_id \u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[32mint64 \u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.286077e+06\u001b[0m │ \u001b[1;36m718946.592285\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m766197.5000\u001b[0m │ \u001b[1;36m1.357358e+06\u001b[0m │ \u001b[1;36m1.739022e+06\u001b[0m │ \u001b[1;36m2.703454e+06\u001b[0m │\n", + "│ \u001b[32mtarget \u001b[0m │ \u001b[1;36m2\u001b[0m │ \u001b[32mint64 \u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m0\u001b[0m │ \u001b[1;36m2\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m3.143728e-02\u001b[0m │ \u001b[1;36m0.174496\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0000\u001b[0m │ \u001b[1;36m0.000000e+00\u001b[0m │ \u001b[1;36m0.000000e+00\u001b[0m │ \u001b[1;36m1.000000e+00\u001b[0m │\n", + "│ \u001b[32mdays30_165L \u001b[0m │ \u001b[1;36m3\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m140968\u001b[0m │ \u001b[1;36m22\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m5.177078e-01\u001b[0m │ \u001b[1;36m0.899238\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0000\u001b[0m │ \u001b[1;36m0.000000e+00\u001b[0m │ \u001b[1;36m1.000000e+00\u001b[0m │ \u001b[1;36m2.200000e+01\u001b[0m │\n", + "│ \u001b[32mdays360_512L \u001b[0m │ \u001b[1;36m4\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m140968\u001b[0m │ \u001b[1;36m92\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m4.777066e+00\u001b[0m │ \u001b[1;36m5.168856\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m1.0000\u001b[0m │ \u001b[1;36m3.000000e+00\u001b[0m │ \u001b[1;36m6.500000e+00\u001b[0m │ \u001b[1;36m1.150000e+02\u001b[0m │\n", + "│ \u001b[32mdays90_310L \u001b[0m │ \u001b[1;36m5\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m140968\u001b[0m │ \u001b[1;36m37\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.211420e+00\u001b[0m │ \u001b[1;36m1.655931\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m0.0000\u001b[0m │ \u001b[1;36m1.000000e+00\u001b[0m │ \u001b[1;36m2.000000e+00\u001b[0m │ \u001b[1;36m4.100000e+01\u001b[0m │\n", + "│ \u001b[32mpmtscount_423L \u001b[0m │ \u001b[1;36m6\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m954021\u001b[0m │ \u001b[1;36m66\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m5.839291e+00\u001b[0m │ \u001b[1;36m4.148264\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3.0000\u001b[0m │ \u001b[1;36m6.000000e+00\u001b[0m │ \u001b[1;36m7.000000e+00\u001b[0m │ \u001b[1;36m1.210000e+02\u001b[0m │\n", + "│ \u001b[32mpmtssum_45A \u001b[0m │ \u001b[1;36m7\u001b[0m │ \u001b[32mfloat64\u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m954021\u001b[0m │ \u001b[1;36m265229\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[1;36m1.319994e+04\u001b[0m │ \u001b[1;36m18117.218312\u001b[0m │ \u001b[1;36m0.0\u001b[0m │ \u001b[1;36m3156.4001\u001b[0m │ \u001b[1;36m8.391900e+03\u001b[0m │ \u001b[1;36m1.699200e+04\u001b[0m │ \u001b[1;36m4.768434e+05\u001b[0m │\n", + "│ \u001b[32meducation_1103M\u001b[0m │ \u001b[1;36m9\u001b[0m │ \u001b[32mstring \u001b[0m │ \u001b[1;36m1526659\u001b[0m │ \u001b[1;36m26183\u001b[0m │ \u001b[1;36m5\u001b[0m │ \u001b[32ma55475b1\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │ \u001b[2mNULL\u001b[0m │\n", + "└─────────────────┴───────┴─────────┴─────────┴────────┴─────────┴──────────┴──────────────┴───────────────┴─────────┴─────────────┴──────────────┴──────────────┴──────────────┘" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# take the first 10 columns\n", + "df_train[df_train.columns[:10]].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Last-mile data preprocessing\n", + "We will perform the following transformation before feeding the data to models:\n", + "\n", + "* Missing value imputation\n", + "* Encoding categorical variables\n", + "* Handling date variables\n", + "* Handling outliers\n", + "* Scaling and normalization\n", + "\n", + "\n", + "IbisML provides a set of transformations. You can find the\n", + "[roadmap](https://github.com/ibis-project/ibis-ml/issues/32).\n", + "The [IbisML website](https://ibis-project.github.io/ibis-ml/) also includes tutorials and API documentation.\n", + "\n", + "### Impute features\n", + "Impute all numeric columns using the median. In real-life scenarios, it's important to\n", + "understand the meaning of each feature and apply the appropriate imputation method for\n", + "different features. For more imputations, please refer to this\n", + "[documentation](https://ibis-project.github.io/ibis-ml/reference/steps-imputation.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "step_impute_median = ml.ImputeMedian(ml.numeric())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Encode categorical features\n", + "Encode all categorical features using one-hot-encode. For more encoding steps,\n", + "please refer to this\n", + "[doc](https://ibis-project.github.io/ibis-ml/reference/steps-encoding.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "ohe_step = ml.OneHotEncode(\n", + " [\n", + " \"maritalst_893M\",\n", + " \"requesttype_4525192L\",\n", + " \"max_profession_152M\",\n", + " \"max_gender_992L\",\n", + " \"max_empl_industry_691L\",\n", + " \"max_housingtype_772L\",\n", + " \"max_incometype_1044T\",\n", + " \"max_cancelreason_3545846M\",\n", + " \"max_rejectreason_755M\",\n", + " \"education_1103M\",\n", + " \"max_status_219L\",\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handle date variables\n", + "Calculate all the days difference between any date columns and the column `date_decision`:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "date_cols = [col_name for col_name in df_train.columns if col_name[-1] == \"D\"]\n", + "days_to_decision_expr = {\n", + " # difference in days\n", + " f\"{col}_date_decision_diff\": (\n", + " _.date_decision.epoch_seconds() - getattr(_, col).epoch_seconds()\n", + " )\n", + " / (60 * 60 * 24)\n", + " for col in date_cols\n", + "}\n", + "days_to_decision_step = ml.Mutate(days_to_decision_expr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Extract information from the date columns:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# dow and month is set to catagoery\n", + "expand_date_step = ml.ExpandDate(ml.date(), [\"week\", \"day\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Handle outliers\n", + "Capping outliers using `z-score` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "step_handle_outliers = ml.HandleUnivariateOutliers(\n", + " [\"max_amount_1115A\", \"max_overdueamountmax_950A\"],\n", + " method=\"z-score\",\n", + " treatment=\"capping\",\n", + " deviation_factor=3,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Construct recipe\n", + "We'll construct the last mile preprocessing [recipe](https://ibis-project.github.io/ibis-ml/reference/core.html#ibis_ml.Recipe)\n", + "by chaining all transformation steps, which will be fitted to the training dataset and later applied test datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Last-mile preprocessing recipe: \n", + "Recipe(ExpandDate(date(), components=['week', 'day']),\n", + " Drop(date()),\n", + " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L'))),\n", + " Drop(string()),\n", + " HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", + " method='z-score',\n", + " deviation_factor=3,\n", + " treatment='capping'),\n", + " ImputeMedian(numeric()),\n", + " ScaleMinMax(numeric()),\n", + " FillNA(numeric(), 0),\n", + " Cast(numeric(), 'float32'))\n" + ] + } + ], + "source": [ + "last_mile_preprocessing = ml.Recipe(\n", + " expand_date_step,\n", + " ml.Drop(ml.date()),\n", + " # handle string columns\n", + " ohe_step,\n", + " ml.Drop(ml.string()),\n", + " # handle numeric cols\n", + " # capping outliers\n", + " step_handle_outliers,\n", + " step_impute_median,\n", + " ml.ScaleMinMax(ml.numeric()),\n", + " # fill missing value\n", + " ml.FillNA(ml.numeric(), 0),\n", + " ml.Cast(ml.numeric(), \"float32\"),\n", + ")\n", + "print(f\"Last-mile preprocessing recipe: \\n{last_mile_preprocessing}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modeling\n", + "After completing data preprocessing with Ibis and IbisML, we proceed to the modeling\n", + "phase. Here are two approaches:\n", + "\n", + "* Use IbisML as a independent data preprocessing component and hand off the data to downstream modeling\n", + "frameworks with various output formats:\n", + " - pandas Dataframe\n", + " - NumPy Array\n", + " - Polars Dataframe\n", + " - Dask Dataframe\n", + " - xgboost.DMatrix\n", + " - Pyarrow Table\n", + "* Use IbisML recipes as components within an sklearn Pipeline and\n", + "train models similarly to how you would do with sklearn pipeline.\n", + "\n", + "We will build an XGBoost model within a scikit-learn pipeline, and a neural network classifier using the\n", + "output transformed by IbisML recipes.\n", + "\n", + "### Train and test data splitting\n", + "We'll use hashing on the unique key to consistently split rows to different groups.\n", + "Hashing is robust to underlying changes in the data, such as adding, deleting, or\n", + "reordering rows. This deterministic process ensures that each data point is always\n", + "assigned to the same split, thereby enhancing reproducibility." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jiting/anaconda3/envs/ibis-ml-dev/lib/python3.12/site-packages/ibis/expr/types/relations.py:685: FutureWarning: Selecting/filtering arbitrary expressions in `Table.__getitem__` is deprecated and will be removed in version 10.0. Please use `Table.select` or `Table.filter` instead.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "train dataset size = 1145346 \n", + "test data size = 381313\n" + ] + } + ], + "source": [ + "train_data, test_data = ml.train_test_split(\n", + " df_train, unique_key=[\"case_id\"], test_size=0.25, random_seed=222\n", + ")\n", + "\n", + "X_train = train_data.drop(\"target\")\n", + "y_train = train_data.target.cast(dt.float32).name(\"target\")\n", + "\n", + "X_test = test_data.drop(\"target\")\n", + "y_test = test_data.target.cast(dt.float32).name(\"target\")\n", + "\n", + "train_cnt = X_train.count().execute()\n", + "test_cnt = X_test.count().execute()\n", + "print(f\"train dataset size = {train_cnt} \\ntest data size = {test_cnt}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Hashing provides a consistent but pseudo-random distribution of data, which\n", + "may not precisely align with the specified train/test ratio. While hash codes\n", + "ensure reproducibility, they don't guarantee an exact split. Due to statistical variance,\n", + "you might find a slight imbalance in the distribution, resulting in marginally more or\n", + "fewer samples in either the training or test dataset than the target percentage. This\n", + "minor deviation from the intended ratio is a normal consequence of hash-based\n", + "partitioning.\n", + "\n", + "\n", + "### XGBoost\n", + "In this section, we integrate XGBoost into a scikit-learn pipeline to create a\n", + "streamlined workflow for training and evaluating our model.\n", + "\n", + "We'll set up a pipeline that includes two components:\n", + "\n", + "* **Preprocessing**: This step applies the `last_mile_preprocessing` for final data preprocessing.\n", + "* **Modeling**: This step applies the `xgb.XGBClassifier()` to train the XGBoost model." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "last_mile_recipes\n", + "** y type =
Pipeline(steps=[('last_mile_recipes',\n", + " Recipe(ExpandDate(date(), components=['week', 'day']),\n", + " Drop(date()),\n", + " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_sta...\n", + " feature_types=None, gamma=None, grow_policy=None,\n", + " importance_type=None,\n", + " interaction_constraints=None, learning_rate=0.05,\n", + " max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None,\n", + " max_depth=5, max_leaves=None,\n", + " min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None,\n", + " n_estimators=100, n_jobs=None,\n", + " num_parallel_tree=None, random_state=42, ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('last_mile_recipes',\n", + " Recipe(ExpandDate(date(), components=['week', 'day']),\n", + " Drop(date()),\n", + " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_sta...\n", + " feature_types=None, gamma=None, grow_policy=None,\n", + " importance_type=None,\n", + " interaction_constraints=None, learning_rate=0.05,\n", + " max_bin=None, max_cat_threshold=None,\n", + " max_cat_to_onehot=None, max_delta_step=None,\n", + " max_depth=5, max_leaves=None,\n", + " min_child_weight=None, missing=nan,\n", + " monotone_constraints=None, multi_strategy=None,\n", + " n_estimators=100, n_jobs=None,\n", + " num_parallel_tree=None, random_state=42, ...))])
Recipe(ExpandDate(date(), components=['week', 'day']),\n", + " Drop(date()),\n", + " OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L'))),\n", + " Drop(string()),\n", + " HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", + " method='z-score',\n", + " deviation_factor=3,\n", + " treatment='capping'),\n", + " ImputeMedian(numeric()),\n", + " ScaleMinMax(numeric()),\n", + " FillNA(numeric(), 0),\n", + " Cast(numeric(), 'float32'))
ExpandDate(date(), components=['week', 'day'])
Drop(date())
OneHotEncode(cols(('maritalst_893M', 'requesttype_4525192L', 'max_profession_152M', 'max_gender_992L', 'max_empl_industry_691L', 'max_housingtype_772L', 'max_incometype_1044T', 'max_cancelreason_3545846M', 'max_rejectreason_755M', 'education_1103M', 'max_status_219L')))
Drop(string())
HandleUnivariateOutliers(cols(('max_amount_1115A', 'max_overdueamountmax_950A')),\n", + " method='z-score',\n", + " deviation_factor=3,\n", + " treatment='capping')
ImputeMedian(numeric())
ScaleMinMax(numeric())
FillNA(numeric(), 0)
Cast(numeric(), 'float32')
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", + " colsample_bylevel=None, colsample_bynode=None,\n", + " colsample_bytree=0.8, device=None, early_stopping_rounds=None,\n", + " enable_categorical=False, eval_metric=None, feature_types=None,\n", + " gamma=None, grow_policy=None, importance_type=None,\n", + " interaction_constraints=None, learning_rate=0.05, max_bin=None,\n", + " max_cat_threshold=None, max_cat_to_onehot=None,\n", + " max_delta_step=None, max_depth=5, max_leaves=None,\n", + " min_child_weight=None, missing=nan, monotone_constraints=None,\n", + " multi_strategy=None, n_estimators=100, n_jobs=None,\n", + " num_parallel_tree=None, random_state=42, ...)