diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0fc33f2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,149 @@ +# Manual additions +dataset/ + + +# Created by https://www.gitignore.io/api/code,macos,python,jupyternotebook + +### Code ### +# Visual Studio Code - https://code.visualstudio.com/ +.settings/ +.vscode/ +tsconfig.json +jsconfig.json + +### JupyterNotebook ### +.ipynb_checkpoints +*/.ipynb_checkpoints/* + +# Remove previous ipynb_checkpoints +# git rm -r .ipynb_checkpoints/ +# +### macOS ### +*.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +.pytest_cache/ +nosetests.xml +coverage.xml +*.cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule.* + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + + +# End of https://www.gitignore.io/api/code,macos,python,jupyternotebook + diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..a4f0c7d --- /dev/null +++ b/test.ipynb @@ -0,0 +1,703 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T20:59:25.682883Z", + "start_time": "2018-03-28T20:59:19.498143Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "DUMMY_DATA_PATH = 'dataset/dummy/'\n", + "DUMMY_BANK_DATA = DUMMY_DATA_PATH+'BSA.csv'\n", + "DUMMY_MAIN_DATA = DUMMY_DATA_PATH+'data.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:03:06.490558Z", + "start_time": "2018-03-28T21:03:06.465137Z" + } + }, + "outputs": [], + "source": [ + "main_df = pd.read_csv(DUMMY_MAIN_DATA)\n", + "bank_df = pd.read_csv(DUMMY_BANK_DATA)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:03:06.752422Z", + "start_time": "2018-03-28T21:03:06.705442Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
addressads_cmpidads_creativeads_matchtypeads_networkads_targetidamountapplication_idbirthdatebrowser...registered_office_cityregistered_office_staterole_in_firmrole_on_applicationseo_citystateutm_mediumutm_sourceutm_termyear_of_incorporation
0NaNNaNNaNNaNNaNNaNNaN1024.0NaNOpera...NaNNaNNaN0NaNNaNNaNNaNNaNNaN
1At. Pandharbodi, GondiyaNaNNaNNaNNaNNaN300000.0716.023/10/1982Chrome...GondiaMAHARASHTRA1.04NaNMAHARASHTRANaNNaNNaN2014.0
2Near Heena Manjeel, Serve No. 53,, kale padal ...NaNNaNNaNNaNNaN200000.01031.008/09/1987Chrome...PUNEMAHARASHTRA1.04NaNMAHARASHTRANaNNaNNaN2016.0
3Mangasule Gali977169039.02.312225e+11e{google_search}kwd-11424241300000.02056.002/04/1982Chrome...PuneMAHARASHTRA1.04NaNKARNATAKAppcadwordsbusiness loans2014.0
4Near Pratiksha Building,, 1, Natraj Niwas, Ata...NaNNaNNaNNaNNaN500000.09047.013/04/1979Chrome...THANEMAHARASHTRA1.04NaNMAHARASHTRABannerFacebookCarousel-Ad2014.0
\n", + "

5 rows × 41 columns

\n", + "
" + ], + "text/plain": [ + " address ads_cmpid \\\n", + "0 NaN NaN \n", + "1 At. Pandharbodi, Gondiya NaN \n", + "2 Near Heena Manjeel, Serve No. 53,, kale padal ... NaN \n", + "3 Mangasule Gali 977169039.0 \n", + "4 Near Pratiksha Building,, 1, Natraj Niwas, Ata... NaN \n", + "\n", + " ads_creative ads_matchtype ads_network ads_targetid amount \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN 300000.0 \n", + "2 NaN NaN NaN NaN 200000.0 \n", + "3 2.312225e+11 e {google_search} kwd-11424241 300000.0 \n", + "4 NaN NaN NaN NaN 500000.0 \n", + "\n", + " application_id birthdate browser ... \\\n", + "0 1024.0 NaN Opera ... \n", + "1 716.0 23/10/1982 Chrome ... \n", + "2 1031.0 08/09/1987 Chrome ... \n", + "3 2056.0 02/04/1982 Chrome ... \n", + "4 9047.0 13/04/1979 Chrome ... \n", + "\n", + " registered_office_city registered_office_state role_in_firm \\\n", + "0 NaN NaN NaN \n", + "1 Gondia MAHARASHTRA 1.0 \n", + "2 PUNE MAHARASHTRA 1.0 \n", + "3 Pune MAHARASHTRA 1.0 \n", + "4 THANE MAHARASHTRA 1.0 \n", + "\n", + " role_on_application seo_city state utm_medium utm_source \\\n", + "0 0 NaN NaN NaN NaN \n", + "1 4 NaN MAHARASHTRA NaN NaN \n", + "2 4 NaN MAHARASHTRA NaN NaN \n", + "3 4 NaN KARNATAKA ppc adwords \n", + "4 4 NaN MAHARASHTRA Banner Facebook \n", + "\n", + " utm_term year_of_incorporation \n", + "0 NaN NaN \n", + "1 NaN 2014.0 \n", + "2 NaN 2016.0 \n", + "3 business loans 2014.0 \n", + "4 Carousel-Ad 2014.0 \n", + "\n", + "[5 rows x 41 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "main_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:03:09.592675Z", + "start_time": "2018-03-28T21:03:09.515172Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ads_matchtypeads_networkamountapplication_idbirthdatebrowsercampaign_citycitycompany_sizecountry...registered_office_cityregistered_office_staterole_in_firmrole_on_applicationseo_citystateutm_mediumutm_sourceutm_termyear_of_incorporation
0NaNNaNNaN1024.0NaNOperaNaNNaNNaNNaN...NaNNaNNaN0NaNNaNNaNNaNNaNNaN
1NaNNaN300000.0716.023/10/1982ChromeNaNNaNNaNIndia...GondiaMAHARASHTRA1.04NaNMAHARASHTRANaNNaNNaN2014.0
2NaNNaN200000.01031.008/09/1987ChromeNaNNaNNaNIndia...PUNEMAHARASHTRA1.04NaNMAHARASHTRANaNNaNNaN2016.0
3e{google_search}300000.02056.002/04/1982ChromePuneMumbai5.0India...PuneMAHARASHTRA1.04NaNKARNATAKAppcadwordsbusiness loans2014.0
4NaNNaN500000.09047.013/04/1979ChromeNaNMumbaiNaNIndia...THANEMAHARASHTRA1.04NaNMAHARASHTRABannerFacebookCarousel-Ad2014.0
\n", + "

5 rows × 35 columns

\n", + "
" + ], + "text/plain": [ + " ads_matchtype ads_network amount application_id birthdate \\\n", + "0 NaN NaN NaN 1024.0 NaN \n", + "1 NaN NaN 300000.0 716.0 23/10/1982 \n", + "2 NaN NaN 200000.0 1031.0 08/09/1987 \n", + "3 e {google_search} 300000.0 2056.0 02/04/1982 \n", + "4 NaN NaN 500000.0 9047.0 13/04/1979 \n", + "\n", + " browser campaign_city city company_size country ... \\\n", + "0 Opera NaN NaN NaN NaN ... \n", + "1 Chrome NaN NaN NaN India ... \n", + "2 Chrome NaN NaN NaN India ... \n", + "3 Chrome Pune Mumbai 5.0 India ... \n", + "4 Chrome NaN Mumbai NaN India ... \n", + "\n", + " registered_office_city registered_office_state role_in_firm \\\n", + "0 NaN NaN NaN \n", + "1 Gondia MAHARASHTRA 1.0 \n", + "2 PUNE MAHARASHTRA 1.0 \n", + "3 Pune MAHARASHTRA 1.0 \n", + "4 THANE MAHARASHTRA 1.0 \n", + "\n", + " role_on_application seo_city state utm_medium utm_source \\\n", + "0 0 NaN NaN NaN NaN \n", + "1 4 NaN MAHARASHTRA NaN NaN \n", + "2 4 NaN MAHARASHTRA NaN NaN \n", + "3 4 NaN KARNATAKA ppc adwords \n", + "4 4 NaN MAHARASHTRA Banner Facebook \n", + "\n", + " utm_term year_of_incorporation \n", + "0 NaN NaN \n", + "1 NaN 2014.0 \n", + "2 NaN 2016.0 \n", + "3 business loans 2014.0 \n", + "4 Carousel-Ad 2014.0 \n", + "\n", + "[5 rows x 35 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to_remove_cols = [\n", + " 'address', # textual, and hence not helpful\n", + " 'ads_cmpid', # unique key\n", + " 'ads_creative', # unique key\n", + "# 'ads_matchtype',\n", + "# 'ads_network',\n", + " 'ads_targetid', # unique key\n", + "# 'amount',\n", + "# 'application_id',\n", + "# 'birthdate',\n", + "# 'browser',\n", + "# 'campaign_city',\n", + "# 'city',\n", + "# 'company_size',\n", + " 'country', # all are india as-is, so no point keeping it..\n", + " 'created_date', # has no correlation on the model.. \n", + "# 'email',\n", + " 'firm_name', # each name is potentially unique, also string so can't do much\n", + " 'firm_pan', # unique for each company, NOTE: there is a pattern than can be used to extract features!! (TODO)\n", + "# 'firm_type',\n", + "# 'gender',\n", + "# 'industry',\n", + "# 'ip',\n", + "# 'last_fy_profit',\n", + "# 'latitude',\n", + "# 'loan_created',\n", + "# 'longitude',\n", + "# 'name',\n", + "# 'network',\n", + " 'pan', # unique for each individual, NOTE: there is a pattern than can be used to extract features!! (TODO)\n", + "# 'pincode',\n", + "# 'platform',\n", + "# 'registered_office_city',\n", + "# 'registered_office_state',\n", + "# 'role_in_firm',\n", + "# 'role_on_application',\n", + "# 'seo_city',\n", + "# 'state',\n", + "# 'utm_medium',\n", + "# 'utm_source',\n", + "# 'utm_term',\n", + "# 'year_of_incorporation',\n", + "]\n", + "\n", + "main_df = main_df.drop(columns=to_remove_cols)\n", + "main_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:03:19.300420Z", + "start_time": "2018-03-28T21:03:19.294635Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "432\n" + ] + } + ], + "source": [ + "'''\n", + "- make 0 amounts as NaNs\n", + "- birthdat -> age\n", + "- browser: some really low counts\n", + "- campaign_city: some really low counts\n", + "- city: some really low counts\n", + "- email: publicly hosted email domain or personal email domain\n", + "- firm_type: is skewed (need to figure things out..)\n", + "\n", + "'''\n", + "print(len(main_df))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:11:05.008409Z", + "start_time": "2018-03-28T21:11:04.998455Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Male 339\n", + "Female 36\n", + "Name: gender, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# list(main_df)\n", + "main_df['gender'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T00:03:50.014321Z", + "start_time": "2018-03-28T00:03:50.002483Z" + } + }, + "outputs": [], + "source": [ + "# only keep applications that are in both data sets\n", + "appln_id = pd.Series(list(set(main_df['application_id']) & set(bank_df['appl_id'])))\n", + "main_df = main_df.loc[main_df['application_id'].isin(appln_id)]\n", + "bank_df = bank_df.loc[bank_df['appl_id'].isin(appln_id)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T00:03:51.265344Z", + "start_time": "2018-03-28T00:03:50.999445Z" + } + }, + "outputs": [], + "source": [ + "def _aggregate_columns(df, application_id_col):\n", + " # group by application id and merge all rows into lists\n", + " new_df = pd.DataFrame()\n", + " g = bank_df.groupby(application_id_col)\n", + " for k in list(df):\n", + " if k == application_id_col:\n", + " continue\n", + " new_df = pd.concat([new_df, g[k].apply(list)], axis=1)\n", + " return new_df.reset_index()\n", + "\n", + "\n", + "def setup_aggregations(df, application_id_col):\n", + " '''\n", + " fix bank data (for applications with multiple rows)\n", + " - average the averages\n", + " - add high_credit_cp\n", + " - add invard returns\n", + " - max of all the maxs\n", + " - min of all the mins\n", + " - add outward_returns\n", + " - drop totals (because average is better and normalized)\n", + " '''\n", + " df = df.drop(columns=['total_bi_inflow', 'total_bi_outflow', 'total_inflow', 'total_outflow'])\n", + " df = _aggregate_columns(df, application_id_col)\n", + " new_df = pd.DataFrame()\n", + " for k in list(df):\n", + " if k == application_id_col:\n", + " new_df = pd.concat([new_df, df[k]], axis=1)\n", + " elif 'average' in k:\n", + " new_df = pd.concat([new_df, df[k].apply(np.average)], axis=1)\n", + " elif 'max' in k:\n", + " new_df = pd.concat([new_df, df[k].apply(np.max)], axis=1)\n", + " elif 'min' in k:\n", + " new_df = pd.concat([new_df, df[k].apply(np.min)], axis=1)\n", + " else:\n", + " new_df = pd.concat([new_df, df[k].apply(np.sum)], axis=1)\n", + " return new_df\n", + "\n", + "bank_df = setup_aggregations(bank_df, 'appl_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}