diff --git a/Data Cleanup.ipynb b/Data Cleanup.ipynb
new file mode 100644
index 0000000..2cbfed2
--- /dev/null
+++ b/Data Cleanup.ipynb
@@ -0,0 +1,581 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-29T02:29:10.637998Z",
+ "start_time": "2018-03-29T02:29:08.722805Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "DUMMY_DATA_PATH = 'dataset/dummy/'\n",
+ "DUMMY_BANK_DATA = DUMMY_DATA_PATH+'BSA.csv'\n",
+ "DUMMY_MAIN_DATA = DUMMY_DATA_PATH+'data.csv'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-29T02:29:11.492651Z",
+ "start_time": "2018-03-29T02:29:11.354573Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "main_df = pd.read_csv(DUMMY_MAIN_DATA)\n",
+ "bank_df = pd.read_csv(DUMMY_BANK_DATA)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-29T02:29:11.997304Z",
+ "start_time": "2018-03-29T02:29:11.911392Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ads_matchtype | \n",
+ " ads_network | \n",
+ " amount | \n",
+ " application_id | \n",
+ " birthdate | \n",
+ " browser | \n",
+ " campaign_city | \n",
+ " city | \n",
+ " company_size | \n",
+ " email | \n",
+ " ... | \n",
+ " network | \n",
+ " platform | \n",
+ " registered_office_city | \n",
+ " registered_office_state | \n",
+ " role_in_firm | \n",
+ " role_on_application | \n",
+ " state | \n",
+ " utm_medium | \n",
+ " utm_source | \n",
+ " year_of_incorporation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1024.0 | \n",
+ " NaN | \n",
+ " Opera | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " Kaif1779@gmail.com | \n",
+ " ... | \n",
+ " Opera Software Americas LLC | \n",
+ " mobile:Pike v8.0 release 461 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 300000.0 | \n",
+ " 716.0 | \n",
+ " 23/10/1982 | \n",
+ " Chrome | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " vihanmarketing36@gmail.com | \n",
+ " ... | \n",
+ " Idea Cellular Limited | \n",
+ " Win32 | \n",
+ " Gondia | \n",
+ " MAHARASHTRA | \n",
+ " 1.0 | \n",
+ " 4 | \n",
+ " MAHARASHTRA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2014.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 200000.0 | \n",
+ " 1031.0 | \n",
+ " 08/09/1987 | \n",
+ " Chrome | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " faijiyatoursandtravels@gmail.com | \n",
+ " ... | \n",
+ " Idea Cellular Limited | \n",
+ " mobile:Linux armv8l | \n",
+ " PUNE | \n",
+ " MAHARASHTRA | \n",
+ " 1.0 | \n",
+ " 4 | \n",
+ " MAHARASHTRA | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2016.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " e | \n",
+ " {google_search} | \n",
+ " 300000.0 | \n",
+ " 2056.0 | \n",
+ " 02/04/1982 | \n",
+ " Chrome | \n",
+ " Pune | \n",
+ " Mumbai | \n",
+ " 5.0 | \n",
+ " sagarnk2008@gmail.com | \n",
+ " ... | \n",
+ " Reliance Jio Infocomm Limited | \n",
+ " mobile:Linux aarch64 | \n",
+ " Pune | \n",
+ " MAHARASHTRA | \n",
+ " 1.0 | \n",
+ " 4 | \n",
+ " KARNATAKA | \n",
+ " ppc | \n",
+ " adwords | \n",
+ " 2014.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 500000.0 | \n",
+ " 9047.0 | \n",
+ " 13/04/1979 | \n",
+ " Chrome | \n",
+ " NaN | \n",
+ " Mumbai | \n",
+ " NaN | \n",
+ " natrajmoily@gmail.com | \n",
+ " ... | \n",
+ " Syscon Infoway Pvt. Ltd. | \n",
+ " mobile:Linux armv8l | \n",
+ " THANE | \n",
+ " MAHARASHTRA | \n",
+ " 1.0 | \n",
+ " 4 | \n",
+ " MAHARASHTRA | \n",
+ " Banner | \n",
+ " Facebook | \n",
+ " 2014.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 25 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ads_matchtype ads_network amount application_id birthdate \\\n",
+ "0 NaN NaN NaN 1024.0 NaN \n",
+ "1 NaN NaN 300000.0 716.0 23/10/1982 \n",
+ "2 NaN NaN 200000.0 1031.0 08/09/1987 \n",
+ "3 e {google_search} 300000.0 2056.0 02/04/1982 \n",
+ "4 NaN NaN 500000.0 9047.0 13/04/1979 \n",
+ "\n",
+ " browser campaign_city city company_size \\\n",
+ "0 Opera NaN NaN NaN \n",
+ "1 Chrome NaN NaN NaN \n",
+ "2 Chrome NaN NaN NaN \n",
+ "3 Chrome Pune Mumbai 5.0 \n",
+ "4 Chrome NaN Mumbai NaN \n",
+ "\n",
+ " email ... \\\n",
+ "0 Kaif1779@gmail.com ... \n",
+ "1 vihanmarketing36@gmail.com ... \n",
+ "2 faijiyatoursandtravels@gmail.com ... \n",
+ "3 sagarnk2008@gmail.com ... \n",
+ "4 natrajmoily@gmail.com ... \n",
+ "\n",
+ " network platform \\\n",
+ "0 Opera Software Americas LLC mobile:Pike v8.0 release 461 \n",
+ "1 Idea Cellular Limited Win32 \n",
+ "2 Idea Cellular Limited mobile:Linux armv8l \n",
+ "3 Reliance Jio Infocomm Limited mobile:Linux aarch64 \n",
+ "4 Syscon Infoway Pvt. Ltd. mobile:Linux armv8l \n",
+ "\n",
+ " registered_office_city registered_office_state role_in_firm \\\n",
+ "0 NaN NaN NaN \n",
+ "1 Gondia MAHARASHTRA 1.0 \n",
+ "2 PUNE MAHARASHTRA 1.0 \n",
+ "3 Pune MAHARASHTRA 1.0 \n",
+ "4 THANE MAHARASHTRA 1.0 \n",
+ "\n",
+ " role_on_application state utm_medium utm_source \\\n",
+ "0 0 NaN NaN NaN \n",
+ "1 4 MAHARASHTRA NaN NaN \n",
+ "2 4 MAHARASHTRA NaN NaN \n",
+ "3 4 KARNATAKA ppc adwords \n",
+ "4 4 MAHARASHTRA Banner Facebook \n",
+ "\n",
+ " year_of_incorporation \n",
+ "0 NaN \n",
+ "1 2014.0 \n",
+ "2 2016.0 \n",
+ "3 2014.0 \n",
+ "4 2014.0 \n",
+ "\n",
+ "[5 rows x 25 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "to_remove_cols = [\n",
+ " 'address', # textual, and hence not helpful\n",
+ " 'ads_cmpid', # unique key\n",
+ " 'ads_creative', # unique key\n",
+ "# 'ads_matchtype',\n",
+ "# 'ads_network',\n",
+ " 'ads_targetid', # unique key\n",
+ "# 'amount',\n",
+ "# 'application_id',\n",
+ " 'birthdate', # use firm age (better fit to the model)\n",
+ "# 'browser',\n",
+ "# 'campaign_city',\n",
+ "# 'city',\n",
+ "# 'company_size',\n",
+ " 'country', # all are india as-is, so no point keeping it..\n",
+ " 'created_date', # has no correlation on the model.. \n",
+ "# 'email',\n",
+ " 'firm_name', # each name is potentially unique, also string so can't do much\n",
+ " 'firm_pan', # unique for each company, NOTE: there is a pattern than can be used to extract features!! (TODO)\n",
+ "# 'firm_type',\n",
+ "# 'gender',\n",
+ "# 'industry',\n",
+ " 'ip', # can we do something about IPs? (TODO)\n",
+ "# 'last_fy_profit',\n",
+ " 'latitude', # IDK how helpful coordinates are.. We have the city and states; and so we can use that..\n",
+ "# 'loan_created', # THIS IS OUR TARGET! THIS IS SKEWED, SO A NAIVE IMPL WILL ALSO HAVE 96% ACCURACY :D \n",
+ " 'longitude', # see comment for 'latitude'\n",
+ " 'name', # see comment for firm name\n",
+ "# 'network',\n",
+ " 'pan', # unique for each individual, NOTE: there is a pattern than can be used to extract features!! (TODO)\n",
+ " 'pincode',\n",
+ "# 'platform',\n",
+ "# 'registered_office_city',\n",
+ "# 'registered_office_state',\n",
+ "# 'role_in_firm',\n",
+ "# 'role_on_application',\n",
+ " 'seo_city', # only 1 value.. \n",
+ "# 'state',\n",
+ "# 'utm_medium',\n",
+ "# 'utm_source',\n",
+ " 'utm_term', # idk what to do with this data.. ¯\\_(ツ)_/¯\n",
+ "# 'year_of_incorporation',\n",
+ "]\n",
+ "\n",
+ "main_df = main_df.drop(columns=to_remove_cols)\n",
+ "main_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-29T02:33:17.414224Z",
+ "start_time": "2018-03-29T02:33:17.363625Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "432 25\n"
+ ]
+ }
+ ],
+ "source": [
+ "'''\n",
+ "Things to clean in main dataframe:\n",
+ "- [x] amount: make 0 amounts as NaNs\n",
+ "- [x] birthdate: calculate age of the person (do we need this if we have the age of firm?) --> removed col: using firm age\n",
+ "- [ ] browser: some really low counts\n",
+ "- [ ] campaign_city: some really low counts\n",
+ "- [ ] city: some really low counts\n",
+ "- [ ] email: publicly hosted email domain or personal email domain\n",
+ "- [ ] firm_type: is skewed (need to figure things out..)\n",
+ "- [ ] last_fy_profit: convert 0 to NaNs\n",
+ "- [ ] platform: combine all the 'mobile:Linux'?\n",
+ "- [ ] registered_office_city: combine same values (cases are different hence are treated as separate values)\n",
+ "- [ ] role_in_firm: categorical; so don't use the numbers as is..\n",
+ "- [ ] role_on_application: categorical; so don't use the numbers as is..\n",
+ "- [ ] year_of_incorporation: -> compute age of firm\n",
+ "'''\n",
+ "print(len(main_df), len(list(main_df)))\n",
+ "\n",
+ "main_df.loc[main_df['amount'] == 0, 'amount'] = np.NAN"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-29T02:33:33.797848Z",
+ "start_time": "2018-03-29T02:33:33.772428Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ads_matchtype | \n",
+ " ads_network | \n",
+ " amount | \n",
+ " application_id | \n",
+ " birthdate | \n",
+ " browser | \n",
+ " campaign_city | \n",
+ " city | \n",
+ " company_size | \n",
+ " email | \n",
+ " ... | \n",
+ " network | \n",
+ " platform | \n",
+ " registered_office_city | \n",
+ " registered_office_state | \n",
+ " role_in_firm | \n",
+ " role_on_application | \n",
+ " state | \n",
+ " utm_medium | \n",
+ " utm_source | \n",
+ " year_of_incorporation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0 rows × 25 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: [ads_matchtype, ads_network, amount, application_id, birthdate, browser, campaign_city, city, company_size, email, firm_type, gender, industry, last_fy_profit, loan_created, network, platform, registered_office_city, registered_office_state, role_in_firm, role_on_application, state, utm_medium, utm_source, year_of_incorporation]\n",
+ "Index: []\n",
+ "\n",
+ "[0 rows x 25 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-28T21:41:02.961905Z",
+ "start_time": "2018-03-28T21:41:02.944388Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "# list(main_df)\n",
+ "main_df['year_of_incorporation'].sort_values().value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-28T21:44:00.401287Z",
+ "start_time": "2018-03-28T21:44:00.382401Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# only keep applications that are in both data sets\n",
+ "appln_id = pd.Series(list(set(main_df['application_id']) & set(bank_df['application_id'])))\n",
+ "main_df = main_df.loc[main_df['application_id'].isin(appln_id)]\n",
+ "bank_df = bank_df.loc[bank_df['application_id'].isin(appln_id)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-28T21:44:00.579850Z",
+ "start_time": "2018-03-28T21:44:00.566154Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "list(bank_df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-28T21:44:01.613870Z",
+ "start_time": "2018-03-28T21:44:01.144643Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def _aggregate_columns(df, application_id_col):\n",
+ " # group by application id and merge all rows into lists\n",
+ " new_df = pd.DataFrame()\n",
+ " g = bank_df.groupby(application_id_col)\n",
+ " for k in list(df):\n",
+ " if k == application_id_col:\n",
+ " continue\n",
+ " new_df = pd.concat([new_df, g[k].apply(list)], axis=1)\n",
+ " return new_df.reset_index()\n",
+ "\n",
+ "\n",
+ "def setup_aggregations(df, application_id_col):\n",
+ " '''\n",
+ " fix bank data (for applications with multiple rows)\n",
+ " - average the averages\n",
+ " - add high_credit_cp\n",
+ " - add invard returns\n",
+ " - max of all the maxs\n",
+ " - min of all the mins\n",
+ " - add outward_returns\n",
+ " - drop totals (because average is better and normalized)\n",
+ " '''\n",
+ " df = df.drop(columns=['total_business_inflow', 'total_business_outflow', 'total_inflow', 'total_outflow'])\n",
+ " df = _aggregate_columns(df, application_id_col)\n",
+ " new_df = pd.DataFrame()\n",
+ " for k in list(df):\n",
+ " if k == application_id_col:\n",
+ " new_df = pd.concat([new_df, df[k]], axis=1)\n",
+ " elif 'average' in k:\n",
+ " new_df = pd.concat([new_df, df[k].apply(np.average)], axis=1)\n",
+ " elif 'max' in k:\n",
+ " new_df = pd.concat([new_df, df[k].apply(np.max)], axis=1)\n",
+ " elif 'min' in k:\n",
+ " new_df = pd.concat([new_df, df[k].apply(np.min)], axis=1)\n",
+ " else:\n",
+ " new_df = pd.concat([new_df, df[k].apply(np.sum)], axis=1)\n",
+ " return new_df\n",
+ "\n",
+ "bank_df = setup_aggregations(bank_df, 'application_id')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2018-03-28T21:44:06.462793Z",
+ "start_time": "2018-03-28T21:44:06.428801Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "bank_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}