diff --git a/Data Cleanup.ipynb b/Data Cleanup.ipynb index 2cbfed2..74bc9d7 100644 --- a/Data Cleanup.ipynb +++ b/Data Cleanup.ipynb @@ -2,11 +2,11 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 24, "metadata": { "ExecuteTime": { - "end_time": "2018-03-29T02:29:10.637998Z", - "start_time": "2018-03-29T02:29:08.722805Z" + "end_time": "2018-03-30T01:25:57.804317Z", + "start_time": "2018-03-30T01:25:57.794581Z" } }, "outputs": [], @@ -16,16 +16,35 @@ "\n", "DUMMY_DATA_PATH = 'dataset/dummy/'\n", "DUMMY_BANK_DATA = DUMMY_DATA_PATH+'BSA.csv'\n", - "DUMMY_MAIN_DATA = DUMMY_DATA_PATH+'data.csv'" + "DUMMY_MAIN_DATA = DUMMY_DATA_PATH+'data.csv'\n", + "\n", + "FINAL_DATA = 'dataset/loan.csv'\n", + "TRAIN_DATA = 'dataset/loan_one_hot_encoded.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-30T01:25:57.847620Z", + "start_time": "2018-03-30T01:25:57.808610Z" + } + }, + "outputs": [], + "source": [ + "PUBLIC_EMAIL_DOMAINS = ()\n", + "with open('public-email-domains.txt', 'r') as f:\n", + " PUBLIC_EMAIL_DOMAINS = tuple(d.strip() for d in f.readlines())" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 26, "metadata": { "ExecuteTime": { - "end_time": "2018-03-29T02:29:11.492651Z", - "start_time": "2018-03-29T02:29:11.354573Z" + "end_time": "2018-03-30T01:25:57.898990Z", + "start_time": "2018-03-30T01:25:57.852012Z" } }, "outputs": [], @@ -36,11 +55,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 27, "metadata": { "ExecuteTime": { - "end_time": "2018-03-29T02:29:11.997304Z", - "start_time": "2018-03-29T02:29:11.911392Z" + "end_time": "2018-03-30T01:25:58.013508Z", + "start_time": "2018-03-30T01:25:57.902458Z" } }, "outputs": [ @@ -69,14 +88,14 @@ " ads_network\n", " amount\n", " application_id\n", - " birthdate\n", " browser\n", " campaign_city\n", " city\n", " company_size\n", " email\n", + " firm_type\n", " ...\n", - " network\n", + " loan_created\n", " platform\n", " registered_office_city\n", " registered_office_state\n", @@ -95,14 +114,14 @@ " NaN\n", " NaN\n", " 1024.0\n", - " NaN\n", " Opera\n", " NaN\n", " NaN\n", " NaN\n", " Kaif1779@gmail.com\n", + " NaN\n", " ...\n", - " Opera Software Americas LLC\n", + " 0\n", " mobile:Pike v8.0 release 461\n", " NaN\n", " NaN\n", @@ -119,14 +138,14 @@ " NaN\n", " 300000.0\n", " 716.0\n", - " 23/10/1982\n", " Chrome\n", " NaN\n", " NaN\n", " NaN\n", " vihanmarketing36@gmail.com\n", + " Proprietorship\n", " ...\n", - " Idea Cellular Limited\n", + " 1\n", " Win32\n", " Gondia\n", " MAHARASHTRA\n", @@ -143,14 +162,14 @@ " NaN\n", " 200000.0\n", " 1031.0\n", - " 08/09/1987\n", " Chrome\n", " NaN\n", " NaN\n", " NaN\n", " faijiyatoursandtravels@gmail.com\n", + " Proprietorship\n", " ...\n", - " Idea Cellular Limited\n", + " 0\n", " mobile:Linux armv8l\n", " PUNE\n", " MAHARASHTRA\n", @@ -167,14 +186,14 @@ " {google_search}\n", " 300000.0\n", " 2056.0\n", - " 02/04/1982\n", " Chrome\n", " Pune\n", " Mumbai\n", " 5.0\n", " sagarnk2008@gmail.com\n", + " Proprietorship\n", " ...\n", - " Reliance Jio Infocomm Limited\n", + " 0\n", " mobile:Linux aarch64\n", " Pune\n", " MAHARASHTRA\n", @@ -191,14 +210,14 @@ " NaN\n", " 500000.0\n", " 9047.0\n", - " 13/04/1979\n", " Chrome\n", " NaN\n", " Mumbai\n", " NaN\n", " natrajmoily@gmail.com\n", + " Proprietorship\n", " ...\n", - " Syscon Infoway Pvt. Ltd.\n", + " 0\n", " mobile:Linux armv8l\n", " THANE\n", " MAHARASHTRA\n", @@ -211,63 +230,56 @@ " \n", " \n", "\n", - "

5 rows × 25 columns

\n", + "

5 rows × 23 columns

\n", "" ], "text/plain": [ - " ads_matchtype ads_network amount application_id birthdate \\\n", - "0 NaN NaN NaN 1024.0 NaN \n", - "1 NaN NaN 300000.0 716.0 23/10/1982 \n", - "2 NaN NaN 200000.0 1031.0 08/09/1987 \n", - "3 e {google_search} 300000.0 2056.0 02/04/1982 \n", - "4 NaN NaN 500000.0 9047.0 13/04/1979 \n", - "\n", - " browser campaign_city city company_size \\\n", - "0 Opera NaN NaN NaN \n", - "1 Chrome NaN NaN NaN \n", - "2 Chrome NaN NaN NaN \n", - "3 Chrome Pune Mumbai 5.0 \n", - "4 Chrome NaN Mumbai NaN \n", + " ads_matchtype ads_network amount application_id browser \\\n", + "0 NaN NaN NaN 1024.0 Opera \n", + "1 NaN NaN 300000.0 716.0 Chrome \n", + "2 NaN NaN 200000.0 1031.0 Chrome \n", + "3 e {google_search} 300000.0 2056.0 Chrome \n", + "4 NaN NaN 500000.0 9047.0 Chrome \n", "\n", - " email ... \\\n", - "0 Kaif1779@gmail.com ... \n", - "1 vihanmarketing36@gmail.com ... \n", - "2 faijiyatoursandtravels@gmail.com ... \n", - "3 sagarnk2008@gmail.com ... \n", - "4 natrajmoily@gmail.com ... \n", + " campaign_city city company_size email \\\n", + "0 NaN NaN NaN Kaif1779@gmail.com \n", + "1 NaN NaN NaN vihanmarketing36@gmail.com \n", + "2 NaN NaN NaN faijiyatoursandtravels@gmail.com \n", + "3 Pune Mumbai 5.0 sagarnk2008@gmail.com \n", + "4 NaN Mumbai NaN natrajmoily@gmail.com \n", "\n", - " network platform \\\n", - "0 Opera Software Americas LLC mobile:Pike v8.0 release 461 \n", - "1 Idea Cellular Limited Win32 \n", - "2 Idea Cellular Limited mobile:Linux armv8l \n", - "3 Reliance Jio Infocomm Limited mobile:Linux aarch64 \n", - "4 Syscon Infoway Pvt. Ltd. mobile:Linux armv8l \n", + " firm_type ... loan_created \\\n", + "0 NaN ... 0 \n", + "1 Proprietorship ... 1 \n", + "2 Proprietorship ... 0 \n", + "3 Proprietorship ... 0 \n", + "4 Proprietorship ... 0 \n", "\n", - " registered_office_city registered_office_state role_in_firm \\\n", - "0 NaN NaN NaN \n", - "1 Gondia MAHARASHTRA 1.0 \n", - "2 PUNE MAHARASHTRA 1.0 \n", - "3 Pune MAHARASHTRA 1.0 \n", - "4 THANE MAHARASHTRA 1.0 \n", + " platform registered_office_city \\\n", + "0 mobile:Pike v8.0 release 461 NaN \n", + "1 Win32 Gondia \n", + "2 mobile:Linux armv8l PUNE \n", + "3 mobile:Linux aarch64 Pune \n", + "4 mobile:Linux armv8l THANE \n", "\n", - " role_on_application state utm_medium utm_source \\\n", - "0 0 NaN NaN NaN \n", - "1 4 MAHARASHTRA NaN NaN \n", - "2 4 MAHARASHTRA NaN NaN \n", - "3 4 KARNATAKA ppc adwords \n", - "4 4 MAHARASHTRA Banner Facebook \n", + " registered_office_state role_in_firm role_on_application state \\\n", + "0 NaN NaN 0 NaN \n", + "1 MAHARASHTRA 1.0 4 MAHARASHTRA \n", + "2 MAHARASHTRA 1.0 4 MAHARASHTRA \n", + "3 MAHARASHTRA 1.0 4 KARNATAKA \n", + "4 MAHARASHTRA 1.0 4 MAHARASHTRA \n", "\n", - " year_of_incorporation \n", - "0 NaN \n", - "1 2014.0 \n", - "2 2016.0 \n", - "3 2014.0 \n", - "4 2014.0 \n", + " utm_medium utm_source year_of_incorporation \n", + "0 NaN NaN NaN \n", + "1 NaN NaN 2014.0 \n", + "2 NaN NaN 2016.0 \n", + "3 ppc adwords 2014.0 \n", + "4 Banner Facebook 2014.0 \n", "\n", - "[5 rows x 25 columns]" + "[5 rows x 23 columns]" ] }, - "execution_count": 3, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -301,7 +313,7 @@ "# 'loan_created', # THIS IS OUR TARGET! THIS IS SKEWED, SO A NAIVE IMPL WILL ALSO HAVE 96% ACCURACY :D \n", " 'longitude', # see comment for 'latitude'\n", " 'name', # see comment for firm name\n", - "# 'network',\n", + " 'network', # don't need it.. too many random values.. \n", " 'pan', # unique for each individual, NOTE: there is a pattern than can be used to extract features!! (TODO)\n", " 'pincode',\n", "# 'platform',\n", @@ -323,19 +335,20 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 28, "metadata": { "ExecuteTime": { - "end_time": "2018-03-29T02:33:17.414224Z", - "start_time": "2018-03-29T02:33:17.363625Z" - } + "end_time": "2018-03-30T01:25:58.293820Z", + "start_time": "2018-03-30T01:25:58.018503Z" + }, + "scrolled": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "432 25\n" + "432 23\n" ] } ], @@ -347,27 +360,208 @@ "- [ ] browser: some really low counts\n", "- [ ] campaign_city: some really low counts\n", "- [ ] city: some really low counts\n", - "- [ ] email: publicly hosted email domain or personal email domain\n", + "- [x] email: publicly hosted email domain or personal email domain\n", "- [ ] firm_type: is skewed (need to figure things out..)\n", - "- [ ] last_fy_profit: convert 0 to NaNs\n", - "- [ ] platform: combine all the 'mobile:Linux'?\n", - "- [ ] registered_office_city: combine same values (cases are different hence are treated as separate values)\n", + "- [x] last_fy_profit: convert 0 to NaNs\n", + "- [x] platform: combine all the 'mobile:Linux'?\n", + "- [x] registered_office_city: combine same values (cases are different hence are treated as separate values)\n", "- [ ] role_in_firm: categorical; so don't use the numbers as is..\n", "- [ ] role_on_application: categorical; so don't use the numbers as is..\n", - "- [ ] year_of_incorporation: -> compute age of firm\n", + "- [x] year_of_incorporation: -> compute age of firm\n", "'''\n", - "print(len(main_df), len(list(main_df)))\n", + "# remove 0 amounts\n", + "main_df.loc[main_df['amount'] == 0, 'amount'] = np.NAN\n", + "\n", + "# create a boolean column\n", + "main_df['private_email_domain'] = False\n", + "for index, row in main_df.iterrows():\n", + " email = row['email']\n", + " if email is not np.NaN and email.split('@')[1] not in PUBLIC_EMAIL_DOMAINS:\n", + " main_df.loc[index, 'private_email_domain'] = True\n", + "main_df = main_df.drop(columns=['email']) # drop email column\n", + "\n", + "# remove 0 last_fy_profit\n", + "main_df.loc[main_df['last_fy_profit'] == 0, 'last_fy_profit'] = np.NAN\n", + "\n", + "# make all text uppercase in registered_office_city\n", + "main_df['registered_office_city'] = main_df['registered_office_city'].str.upper()\n", + "\n", + "# compute age of firm\n", + "# main_df.loc[main_df['year_of_incorporation'] == 0, 'year_of_incorporation'] = np.NAN\n", + "main_df['age_of_firm'] = np.nan\n", + "def compute_age_of_firm(x):\n", + " if x is np.nan:\n", + " return np.nan\n", + " elif type(x) == str:\n", + " if '/' in x:\n", + " x = x.split('/')[1].split('.')[0].strip('., ')\n", + " x = x.split('.')[0].strip('., ')\n", + " x = int(x)\n", + " if 2018-x == 2018:\n", + " return np.nan\n", + " return 2018-x\n", + " else:\n", + " return x\n", + "main_df['age_of_firm'] = list(map(compute_age_of_firm, main_df['year_of_incorporation']))\n", + "main_df = main_df.drop(columns=['year_of_incorporation']) # drop email column\n", + "\n", + "# strip the redundant brace brakets around the ads_network\n", + "main_df['ads_network'] = list(map(lambda x: x.strip('}{') if type(x) == str else x, main_df['ads_network']))\n", + "\n", + "# platform: combine all the 'mobile:Linux'?\n", + "main_df['platform'] = list(map(lambda x: 'mobile:Linux' if type(x) == str and 'mobile:Linux' in x else x, main_df['platform']))\n", + "\n", + "print(len(main_df), len(list(main_df)))" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-30T01:25:58.310947Z", + "start_time": "2018-03-30T01:25:58.301339Z" + } + }, + "outputs": [], + "source": [ + "# # only keep applications that are in both data sets\n", + "# appln_id = pd.Series(list(set(main_df['application_id']) & set(bank_df['application_id'])))\n", + "# main_df = main_df.loc[main_df['application_id'].isin(appln_id)]\n", + "# bank_df = bank_df.loc[bank_df['application_id'].isin(appln_id)]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-30T01:25:58.352141Z", + "start_time": "2018-03-30T01:25:58.317396Z" + } + }, + "outputs": [], + "source": [ + "# list(bank_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-30T01:25:58.958612Z", + "start_time": "2018-03-30T01:25:58.357356Z" + } + }, + "outputs": [], + "source": [ + "def _aggregate_columns(df, application_id_col):\n", + " # group by application id and merge all rows into lists\n", + " new_df = pd.DataFrame()\n", + " g = bank_df.groupby(application_id_col)\n", + " for k in list(df):\n", + " if k == application_id_col:\n", + " continue\n", + " new_df = pd.concat([new_df, g[k].apply(list)], axis=1)\n", + " return new_df.reset_index()\n", + "\n", + "\n", + "def setup_aggregations(df, application_id_col):\n", + " '''\n", + " fix bank data (for applications with multiple rows)\n", + " - average the averages\n", + " - add high_credit_cp\n", + " - add invard returns\n", + " - max of all the maxs\n", + " - min of all the mins\n", + " - add outward_returns\n", + " - drop totals (because average is better and normalized)\n", + " '''\n", + " df = df.drop(columns=['total_business_inflow', 'total_business_outflow', 'total_inflow', 'total_outflow'])\n", + " df = _aggregate_columns(df, application_id_col)\n", + " new_df = pd.DataFrame()\n", + " for k in list(df):\n", + " if k == application_id_col:\n", + " new_df = pd.concat([new_df, df[k]], axis=1)\n", + " elif 'average' in k:\n", + " new_df = pd.concat([new_df, df[k].apply(np.average)], axis=1)\n", + " elif 'max' in k:\n", + " new_df = pd.concat([new_df, df[k].apply(np.max)], axis=1)\n", + " elif 'min' in k:\n", + " new_df = pd.concat([new_df, df[k].apply(np.min)], axis=1)\n", + " else:\n", + " new_df = pd.concat([new_df, df[k].apply(np.sum)], axis=1)\n", + " return new_df\n", "\n", - "main_df.loc[main_df['amount'] == 0, 'amount'] = np.NAN" + "bank_df = setup_aggregations(bank_df, 'application_id')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-30T01:25:58.971549Z", + "start_time": "2018-03-30T01:25:58.963296Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "230 16\n" + ] + } + ], + "source": [ + "print(len(bank_df), len(list(bank_df)))" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-30T01:25:58.993300Z", + "start_time": "2018-03-30T01:25:58.977102Z" + } + }, + "outputs": [], + "source": [ + "df = pd.merge(main_df, bank_df, on='application_id')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-30T01:25:59.031417Z", + "start_time": "2018-03-30T01:25:58.998453Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "230 38\n" + ] + } + ], + "source": [ + "print(len(df), len(list(df)))" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 35, "metadata": { "ExecuteTime": { - "end_time": "2018-03-29T02:33:33.797848Z", - "start_time": "2018-03-29T02:33:33.772428Z" + "end_time": "2018-03-30T01:25:59.093342Z", + "start_time": "2018-03-30T01:25:59.039687Z" } }, "outputs": [ @@ -396,157 +590,2261 @@ " ads_network\n", " amount\n", " application_id\n", - " birthdate\n", " browser\n", " campaign_city\n", " city\n", " company_size\n", - " email\n", + " firm_type\n", + " gender\n", " ...\n", - " network\n", - " platform\n", - " registered_office_city\n", - " registered_office_state\n", - " role_in_firm\n", - " role_on_application\n", - " state\n", - " utm_medium\n", - " utm_source\n", - " year_of_incorporation\n", + " inward_returns\n", + " max_business_inflow\n", + " max_business_outflow\n", + " max_inflow\n", + " max_outflow\n", + " min_business_inflow\n", + " min_business_outflow\n", + " min_inflow\n", + " min_outflow\n", + " outward_returns\n", " \n", " \n", " \n", + " \n", + " 0\n", + " NaN\n", + " NaN\n", + " 200000.0\n", + " 1031\n", + " Chrome\n", + " NaN\n", + " NaN\n", + " NaN\n", + " Proprietorship\n", + " Male\n", + " ...\n", + " 2\n", + " 502725\n", + " 570348\n", + " 502725\n", + " 502725\n", + " 35981\n", + " 24331\n", + " 35981\n", + " 35981\n", + " 0\n", + " \n", + " \n", + " 1\n", + " e\n", + " google_search\n", + " 300000.0\n", + " 2056\n", + " Chrome\n", + " Pune\n", + " Mumbai\n", + " 5.0\n", + " Proprietorship\n", + " Male\n", + " ...\n", + " 0\n", + " 159971\n", + " 159356\n", + " 159971\n", + " 159971\n", + " 43826\n", + " 47157\n", + " 43826\n", + " 43826\n", + " 0\n", + " \n", + " \n", + " 2\n", + " NaN\n", + " NaN\n", + " 500000.0\n", + " 9047\n", + " Chrome\n", + " NaN\n", + " Mumbai\n", + " NaN\n", + " Proprietorship\n", + " Male\n", + " ...\n", + " 0\n", + " 134835\n", + " 133462\n", + " 134835\n", + " 134835\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " \n", + " \n", + " 3\n", + " NaN\n", + " google_display\n", + " 500000.0\n", + " 2068\n", + " Chrome\n", + " NaN\n", + " Mumbai\n", + " 5.0\n", + " Proprietorship\n", + " Male\n", + " ...\n", + " 0\n", + " 879035\n", + " 780395\n", + " 879035\n", + " 879035\n", + " 21\n", + " 31236\n", + " 21\n", + " 21\n", + " 0\n", + " \n", + " \n", + " 4\n", + " b\n", + " google_search\n", + " 500000.0\n", + " 2737\n", + " Chrome\n", + " Jaipur\n", + " Ajmer\n", + " 5.0\n", + " Proprietorship\n", + " Male\n", + " ...\n", + " 0\n", + " 373105\n", + " 285950\n", + " 373105\n", + " 373105\n", + " 22000\n", + " 30008\n", + " 22000\n", + " 22000\n", + " 3\n", + " \n", " \n", "\n", - "

0 rows × 25 columns

\n", + "

5 rows × 38 columns

\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [ads_matchtype, ads_network, amount, application_id, birthdate, browser, campaign_city, city, company_size, email, firm_type, gender, industry, last_fy_profit, loan_created, network, platform, registered_office_city, registered_office_state, role_in_firm, role_on_application, state, utm_medium, utm_source, year_of_incorporation]\n", - "Index: []\n", + " ads_matchtype ads_network amount application_id browser \\\n", + "0 NaN NaN 200000.0 1031 Chrome \n", + "1 e google_search 300000.0 2056 Chrome \n", + "2 NaN NaN 500000.0 9047 Chrome \n", + "3 NaN google_display 500000.0 2068 Chrome \n", + "4 b google_search 500000.0 2737 Chrome \n", + "\n", + " campaign_city city company_size firm_type gender ... \\\n", + "0 NaN NaN NaN Proprietorship Male ... \n", + "1 Pune Mumbai 5.0 Proprietorship Male ... \n", + "2 NaN Mumbai NaN Proprietorship Male ... \n", + "3 NaN Mumbai 5.0 Proprietorship Male ... \n", + "4 Jaipur Ajmer 5.0 Proprietorship Male ... \n", + "\n", + " inward_returns max_business_inflow max_business_outflow max_inflow \\\n", + "0 2 502725 570348 502725 \n", + "1 0 159971 159356 159971 \n", + "2 0 134835 133462 134835 \n", + "3 0 879035 780395 879035 \n", + "4 0 373105 285950 373105 \n", "\n", - "[0 rows x 25 columns]" + " max_outflow min_business_inflow min_business_outflow min_inflow \\\n", + "0 502725 35981 24331 35981 \n", + "1 159971 43826 47157 43826 \n", + "2 134835 0 0 0 \n", + "3 879035 21 31236 21 \n", + "4 373105 22000 30008 22000 \n", + "\n", + " min_outflow outward_returns \n", + "0 35981 0 \n", + "1 43826 0 \n", + "2 0 0 \n", + "3 21 0 \n", + "4 22000 3 \n", + "\n", + "[5 rows x 38 columns]" ] }, - "execution_count": 13, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "ExecuteTime": { - "end_time": "2018-03-28T21:41:02.961905Z", - "start_time": "2018-03-28T21:41:02.944388Z" - }, - "scrolled": true - }, - "outputs": [], "source": [ - "# list(main_df)\n", - "main_df['year_of_incorporation'].sort_values().value_counts()" + "df.head()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": { "ExecuteTime": { - "end_time": "2018-03-28T21:44:00.401287Z", - "start_time": "2018-03-28T21:44:00.382401Z" + "end_time": "2018-03-30T01:25:59.120310Z", + "start_time": "2018-03-30T01:25:59.100882Z" } }, "outputs": [], "source": [ - "# only keep applications that are in both data sets\n", - "appln_id = pd.Series(list(set(main_df['application_id']) & set(bank_df['application_id'])))\n", - "main_df = main_df.loc[main_df['application_id'].isin(appln_id)]\n", - "bank_df = bank_df.loc[bank_df['application_id'].isin(appln_id)]" + "df.to_csv(FINAL_DATA, index=False)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": { "ExecuteTime": { - "end_time": "2018-03-28T21:44:00.579850Z", - "start_time": "2018-03-28T21:44:00.566154Z" + "end_time": "2018-03-30T01:25:59.233405Z", + "start_time": "2018-03-30T01:25:59.126551Z" } }, "outputs": [], "source": [ - "list(bank_df)" + "categorical_cols = [\n", + " 'ads_matchtype',\n", + " 'ads_network',\n", + "# 'amount',\n", + "# 'application_id',\n", + " 'browser',\n", + " 'campaign_city',\n", + " 'city',\n", + "# 'company_size',\n", + " 'firm_type',\n", + " 'gender',\n", + " 'industry',\n", + "# 'last_fy_profit',\n", + "# 'loan_created',\n", + " 'platform',\n", + " 'registered_office_city',\n", + " 'registered_office_state',\n", + " 'role_in_firm',\n", + " 'role_on_application',\n", + " 'state',\n", + " 'utm_medium',\n", + " 'utm_source',\n", + " 'private_email_domain',\n", + "# 'age_of_firm',\n", + "# 'average_business_inflow',\n", + "# 'average_business_outflow',\n", + "# 'average_inflow',\n", + "# 'average_outflow',\n", + "# 'high_inflow_cp',\n", + "# 'inward_returns',\n", + "# 'max_business_inflow',\n", + "# 'max_business_outflow',\n", + "# 'max_inflow',\n", + "# 'max_outflow',\n", + "# 'min_business_inflow',\n", + "# 'min_business_outflow',\n", + "# 'min_inflow',\n", + "# 'min_outflow',\n", + "# 'outward_returns'\n", + "]\n", + "\n", + "for col in categorical_cols:\n", + " oh = pd.get_dummies(df[col], prefix=col)\n", + " df = df.join(oh)\n", + "df = df.drop(columns=categorical_cols)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": { "ExecuteTime": { - "end_time": "2018-03-28T21:44:01.613870Z", - "start_time": "2018-03-28T21:44:01.144643Z" + "end_time": "2018-03-30T01:25:59.374441Z", + "start_time": "2018-03-30T01:25:59.237261Z" } }, - "outputs": [], - "source": [ - "def _aggregate_columns(df, application_id_col):\n", - " # group by application id and merge all rows into lists\n", - " new_df = pd.DataFrame()\n", - " g = bank_df.groupby(application_id_col)\n", - " for k in list(df):\n", - " if k == application_id_col:\n", - " continue\n", - " new_df = pd.concat([new_df, g[k].apply(list)], axis=1)\n", - " return new_df.reset_index()\n", - "\n", - "\n", - "def setup_aggregations(df, application_id_col):\n", - " '''\n", - " fix bank data (for applications with multiple rows)\n", - " - average the averages\n", - " - add high_credit_cp\n", - " - add invard returns\n", - " - max of all the maxs\n", - " - min of all the mins\n", - " - add outward_returns\n", - " - drop totals (because average is better and normalized)\n", - " '''\n", - " df = df.drop(columns=['total_business_inflow', 'total_business_outflow', 'total_inflow', 'total_outflow'])\n", - " df = _aggregate_columns(df, application_id_col)\n", - " new_df = pd.DataFrame()\n", - " for k in list(df):\n", - " if k == application_id_col:\n", - " new_df = pd.concat([new_df, df[k]], axis=1)\n", - " elif 'average' in k:\n", - " new_df = pd.concat([new_df, df[k].apply(np.average)], axis=1)\n", - " elif 'max' in k:\n", - " new_df = pd.concat([new_df, df[k].apply(np.max)], axis=1)\n", - " elif 'min' in k:\n", - " new_df = pd.concat([new_df, df[k].apply(np.min)], axis=1)\n", - " else:\n", - " new_df = pd.concat([new_df, df[k].apply(np.sum)], axis=1)\n", - " return new_df\n", - "\n", - "bank_df = setup_aggregations(bank_df, 'application_id')" + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
amountapplication_idcompany_sizelast_fy_profitloan_createdage_of_firmaverage_business_inflowaverage_business_outflowaverage_inflowaverage_outflow...state_TELANGANAstate_UTTAR PRADESHstate_WEST BENGALstate_madhya pradeshutm_medium_Bannerutm_medium_ppcutm_source_Facebookutm_source_adwordsprivate_email_domain_Falseprivate_email_domain_True
0200000.01031NaNNaN02.01.647250e+051.640400e+051.647250e+051.647250e+05...0000000010
1300000.020565.0341068.004.09.122000e+049.162500e+049.122000e+049.122000e+04...0000010110
2500000.09047NaNNaN04.04.212600e+044.729200e+044.212600e+044.212600e+04...0000101010
3500000.020685.0NaN04.02.466550e+052.644910e+052.466550e+052.466550e+05...0000010110
4500000.027375.0NaN04.09.678400e+049.549800e+049.678400e+049.678400e+04...0000010110
5400000.031265.0NaN05.06.500300e+046.717000e+046.500300e+046.500300e+04...0000010110
6600000.010815.0319713.00NaN1.118550e+051.106320e+051.118550e+051.118550e+05...0000010110
7300000.051785.0NaN0NaN4.463800e+044.341500e+044.463800e+044.463800e+04...0000010110
8500000.010845.0NaN02.02.366100e+042.257600e+042.366100e+042.366100e+04...0000010110
91000000.06281NaNNaN03.02.279537e+052.259997e+052.279537e+052.279537e+05...0010101001
101000000.0109011.0NaN0NaN3.209850e+053.235200e+053.209850e+053.209850e+05...0000010110
11500000.0110310.0462252.01NaN9.351550e+049.289650e+049.351550e+049.351550e+04...0000010110
12300000.09059NaN350000.003.01.135960e+051.457707e+051.135960e+051.135960e+05...0000010110
13300000.08834NaN400000.006.02.527170e+052.526390e+052.527170e+052.527170e+05...0010010110
14500000.02137NaNNaN02.02.401870e+052.389370e+052.401870e+052.401870e+05...0000000010
15500000.052115.0242780.003.03.173480e+053.285140e+053.173480e+053.173480e+05...0000010110
16500000.09061NaN2434080.003.03.447220e+053.387640e+053.447220e+053.447220e+05...0000010110
171000000.041995.0NaN04.02.652950e+042.744150e+042.652950e+042.652950e+04...0000010110
18300000.031785.0NaN02.06.871790e+056.869310e+056.871790e+056.871790e+05...0000010110
191000000.0114311.0NaN03.01.220400e+041.120500e+041.220400e+041.220400e+04...0000010110
20500000.042285.0NaN03.05.794920e+055.788810e+055.794920e+055.794920e+05...0000010110
211000000.0115811.0NaN07.06.861212e+066.900142e+066.861212e+066.861212e+06...0000010101
22300000.0115910.0NaN05.01.824030e+051.840570e+051.824030e+051.824030e+05...0000010110
23300000.042335.0NaN0NaN1.062137e+051.062613e+051.062137e+051.062137e+05...0000010110
241000000.0526810.01230412.00NaN9.640510e+059.628270e+059.640510e+059.640510e+05...0000010110
25500000.032295.0NaN03.03.089860e+053.088210e+053.089860e+053.089860e+05...0000010110
26500000.042555.0360000.003.02.321530e+052.316730e+052.321530e+052.321530e+05...0010010110
27200000.05289NaN411644.009.07.781100e+046.834200e+047.781100e+047.781100e+04...0000101010
28500000.036205.0553752.004.01.617640e+051.562800e+051.617640e+051.617640e+05...0000010110
291000000.0222411.0351478.002.01.128120e+051.126340e+051.128120e+051.128120e+05...0000010110
..................................................................
2001000000.0498610.0NaN010.01.327260e+051.325810e+051.327260e+051.327260e+05...0000101010
201500000.0134310.0NaN04.01.357450e+051.357450e+051.357450e+051.357450e+05...0000010110
202500000.039645.0NaN09.05.548440e+055.545230e+055.548440e+055.548440e+05...0000010110
2031000000.018565.0707174.006.05.661350e+055.719570e+055.661350e+055.661350e+05...0000010110
2041000000.07327NaNNaN04.01.855910e+051.858975e+051.855910e+051.855910e+05...0000010110
205300000.0487211.0273416.00NaN6.601200e+048.239350e+046.601200e+046.601200e+04...0000010110
206300000.09107NaN503006.0019.08.720900e+048.780200e+048.786700e+048.786700e+04...0000101010
207500000.09109NaN295850.0011.02.908670e+053.349475e+053.334265e+053.334265e+05...0000000010
208300000.09113NaN400000.005.01.731960e+051.831990e+051.834930e+051.834930e+05...0000101010
209500000.0297210.0NaN03.01.102355e+058.094250e+041.102355e+051.102355e+05...0000010110
210500000.0502610.0NaN0NaN2.854230e+052.833280e+052.854230e+052.854230e+05...0000010110
211500000.019565.0286850.003.01.728560e+051.749340e+051.728560e+051.728560e+05...0000010110
2121000000.019625.0320000.005.07.370620e+057.390440e+057.370620e+057.370620e+05...0000010110
213500000.09133NaNNaN04.01.629150e+051.596030e+051.629150e+051.629150e+05...0000010110
214100000.0425311.0NaN03.01.360880e+051.379060e+051.360880e+051.360880e+05...0000010110
2151000000.040205.0NaN012.02.546000e+042.454400e+042.546000e+042.546000e+04...0000010110
216300000.08862NaNNaN04.03.918100e+044.155500e+043.918100e+043.918100e+04...0000101010
2171000000.03714NaNNaN05.01.077650e+051.117710e+051.077650e+051.077650e+05...0000000010
2181000000.052795.0850814.0010.04.411120e+054.409760e+054.411120e+054.411120e+05...0000010110
219500000.040305.0NaN0NaN3.233950e+043.529950e+043.233950e+043.233950e+04...0000010110
220500000.09035NaN290775.001.02.203285e+052.202840e+052.203285e+052.203285e+05...0000101001
221500000.09102NaN458032.0010.05.495500e+045.502200e+045.495500e+045.495500e+04...0000101010
2221000000.044315.0NaN0NaN1.248270e+051.267550e+051.248270e+051.248270e+05...0000010110
223300000.040625.0NaN05.01.847560e+051.874110e+051.847560e+051.847560e+05...0000010110
224300000.020205.0496000.005.08.160400e+047.916300e+048.160400e+048.160400e+04...0000010110
225500000.01000NaN293960.00NaN3.969100e+043.969100e+043.969100e+043.969100e+04...0000000010
2261000000.030535.0NaN03.01.427360e+051.427310e+051.427360e+051.427360e+05...0000010110
227300000.0510411.0NaN04.05.825630e+055.905210e+055.825630e+055.825630e+05...0000010110
2281000000.05762NaNNaN03.01.829410e+051.827940e+051.829410e+051.829410e+05...0000101010
229NaN8874NaNNaN0NaN4.799900e+044.799900e+044.799900e+044.799900e+04...0000101010
\n", + "

230 rows × 243 columns

\n", + "
" + ], + "text/plain": [ + " amount application_id company_size last_fy_profit loan_created \\\n", + "0 200000.0 1031 NaN NaN 0 \n", + "1 300000.0 2056 5.0 341068.0 0 \n", + "2 500000.0 9047 NaN NaN 0 \n", + "3 500000.0 2068 5.0 NaN 0 \n", + "4 500000.0 2737 5.0 NaN 0 \n", + "5 400000.0 3126 5.0 NaN 0 \n", + "6 600000.0 1081 5.0 319713.0 0 \n", + "7 300000.0 5178 5.0 NaN 0 \n", + "8 500000.0 1084 5.0 NaN 0 \n", + "9 1000000.0 6281 NaN NaN 0 \n", + "10 1000000.0 1090 11.0 NaN 0 \n", + "11 500000.0 1103 10.0 462252.0 1 \n", + "12 300000.0 9059 NaN 350000.0 0 \n", + "13 300000.0 8834 NaN 400000.0 0 \n", + "14 500000.0 2137 NaN NaN 0 \n", + "15 500000.0 5211 5.0 242780.0 0 \n", + "16 500000.0 9061 NaN 2434080.0 0 \n", + "17 1000000.0 4199 5.0 NaN 0 \n", + "18 300000.0 3178 5.0 NaN 0 \n", + "19 1000000.0 1143 11.0 NaN 0 \n", + "20 500000.0 4228 5.0 NaN 0 \n", + "21 1000000.0 1158 11.0 NaN 0 \n", + "22 300000.0 1159 10.0 NaN 0 \n", + "23 300000.0 4233 5.0 NaN 0 \n", + "24 1000000.0 5268 10.0 1230412.0 0 \n", + "25 500000.0 3229 5.0 NaN 0 \n", + "26 500000.0 4255 5.0 360000.0 0 \n", + "27 200000.0 5289 NaN 411644.0 0 \n", + "28 500000.0 3620 5.0 553752.0 0 \n", + "29 1000000.0 2224 11.0 351478.0 0 \n", + ".. ... ... ... ... ... \n", + "200 1000000.0 4986 10.0 NaN 0 \n", + "201 500000.0 1343 10.0 NaN 0 \n", + "202 500000.0 3964 5.0 NaN 0 \n", + "203 1000000.0 1856 5.0 707174.0 0 \n", + "204 1000000.0 7327 NaN NaN 0 \n", + "205 300000.0 4872 11.0 273416.0 0 \n", + "206 300000.0 9107 NaN 503006.0 0 \n", + "207 500000.0 9109 NaN 295850.0 0 \n", + "208 300000.0 9113 NaN 400000.0 0 \n", + "209 500000.0 2972 10.0 NaN 0 \n", + "210 500000.0 5026 10.0 NaN 0 \n", + "211 500000.0 1956 5.0 286850.0 0 \n", + "212 1000000.0 1962 5.0 320000.0 0 \n", + "213 500000.0 9133 NaN NaN 0 \n", + "214 100000.0 4253 11.0 NaN 0 \n", + "215 1000000.0 4020 5.0 NaN 0 \n", + "216 300000.0 8862 NaN NaN 0 \n", + "217 1000000.0 3714 NaN NaN 0 \n", + "218 1000000.0 5279 5.0 850814.0 0 \n", + "219 500000.0 4030 5.0 NaN 0 \n", + "220 500000.0 9035 NaN 290775.0 0 \n", + "221 500000.0 9102 NaN 458032.0 0 \n", + "222 1000000.0 4431 5.0 NaN 0 \n", + "223 300000.0 4062 5.0 NaN 0 \n", + "224 300000.0 2020 5.0 496000.0 0 \n", + "225 500000.0 1000 NaN 293960.0 0 \n", + "226 1000000.0 3053 5.0 NaN 0 \n", + "227 300000.0 5104 11.0 NaN 0 \n", + "228 1000000.0 5762 NaN NaN 0 \n", + "229 NaN 8874 NaN NaN 0 \n", + "\n", + " age_of_firm average_business_inflow average_business_outflow \\\n", + "0 2.0 1.647250e+05 1.640400e+05 \n", + "1 4.0 9.122000e+04 9.162500e+04 \n", + "2 4.0 4.212600e+04 4.729200e+04 \n", + "3 4.0 2.466550e+05 2.644910e+05 \n", + "4 4.0 9.678400e+04 9.549800e+04 \n", + "5 5.0 6.500300e+04 6.717000e+04 \n", + "6 NaN 1.118550e+05 1.106320e+05 \n", + "7 NaN 4.463800e+04 4.341500e+04 \n", + "8 2.0 2.366100e+04 2.257600e+04 \n", + "9 3.0 2.279537e+05 2.259997e+05 \n", + "10 NaN 3.209850e+05 3.235200e+05 \n", + "11 NaN 9.351550e+04 9.289650e+04 \n", + "12 3.0 1.135960e+05 1.457707e+05 \n", + "13 6.0 2.527170e+05 2.526390e+05 \n", + "14 2.0 2.401870e+05 2.389370e+05 \n", + "15 3.0 3.173480e+05 3.285140e+05 \n", + "16 3.0 3.447220e+05 3.387640e+05 \n", + "17 4.0 2.652950e+04 2.744150e+04 \n", + "18 2.0 6.871790e+05 6.869310e+05 \n", + "19 3.0 1.220400e+04 1.120500e+04 \n", + "20 3.0 5.794920e+05 5.788810e+05 \n", + "21 7.0 6.861212e+06 6.900142e+06 \n", + "22 5.0 1.824030e+05 1.840570e+05 \n", + "23 NaN 1.062137e+05 1.062613e+05 \n", + "24 NaN 9.640510e+05 9.628270e+05 \n", + "25 3.0 3.089860e+05 3.088210e+05 \n", + "26 3.0 2.321530e+05 2.316730e+05 \n", + "27 9.0 7.781100e+04 6.834200e+04 \n", + "28 4.0 1.617640e+05 1.562800e+05 \n", + "29 2.0 1.128120e+05 1.126340e+05 \n", + ".. ... ... ... \n", + "200 10.0 1.327260e+05 1.325810e+05 \n", + "201 4.0 1.357450e+05 1.357450e+05 \n", + "202 9.0 5.548440e+05 5.545230e+05 \n", + "203 6.0 5.661350e+05 5.719570e+05 \n", + "204 4.0 1.855910e+05 1.858975e+05 \n", + "205 NaN 6.601200e+04 8.239350e+04 \n", + "206 19.0 8.720900e+04 8.780200e+04 \n", + "207 11.0 2.908670e+05 3.349475e+05 \n", + "208 5.0 1.731960e+05 1.831990e+05 \n", + "209 3.0 1.102355e+05 8.094250e+04 \n", + "210 NaN 2.854230e+05 2.833280e+05 \n", + "211 3.0 1.728560e+05 1.749340e+05 \n", + "212 5.0 7.370620e+05 7.390440e+05 \n", + "213 4.0 1.629150e+05 1.596030e+05 \n", + "214 3.0 1.360880e+05 1.379060e+05 \n", + "215 12.0 2.546000e+04 2.454400e+04 \n", + "216 4.0 3.918100e+04 4.155500e+04 \n", + "217 5.0 1.077650e+05 1.117710e+05 \n", + "218 10.0 4.411120e+05 4.409760e+05 \n", + "219 NaN 3.233950e+04 3.529950e+04 \n", + "220 1.0 2.203285e+05 2.202840e+05 \n", + "221 10.0 5.495500e+04 5.502200e+04 \n", + "222 NaN 1.248270e+05 1.267550e+05 \n", + "223 5.0 1.847560e+05 1.874110e+05 \n", + "224 5.0 8.160400e+04 7.916300e+04 \n", + "225 NaN 3.969100e+04 3.969100e+04 \n", + "226 3.0 1.427360e+05 1.427310e+05 \n", + "227 4.0 5.825630e+05 5.905210e+05 \n", + "228 3.0 1.829410e+05 1.827940e+05 \n", + "229 NaN 4.799900e+04 4.799900e+04 \n", + "\n", + " average_inflow average_outflow ... \\\n", + "0 1.647250e+05 1.647250e+05 ... \n", + "1 9.122000e+04 9.122000e+04 ... \n", + "2 4.212600e+04 4.212600e+04 ... \n", + "3 2.466550e+05 2.466550e+05 ... \n", + "4 9.678400e+04 9.678400e+04 ... \n", + "5 6.500300e+04 6.500300e+04 ... \n", + "6 1.118550e+05 1.118550e+05 ... \n", + "7 4.463800e+04 4.463800e+04 ... \n", + "8 2.366100e+04 2.366100e+04 ... \n", + "9 2.279537e+05 2.279537e+05 ... \n", + "10 3.209850e+05 3.209850e+05 ... \n", + "11 9.351550e+04 9.351550e+04 ... \n", + "12 1.135960e+05 1.135960e+05 ... \n", + "13 2.527170e+05 2.527170e+05 ... \n", + "14 2.401870e+05 2.401870e+05 ... \n", + "15 3.173480e+05 3.173480e+05 ... \n", + "16 3.447220e+05 3.447220e+05 ... \n", + "17 2.652950e+04 2.652950e+04 ... \n", + "18 6.871790e+05 6.871790e+05 ... \n", + "19 1.220400e+04 1.220400e+04 ... \n", + "20 5.794920e+05 5.794920e+05 ... \n", + "21 6.861212e+06 6.861212e+06 ... \n", + "22 1.824030e+05 1.824030e+05 ... \n", + "23 1.062137e+05 1.062137e+05 ... \n", + "24 9.640510e+05 9.640510e+05 ... \n", + "25 3.089860e+05 3.089860e+05 ... \n", + "26 2.321530e+05 2.321530e+05 ... \n", + "27 7.781100e+04 7.781100e+04 ... \n", + "28 1.617640e+05 1.617640e+05 ... \n", + "29 1.128120e+05 1.128120e+05 ... \n", + ".. ... ... ... \n", + "200 1.327260e+05 1.327260e+05 ... \n", + "201 1.357450e+05 1.357450e+05 ... \n", + "202 5.548440e+05 5.548440e+05 ... \n", + "203 5.661350e+05 5.661350e+05 ... \n", + "204 1.855910e+05 1.855910e+05 ... \n", + "205 6.601200e+04 6.601200e+04 ... \n", + "206 8.786700e+04 8.786700e+04 ... \n", + "207 3.334265e+05 3.334265e+05 ... \n", + "208 1.834930e+05 1.834930e+05 ... \n", + "209 1.102355e+05 1.102355e+05 ... \n", + "210 2.854230e+05 2.854230e+05 ... \n", + "211 1.728560e+05 1.728560e+05 ... \n", + "212 7.370620e+05 7.370620e+05 ... \n", + "213 1.629150e+05 1.629150e+05 ... \n", + "214 1.360880e+05 1.360880e+05 ... \n", + "215 2.546000e+04 2.546000e+04 ... \n", + "216 3.918100e+04 3.918100e+04 ... \n", + "217 1.077650e+05 1.077650e+05 ... \n", + "218 4.411120e+05 4.411120e+05 ... \n", + "219 3.233950e+04 3.233950e+04 ... \n", + "220 2.203285e+05 2.203285e+05 ... \n", + "221 5.495500e+04 5.495500e+04 ... \n", + "222 1.248270e+05 1.248270e+05 ... \n", + "223 1.847560e+05 1.847560e+05 ... \n", + "224 8.160400e+04 8.160400e+04 ... \n", + "225 3.969100e+04 3.969100e+04 ... \n", + "226 1.427360e+05 1.427360e+05 ... \n", + "227 5.825630e+05 5.825630e+05 ... \n", + "228 1.829410e+05 1.829410e+05 ... \n", + "229 4.799900e+04 4.799900e+04 ... \n", + "\n", + " state_TELANGANA state_UTTAR PRADESH state_WEST BENGAL \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "5 0 0 0 \n", + "6 0 0 0 \n", + "7 0 0 0 \n", + "8 0 0 0 \n", + "9 0 0 1 \n", + "10 0 0 0 \n", + "11 0 0 0 \n", + "12 0 0 0 \n", + "13 0 0 1 \n", + "14 0 0 0 \n", + "15 0 0 0 \n", + "16 0 0 0 \n", + "17 0 0 0 \n", + "18 0 0 0 \n", + "19 0 0 0 \n", + "20 0 0 0 \n", + "21 0 0 0 \n", + "22 0 0 0 \n", + "23 0 0 0 \n", + "24 0 0 0 \n", + "25 0 0 0 \n", + "26 0 0 1 \n", + "27 0 0 0 \n", + "28 0 0 0 \n", + "29 0 0 0 \n", + ".. ... ... ... \n", + "200 0 0 0 \n", + "201 0 0 0 \n", + "202 0 0 0 \n", + "203 0 0 0 \n", + "204 0 0 0 \n", + "205 0 0 0 \n", + "206 0 0 0 \n", + "207 0 0 0 \n", + "208 0 0 0 \n", + "209 0 0 0 \n", + "210 0 0 0 \n", + "211 0 0 0 \n", + "212 0 0 0 \n", + "213 0 0 0 \n", + "214 0 0 0 \n", + "215 0 0 0 \n", + "216 0 0 0 \n", + "217 0 0 0 \n", + "218 0 0 0 \n", + "219 0 0 0 \n", + "220 0 0 0 \n", + "221 0 0 0 \n", + "222 0 0 0 \n", + "223 0 0 0 \n", + "224 0 0 0 \n", + "225 0 0 0 \n", + "226 0 0 0 \n", + "227 0 0 0 \n", + "228 0 0 0 \n", + "229 0 0 0 \n", + "\n", + " state_madhya pradesh utm_medium_Banner utm_medium_ppc \\\n", + "0 0 0 0 \n", + "1 0 0 1 \n", + "2 0 1 0 \n", + "3 0 0 1 \n", + "4 0 0 1 \n", + "5 0 0 1 \n", + "6 0 0 1 \n", + "7 0 0 1 \n", + "8 0 0 1 \n", + "9 0 1 0 \n", + "10 0 0 1 \n", + "11 0 0 1 \n", + "12 0 0 1 \n", + "13 0 0 1 \n", + "14 0 0 0 \n", + "15 0 0 1 \n", + "16 0 0 1 \n", + "17 0 0 1 \n", + "18 0 0 1 \n", + "19 0 0 1 \n", + "20 0 0 1 \n", + "21 0 0 1 \n", + "22 0 0 1 \n", + "23 0 0 1 \n", + "24 0 0 1 \n", + "25 0 0 1 \n", + "26 0 0 1 \n", + "27 0 1 0 \n", + "28 0 0 1 \n", + "29 0 0 1 \n", + ".. ... ... ... \n", + "200 0 1 0 \n", + "201 0 0 1 \n", + "202 0 0 1 \n", + "203 0 0 1 \n", + "204 0 0 1 \n", + "205 0 0 1 \n", + "206 0 1 0 \n", + "207 0 0 0 \n", + "208 0 1 0 \n", + "209 0 0 1 \n", + "210 0 0 1 \n", + "211 0 0 1 \n", + "212 0 0 1 \n", + "213 0 0 1 \n", + "214 0 0 1 \n", + "215 0 0 1 \n", + "216 0 1 0 \n", + "217 0 0 0 \n", + "218 0 0 1 \n", + "219 0 0 1 \n", + "220 0 1 0 \n", + "221 0 1 0 \n", + "222 0 0 1 \n", + "223 0 0 1 \n", + "224 0 0 1 \n", + "225 0 0 0 \n", + "226 0 0 1 \n", + "227 0 0 1 \n", + "228 0 1 0 \n", + "229 0 1 0 \n", + "\n", + " utm_source_Facebook utm_source_adwords private_email_domain_False \\\n", + "0 0 0 1 \n", + "1 0 1 1 \n", + "2 1 0 1 \n", + "3 0 1 1 \n", + "4 0 1 1 \n", + "5 0 1 1 \n", + "6 0 1 1 \n", + "7 0 1 1 \n", + "8 0 1 1 \n", + "9 1 0 0 \n", + "10 0 1 1 \n", + "11 0 1 1 \n", + "12 0 1 1 \n", + "13 0 1 1 \n", + "14 0 0 1 \n", + "15 0 1 1 \n", + "16 0 1 1 \n", + "17 0 1 1 \n", + "18 0 1 1 \n", + "19 0 1 1 \n", + "20 0 1 1 \n", + "21 0 1 0 \n", + "22 0 1 1 \n", + "23 0 1 1 \n", + "24 0 1 1 \n", + "25 0 1 1 \n", + "26 0 1 1 \n", + "27 1 0 1 \n", + "28 0 1 1 \n", + "29 0 1 1 \n", + ".. ... ... ... \n", + "200 1 0 1 \n", + "201 0 1 1 \n", + "202 0 1 1 \n", + "203 0 1 1 \n", + "204 0 1 1 \n", + "205 0 1 1 \n", + "206 1 0 1 \n", + "207 0 0 1 \n", + "208 1 0 1 \n", + "209 0 1 1 \n", + "210 0 1 1 \n", + "211 0 1 1 \n", + "212 0 1 1 \n", + "213 0 1 1 \n", + "214 0 1 1 \n", + "215 0 1 1 \n", + "216 1 0 1 \n", + "217 0 0 1 \n", + "218 0 1 1 \n", + "219 0 1 1 \n", + "220 1 0 0 \n", + "221 1 0 1 \n", + "222 0 1 1 \n", + "223 0 1 1 \n", + "224 0 1 1 \n", + "225 0 0 1 \n", + "226 0 1 1 \n", + "227 0 1 1 \n", + "228 1 0 1 \n", + "229 1 0 1 \n", + "\n", + " private_email_domain_True \n", + "0 0 \n", + "1 0 \n", + "2 0 \n", + "3 0 \n", + "4 0 \n", + "5 0 \n", + "6 0 \n", + "7 0 \n", + "8 0 \n", + "9 1 \n", + "10 0 \n", + "11 0 \n", + "12 0 \n", + "13 0 \n", + "14 0 \n", + "15 0 \n", + "16 0 \n", + "17 0 \n", + "18 0 \n", + "19 0 \n", + "20 0 \n", + "21 1 \n", + "22 0 \n", + "23 0 \n", + "24 0 \n", + "25 0 \n", + "26 0 \n", + "27 0 \n", + "28 0 \n", + "29 0 \n", + ".. ... \n", + "200 0 \n", + "201 0 \n", + "202 0 \n", + "203 0 \n", + "204 0 \n", + "205 0 \n", + "206 0 \n", + "207 0 \n", + "208 0 \n", + "209 0 \n", + "210 0 \n", + "211 0 \n", + "212 0 \n", + "213 0 \n", + "214 0 \n", + "215 0 \n", + "216 0 \n", + "217 0 \n", + "218 0 \n", + "219 0 \n", + "220 1 \n", + "221 0 \n", + "222 0 \n", + "223 0 \n", + "224 0 \n", + "225 0 \n", + "226 0 \n", + "227 0 \n", + "228 0 \n", + "229 0 \n", + "\n", + "[230 rows x 243 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 39, "metadata": { "ExecuteTime": { - "end_time": "2018-03-28T21:44:06.462793Z", - "start_time": "2018-03-28T21:44:06.428801Z" + "end_time": "2018-03-30T01:25:59.430729Z", + "start_time": "2018-03-30T01:25:59.378475Z" } }, "outputs": [], "source": [ - "bank_df.head()" + "df.to_csv(TRAIN_DATA, index=False)" ] }, {