diff --git a/Data Cleanup.ipynb b/Data Cleanup.ipynb new file mode 100644 index 0000000..2cbfed2 --- /dev/null +++ b/Data Cleanup.ipynb @@ -0,0 +1,581 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-29T02:29:10.637998Z", + "start_time": "2018-03-29T02:29:08.722805Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "DUMMY_DATA_PATH = 'dataset/dummy/'\n", + "DUMMY_BANK_DATA = DUMMY_DATA_PATH+'BSA.csv'\n", + "DUMMY_MAIN_DATA = DUMMY_DATA_PATH+'data.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-29T02:29:11.492651Z", + "start_time": "2018-03-29T02:29:11.354573Z" + } + }, + "outputs": [], + "source": [ + "main_df = pd.read_csv(DUMMY_MAIN_DATA)\n", + "bank_df = pd.read_csv(DUMMY_BANK_DATA)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-29T02:29:11.997304Z", + "start_time": "2018-03-29T02:29:11.911392Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ads_matchtypeads_networkamountapplication_idbirthdatebrowsercampaign_citycitycompany_sizeemail...networkplatformregistered_office_cityregistered_office_staterole_in_firmrole_on_applicationstateutm_mediumutm_sourceyear_of_incorporation
0NaNNaNNaN1024.0NaNOperaNaNNaNNaNKaif1779@gmail.com...Opera Software Americas LLCmobile:Pike v8.0 release 461NaNNaNNaN0NaNNaNNaNNaN
1NaNNaN300000.0716.023/10/1982ChromeNaNNaNNaNvihanmarketing36@gmail.com...Idea Cellular LimitedWin32GondiaMAHARASHTRA1.04MAHARASHTRANaNNaN2014.0
2NaNNaN200000.01031.008/09/1987ChromeNaNNaNNaNfaijiyatoursandtravels@gmail.com...Idea Cellular Limitedmobile:Linux armv8lPUNEMAHARASHTRA1.04MAHARASHTRANaNNaN2016.0
3e{google_search}300000.02056.002/04/1982ChromePuneMumbai5.0sagarnk2008@gmail.com...Reliance Jio Infocomm Limitedmobile:Linux aarch64PuneMAHARASHTRA1.04KARNATAKAppcadwords2014.0
4NaNNaN500000.09047.013/04/1979ChromeNaNMumbaiNaNnatrajmoily@gmail.com...Syscon Infoway Pvt. Ltd.mobile:Linux armv8lTHANEMAHARASHTRA1.04MAHARASHTRABannerFacebook2014.0
\n", + "

5 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " ads_matchtype ads_network amount application_id birthdate \\\n", + "0 NaN NaN NaN 1024.0 NaN \n", + "1 NaN NaN 300000.0 716.0 23/10/1982 \n", + "2 NaN NaN 200000.0 1031.0 08/09/1987 \n", + "3 e {google_search} 300000.0 2056.0 02/04/1982 \n", + "4 NaN NaN 500000.0 9047.0 13/04/1979 \n", + "\n", + " browser campaign_city city company_size \\\n", + "0 Opera NaN NaN NaN \n", + "1 Chrome NaN NaN NaN \n", + "2 Chrome NaN NaN NaN \n", + "3 Chrome Pune Mumbai 5.0 \n", + "4 Chrome NaN Mumbai NaN \n", + "\n", + " email ... \\\n", + "0 Kaif1779@gmail.com ... \n", + "1 vihanmarketing36@gmail.com ... \n", + "2 faijiyatoursandtravels@gmail.com ... \n", + "3 sagarnk2008@gmail.com ... \n", + "4 natrajmoily@gmail.com ... \n", + "\n", + " network platform \\\n", + "0 Opera Software Americas LLC mobile:Pike v8.0 release 461 \n", + "1 Idea Cellular Limited Win32 \n", + "2 Idea Cellular Limited mobile:Linux armv8l \n", + "3 Reliance Jio Infocomm Limited mobile:Linux aarch64 \n", + "4 Syscon Infoway Pvt. Ltd. mobile:Linux armv8l \n", + "\n", + " registered_office_city registered_office_state role_in_firm \\\n", + "0 NaN NaN NaN \n", + "1 Gondia MAHARASHTRA 1.0 \n", + "2 PUNE MAHARASHTRA 1.0 \n", + "3 Pune MAHARASHTRA 1.0 \n", + "4 THANE MAHARASHTRA 1.0 \n", + "\n", + " role_on_application state utm_medium utm_source \\\n", + "0 0 NaN NaN NaN \n", + "1 4 MAHARASHTRA NaN NaN \n", + "2 4 MAHARASHTRA NaN NaN \n", + "3 4 KARNATAKA ppc adwords \n", + "4 4 MAHARASHTRA Banner Facebook \n", + "\n", + " year_of_incorporation \n", + "0 NaN \n", + "1 2014.0 \n", + "2 2016.0 \n", + "3 2014.0 \n", + "4 2014.0 \n", + "\n", + "[5 rows x 25 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "to_remove_cols = [\n", + " 'address', # textual, and hence not helpful\n", + " 'ads_cmpid', # unique key\n", + " 'ads_creative', # unique key\n", + "# 'ads_matchtype',\n", + "# 'ads_network',\n", + " 'ads_targetid', # unique key\n", + "# 'amount',\n", + "# 'application_id',\n", + " 'birthdate', # use firm age (better fit to the model)\n", + "# 'browser',\n", + "# 'campaign_city',\n", + "# 'city',\n", + "# 'company_size',\n", + " 'country', # all are india as-is, so no point keeping it..\n", + " 'created_date', # has no correlation on the model.. \n", + "# 'email',\n", + " 'firm_name', # each name is potentially unique, also string so can't do much\n", + " 'firm_pan', # unique for each company, NOTE: there is a pattern than can be used to extract features!! (TODO)\n", + "# 'firm_type',\n", + "# 'gender',\n", + "# 'industry',\n", + " 'ip', # can we do something about IPs? (TODO)\n", + "# 'last_fy_profit',\n", + " 'latitude', # IDK how helpful coordinates are.. We have the city and states; and so we can use that..\n", + "# 'loan_created', # THIS IS OUR TARGET! THIS IS SKEWED, SO A NAIVE IMPL WILL ALSO HAVE 96% ACCURACY :D \n", + " 'longitude', # see comment for 'latitude'\n", + " 'name', # see comment for firm name\n", + "# 'network',\n", + " 'pan', # unique for each individual, NOTE: there is a pattern than can be used to extract features!! (TODO)\n", + " 'pincode',\n", + "# 'platform',\n", + "# 'registered_office_city',\n", + "# 'registered_office_state',\n", + "# 'role_in_firm',\n", + "# 'role_on_application',\n", + " 'seo_city', # only 1 value.. \n", + "# 'state',\n", + "# 'utm_medium',\n", + "# 'utm_source',\n", + " 'utm_term', # idk what to do with this data.. ¯\\_(ツ)_/¯\n", + "# 'year_of_incorporation',\n", + "]\n", + "\n", + "main_df = main_df.drop(columns=to_remove_cols)\n", + "main_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-29T02:33:17.414224Z", + "start_time": "2018-03-29T02:33:17.363625Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "432 25\n" + ] + } + ], + "source": [ + "'''\n", + "Things to clean in main dataframe:\n", + "- [x] amount: make 0 amounts as NaNs\n", + "- [x] birthdate: calculate age of the person (do we need this if we have the age of firm?) --> removed col: using firm age\n", + "- [ ] browser: some really low counts\n", + "- [ ] campaign_city: some really low counts\n", + "- [ ] city: some really low counts\n", + "- [ ] email: publicly hosted email domain or personal email domain\n", + "- [ ] firm_type: is skewed (need to figure things out..)\n", + "- [ ] last_fy_profit: convert 0 to NaNs\n", + "- [ ] platform: combine all the 'mobile:Linux'?\n", + "- [ ] registered_office_city: combine same values (cases are different hence are treated as separate values)\n", + "- [ ] role_in_firm: categorical; so don't use the numbers as is..\n", + "- [ ] role_on_application: categorical; so don't use the numbers as is..\n", + "- [ ] year_of_incorporation: -> compute age of firm\n", + "'''\n", + "print(len(main_df), len(list(main_df)))\n", + "\n", + "main_df.loc[main_df['amount'] == 0, 'amount'] = np.NAN" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-29T02:33:33.797848Z", + "start_time": "2018-03-29T02:33:33.772428Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ads_matchtypeads_networkamountapplication_idbirthdatebrowsercampaign_citycitycompany_sizeemail...networkplatformregistered_office_cityregistered_office_staterole_in_firmrole_on_applicationstateutm_mediumutm_sourceyear_of_incorporation
\n", + "

0 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [ads_matchtype, ads_network, amount, application_id, birthdate, browser, campaign_city, city, company_size, email, firm_type, gender, industry, last_fy_profit, loan_created, network, platform, registered_office_city, registered_office_state, role_in_firm, role_on_application, state, utm_medium, utm_source, year_of_incorporation]\n", + "Index: []\n", + "\n", + "[0 rows x 25 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:41:02.961905Z", + "start_time": "2018-03-28T21:41:02.944388Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "# list(main_df)\n", + "main_df['year_of_incorporation'].sort_values().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:44:00.401287Z", + "start_time": "2018-03-28T21:44:00.382401Z" + } + }, + "outputs": [], + "source": [ + "# only keep applications that are in both data sets\n", + "appln_id = pd.Series(list(set(main_df['application_id']) & set(bank_df['application_id'])))\n", + "main_df = main_df.loc[main_df['application_id'].isin(appln_id)]\n", + "bank_df = bank_df.loc[bank_df['application_id'].isin(appln_id)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:44:00.579850Z", + "start_time": "2018-03-28T21:44:00.566154Z" + } + }, + "outputs": [], + "source": [ + "list(bank_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:44:01.613870Z", + "start_time": "2018-03-28T21:44:01.144643Z" + } + }, + "outputs": [], + "source": [ + "def _aggregate_columns(df, application_id_col):\n", + " # group by application id and merge all rows into lists\n", + " new_df = pd.DataFrame()\n", + " g = bank_df.groupby(application_id_col)\n", + " for k in list(df):\n", + " if k == application_id_col:\n", + " continue\n", + " new_df = pd.concat([new_df, g[k].apply(list)], axis=1)\n", + " return new_df.reset_index()\n", + "\n", + "\n", + "def setup_aggregations(df, application_id_col):\n", + " '''\n", + " fix bank data (for applications with multiple rows)\n", + " - average the averages\n", + " - add high_credit_cp\n", + " - add invard returns\n", + " - max of all the maxs\n", + " - min of all the mins\n", + " - add outward_returns\n", + " - drop totals (because average is better and normalized)\n", + " '''\n", + " df = df.drop(columns=['total_business_inflow', 'total_business_outflow', 'total_inflow', 'total_outflow'])\n", + " df = _aggregate_columns(df, application_id_col)\n", + " new_df = pd.DataFrame()\n", + " for k in list(df):\n", + " if k == application_id_col:\n", + " new_df = pd.concat([new_df, df[k]], axis=1)\n", + " elif 'average' in k:\n", + " new_df = pd.concat([new_df, df[k].apply(np.average)], axis=1)\n", + " elif 'max' in k:\n", + " new_df = pd.concat([new_df, df[k].apply(np.max)], axis=1)\n", + " elif 'min' in k:\n", + " new_df = pd.concat([new_df, df[k].apply(np.min)], axis=1)\n", + " else:\n", + " new_df = pd.concat([new_df, df[k].apply(np.sum)], axis=1)\n", + " return new_df\n", + "\n", + "bank_df = setup_aggregations(bank_df, 'application_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2018-03-28T21:44:06.462793Z", + "start_time": "2018-03-28T21:44:06.428801Z" + } + }, + "outputs": [], + "source": [ + "bank_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}