diff --git a/SVM test.ipynb b/SVM test.ipynb new file mode 100644 index 0000000..4a5b235 --- /dev/null +++ b/SVM test.ipynb @@ -0,0 +1,1331 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:13.476646Z", + "start_time": "2018-04-19T20:41:03.019958Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.svm import SVC\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import Imputer\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "DATA = 'dataset/loan_one_hot_encoded.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:13.770603Z", + "start_time": "2018-04-19T20:41:13.480032Z" + } + }, + "outputs": [], + "source": [ + "drop_cols = ['loan_created', 'application_id',\n", + "# 'firm_type_Proprietorship',\n", + " 'average_business_inflow'\n", + " ]\n", + "df = pd.read_csv(DATA)\n", + "Y = df['loan_created']\n", + "og_X = df.drop(drop_cols, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Things to-do:\n", + "\n", + "- [ ] The data is missing values; use Imputer to fill mean/median of the column\n", + " - [ ] Create another column to denote whether the data was imputed or not; I've read that it seems to have better results\n", + "- [ ] Set class_weights in SVM\n", + "- [ ] Tune hyperparameters\n", + "- [ ] Specific kernel? \n", + "\n", + "What to do about skewed data:\n", + "- See as an anamoly detection problem?\n", + "- class weights (for SVM)\n", + "- Remove training data (less data anyway..:( )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:14.741139Z", + "start_time": "2018-04-19T20:41:13.773038Z" + } + }, + "outputs": [], + "source": [ + "imp = Imputer()\n", + "imputed_X = imp.fit_transform(og_X)\n", + "\n", + "# X = imputed_X\n", + "scl = StandardScaler()\n", + "X = scl.fit_transform(imputed_X)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:14.752488Z", + "start_time": "2018-04-19T20:41:14.744609Z" + } + }, + "outputs": [], + "source": [ + "# pd.value_counts(og_X['firm_type_Proprietorship'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:14.776906Z", + "start_time": "2018-04-19T20:41:14.756636Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(230, 240)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:14.811622Z", + "start_time": "2018-04-19T20:41:14.780138Z" + } + }, + "outputs": [], + "source": [ + "reverse_Y = Y.apply(lambda x: 0 if x == 1 else 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T00:16:50.047193Z", + "start_time": "2018-04-20T00:16:50.038098Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([0, 1]), array([221, 9]))" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.unique(Y, return_counts=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ensembles" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T06:46:16.056913Z", + "start_time": "2018-04-20T06:46:16.052278Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import BaggingClassifier\n", + "from sklearn.ensemble import AdaBoostClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T06:47:56.577213Z", + "start_time": "2018-04-20T06:47:56.500595Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf = SVC(**{'C': 10, 'class_weight': 'balanced', 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True})\n", + "ens_clf = BaggingClassifier(clf)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)\n", + "ens_clf.fit(X_train, y_train)\n", + "ens_clf.predict(X_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T06:48:07.149126Z", + "start_time": "2018-04-20T06:48:04.876784Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf = SVC(**{'C': 10, 'class_weight': 'balanced', 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True})\n", + "ens_clf = AdaBoostClassifier(clf)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)\n", + "ens_clf.fit(X_train, y_train)\n", + "ens_clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### anomaly detection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:48:31.710508Z", + "start_time": "2018-04-19T20:48:31.706267Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.svm import OneClassSVM\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import make_scorer" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:09:10.735686Z", + "start_time": "2018-04-19T21:09:10.656810Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.51282051 0.43589744 0.71052632 0.39473684 0.55263158 0.34210526]\n", + "0.4914529914529915\n" + ] + } + ], + "source": [ + "ad_clf = OneClassSVM(kernel=\"rbf\")\n", + "scores = cross_val_score(ad_clf, X, [_ if _ == 1 else -1 for _ in Y], cv=k_fold, scoring=make_scorer(accuracy_score))\n", + "print(scores)\n", + "print(np.average(scores))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:09:25.083572Z", + "start_time": "2018-04-19T21:09:25.064562Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6578947368421053\n", + "[ 1 -1 -1 -1 -1 -1 -1 1 -1 1 -1 1 -1 -1 1 -1 -1 -1 1 -1 -1 -1 -1 -1\n", + " -1 -1 1 -1 -1 -1 -1 -1 1 1 -1 -1 -1 1 1 -1 1 -1 1 -1 -1 1 -1 -1\n", + " -1 -1 -1 -1 1 -1 1 1 1 -1 1 -1 1 -1 -1 1 -1 -1 1 -1 -1 1 -1 -1\n", + " 1 -1 -1 1]\n" + ] + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)\n", + "ad_clf = OneClassSVM(nu=0.7)\n", + "ad_clf.fit(X_train, y_train)\n", + "y_predict = ad_clf.predict(X_test)\n", + "print(accuracy_score([x if x == 1 else -1 for x in y_test], y_predict))\n", + "print(y_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:21:37.819707Z", + "start_time": "2018-04-19T21:21:37.814866Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:43:03.210818Z", + "start_time": "2018-04-19T21:43:02.417814Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=5, error_score='raise',\n", + " estimator=OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf',\n", + " max_iter=-1, nu=0.5, random_state=None, shrinking=True, tol=0.001,\n", + " verbose=False),\n", + " fit_params=None, iid=True, n_jobs=1,\n", + " param_grid=[{'nu': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'gamma': ['auto'], 'kernel': ['rbf']}],\n", + " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", + " scoring=make_scorer(accuracy_score), verbose=0)" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "param_grid = [\n", + " {'nu': np.arange(.1, 1.0, 0.1), 'gamma': ['auto'], 'kernel': ['rbf']},\n", + " ]\n", + "gs_cv = GridSearchCV(OneClassSVM(), param_grid=param_grid, scoring=make_scorer(accuracy_score), cv=5, refit=True)\n", + "gs_cv.fit(X, [_ if _ == 1 else -1 for _ in Y])" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:43:03.283239Z", + "start_time": "2018-04-19T21:43:03.217429Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split3_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split4_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
mean_fit_timemean_score_timemean_test_scoremean_train_scoreparam_gammaparam_kernelparam_nuparamsrank_test_scoresplit0_test_score...split2_test_scoresplit2_train_scoresplit3_test_scoresplit3_train_scoresplit4_test_scoresplit4_train_scorestd_fit_timestd_score_timestd_test_scorestd_train_score
00.0056700.0014630.3565220.215217autorbf0.1{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.1}90.326087...0.5217390.2119570.3260870.2173910.2826090.1847830.0004230.0002020.0843070.017728
10.0059840.0014300.3652170.220652autorbf0.2{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.2}80.326087...0.5652170.2391300.3260870.2717390.2826090.1739130.0006360.0002070.1014080.033958
20.0065450.0012720.3956520.317391autorbf0.3{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.300...70.326087...0.5652170.3315220.3695650.2989130.2826090.3206520.0008860.0001290.0985720.015975
30.0076760.0015840.4304350.389130autorbf0.4{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.4}60.369565...0.5869570.3695650.4130430.3858700.3043480.4076090.0005990.0001950.0966350.012676
40.0083410.0019640.4826090.483696autorbf0.5{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.5}50.478261...0.6086960.5000000.4347830.4836960.4130430.4782610.0005980.0002300.0679150.009094
50.0093910.0020860.5782610.571739autorbf0.6{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.6}40.521739...0.6739130.5760870.5000000.5706520.5869570.5706520.0007470.0002120.0624030.004067
60.0094020.0019620.6695650.667391autorbf0.7{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.700...30.608696...0.7391300.6684780.5869570.6739130.6304350.6630430.0008200.0000880.0770440.006338
70.0107290.0023970.7826090.771739autorbf0.8{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.8}20.695652...0.8043480.7771740.7826090.7771740.8043480.7771740.0007920.0002440.0456000.006875
80.0108530.0028180.8739130.869565autorbf0.9{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.9}10.869565...0.9130430.8695650.8260870.8750000.8695650.8641300.0002560.0003630.0288400.004861
\n", + "

9 rows × 23 columns

\n", + "
" + ], + "text/plain": [ + " mean_fit_time mean_score_time mean_test_score mean_train_score \\\n", + "0 0.005670 0.001463 0.356522 0.215217 \n", + "1 0.005984 0.001430 0.365217 0.220652 \n", + "2 0.006545 0.001272 0.395652 0.317391 \n", + "3 0.007676 0.001584 0.430435 0.389130 \n", + "4 0.008341 0.001964 0.482609 0.483696 \n", + "5 0.009391 0.002086 0.578261 0.571739 \n", + "6 0.009402 0.001962 0.669565 0.667391 \n", + "7 0.010729 0.002397 0.782609 0.771739 \n", + "8 0.010853 0.002818 0.873913 0.869565 \n", + "\n", + " param_gamma param_kernel param_nu \\\n", + "0 auto rbf 0.1 \n", + "1 auto rbf 0.2 \n", + "2 auto rbf 0.3 \n", + "3 auto rbf 0.4 \n", + "4 auto rbf 0.5 \n", + "5 auto rbf 0.6 \n", + "6 auto rbf 0.7 \n", + "7 auto rbf 0.8 \n", + "8 auto rbf 0.9 \n", + "\n", + " params rank_test_score \\\n", + "0 {'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.1} 9 \n", + "1 {'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.2} 8 \n", + "2 {'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.300... 7 \n", + "3 {'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.4} 6 \n", + "4 {'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.5} 5 \n", + "5 {'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.6} 4 \n", + "6 {'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.700... 3 \n", + "7 {'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.8} 2 \n", + "8 {'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.9} 1 \n", + "\n", + " split0_test_score ... split2_test_score split2_train_score \\\n", + "0 0.326087 ... 0.521739 0.211957 \n", + "1 0.326087 ... 0.565217 0.239130 \n", + "2 0.326087 ... 0.565217 0.331522 \n", + "3 0.369565 ... 0.586957 0.369565 \n", + "4 0.478261 ... 0.608696 0.500000 \n", + "5 0.521739 ... 0.673913 0.576087 \n", + "6 0.608696 ... 0.739130 0.668478 \n", + "7 0.695652 ... 0.804348 0.777174 \n", + "8 0.869565 ... 0.913043 0.869565 \n", + "\n", + " split3_test_score split3_train_score split4_test_score \\\n", + "0 0.326087 0.217391 0.282609 \n", + "1 0.326087 0.271739 0.282609 \n", + "2 0.369565 0.298913 0.282609 \n", + "3 0.413043 0.385870 0.304348 \n", + "4 0.434783 0.483696 0.413043 \n", + "5 0.500000 0.570652 0.586957 \n", + "6 0.586957 0.673913 0.630435 \n", + "7 0.782609 0.777174 0.804348 \n", + "8 0.826087 0.875000 0.869565 \n", + "\n", + " split4_train_score std_fit_time std_score_time std_test_score \\\n", + "0 0.184783 0.000423 0.000202 0.084307 \n", + "1 0.173913 0.000636 0.000207 0.101408 \n", + "2 0.320652 0.000886 0.000129 0.098572 \n", + "3 0.407609 0.000599 0.000195 0.096635 \n", + "4 0.478261 0.000598 0.000230 0.067915 \n", + "5 0.570652 0.000747 0.000212 0.062403 \n", + "6 0.663043 0.000820 0.000088 0.077044 \n", + "7 0.777174 0.000792 0.000244 0.045600 \n", + "8 0.864130 0.000256 0.000363 0.028840 \n", + "\n", + " std_train_score \n", + "0 0.017728 \n", + "1 0.033958 \n", + "2 0.015975 \n", + "3 0.012676 \n", + "4 0.009094 \n", + "5 0.004067 \n", + "6 0.006338 \n", + "7 0.006875 \n", + "8 0.004861 \n", + "\n", + "[9 rows x 23 columns]" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "pd.DataFrame(gs_cv.cv_results_)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:43:06.474561Z", + "start_time": "2018-04-19T21:43:06.467558Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8739130434782608" + ] + }, + "execution_count": 71, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gs_cv.best_score_" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:42:38.126146Z", + "start_time": "2018-04-19T21:42:38.120287Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'degree': 6, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.4}" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gs_cv.best_params_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### normal svm and kernel selection" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:14.823836Z", + "start_time": "2018-04-19T20:41:14.818107Z" + } + }, + "outputs": [], + "source": [ + "k_fold = 6" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:15.041037Z", + "start_time": "2018-04-19T20:41:14.827897Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Kernel: poly\n" + ] + }, + { + "data": { + "text/plain": [ + "0.9610825400299084" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kernel = 'poly'\n", + "print('Kernel: ', kernel)\n", + "clf = SVC(kernel=kernel, class_weight='balanced')\n", + "np.average(cross_val_score(clf, X, Y, cv=k_fold))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:18.808030Z", + "start_time": "2018-04-19T20:41:18.712105Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Kernel: rbf\n" + ] + }, + { + "data": { + "text/plain": [ + "0.9482620272093957" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kernel = 'rbf'\n", + "print('Kernel: ', kernel)\n", + "clf = SVC(kernel=kernel, class_weight='balanced')\n", + "np.average(cross_val_score(clf, X, Y, cv=k_fold))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:21.963007Z", + "start_time": "2018-04-19T20:41:21.848799Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Kernel: sigmoid\n" + ] + }, + { + "data": { + "text/plain": [ + "0.9002230975915185" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kernel = 'sigmoid'\n", + "print('Kernel: ', kernel)\n", + "clf = SVC(kernel=kernel, class_weight='balanced')\n", + "np.average(cross_val_score(clf, X, Y, cv=k_fold))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:30.589630Z", + "start_time": "2018-04-19T20:41:30.585930Z" + } + }, + "outputs": [], + "source": [ + "# kernel = 'precomputed'\n", + "# print('Kernel: ', kernel)\n", + "# clf = SVC(kernel=kernel, class_weight='balanced')\n", + "# cross_val_score(clf, X, Y, cv=k_fold)" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-11T22:25:40.478665Z", + "start_time": "2018-04-11T22:25:40.470891Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.52036199, 12.77777778])" + ] + }, + "execution_count": 149, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# n_samples / (n_classes * np.bincount(y))\n", + "X.shape[0] / (2*np.bincount(Y))" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T04:04:04.519112Z", + "start_time": "2018-04-20T04:04:04.462909Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + " 0 0]\n" + ] + }, + { + "data": { + "text/plain": [ + "array([0.0415634 , 0.04365995, 0.00182686, 0.04664141, 0.04411105,\n", + " 0.04331738, 0.0484596 , 0.00127521, 0.04523435, 0.0435793 ,\n", + " 0.04427737, 0.04421218, 0.0440134 , 0.04514256, 0.04492044,\n", + " 0.04163024, 0.01415224, 0.04695233, 0.04320325, 0.04420214,\n", + " 0.04390487, 0.04575226, 0.0464508 , 0.04284476, 0.04640284,\n", + " 0.04522823, 0.04429313, 0.04635346, 0.04310751, 0.04565618,\n", + " 0.04426294, 0.02688375, 0.03886473, 0.04365537, 0.04585414,\n", + " 0.04366811, 0.04500299, 0.04460703, 0.04352903, 0.04406892,\n", + " 0.04654779, 0.04447094, 0.04201015, 0.04326055, 0.04281197,\n", + " 0.0433026 , 0.04371573, 0.04417434, 0.04351211, 0.04698533,\n", + " 0.04435703, 0.04658198, 0.04442717, 0.04319824, 0.04359141,\n", + " 0.04380029, 0.04465192, 0.04487563, 0.04539705, 0.01931896,\n", + " 0.04345568, 0.04326056, 0.04322224, 0.04370086, 0.04355741,\n", + " 0.04479891, 0.04204287, 0.04371642, 0.04535695, 0.04569559,\n", + " 0.04160463, 0.04522647, 0.04433792, 0.04536438, 0.04332477,\n", + " 0.04426991])" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)\n", + "clf = SVC(kernel='poly', class_weight='balanced', probability=True)\n", + "clf.fit(X_train, y_train)\n", + "print(clf.predict(X_test))\n", + "clf.predict_proba(X_test)[:,1]" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T00:12:59.970768Z", + "start_time": "2018-04-20T00:12:59.949906Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)\n", + "clf = SVC(kernel='poly', class_weight='balanced')\n", + "clf.fit(X_train, y_train)\n", + "clf.predict(X_test)\n", + "# clf.score(X_test, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-12T19:34:57.901911Z", + "start_time": "2018-04-12T19:34:57.840845Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 14.0 14.0\n", + "5 9.4 23.4\n", + "10 7.9 31.3\n", + "15 6.3 37.6\n", + "20 5.4 43.0\n", + "25 4.8 47.8\n", + "30 4.3 52.1\n", + "35 4.0 56.1\n", + "40 3.6 59.7\n", + "45 3.5 63.2\n", + "50 3.1 66.3\n", + "55 3.0 69.3\n", + "60 2.8 72.1\n", + "65 2.5 74.6\n", + "70 2.5 77.1\n", + "75 2.5 79.6\n", + "80 2.1 81.7\n", + "85 2.0 83.7\n", + "90 2.0 85.7\n", + "95 2.0 87.7\n", + "100 2.0 89.7\n" + ] + } + ], + "source": [ + "p = PCA(n_components=160)\n", + "p.fit(X)\n", + "\n", + "percents = np.round((p.explained_variance_ratio_), 3)\n", + "_sum = 0\n", + "# print(np.sum(percents))\n", + "j = 0\n", + "for i in range(21):\n", + " s = np.sum(percents[j: j+5])*100\n", + " _sum += s\n", + " print(j, s, _sum)\n", + " j += 5\n" + ] + }, + { + "cell_type": "code", + "execution_count": 190, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-12T19:30:07.288063Z", + "start_time": "2018-04-12T19:30:07.185242Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.046 0.029 0.023 0.021 0.021 0.02 0.019 0.019 0.018 0.018\n", + " 0.017 0.016 0.016 0.015 0.015 0.014 0.013 0.012 0.012 0.012\n", + " 0.011 0.011 0.011 0.011 0.01 0.01 0.01 0.01 0.009 0.009\n", + " 0.009 0.009 0.009 0.008 0.008 0.008 0.008 0.008 0.008 0.008\n", + " 0.008 0.007 0.007 0.007 0.007 0.007 0.007 0.007 0.007 0.007\n", + " 0.007 0.006 0.006 0.006 0.006 0.006 0.006 0.006 0.006 0.006]\n", + "[ 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n", + " 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n", + " 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n", + " 1. 1. 1. 1. 1. 1.]\n" + ] + }, + { + "data": { + "text/plain": [ + "1 average_outflow\n", + "2 utm_medium_ppc\n", + "3 firm_type_Private Limited\n", + "4 campaign_city_Kolkata\n", + "5 state_TAMIL NADU\n", + "6 registered_office_state_ANDHRA PRADESH\n", + "7 city_Chennai\n", + "8 registered_office_state_MADHYA PRADESH\n", + "9 registered_office_state_KARNATAKA\n", + "10 registered_office_state_HARYANA\n", + "11 city_Indore\n", + "12 state_MADHYA PRADESH\n", + "13 registered_office_state_GUJARAT\n", + "14 registered_office_state_CHHATTISGARH\n", + "15 state_KARNATAKA\n", + "16 registered_office_state_TELANGANA\n", + "17 platform_mobile:Linux\n", + "18 registered_office_state_PUNJAB\n", + "19 role_in_firm_2.0\n", + "20 registered_office_city_DHAR\n", + "21 city_Kalyan\n", + "22 industry_132.0\n", + "23 campaign_city_Mumbai\n", + "24 industry_20.0\n", + "25 browser_Chrome\n", + "26 registered_office_city_THANE\n", + "27 registered_office_city_RAISEN\n", + "28 registered_office_city_NASHIK\n", + "29 industry_69.0\n", + "30 industry_35.0\n", + "31 city_Ahmedabad\n", + "32 industry_69.0\n", + "33 campaign_city_Nellore\n", + "34 industry_126.0\n", + "35 gender_Female\n", + "36 industry_69.0\n", + "37 registered_office_city_SATARA\n", + "38 city_Chandigarh\n", + "39 campaign_city_Nellore\n", + "40 campaign_city_Badlapur\n", + "41 registered_office_city_AURANGABAD\n", + "42 registered_office_city_K.V.RANGAREDDY\n", + "43 registered_office_city_AURANGABAD\n", + "44 industry_124.0\n", + "45 industry_128.0\n", + "46 industry_124.0\n", + "47 registered_office_city_SATARA\n", + "48 city_Badlapur\n", + "49 city_Bulandshahr\n", + "50 industry_9.0\n", + "51 industry_51.0\n", + "52 registered_office_city_TIRUVALLUR\n", + "53 industry_98.0\n", + "54 city_Ulhasnagar\n", + "55 city_Ghaziabad\n", + "56 role_on_application_1\n", + "57 browser_Netscape\n", + "58 city_Ulhasnagar\n", + "59 registered_office_city_CHITTOOR\n", + "60 industry_88.0\n", + "dtype: object" + ] + }, + "execution_count": 190, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca = PCA(n_components=60)\n", + "pca.fit(X)\n", + "print(np.round((pca.explained_variance_ratio_), 3))\n", + "\n", + "# pca.components_\n", + "# ['PC-1','PC-2','PC-3','PC-4','PC-5','PC-6']\n", + "\n", + "coef = pca.transform(np.eye(X.shape[1]))\n", + "print(np.linalg.norm(coef, axis=0))\n", + "\n", + "_p = pd.DataFrame(coef, columns=range(1, 61), index=og_X.columns)\n", + "\n", + "abs(_p).idxmax()\n", + "\n", + "# pd.value_counts(abs(_p).idxmax(axis=1))\n", + "\n", + "# clf = SVC(kernel='poly', class_weight='balanced')\n", + "# X_train, X_test, y_train, y_test = train_test_split(pca.transform(X), Y, test_size=0.33, random_state=42)\n", + "\n", + "# clf.fit(X_train, y_train)\n", + "\n", + "# clf.predict(X_test), clf.score(X_test, y_test)\n", + "\n", + "\n", + "# # cross_val_score(clf, pca.transform(X), Y, cv=k_fold)" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-11T21:53:55.310221Z", + "start_time": "2018-04-11T21:53:55.303191Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PCA(copy=True, iterated_power='auto', n_components=6, random_state=None,\n", + " svd_solver='auto', tol=0.0, whiten=False)" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pca" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}