diff --git a/SVM test.ipynb b/SVM test.ipynb new file mode 100644 index 0000000..4a5b235 --- /dev/null +++ b/SVM test.ipynb @@ -0,0 +1,1331 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:13.476646Z", + "start_time": "2018-04-19T20:41:03.019958Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from sklearn.svm import SVC\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import Imputer\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "DATA = 'dataset/loan_one_hot_encoded.csv'" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:13.770603Z", + "start_time": "2018-04-19T20:41:13.480032Z" + } + }, + "outputs": [], + "source": [ + "drop_cols = ['loan_created', 'application_id',\n", + "# 'firm_type_Proprietorship',\n", + " 'average_business_inflow'\n", + " ]\n", + "df = pd.read_csv(DATA)\n", + "Y = df['loan_created']\n", + "og_X = df.drop(drop_cols, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Things to-do:\n", + "\n", + "- [ ] The data is missing values; use Imputer to fill mean/median of the column\n", + " - [ ] Create another column to denote whether the data was imputed or not; I've read that it seems to have better results\n", + "- [ ] Set class_weights in SVM\n", + "- [ ] Tune hyperparameters\n", + "- [ ] Specific kernel? \n", + "\n", + "What to do about skewed data:\n", + "- See as an anamoly detection problem?\n", + "- class weights (for SVM)\n", + "- Remove training data (less data anyway..:( )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:14.741139Z", + "start_time": "2018-04-19T20:41:13.773038Z" + } + }, + "outputs": [], + "source": [ + "imp = Imputer()\n", + "imputed_X = imp.fit_transform(og_X)\n", + "\n", + "# X = imputed_X\n", + "scl = StandardScaler()\n", + "X = scl.fit_transform(imputed_X)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:14.752488Z", + "start_time": "2018-04-19T20:41:14.744609Z" + } + }, + "outputs": [], + "source": [ + "# pd.value_counts(og_X['firm_type_Proprietorship'])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:14.776906Z", + "start_time": "2018-04-19T20:41:14.756636Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(230, 240)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:41:14.811622Z", + "start_time": "2018-04-19T20:41:14.780138Z" + } + }, + "outputs": [], + "source": [ + "reverse_Y = Y.apply(lambda x: 0 if x == 1 else 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T00:16:50.047193Z", + "start_time": "2018-04-20T00:16:50.038098Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([0, 1]), array([221, 9]))" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.unique(Y, return_counts=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ensembles" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T06:46:16.056913Z", + "start_time": "2018-04-20T06:46:16.052278Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.ensemble import BaggingClassifier\n", + "from sklearn.ensemble import AdaBoostClassifier" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T06:47:56.577213Z", + "start_time": "2018-04-20T06:47:56.500595Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf = SVC(**{'C': 10, 'class_weight': 'balanced', 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True})\n", + "ens_clf = BaggingClassifier(clf)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)\n", + "ens_clf.fit(X_train, y_train)\n", + "ens_clf.predict(X_test)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T06:48:07.149126Z", + "start_time": "2018-04-20T06:48:04.876784Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "clf = SVC(**{'C': 10, 'class_weight': 'balanced', 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True})\n", + "ens_clf = AdaBoostClassifier(clf)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)\n", + "ens_clf.fit(X_train, y_train)\n", + "ens_clf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### anomaly detection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T20:48:31.710508Z", + "start_time": "2018-04-19T20:48:31.706267Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.svm import OneClassSVM\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import make_scorer" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:09:10.735686Z", + "start_time": "2018-04-19T21:09:10.656810Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0.51282051 0.43589744 0.71052632 0.39473684 0.55263158 0.34210526]\n", + "0.4914529914529915\n" + ] + } + ], + "source": [ + "ad_clf = OneClassSVM(kernel=\"rbf\")\n", + "scores = cross_val_score(ad_clf, X, [_ if _ == 1 else -1 for _ in Y], cv=k_fold, scoring=make_scorer(accuracy_score))\n", + "print(scores)\n", + "print(np.average(scores))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:09:25.083572Z", + "start_time": "2018-04-19T21:09:25.064562Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6578947368421053\n", + "[ 1 -1 -1 -1 -1 -1 -1 1 -1 1 -1 1 -1 -1 1 -1 -1 -1 1 -1 -1 -1 -1 -1\n", + " -1 -1 1 -1 -1 -1 -1 -1 1 1 -1 -1 -1 1 1 -1 1 -1 1 -1 -1 1 -1 -1\n", + " -1 -1 -1 -1 1 -1 1 1 1 -1 1 -1 1 -1 -1 1 -1 -1 1 -1 -1 1 -1 -1\n", + " 1 -1 -1 1]\n" + ] + } + ], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)\n", + "ad_clf = OneClassSVM(nu=0.7)\n", + "ad_clf.fit(X_train, y_train)\n", + "y_predict = ad_clf.predict(X_test)\n", + "print(accuracy_score([x if x == 1 else -1 for x in y_test], y_predict))\n", + "print(y_predict)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:21:37.819707Z", + "start_time": "2018-04-19T21:21:37.814866Z" + } + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:43:03.210818Z", + "start_time": "2018-04-19T21:43:02.417814Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "GridSearchCV(cv=5, error_score='raise',\n", + " estimator=OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf',\n", + " max_iter=-1, nu=0.5, random_state=None, shrinking=True, tol=0.001,\n", + " verbose=False),\n", + " fit_params=None, iid=True, n_jobs=1,\n", + " param_grid=[{'nu': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'gamma': ['auto'], 'kernel': ['rbf']}],\n", + " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n", + " scoring=make_scorer(accuracy_score), verbose=0)" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "param_grid = [\n", + " {'nu': np.arange(.1, 1.0, 0.1), 'gamma': ['auto'], 'kernel': ['rbf']},\n", + " ]\n", + "gs_cv = GridSearchCV(OneClassSVM(), param_grid=param_grid, scoring=make_scorer(accuracy_score), cv=5, refit=True)\n", + "gs_cv.fit(X, [_ if _ == 1 else -1 for _ in Y])" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-19T21:43:03.283239Z", + "start_time": "2018-04-19T21:43:03.217429Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('mean_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split0_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split1_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split2_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split3_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('split4_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/sklearn/utils/deprecation.py:122: FutureWarning: You are accessing a training score ('std_train_score'), which will not be available by default any more in 0.21. If you need training scores, please set return_train_score=True\n", + " warnings.warn(*warn_args, **warn_kwargs)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + " | mean_fit_time | \n", + "mean_score_time | \n", + "mean_test_score | \n", + "mean_train_score | \n", + "param_gamma | \n", + "param_kernel | \n", + "param_nu | \n", + "params | \n", + "rank_test_score | \n", + "split0_test_score | \n", + "... | \n", + "split2_test_score | \n", + "split2_train_score | \n", + "split3_test_score | \n", + "split3_train_score | \n", + "split4_test_score | \n", + "split4_train_score | \n", + "std_fit_time | \n", + "std_score_time | \n", + "std_test_score | \n", + "std_train_score | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0.005670 | \n", + "0.001463 | \n", + "0.356522 | \n", + "0.215217 | \n", + "auto | \n", + "rbf | \n", + "0.1 | \n", + "{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.1} | \n", + "9 | \n", + "0.326087 | \n", + "... | \n", + "0.521739 | \n", + "0.211957 | \n", + "0.326087 | \n", + "0.217391 | \n", + "0.282609 | \n", + "0.184783 | \n", + "0.000423 | \n", + "0.000202 | \n", + "0.084307 | \n", + "0.017728 | \n", + "
1 | \n", + "0.005984 | \n", + "0.001430 | \n", + "0.365217 | \n", + "0.220652 | \n", + "auto | \n", + "rbf | \n", + "0.2 | \n", + "{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.2} | \n", + "8 | \n", + "0.326087 | \n", + "... | \n", + "0.565217 | \n", + "0.239130 | \n", + "0.326087 | \n", + "0.271739 | \n", + "0.282609 | \n", + "0.173913 | \n", + "0.000636 | \n", + "0.000207 | \n", + "0.101408 | \n", + "0.033958 | \n", + "
2 | \n", + "0.006545 | \n", + "0.001272 | \n", + "0.395652 | \n", + "0.317391 | \n", + "auto | \n", + "rbf | \n", + "0.3 | \n", + "{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.300... | \n", + "7 | \n", + "0.326087 | \n", + "... | \n", + "0.565217 | \n", + "0.331522 | \n", + "0.369565 | \n", + "0.298913 | \n", + "0.282609 | \n", + "0.320652 | \n", + "0.000886 | \n", + "0.000129 | \n", + "0.098572 | \n", + "0.015975 | \n", + "
3 | \n", + "0.007676 | \n", + "0.001584 | \n", + "0.430435 | \n", + "0.389130 | \n", + "auto | \n", + "rbf | \n", + "0.4 | \n", + "{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.4} | \n", + "6 | \n", + "0.369565 | \n", + "... | \n", + "0.586957 | \n", + "0.369565 | \n", + "0.413043 | \n", + "0.385870 | \n", + "0.304348 | \n", + "0.407609 | \n", + "0.000599 | \n", + "0.000195 | \n", + "0.096635 | \n", + "0.012676 | \n", + "
4 | \n", + "0.008341 | \n", + "0.001964 | \n", + "0.482609 | \n", + "0.483696 | \n", + "auto | \n", + "rbf | \n", + "0.5 | \n", + "{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.5} | \n", + "5 | \n", + "0.478261 | \n", + "... | \n", + "0.608696 | \n", + "0.500000 | \n", + "0.434783 | \n", + "0.483696 | \n", + "0.413043 | \n", + "0.478261 | \n", + "0.000598 | \n", + "0.000230 | \n", + "0.067915 | \n", + "0.009094 | \n", + "
5 | \n", + "0.009391 | \n", + "0.002086 | \n", + "0.578261 | \n", + "0.571739 | \n", + "auto | \n", + "rbf | \n", + "0.6 | \n", + "{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.6} | \n", + "4 | \n", + "0.521739 | \n", + "... | \n", + "0.673913 | \n", + "0.576087 | \n", + "0.500000 | \n", + "0.570652 | \n", + "0.586957 | \n", + "0.570652 | \n", + "0.000747 | \n", + "0.000212 | \n", + "0.062403 | \n", + "0.004067 | \n", + "
6 | \n", + "0.009402 | \n", + "0.001962 | \n", + "0.669565 | \n", + "0.667391 | \n", + "auto | \n", + "rbf | \n", + "0.7 | \n", + "{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.700... | \n", + "3 | \n", + "0.608696 | \n", + "... | \n", + "0.739130 | \n", + "0.668478 | \n", + "0.586957 | \n", + "0.673913 | \n", + "0.630435 | \n", + "0.663043 | \n", + "0.000820 | \n", + "0.000088 | \n", + "0.077044 | \n", + "0.006338 | \n", + "
7 | \n", + "0.010729 | \n", + "0.002397 | \n", + "0.782609 | \n", + "0.771739 | \n", + "auto | \n", + "rbf | \n", + "0.8 | \n", + "{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.8} | \n", + "2 | \n", + "0.695652 | \n", + "... | \n", + "0.804348 | \n", + "0.777174 | \n", + "0.782609 | \n", + "0.777174 | \n", + "0.804348 | \n", + "0.777174 | \n", + "0.000792 | \n", + "0.000244 | \n", + "0.045600 | \n", + "0.006875 | \n", + "
8 | \n", + "0.010853 | \n", + "0.002818 | \n", + "0.873913 | \n", + "0.869565 | \n", + "auto | \n", + "rbf | \n", + "0.9 | \n", + "{'gamma': 'auto', 'kernel': 'rbf', 'nu': 0.9} | \n", + "1 | \n", + "0.869565 | \n", + "... | \n", + "0.913043 | \n", + "0.869565 | \n", + "0.826087 | \n", + "0.875000 | \n", + "0.869565 | \n", + "0.864130 | \n", + "0.000256 | \n", + "0.000363 | \n", + "0.028840 | \n", + "0.004861 | \n", + "
9 rows × 23 columns
\n", + "