diff --git a/SVM.ipynb b/SVM.ipynb index 991c162..fb71a62 100644 --- a/SVM.ipynb +++ b/SVM.ipynb @@ -1,12 +1,38 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Outline\n", + "\n", + "#### Data set versions:\n", + "- Original dataset\n", + "- Synthesized dataset using SMOTE\n", + "\n", + "#### Approaches:\n", + "- SVM + Grid Search\n", + "- SVM + Grid Search + Feature selection with F-score\n", + "- Anomaly detection SVM\n", + "- Ensemble SVM\n", + " - Og data\n", + " - Synthesized data\n", + " - F-score filtered data\n", + "\n", + "#### Evaluation Metrics:\n", + "- ROC\n", + "- F-score (Precision Recall)\n", + "- Confusion matrix\n", + "- Accuracy" + ] + }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 179, "metadata": { "ExecuteTime": { - "end_time": "2018-03-30T02:52:26.123962Z", - "start_time": "2018-03-30T02:52:20.457436Z" + "end_time": "2018-04-20T06:49:19.598232Z", + "start_time": "2018-04-20T06:49:19.591110Z" } }, "outputs": [], @@ -14,108 +40,687 @@ "import numpy as np\n", "import pandas as pd\n", "\n", + "from collections import defaultdict\n", + "\n", + "from imblearn.over_sampling import SMOTE\n", + "\n", "from sklearn.svm import SVC\n", + "from sklearn.svm import OneClassSVM\n", + "\n", "from sklearn.preprocessing import Imputer\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "from sklearn.ensemble import BaggingClassifier\n", + "from sklearn.ensemble import AdaBoostClassifier\n", + "\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import auc\n", + "from sklearn.metrics import roc_curve\n", + "from sklearn.metrics import f1_score\n", + "from sklearn.metrics import precision_recall_curve\n", + "# from sklearn.metrics import precision_score\n", + "# from sklearn.metrics import recall_score\n", + "\n", + "from sklearn.model_selection import ParameterGrid\n", + "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.model_selection import cross_val_score\n", + "from sklearn.model_selection import train_test_split\n", "\n", "DATA = 'dataset/loan_one_hot_encoded.csv'" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 204, "metadata": { "ExecuteTime": { - "end_time": "2018-03-30T02:52:26.217863Z", - "start_time": "2018-03-30T02:52:26.126714Z" + "end_time": "2018-04-20T07:12:40.152012Z", + "start_time": "2018-04-20T07:12:40.108731Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "((230, 240), (230,), (array([0, 1]), array([221, 9])))" + ] + }, + "execution_count": 204, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ + "drop_cols = ['loan_created', 'application_id',\n", + "# 'firm_type_Proprietorship',\n", + " 'average_business_inflow'\n", + " ]\n", "df = pd.read_csv(DATA)\n", "Y = df['loan_created']\n", - "X = df.drop('loan_created', axis=1)" + "og_X = df.drop(drop_cols, axis=1)\n", + "\n", + "imp = Imputer()\n", + "imputed_X = imp.fit_transform(og_X)\n", + "\n", + "# X = imputed_X\n", + "scl = StandardScaler()\n", + "X = scl.fit_transform(imputed_X)\n", + "\n", + "X.shape, Y.shape, np.unique(Y, return_counts=True)" ] }, { - "cell_type": "markdown", - "metadata": {}, + "cell_type": "code", + "execution_count": 205, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T07:12:40.639529Z", + "start_time": "2018-04-20T07:12:40.626047Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((245, 240), (245,), (array([0, 1]), array([221, 24])))" + ] + }, + "execution_count": 205, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "Things to-do:\n", - "\n", - "- [ ] The data is missing values; use Imputer to fill mean/median of the column\n", - " - [ ] Create another column to denote whether the data was imputed or not; I've read that it seems to have better results\n", - "- [ ] Set class_weights in SVM\n", - "- [ ] Tune hyperparameters\n", - "- [ ] Specific kernel? \n", + "sm = SMOTE(random_state = 44, ratio = {0:221, 1:24})\n", + "X_os, Y_os = sm.fit_sample(X, Y)\n", "\n", - "What to do about skewed data:\n", - "- See as an anamoly detection problem?\n", - "- class weights (for SVM)\n", - "- Remove training data (less data anyway..:( )\n" + "X_os.shape, Y_os.shape, np.unique(Y_os, return_counts=True)" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 218, "metadata": { "ExecuteTime": { - "end_time": "2018-03-30T02:52:26.232491Z", - "start_time": "2018-03-30T02:52:26.220908Z" + "end_time": "2018-04-20T07:18:55.177547Z", + "start_time": "2018-04-20T07:18:55.158706Z" } }, "outputs": [], "source": [ - "imp = Imputer()\n", - "imputed_X = imp.fit_transform(X)" + "class GridSearchCV:\n", + " def __init__(self, param_grid, clf, pos_label, cv=5):\n", + " self.param_list_ = list(ParameterGrid(param_grid))\n", + " self.cv = cv\n", + " self.pos_label = pos_label\n", + " self._clf = clf\n", + " \n", + " self.overfit_ = [[] for _ in range(len(self.param_list_))]\n", + " \n", + " # evaluation scores\n", + " self.accuracy_scores_ = ([[] for _ in range(len(self.param_list_))])\n", + " self.precision_ = [[] for _ in range(len(self.param_list_))]\n", + " self.recall_ = [[] for _ in range(len(self.param_list_))]\n", + " self.f1_scores_ = [[] for _ in range(len(self.param_list_))]\n", + " self.fpr_ = [[] for _ in range(len(self.param_list_))]\n", + " self.tpr_ = [[] for _ in range(len(self.param_list_))]\n", + " self.auc_scores_ = [[] for _ in range(len(self.param_list_))] \n", + " \n", + " def fit(self, X, y):\n", + " skf = StratifiedKFold(n_splits=self.cv)\n", + " for train_index, test_index in skf.split(X, y):\n", + " for idx, params in enumerate(self.param_list_):\n", + " X_train, X_test = X[train_index], X[test_index]\n", + " y_train, y_test = y[train_index], y[test_index]\n", + " clf.set_params(**params)\n", + " clf.fit(X_train, y_train) # probability=True, class_weight='balanced', \n", + " y_pred = clf.predict(X_test)\n", + " if getattr(clf, \"predict_proba\", None):\n", + " y_probab = clf.predict_proba(X_test)\n", + " else:\n", + " y_pred[y_pred == -1] = 0\n", + "\n", + " if np.all(y_pred[0] == y_pred):\n", + " # all values predicted are same;\n", + " self.overfit_[idx].append(True)\n", + "\n", + " self.precision_[idx].append([])\n", + " self.recall_[idx].append([])\n", + " self.f1_scores_[idx].append(np.nan)\n", + "\n", + " if getattr(clf, \"predict_proba\", None):\n", + " self.fpr_[idx].append([])\n", + " self.tpr_[idx].append([])\n", + " self.auc_scores_[idx].append(np.nan)\n", + " else:\n", + " self.overfit_[idx].append(False)\n", + " precision, recall, pr_thresholds = precision_recall_curve(y_test, y_pred, pos_label=self.pos_label)\n", + " self.precision_[idx].append(precision)\n", + " self.recall_[idx].append(recall)\n", + " self.f1_scores_[idx].append(f1_score(y_test, y_pred))\n", + " if getattr(clf, \"predict_proba\", None):\n", + " fpr, tpr, roc_thresholds = roc_curve(y_test, y_probab[:,1], pos_label=self.pos_label)\n", + " self.fpr_[idx].append(fpr)\n", + " self.tpr_[idx].append(tpr)\n", + " self.auc_scores_[idx].append(auc(fpr, tpr))\n", + "\n", + "\n", + " self.accuracy_scores_[idx].append(accuracy_score(y_test, y_pred))\n", + " \n", + " self.average_accuracy_param_ = np.array([np.average(_) for _ in self.accuracy_scores_])\n", + " \n", + " self.accuracy_scores_ = np.array(self.accuracy_scores_)\n", + " self.precision_ = np.array(self.precision_)\n", + " self.recall_ = np.array(self.recall_)\n", + " self.f1_scores_ = np.array(self.f1_scores_)\n", + " self.fpr_ = np.array(self.fpr_)\n", + " self.tpr_ = np.array(self.tpr_)\n", + " self.auc_scores_ = np.array(self.auc_scores_)\n", + " \n", + " def print_results(self, show_overfit=False):\n", + " for _idx in range(len(self.average_accuracy_param_)):\n", + " if not show_overfit and np.any(self.overfit_[_idx]):\n", + " continue\n", + " print('-'*40)\n", + " print('overfit: ', np.any(self.overfit_[_idx]), np.unique(self.overfit_[_idx], return_counts=True))\n", + " print('params:', self.param_list_[_idx])\n", + " print('avg acc: ', self.average_accuracy_param_[_idx])\n", + " print('f1 score:', np.average(gscv.f1_scores_[_idx]))\n", + " print('auc: ', np.average(self.auc_scores_[_idx]))" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## grid search on synthesized data" + ] + }, + { + "cell_type": "code", + "execution_count": 207, "metadata": { "ExecuteTime": { - "end_time": "2018-03-30T02:52:26.265969Z", - "start_time": "2018-03-30T02:52:26.236615Z" + "end_time": "2018-04-20T07:12:49.647322Z", + "start_time": "2018-04-20T07:12:44.427127Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------\n", + "overfit: False (array([False]), array([10]))\n", + "params: {'C': 0.5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}\n", + "avg acc: 0.9801410256410257\n", + "f1 score: 0.8899999999999999\n", + "auc: 0.9833333333333332\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([10]))\n", + "params: {'C': 0.5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.9309615384615386\n", + "f1 score: 0.7292857142857143\n", + "auc: 0.9540513833992094\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([10]))\n", + "params: {'C': 1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}\n", + "avg acc: 0.9839871794871795\n", + "f1 score: 0.9257142857142858\n", + "auc: 0.9805006587615284\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([10]))\n", + "params: {'C': 1, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.918948717948718\n", + "f1 score: 0.7104761904761905\n", + "auc: 0.9527997364953886\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([10]))\n", + "params: {'C': 5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}\n", + "avg acc: 0.9879871794871795\n", + "f1 score: 0.9400000000000001\n", + "auc: 0.9819499341238472\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([10]))\n", + "params: {'C': 5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.902448717948718\n", + "f1 score: 0.6858730158730159\n", + "auc: 0.936824769433465\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([10]))\n", + "params: {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}\n", + "avg acc: 0.9879871794871795\n", + "f1 score: 0.9400000000000001\n", + "auc: 0.9804347826086957\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([10]))\n", + "params: {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.9070897435897438\n", + "f1 score: 0.698095238095238\n", + "auc: 0.905270092226614\n" + ] } + ], + "source": [ + "param_grid = [\n", + " {'C': [.1, .5, 1, 5, 10], \n", + " 'gamma': ['auto'], 'kernel': ['rbf', 'sigmoid'], \n", + " 'probability': [True], 'class_weight': ['balanced']},\n", + " ]\n", + "clf = SVC()\n", + "gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=10)\n", + "gscv.fit(X_os, Y_os)\n", + "# vars(gscv)\n", + "gscv.print_results()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## grid search on og data" + ] + }, + { + "cell_type": "code", + "execution_count": 208, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T07:13:11.877136Z", + "start_time": "2018-04-20T07:12:55.785021Z" + }, + "scrolled": true }, "outputs": [ { - "data": { - "text/plain": [ - "(230, 242)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------\n", + "overfit: False (array([False]), array([9]))\n", + "params: {'C': 5, 'class_weight': 'balanced', 'degree': 2, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.8439316239316239\n", + "f1 score: 0.05555555555555555\n", + "auc: 0.3429629629629629\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([9]))\n", + "params: {'C': 5, 'class_weight': 'balanced', 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.8439316239316239\n", + "f1 score: 0.05555555555555555\n", + "auc: 0.3622222222222222\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([9]))\n", + "params: {'C': 5, 'class_weight': 'balanced', 'degree': 4, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.8439316239316239\n", + "f1 score: 0.05555555555555555\n", + "auc: 0.2974074074074074\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([9]))\n", + "params: {'C': 5, 'class_weight': 'balanced', 'degree': 5, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.8439316239316239\n", + "f1 score: 0.05555555555555555\n", + "auc: 0.3222222222222222\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([9]))\n", + "params: {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.8654700854700855\n", + "f1 score: 0.05555555555555555\n", + "auc: 0.37203703703703705\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([9]))\n", + "params: {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.8654700854700855\n", + "f1 score: 0.05555555555555555\n", + "auc: 0.4368518518518518\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([9]))\n", + "params: {'C': 10, 'class_weight': 'balanced', 'degree': 4, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.8654700854700855\n", + "f1 score: 0.05555555555555555\n", + "auc: 0.34648148148148145\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([9]))\n", + "params: {'C': 10, 'class_weight': 'balanced', 'degree': 5, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "avg acc: 0.8654700854700855\n", + "f1 score: 0.05555555555555555\n", + "auc: 0.3557407407407408\n" + ] } ], "source": [ - "imputed_X.shape" + "param_grid = [\n", + " {'C': [5, 10, 20, 40, 80], # .1, .5, 1, \n", + " 'gamma': ['auto'], 'kernel': ['poly', 'sigmoid'], \n", + " 'degree': [2, 3, 4, 5],\n", + " 'probability': [True], 'class_weight': ['balanced']},\n", + " ]\n", + "clf = SVC()\n", + "gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=9)\n", + "gscv.fit(X, Y)\n", + "# vars(gscv)\n", + "gscv.print_results()" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## anomaly detection grid search on og data" + ] + }, + { + "cell_type": "code", + "execution_count": 219, "metadata": { "ExecuteTime": { - "end_time": "2018-03-30T02:52:26.318157Z", - "start_time": "2018-03-30T02:52:26.309681Z" + "end_time": "2018-04-20T07:19:12.794478Z", + "start_time": "2018-04-20T07:19:08.296139Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.1}\n", + "avg acc: 0.7961866584438277\n", + "f1 score: 0.06000000000000001\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.2}\n", + "avg acc: 0.8352286977078837\n", + "f1 score: 0.07272727272727272\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.30000000000000004}\n", + "avg acc: 0.8568794326241134\n", + "f1 score: 0.0808080808080808\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.4}\n", + "avg acc: 0.8873183266522766\n", + "f1 score: 0.1015873015873016\n", + "auc: nan\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sahil/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py:1128: RuntimeWarning: Mean of empty slice.\n", + " avg = a.mean(axis)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars\n", + " ret = ret.dtype.type(ret / rcount)\n" + ] } + ], + "source": [ + "param_grid = [\n", + " {'nu': np.arange(.1, 1.0, 0.1), \n", + " 'gamma': ['auto'], 'kernel': ['poly'],\n", + " 'degree': [2, 3, 4, 5, 6, 7]},\n", + " ]\n", + "clf = OneClassSVM()\n", + "gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=5)\n", + "gscv.fit(X, Y)\n", + "# vars(gscv)\n", + "gscv.print_results()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## anomaly detection grid search synthesized og data" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "metadata": { + "ExecuteTime": { + "end_time": "2018-04-20T07:13:23.749534Z", + "start_time": "2018-04-20T07:13:19.559711Z" + }, + "scrolled": true }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.1}\n", + "avg acc: 0.7923061224489796\n", + "f1 score: 0.35557354925775975\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.2}\n", + "avg acc: 0.8164659863945578\n", + "f1 score: 0.35058823529411764\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.30000000000000004}\n", + "avg acc: 0.8450442176870748\n", + "f1 score: 0.3717733247145012\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.4}\n", + "avg acc: 0.8817006802721089\n", + "f1 score: 0.4522144522144522\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.5}\n", + "avg acc: 0.8900340136054423\n", + "f1 score: 0.4242424242424242\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.6}\n", + "avg acc: 0.8900340136054423\n", + "f1 score: 0.4242424242424242\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 2, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.7000000000000001}\n", + "avg acc: 0.9021122448979592\n", + "f1 score: 0.3371428571428572\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 3, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.1}\n", + "avg acc: 0.9387619047619047\n", + "f1 score: 0.6433333333333333\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 3, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.2}\n", + "avg acc: 0.9305986394557824\n", + "f1 score: 0.5597402597402598\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 3, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.30000000000000004}\n", + "avg acc: 0.934765306122449\n", + "f1 score: 0.5577777777777777\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 3, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.4}\n", + "avg acc: 0.9429319727891157\n", + "f1 score: 0.5968253968253968\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 3, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.5}\n", + "avg acc: 0.9347687074829931\n", + "f1 score: 0.5119047619047619\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 3, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.6}\n", + "avg acc: 0.930687074829932\n", + "f1 score: 0.47619047619047616\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 4, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.1}\n", + "avg acc: 0.9470102040816327\n", + "f1 score: 0.6411111111111111\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 4, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.2}\n", + "avg acc: 0.934765306122449\n", + "f1 score: 0.5542857142857144\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 4, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.30000000000000004}\n", + "avg acc: 0.9430136054421769\n", + "f1 score: 0.6253968253968254\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 5, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.1}\n", + "avg acc: 0.9510102040816328\n", + "f1 score: 0.6553968253968254\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 5, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.2}\n", + "avg acc: 0.9428469387755101\n", + "f1 score: 0.5863492063492064\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 5, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.30000000000000004}\n", + "avg acc: 0.9469319727891156\n", + "f1 score: 0.6277777777777778\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 6, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.1}\n", + "avg acc: 0.9429285714285716\n", + "f1 score: 0.6077777777777778\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 6, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.2}\n", + "avg acc: 0.9430136054421769\n", + "f1 score: 0.6253968253968254\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 6, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.30000000000000004}\n", + "avg acc: 0.9469319727891156\n", + "f1 score: 0.6277777777777778\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 7, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.1}\n", + "avg acc: 0.9469285714285715\n", + "f1 score: 0.6220634920634921\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 7, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.2}\n", + "avg acc: 0.9470136054421768\n", + "f1 score: 0.6396825396825397\n", + "auc: nan\n", + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'degree': 7, 'gamma': 'auto', 'kernel': 'poly', 'nu': 0.30000000000000004}\n", + "avg acc: 0.942765306122449\n", + "f1 score: 0.5744444444444444\n", + "auc: nan\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sahil/anaconda3/lib/python3.6/site-packages/numpy/lib/function_base.py:1128: RuntimeWarning: Mean of empty slice.\n", + " avg = a.mean(axis)\n", + "/Users/sahil/anaconda3/lib/python3.6/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars\n", + " ret = ret.dtype.type(ret / rcount)\n" + ] + } + ], + "source": [ + "param_grid = [\n", + " {'nu': np.arange(.1, 1.0, 0.1), \n", + " 'gamma': ['auto'], 'kernel': ['poly'],\n", + " 'degree': [2, 3, 4, 5, 6, 7]},\n", + " ]\n", + "clf = OneClassSVM()\n", + "gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=5)\n", + "gscv.fit(X_os, Y_os)\n", + "# vars(gscv)\n", + "gscv.print_results()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, "source": [ - "reverse_Y = Y.apply(lambda x: 0 if x == 1 else 1)" + "## ensemble grid search on og data" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 215, "metadata": { "ExecuteTime": { - "end_time": "2018-03-30T02:59:30.858463Z", - "start_time": "2018-03-30T02:59:24.546697Z" + "end_time": "2018-04-20T07:14:59.015194Z", + "start_time": "2018-04-20T07:14:58.429111Z" } }, "outputs": [ @@ -123,54 +728,144 @@ "name": "stdout", "output_type": "stream", "text": [ - "[ 0.92307692 0.8974359 0.8974359 0.84210526 0.92105263 0.89189189]\n", - "[ 0.92307692 0.8974359 0.8974359 0.84210526 0.92105263 0.89189189]\n" + "----------------------------------------\n", + "overfit: True (array([ True]), array([5]))\n", + "params: {'n_estimators': 10}\n", + "avg acc: 0.9609579607359441\n", + "f1 score: nan\n", + "auc: nan\n", + "----------------------------------------\n" ] + }, + { + "data": { + "text/plain": [ + "{'param_list_': [{'n_estimators': 10}],\n", + " 'cv': 5,\n", + " 'pos_label': 1,\n", + " '_clf': BaggingClassifier(base_estimator=SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,\n", + " decision_function_shape='ovr', degree=3, gamma='auto', kernel='sigmoid',\n", + " max_iter=-1, probability=True, random_state=None, shrinking=True,\n", + " tol=0.001, verbose=False),\n", + " bootstrap=True, bootstrap_features=False, max_features=1.0,\n", + " max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,\n", + " random_state=None, verbose=0, warm_start=False),\n", + " 'overfit_': [[True, True, True, True, True]],\n", + " 'accuracy_scores_': array([[0.95744681, 0.95652174, 0.95652174, 0.95652174, 0.97777778]]),\n", + " 'precision_': array([], shape=(1, 5, 0), dtype=float64),\n", + " 'recall_': array([], shape=(1, 5, 0), dtype=float64),\n", + " 'f1_scores_': array([[nan, nan, nan, nan, nan]]),\n", + " 'fpr_': array([], shape=(1, 5, 0), dtype=float64),\n", + " 'tpr_': array([], shape=(1, 5, 0), dtype=float64),\n", + " 'auc_scores_': array([[nan, nan, nan, nan, nan]]),\n", + " 'average_accuracy_param_': array([0.96095796])}" + ] + }, + "execution_count": 215, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "k_fold = 6\n", - "clf_params = {\n", - " 'class_weight': 'balanced',\n", - "# 'kernel': 'linear',\n", - " 'C': 10,\n", - "# 'degree': 3, \n", - "# 'gamma': 'auto',\n", - "}\n", - "\n", - "clf1 = SVC(kernel='poly', class_weight='balanced')\n", - "clf2 = SVC(kernel='poly', class_weight='balanced')\n", - "\n", - "scores1 = cross_val_score(clf1, imputed_X, Y, cv=k_fold)\n", - "scores2 = cross_val_score(clf2, imputed_X, reverse_Y, cv=k_fold)\n", - "print(scores1)\n", - "print(scores2)" + "params = {'C': 10, 'class_weight': 'balanced', 'degree': 3, 'gamma': 'auto', 'kernel': 'sigmoid', 'probability': True}\n", + "clf = BaggingClassifier(SVC(**params))\n", + "clf.fit(X, Y)\n", + "# clf.predict(X)\n", + "# cross_val_score(clf, X, Y, cv=5)\n", + "param_grid = [{'n_estimators': [10]}]\n", + "gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=5)\n", + "gscv.fit(X, Y)\n", + "gscv.print_results(show_overfit=True)\n", + "print('-'*40)\n", + "vars(gscv)" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ensemble grid search on synthesized data" + ] + }, + { + "cell_type": "code", + "execution_count": 216, "metadata": { "ExecuteTime": { - "end_time": "2018-03-30T02:57:46.841564Z", - "start_time": "2018-03-30T02:57:46.834038Z" + "end_time": "2018-04-20T07:15:31.170102Z", + "start_time": "2018-04-20T07:15:29.696792Z" } }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------------------------------\n", + "overfit: False (array([False]), array([5]))\n", + "params: {'n_estimators': 10}\n", + "avg acc: 0.9837517006802722\n", + "f1 score: 0.9055555555555556\n", + "auc: 0.9775303030303031\n", + "----------------------------------------\n" + ] + }, { "data": { "text/plain": [ - "array([ 0.52036199, 12.77777778])" + "{'param_list_': [{'n_estimators': 10}],\n", + " 'cv': 5,\n", + " 'pos_label': 1,\n", + " '_clf': BaggingClassifier(base_estimator=SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,\n", + " decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',\n", + " max_iter=-1, probability=True, random_state=None, shrinking=True,\n", + " tol=0.001, verbose=False),\n", + " bootstrap=True, bootstrap_features=False, max_features=1.0,\n", + " max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,\n", + " random_state=None, verbose=0, warm_start=False),\n", + " 'overfit_': [[False, False, False, False, False]],\n", + " 'accuracy_scores_': array([[0.96 , 0.97959184, 1. , 1. , 0.97916667]]),\n", + " 'precision_': array([[array([0.1, 1. , 1. ]),\n", + " array([0.10204082, 1. , 1. ]), array([1., 1.]),\n", + " array([1., 1.]), array([0.8, 1. ])]], dtype=object),\n", + " 'recall_': array([[array([1. , 0.6, 0. ]), array([1. , 0.8, 0. ]), array([1., 0.]),\n", + " array([1., 0.]), array([1., 0.])]], dtype=object),\n", + " 'f1_scores_': array([[0.75 , 0.88888889, 1. , 1. , 0.88888889]]),\n", + " 'fpr_': array([[array([0. , 0. , 0.04444444, 0.04444444, 0.48888889,\n", + " 0.48888889, 1. ]),\n", + " array([0., 0., 1.]), array([0., 0., 1.]), array([0., 0., 1.]),\n", + " array([0. , 0. , 0.02272727, 0.02272727, 1. ])]],\n", + " dtype=object),\n", + " 'tpr_': array([[array([0.2, 0.6, 0.6, 0.8, 0.8, 1. , 1. ]),\n", + " array([0.2, 1. , 1. ]), array([0.2, 1. , 1. ]),\n", + " array([0.2, 1. , 1. ]), array([0.25, 0.75, 0.75, 1. , 1. ])]],\n", + " dtype=object),\n", + " 'auc_scores_': array([[0.89333333, 1. , 1. , 1. , 0.99431818]]),\n", + " 'average_accuracy_param_': array([0.9837517])}" ] }, - "execution_count": 14, + "execution_count": 216, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# n_samples / (n_classes * np.bincount(y))\n", - "X.shape[0] / (2*np.bincount(Y))" + "params = {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf', 'probability': True}\n", + "clf = BaggingClassifier(SVC(**params))\n", + "clf.fit(X, Y)\n", + "param_grid = [{'n_estimators': [10]}]\n", + "gscv = GridSearchCV(param_grid, clf, pos_label=1, cv=5)\n", + "gscv.fit(X_os, Y_os)\n", + "gscv.print_results(show_overfit=True)\n", + "print('-'*40)\n", + "vars(gscv)" ] }, {