-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregressors_pp_R.py
113 lines (93 loc) · 5.1 KB
/
regressors_pp_R.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import sys
import os
import pandas as pd
import numpy as np
import pickle
from sklearn import svm, linear_model, discriminant_analysis, neighbors
from sklearn import tree, naive_bayes, ensemble, neural_network, gaussian_process
from sklearn.model_selection import cross_validate, KFold
from sklearn import metrics
from sklearn import preprocessing
import constants
from Default import Default
from Random import Random
from meta_db.db.DBHelper import DBHelper
from R_Model import *
# Importing utils from R
from rpy2.robjects.packages import importr
import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import StrVector
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
# For Formulae
from rpy2.robjects import IntVector, Formula
# For Pandas
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects import numpy2ri
from rpy2.robjects.conversion import localconverter
db = DBHelper()
SCORE_COLUMNS = ["name"] + constants.CLASSIFIERS
SCORES = ["max_error", "mean_absolute_error", "r2_score", "median_absolute_error", "mean_squared_error"]
# SCORES = ["mean_squared_error"]
metadata = pd.DataFrame(db.get_all_metadata(), columns = db.metadata_columns()).drop("id", axis = 1)
models = pd.DataFrame(db.get_all_models(), columns = db.models_columns()).drop("id", axis = 1)
combinations = pd.DataFrame(db.get_all_combinations(), columns = db.combinations_columns())
preperformance = pd.DataFrame(db.get_all_preperformance(), columns = db.preperformance_columns()).drop("id", axis = 1)
# Not null preperformance
preperformance = preperformance[~preperformance.isnull().any(axis = 1)]
preperformance = pd.merge(preperformance, combinations, left_on = "combination_id", right_on = "id").drop(["combination_id", "id", "num_preprocesses"], axis = 1)
models = models.rename(columns = {"model": "classifier"})
models["preprocesses"] = "None"
scores = pd.concat([models, preperformance], sort = False)
scores = scores[scores.preprocesses.isin(constants.PRE_PROCESSES + ["None"]) & scores.classifier.isin(constants.CLASSIFIERS)]
metadata_means = {feature: np.mean(metadata[feature]) for feature in metadata.columns if feature != "name"}
metadata.fillna(value = metadata_means, inplace = True)
data = pd.merge(metadata, scores, on = "name")
data = data[data.preprocesses.isin(constants.PRE_PROCESSES + ["None"]) & data.classifier.isin(constants.CLASSIFIERS)]
combinations_strings = ["{}+{}".format(pp, clf) for pp in ["None"] + constants.PRE_PROCESSES
for clf in constants.CLASSIFIERS]
reg_models = {}
reg_models["ann"] = lambda: R_Model(neuralnet.neuralnet)
reg_models["cart"] = lambda: R_Model(rpart.rpart)
reg_models["randomForest"] = lambda: R_Model(randomForest.randomForest)
reg_models["svm"] = lambda: SVR()
reg_models["dwnn"] = lambda: KNN()
reg_models["random"] = lambda: Random(random_seed = constants.RANDOM_STATE)
reg_models["default"] = lambda: Default()
mean_scores = []
std_scores = []
for score in constants.CLASSIFIERS_SCORES:
mean_scores.append(score + "_mean")
std_scores.append(score + "_std")
if not os.path.exists("regressors"):
os.makedirs("regressors")
divideFold = KFold(10, random_state = constants.RANDOM_STATE, shuffle = True)
targets = {}
for clf in constants.CLASSIFIERS:
for preprocess in constants.PRE_PROCESSES:
# Getting target value per classifier and preprocessor
combination = combinations[(combinations.classifier == clf) & (combinations.preprocesses == preprocess)]
targets[clf] = {}
targets[clf][preprocess] = data[(data.classifier == clf) & (data.preprocesses == preprocess)]
for score in constants.CLASSIFIERS_SCORES:
target = targets[clf][preprocess][score + "_mean"].values
values = targets[clf][preprocess].drop(["name", "classifier", "preprocesses", *mean_scores, *std_scores], axis = 1)
for reg in reg_models.keys():
count_models = 0
for train_indx, test_indx in divideFold.split(values):
model = reg_models[reg]()
model.fit(values.iloc[train_indx], target[train_indx])
results = []; result_labels = [];
result_labels.append("name"); results.append(reg);
result_labels.append("combination_id"); results.append(int(combination.id));
result_labels.append("score"); results.append(score);
result_labels.append("model_id"); results.append(count_models);
for reg_score in SCORES:
result_labels.append(reg_score)
result = getattr(metrics, reg_score)(target[test_indx], model.predict(values.iloc[test_indx]))
results.append(result)
pickle.dump(model, open("regressors/{}_{}_{}_{}_{}.pickle".format(
reg, clf, preprocess, score, count_models), "wb"))
count_models += 1
db.add_regressor_preperformance_record(result_labels, results)
print("- Finished with {} {} {} {}".format(reg, score, clf, preprocess))