-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregressors_R.py
110 lines (89 loc) · 4.62 KB
/
regressors_R.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import sys
import os
import pandas as pd
import numpy as np
import pickle
from sklearn import svm, linear_model, discriminant_analysis, neighbors
from sklearn import tree, naive_bayes, ensemble, neural_network, gaussian_process
from sklearn.model_selection import cross_validate, KFold
from sklearn import metrics
from sklearn import preprocessing
import constants
from Default import Default
from Random import Random
from meta_db.db.DBHelper import DBHelper
from R_Model import *
# Importing utils from R
from rpy2.robjects.packages import importr
import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import StrVector
from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
# For Formulae
from rpy2.robjects import IntVector, Formula
# For Pandas
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects import numpy2ri
from rpy2.robjects.conversion import localconverter
db = DBHelper()
SCORE_COLUMNS = ["name"] + constants.CLASSIFIERS
SCORES = ["mean_squared_error"]
metadata = pd.DataFrame(db.get_all_metadata(), columns = db.metadata_columns()).drop("id", axis = 1)
models = pd.DataFrame(db.get_all_models(), columns = db.models_columns()).drop("id", axis = 1)
combinations = pd.DataFrame(db.get_all_combinations(), columns = db.combinations_columns())
preperformance = pd.DataFrame(db.get_all_preperformance(), columns = db.preperformance_columns()).drop("id", axis = 1)
# Not null preperformance
preperformance = preperformance[~preperformance.isnull().any(axis = 1)]
preperformance = pd.merge(preperformance, combinations, left_on = "combination_id", right_on = "id").drop(["combination_id", "id", "num_preprocesses"], axis = 1)
models = models.rename(columns = {"model": "classifier"})
models["preprocesses"] = "None"
scores = pd.concat([models, preperformance], sort = False)
scores = scores[scores.preprocesses.isin(constants.PRE_PROCESSES + ["None"]) & scores.classifier.isin(constants.CLASSIFIERS)]
metadata_means = {feature: np.mean(metadata[feature]) for feature in metadata.columns if feature != "name"}
metadata.fillna(value = metadata_means, inplace = True)
data = pd.merge(metadata, scores, on = "name")
data = data[data.preprocesses.isin(constants.PRE_PROCESSES + ["None"]) & data.classifier.isin(constants.CLASSIFIERS)]
combinations_strings = ["{}+{}".format(pp, clf) for pp in ["None"] + constants.PRE_PROCESSES
for clf in constants.CLASSIFIERS]
reg_models = {}
reg_models["ann"] = lambda: R_Model(neuralnet.neuralnet)
reg_models["cart"] = lambda: R_Model(rpart.rpart)
reg_models["randomForest"] = lambda: R_Model(randomForest.randomForest)
reg_models["svm"] = lambda: SVR()
reg_models["dwnn"] = lambda: KNN()
reg_models["random"] = lambda: Random(random_seed = constants.RANDOM_STATE)
reg_models["default"] = lambda: Default()
mean_scores = []
std_scores = []
for score in constants.CLASSIFIERS_SCORES:
mean_scores.append(score + "_mean")
std_scores.append(score + "_std")
if not os.path.exists("regressors"):
os.makedirs("regressors")
divideFold = KFold(10, random_state = constants.RANDOM_STATE, shuffle = True)
targets = {}
for clf in constants.CLASSIFIERS:
# Getting target value per classifier and preprocessor
targets[clf] = data[(data.classifier == clf) & (data.preprocesses == "None")]
for score in constants.CLASSIFIERS_SCORES:
target = targets[clf][score + "_mean"].values
values = targets[clf].drop(["name", "classifier", "preprocesses", *mean_scores, *std_scores], axis = 1)
for reg in reg_models.keys():
count_models = 0
for train_indx, test_indx in divideFold.split(values):
model = reg_models[reg]()
model.fit(values.iloc[train_indx], target[train_indx])
results = []; result_labels = [];
result_labels.append("name"); results.append(reg);
result_labels.append("classifier"); results.append(clf);
result_labels.append("score"); results.append(score);
result_labels.append("model_id"); results.append(count_models);
for reg_score in SCORES:
result_labels.append(reg_score)
result = getattr(metrics, reg_score)(target[test_indx], model.predict(values.iloc[test_indx]))
results.append(result)
pickle.dump(model, open("regressors/{}_{}_{}_{}.pickle".format(
reg, clf, score, count_models), "wb"))
count_models += 1
db.add_regressor_record(result_labels, results)
print("- Finished with {} {} {}".format(reg, score, clf))