-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
196 lines (164 loc) · 6.54 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
import typing
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer, load_digits, load_iris, load_diabetes
from sklearn.model_selection import train_test_split
from typing import Literal, Union, Optional, List, Dict, Any, Tuple, Callable
from dataclasses import dataclass
from typing_extensions import override
def serialize_to_csv_formatted_bytes(
data: typing.Union[pd.DataFrame, pd.Series, np.ndarray],
) -> bytes:
if type(data) not in [pd.DataFrame, pd.Series, np.ndarray]:
raise TypeError(f"({type(data)}) is not supported for serialization")
if isinstance(data, np.ndarray):
data = pd.DataFrame(data)
# data is now of type pd.DataFrame
csv_bytes = data.to_csv(index=False).encode("utf-8")
return csv_bytes
FileName = str
FileContent = bytes
FileUpload = typing.Tuple[FileName, FileContent]
def to_httpx_post_file_format(file_uploads: typing.List[FileUpload]) -> typing.Dict:
ret = {}
for file_category, filename, content in file_uploads:
ret[file_category] = (filename, content)
return ret
def to_oauth_request_form(username: str, password: str) -> {}:
return {"grant_type": "password", "username": username, "password": password}
class Singleton:
def __new__(cls):
raise TypeError("Cannot instantiate this class. This is a singleton.")
def get_example_dataset(
dataset_name: typing.Literal["iris", "breast_cancer", "digits", "diabetes"],
) -> typing.Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
load_dataset_fn = {
"iris": load_iris,
"breast_cancer": load_breast_cancer,
"digits": load_digits,
"diabetes": load_diabetes,
}
x_train, y_train = load_dataset_fn[dataset_name](return_X_y=True, as_frame=True)
# shuffle and get 10 examples
# shuffle is needed because we will might get examples with only 1 class
# use fixed seed for reproducibility
rng = np.random.RandomState(46)
indices = rng.permutation(len(x_train))[:10]
x_train = x_train.iloc[indices]
y_train = y_train.iloc[indices]
x_train, x_test, y_train, y_test = train_test_split(
x_train, y_train, test_size=0.33, random_state=42
)
return x_train, x_test, y_train, y_test
def get_dataset_with_specific_size(
num_examples: int = 10_000, num_columns: int = 100
) -> typing.Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
x_train = np.random.RandomState(42).rand(num_examples, num_columns)
y_train = np.random.RandomState(42).randint(0, 2, size=num_examples)
return x_train, x_train, y_train, y_train
def assert_y_pred_proba_is_valid(x_test, y_pred_proba):
if isinstance(y_pred_proba, list):
y_pred_proba = np.array(y_pred_proba)
proba_shape = y_pred_proba.shape
assert proba_shape[0] == len(x_test)
assert proba_shape[1] >= 2
assert np.allclose(y_pred_proba.sum(axis=1), np.ones(proba_shape[0]))
@dataclass
class PreprocessorConfig:
"""Configuration for data preprocessors.
Attributes:
name: Name of the preprocessor.
categorical_name:
Name of the categorical encoding method.
Options: "none", "numeric", "onehot", "ordinal", "ordinal_shuffled", "none".
append_original: Whether to append original features to the transformed features
subsample_features: Fraction of features to subsample. -1 means no subsampling.
global_transformer_name: Name of the global transformer to use.
"""
name: Literal[
"per_feature", # a different transformation for each feature
"power", # a standard sklearn power transformer
"safepower", # a power transformer that prevents some numerical issues
"power_box",
"safepower_box",
"quantile_uni_coarse", # quantile transformations with few quantiles up to many
"quantile_norm_coarse",
"quantile_uni",
"quantile_norm",
"quantile_uni_fine",
"quantile_norm_fine",
"robust", # a standard sklearn robust scaler
"kdi",
"none", # no transformation (only standardization in transformer)
"kdi_random_alpha",
"kdi_uni",
"kdi_random_alpha_uni",
"adaptive",
"norm_and_kdi",
# KDI with alpha collection
"kdi_alpha_0.3_uni",
"kdi_alpha_0.5_uni",
"kdi_alpha_0.8_uni",
"kdi_alpha_1.0_uni",
"kdi_alpha_1.2_uni",
"kdi_alpha_1.5_uni",
"kdi_alpha_2.0_uni",
"kdi_alpha_3.0_uni",
"kdi_alpha_5.0_uni",
"kdi_alpha_0.3",
"kdi_alpha_0.5",
"kdi_alpha_0.8",
"kdi_alpha_1.0",
"kdi_alpha_1.2",
"kdi_alpha_1.5",
"kdi_alpha_2.0",
"kdi_alpha_3.0",
"kdi_alpha_5.0",
]
categorical_name: Literal[
# categorical features are pretty much treated as ordinal, just not resorted
"none",
# categorical features are treated as numeric,
# that means they are also power transformed for example
"numeric",
# "onehot": categorical features are onehot encoded
"onehot",
# "ordinal": categorical features are sorted and encoded as
# integers from 0 to n_categories - 1
"ordinal",
# "ordinal_shuffled": categorical features are encoded as integers
# from 0 to n_categories - 1 in a random order
"ordinal_shuffled",
"ordinal_very_common_categories_shuffled",
] = "none"
append_original: bool = False
subsample_features: float = -1
global_transformer_name: Union[str, None] = None
@override
def __str__(self) -> str:
return (
f"{self.name}_cat:{self.categorical_name}"
+ ("_and_none" if self.append_original else "")
+ (
f"_subsample_feats_{self.subsample_features}"
if self.subsample_features > 0
else ""
)
+ (
f"_global_transformer_{self.global_transformer_name}"
if self.global_transformer_name is not None
else ""
)
)
def to_dict(self) -> Dict[str, Any]:
"""Convert the PreprocessorConfig instance to a dictionary.
Returns:
Dict[str, Any]: Dictionary containing the configuration parameters.
"""
return {
"name": self.name,
"categorical_name": self.categorical_name,
"append_original": self.append_original,
"subsample_features": self.subsample_features,
"global_transformer_name": self.global_transformer_name,
}