-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathreader.py
80 lines (59 loc) · 4.24 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pandas as pd
from typing import List, Tuple
def read_dataset(dataset: str, config: str) -> Tuple[List[str], List[List[float]], List[List[float]], List[float], List[float]]:
"""
Read and process dataset files based on the specified dataset and configuration.
Args:
dataset (str): The name of the dataset to read ('Insurance_Cost', 'Admission_Chance', or 'Used_Car_Prices').
config (str): Configuration string, used to determine if feature names should be anonymized.
Returns:
Tuple[List[str], List[List[float]], List[List[float]], List[float], List[float]]:
- Names: List of feature names and target variable name
- X_incontext: List of feature vectors for in-context examples
- X_test: List of feature vectors for test examples
- Y_incontext: List of target values for in-context examples
- Y_test: List of target values for test examples
Raises:
ValueError: If an unknown dataset name is provided.
"""
df_incontext = pd.read_csv(f"Datasets/{dataset}/incontext.csv")
df_test = pd.read_csv(f"Datasets/{dataset}/test.csv")
dataset_processors = {
"Insurance_Cost": process_insurance_cost,
"Admission_Chance": process_admission_chance,
"Used_Car_Prices": process_used_car_prices
}
processor = dataset_processors.get(dataset)
if not processor:
raise ValueError(f"Unknown dataset: {dataset}")
Names, X_incontext, X_test, Y_incontext, Y_test = processor(df_incontext, df_test)
if "Anonymized_Features" in config:
Names = [f"Feature{i}" for i in range(len(Names) - 1)] + ["Output"]
return Names, X_incontext, X_test, Y_incontext, Y_test
def process_insurance_cost(df_incontext: pd.DataFrame, df_test: pd.DataFrame) -> Tuple[List[str], List[List[float]], List[List[float]], List[float], List[float]]:
"""Process the Insurance Cost dataset."""
Names = ['smoker', 'bmi', 'age', "annual individual medical costs billed by health insurance in the USA"]
X_incontext = [[float(row['smoker'] == "yes"), float(row['bmi']), float(row['age'])] for _, row in df_incontext.iterrows()]
Y_incontext = [float(row['charges']) for _, row in df_incontext.iterrows()]
X_test = [[float(row['smoker'] == "yes"), float(row['bmi']), float(row['age'])] for _, row in df_test.iterrows()]
Y_test = [float(row['charges']) for _, row in df_test.iterrows()]
return Names, X_incontext, X_test, Y_incontext, Y_test
def process_admission_chance(df_incontext: pd.DataFrame, df_test: pd.DataFrame) -> Tuple[List[str], List[List[float]], List[List[float]], List[float], List[float]]:
"""Process the Admission Chance dataset."""
Names = ['Cumulative GPA', 'GRE Score', 'TOEFL Score', "Chance of Admition"]
X_incontext = [[float(row['cgpa']), float(row['gre_score']), float(row['TOEFL Score'])] for _, row in df_incontext.iterrows()]
Y_incontext = [float(row['Chance of Admit']) for _, row in df_incontext.iterrows()]
X_test = [[float(row['cgpa']), float(row['gre_score']), float(row['TOEFL Score'])] for _, row in df_test.iterrows()]
Y_test = [round(float(row['Chance of Admit']), 2) for _, row in df_test.iterrows()]
return Names, X_incontext, X_test, Y_incontext, Y_test
def process_used_car_prices(df_incontext: pd.DataFrame, df_test: pd.DataFrame) -> Tuple[List[str], List[List[float]], List[List[float]], List[float], List[float]]:
"""Process the Used Car Prices dataset."""
Names = ['City Fuel Economy', 'Mileage', 'Is Toyota', "Is Small (ex. Sedan is small and SUV is large)",
"the price of a used car that can be either a Toyota or Maserati in 2019"]
X_incontext = [[float(row['city_fuel_economy']), float(row['mileage']), float(row['franchise_make']), float(row['passenger_car'])]
for _, row in df_incontext.iterrows()]
Y_incontext = [float(row['price']) for _, row in df_incontext.iterrows()]
X_test = [[float(row['city_fuel_economy']), float(row['mileage']), float(row['franchise_make']), float(row['passenger_car'])]
for _, row in df_test.iterrows()]
Y_test = [float(row['price']) for _, row in df_test.iterrows()]
return Names, X_incontext, X_test, Y_incontext, Y_test