Skip to content

Commit

Permalink
FIX: change dataset loading source from OpenML to Kaggle as OpenML is…
Browse files Browse the repository at this point in the history
… down as of 14/01/25
  • Loading branch information
Valentin-Laurent committed Jan 14, 2025
1 parent 7736558 commit 03232b4
Showing 1 changed file with 24 additions and 7 deletions.
31 changes: 24 additions & 7 deletions examples/regression/1-quickstart/plot_compare_conformity_scores.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
We use here the OpenML house_prices dataset:
https://www.openml.org/search?type=data&sort=runs&id=42165&status=active.
Note : OpenML is down as of 14/01/25, so we'll load the data from Kaggle instead.
The data is modelled by a Random Forest model
:class:`~sklearn.ensemble.RandomForestRegressor` with a fixed parameter set.
The prediction intervals are determined by means of the MAPIE regressor
Expand All @@ -31,7 +33,10 @@
"""
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import fetch_openml
import requests
import zipfile
import io
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

Expand All @@ -43,12 +48,14 @@

# Parameters
features = [
"MSSubClass",
"LotArea",
"OverallQual",
"OverallCond",
"GarageArea",
"MS SubClass",
"Lot Area",
"Overall Qual",
"Overall Cond",
"Garage Area",
]
target = "SalePrice"

alpha = 0.05
rf_kwargs = {"n_estimators": 10, "random_state": random_state}
model = RandomForestRegressor(**rf_kwargs)
Expand All @@ -63,7 +70,17 @@
# in such cases.
# Two sub datasets are extracted: the training and test ones.

X, y = fetch_openml(name="house_prices", return_X_y=True)
dataset_url = (
"https://www.kaggle.com" +
"/api/v1/datasets/download/shashanknecrothapa/ames-housing-dataset"
)
r = requests.get(dataset_url, stream=True)
with zipfile.ZipFile(io.BytesIO(r.content)) as z:
with z.open("AmesHousing.csv") as file:
data = pd.read_csv(file)

X = data[features]
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(
X[features], y, test_size=0.2, random_state=random_state
Expand Down

0 comments on commit 03232b4

Please sign in to comment.