From 03232b4cf1f31a78a2c0910000d16125b335ad40 Mon Sep 17 00:00:00 2001 From: Valentin Laurent Date: Tue, 14 Jan 2025 14:16:42 +0100 Subject: [PATCH] FIX: change dataset loading source from OpenML to Kaggle as OpenML is down as of 14/01/25 --- .../plot_compare_conformity_scores.py | 31 ++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/examples/regression/1-quickstart/plot_compare_conformity_scores.py b/examples/regression/1-quickstart/plot_compare_conformity_scores.py index e4b79c701..1dd0fc79a 100644 --- a/examples/regression/1-quickstart/plot_compare_conformity_scores.py +++ b/examples/regression/1-quickstart/plot_compare_conformity_scores.py @@ -9,6 +9,8 @@ We use here the OpenML house_prices dataset: https://www.openml.org/search?type=data&sort=runs&id=42165&status=active. +Note : OpenML is down as of 14/01/25, so we'll load the data from Kaggle instead. + The data is modelled by a Random Forest model :class:`~sklearn.ensemble.RandomForestRegressor` with a fixed parameter set. The prediction intervals are determined by means of the MAPIE regressor @@ -31,7 +33,10 @@ """ import matplotlib.pyplot as plt import numpy as np -from sklearn.datasets import fetch_openml +import requests +import zipfile +import io +import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split @@ -43,12 +48,14 @@ # Parameters features = [ - "MSSubClass", - "LotArea", - "OverallQual", - "OverallCond", - "GarageArea", + "MS SubClass", + "Lot Area", + "Overall Qual", + "Overall Cond", + "Garage Area", ] +target = "SalePrice" + alpha = 0.05 rf_kwargs = {"n_estimators": 10, "random_state": random_state} model = RandomForestRegressor(**rf_kwargs) @@ -63,7 +70,17 @@ # in such cases. # Two sub datasets are extracted: the training and test ones. -X, y = fetch_openml(name="house_prices", return_X_y=True) +dataset_url = ( + "https://www.kaggle.com" + + "/api/v1/datasets/download/shashanknecrothapa/ames-housing-dataset" +) +r = requests.get(dataset_url, stream=True) +with zipfile.ZipFile(io.BytesIO(r.content)) as z: + with z.open("AmesHousing.csv") as file: + data = pd.read_csv(file) + +X = data[features] +y = data[target] X_train, X_test, y_train, y_test = train_test_split( X[features], y, test_size=0.2, random_state=random_state