diff --git a/docs/refs.bib b/docs/refs.bib index c0aedacb8..005bc0112 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -107,6 +107,21 @@ @article{berrendero++_2022_functional keywords = {62J12,62R10,Functional data,Kernel methods in statistics,Logistic regression,Mathematics - Statistics Theory,Reproducing kernel Hilbert spaces} } +@article{borggaard+thodberg_1992_optimal, + title = {Optimal Minimal Neural Interpretation of Spectra}, + author = {Borggaard, {\relax Claus}. and Thodberg, Hans Henrik.}, + year = {1992}, + month = mar, + journal = {Analytical Chemistry}, + volume = {64}, + number = {5}, + pages = {545--551}, + issn = {0003-2700}, + doi = {10.1021/ac00029a018}, + url = {https://doi.org/10.1021/ac00029a018}, + urldate = {2018-12-09} +} + @article{cuesta-albertos++_2017_ddgclassifier, title = {The {{DDᴳ}}-Classifier in the Functional Setting}, author = {{Cuesta-Albertos}, J. A. and {Febrero-Bande}, M. and {Oviedo~de~la~Fuente}, M.}, @@ -293,6 +308,26 @@ @article{ghosh+chaudhuri_2005_maximum keywords = {Bayes risk,cross-validation,data depth,elliptic symmetry,kernel density estimation,location shift model,Mahalanobis distance,misclassification rate,Vapnik Chervonenkis dimension} } +@article{hastie++_1995_penalized, + title = {Penalized Discriminant Analysis}, + author = {Hastie, Trevor and Buja, Andreas and Tibshirani, Robert}, + year = {1995}, + month = feb, + journal = {The Annals of Statistics}, + volume = {23}, + number = {1}, + pages = {73--102}, + issn = {0090-5364, 2168-8966}, + doi = {10.1214/aos/1176324456}, + url = {https://projecteuclid.org/euclid.aos/1176324456}, + urldate = {2018-12-09}, + abstract = {Fisher's linear discriminant analysis (LDA) is a popular data-analytic tool for studying the relationship between a set of predictors and a categorical response. In this paper we describe a penalized version of LDA. It is designed for situations in which there are many highly correlated predictors, such as those obtained by discretizing a function, or the grey-scale values of the pixels in a series of images. In cases such as these it is natural, efficient and sometimes essential to impose a spatial smoothness constraint on the coefficients, both for improved prediction performance and interpretability. We cast the classification problem into a regression framework via optimal scoring. Using this, our proposal facilitates the use of any penalized regression technique in the classification setting. The technique is illustrated with examples in speech recognition and handwritten character recognition.}, + langid = {english}, + mrnumber = {MR1331657}, + zmnumber = {0821.62031}, + keywords = {discrimination,regularization,Signal and image classification} +} + @article{li++_2012_ddclassifier, title = {{{DD-Classifier}}: {{Nonparametric}} Classification Procedure Based on {{DD-Plot}}}, shorttitle = {{{DD-Classifier}}}, @@ -347,6 +382,14 @@ @article{marron++_2015_functional keywords = {alignment,dynamic time warping,elastic metric,Fisher–Rao metric,Functional data analysis,registration,warping} } +@misc{meteorologicalstateagencyofspainaemet_2009_meteorological, + title = {Meteorological Data of {{Spanish}} Weather Stations between Years 1980 and 2009.}, + author = {{Meteorological State Agency of Spain (AEMET)}}, + year = {2009}, + url = {http://www.aemet.es/}, + abstract = {The data were obtained from the FTP of AEMET in 2009.} +} + @article{pini++_2018_hotelling, title = {Hotelling's {{T2}} in Separable {{Hilbert}} Spaces}, author = {Pini, Alessia and Stamm, Aymeric and Vantini, Simone}, @@ -379,6 +422,20 @@ @incollection{pintado+romo_2005_depthbased isbn = {978-0-8218-3596-8} } +@inproceedings{ramos-carreno++_2022_scikitfda, + title = {{{scikit-fda}}: {{Computational}} Tools for Machine Learning with Functional Data}, + shorttitle = {Scikit-Fda}, + booktitle = {2022 {{IEEE}} 34th {{International Conference}} on {{Tools}} with {{Artificial Intelligence}} ({{ICTAI}})}, + author = {{Ramos-Carre{\~n}o}, Carlos and Torrecilla, Jos{\'e} Luis and Hong, Yujian and Su{\'a}rez, Alberto}, + year = {2022}, + month = oct, + pages = {213--218}, + issn = {2375-0197}, + doi = {10.1109/ICTAI56018.2022.00038}, + abstract = {Machine learning from functional data poses particular challenges that require specific computational tools that take into account their structure. In this work, we present scikit-fda, a Python library for functional data analysis, visualization, preprocessing, and machine learning. The library is designed for smooth integration in the Python scientific ecosystem. In particular, it complements and can be used in combination with scikit-learn, the reference Python library for machine learning. The functionality of scikit-fda is illustrated in clustering, regression, and classification problems from different areas of application.}, + keywords = {Data analysis,data visualization,Data visualization,Documentation,Ecosystems,Extrapolation,functional data analysis,Interpolation,machine learning,Machine learning,Python toolbox} +} + @inbook{ramsay+silverman_2005_functionala, title = {From Functional Data to Smooth Functions}, booktitle = {Functional Data Analysis}, @@ -436,6 +493,24 @@ @inbook{ramsay+silverman_2005_registration keywords = {Multivariate analysis} } +@incollection{romeo+marzoljaen_2014_analisis, + title = {{An\'alisis del viento y la niebla en el aeropuerto de Los Rodeos (Tenerife). Cambios y tendencias}}, + booktitle = {{Cambio clim\'atico y cambio global.}}, + author = {Romeo, V{\'i}ctor M. and Marzol Ja{\'e}n, M{\textordfeminine} Victoria}, + editor = {Fern{\'a}ndez Montes, Sonia and S{\'a}nchez Rodrigo, Fernando}, + year = {2014}, + series = {{Publicaciones de la Asociaci\'on Espa\~nola de Climatolog\'ia. Serie A.}}, + number = {9}, + pages = {325--334}, + publisher = {{Asociaci\'on Espa\~nola de Climatolog\'ia}}, + url = {https://repositorio.aemet.es/handle/20.500.11765/8173}, + urldate = {2022-07-28}, + abstract = {[ES]La particular localizaci\'on del aeropuerto Tenerife Norte-Los Rodeos, en el nordeste de la isla de Tenerife, es la causa de la modificaci\'on de la direcci\'on habitual de los vientos alisios del NE a vientos del NO, acompa\~nados frecuentemente por nubosidad. Esto ocasiona que en un n\'umero significativo de d\'ias al a\~no este aeropuerto est\'e cubierto por la niebla, lo que afecta de una forma muy importante a su operatividad. El objetivo del trabajo es caracterizar su r\'egimen de vientos y de la niebla durante los \'ultimos trece a\~nos (2000-2012) y comprobar si se han producido cambios significativos en ambas variables clim\'aticas con respecto a los periodos normales de 1961-1990 y 1971-2000. Los resultados indican que las dos direcciones dominantes del viento, NO y SE, mantienen sus frecuencias no as\'i la niebla que ha aumentado su presencia en los \'ultimos a\~nos.}, + copyright = {Licencia CC: Reconocimiento CC BY}, + isbn = {978-84-16027-69-9}, + langid = {spanish} +} + @article{ruiz-meana++_2003_cariporide, title = {Cariporide Preserves Mitochondrial Proton Gradient and Delays {{ATP}} Depletion in Cardiomyocytes during Ischemic Conditions}, author = {{Ruiz-Meana}, Marisol and {Garcia-Dorado}, David and Pina, Pilar and Inserte, Javier and Agull{\'o}, Luis and {Soler-Soler}, Jordi}, diff --git a/examples/full_examples/plot_aemet_unsupervised.py b/examples/full_examples/plot_aemet_unsupervised.py index ece1b0829..5d68b8fd0 100644 --- a/examples/full_examples/plot_aemet_unsupervised.py +++ b/examples/full_examples/plot_aemet_unsupervised.py @@ -30,31 +30,56 @@ from skfda.ml.clustering import KMeans from skfda.preprocessing.dim_reduction import FPCA -############################################################################## -# We will first load the AEMET dataset and plot it. +# %% +# In this example we explore the curves of daily temperatures at +# different weather stations from Spain, included in the AEMET dataset\ +# :footcite:p:`meteorologicalstateagencyofspainaemet_2009_meteorological`. +# +# We first show how the visualization tools of scikit-fda can be used to +# detect and interpret magnitude and shape outliers. +# We also explain how to employ a clustering method on these temperature +# curves using scikit-fda. +# Then, the location of each weather station is plotted into a map of Spain +# with a different color according to its cluster. +# This reveals a remarkable resemblance to a Spanish climate map. +# A posterior analysis using functional principal component analysis (FPCA) +# explains how the two first principal components are related with relevant +# meteorological concepts, and can be used to reconstruct and interpret +# the original clustering. +# +# This is one of the examples presented in the ICTAI conference\ +# :footcite:p:`ramos-carreno++_2022_scikitfda`. + +# %% +# We first load the AEMET dataset and plot it. X, _ = fetch_aemet(return_X_y=True) X = X.coordinates[0] X.plot() plt.show() -############################################################################## +# %% # A boxplot can show magnitude outliers, in this case Navacerrada. +# Here the temperatures are lower than in the other curves, as this +# weather station is at a high altitude, near a ski resort. Boxplot( X, depth_method=ModifiedBandDepth(), ).plot() plt.show() -############################################################################## +# %% # A magnitude-shape plot can be used to detect shape outliers, such as the # Canary islands, with a less steeper temperature curve. +# The Canary islands are at a lower latitude, closer to the equator. +# Thus, they have a subtropical climate which presents less temperature +# variation during the year. MagnitudeShapePlot( X, ).plot() plt.show() -############################################################################## +# %% # We now attempt to cluster the curves using functional k-means. n_clusters = 5 n_init = 10 @@ -67,7 +92,7 @@ ) fda_clusters = fda_kmeans.fit_predict(X) -############################################################################## +# %% # We want to plot the cluster of each station in the map of Spain. We need to # define first auxiliary variables and functions for plotting. coords_spain = (-10, 5, 34.98, 44.8) @@ -130,15 +155,36 @@ def plot_cluster_points( # Names of each climate (for this particular seed) climate_names = { - 0: "Cold-mountain", + 0: "Cold/mountain", 1: "Mediterranean", 2: "Atlantic", 3: "Subtropical", 4: "Continental", } -############################################################################## +# %% # We now plot the obtained clustering in the maps. +# +# It is possible to notice that each cluster seems to roughly correspond with +# a particular climate: +# +# - Red points, only present in the Canary islands, correspond to a subtropical +# climate. +# - Yellow points, in the Mediterranean coast, correspond to a Mediterranean +# climate. +# - Green points, in the Atlantic coast of mainland Spain, correspond to an +# Atlantic or oceanic climate. +# - Orange points, in the center of mainland Spain, correspond to a continental +# climate. +# - Finally the purple points are located at the coldest regions of Spain, such +# as in high mountain ranges. +# Thus, it can be associated with a cold or mountain climate. +# +# Notice that a couple of points in the Canary islands are not red. +# The purple one is easy to explain, as it is in mount Teide, the highest +# mountain of Spain. +# The yellow one is in the airport of Los Rodeos, which has its own +# microclimate\ :footcite:p:`romeo+marzoljaen_2014_analisis`. # Mainland fig_spain = create_map(coords_spain, figsize=(8, 6)) @@ -161,7 +207,7 @@ def plot_cluster_points( ) plt.show() -############################################################################## +# %% # We now can compute the first two principal components for interpretability, # and project the data over these directions. fpca = FPCA(n_components=2) @@ -173,12 +219,17 @@ def plot_cluster_points( X_red = fpca.transform(X) -############################################################################## +# %% # We now plot the first two principal components as perturbations over the # mean. # # The ``factor`` parameters is a number that multiplies each component in # order to make their effect more noticeable. +# +# It is possible to observe that the first component measures a global +# increase/decrease in temperature. +# The second component instead has the effect of increasing/decreasing +# the variability of the temperatures during the year. fig = plt.figure(figsize=(8, 4)) FPCAPlot( fpca.mean_, @@ -188,8 +239,15 @@ def plot_cluster_points( ).plot() plt.show() -############################################################################## -# We also plot the projections over the first two principal components. +# %% +# We also plot the projections over the first two principal components, +# keeping the cluster colors. +# Here we can easily observe the corresponding characteristics of each +# climate in terms of global temperature and annual variability. +# The two outliers of the Mediterranean climate correspond to the +# aforementioned airport of Los Rodeos, and to Tarifa, which is +# located at the strait of Gibraltar and thus receives also the cold +# waters of the Atlantic, explaining its lower annual variability. fig, ax = plt.subplots(1, 1) for cluster in range(n_clusters): selection = fda_clusters == cluster @@ -205,7 +263,7 @@ def plot_cluster_points( ax.legend() plt.show() -############################################################################## +# %% # We now attempt a multivariate clustering using only these projections. mv_kmeans = sklearn.cluster.KMeans( n_clusters=n_clusters, @@ -214,9 +272,12 @@ def plot_cluster_points( ) mv_clusters = mv_kmeans.fit_predict(X_red) -############################################################################## +# %% # We now plot the multivariate clustering in the maps. We define a different # color map to match cluster colors with the previously obtained ones. +# As you can see, the clustering using only the two first principal components +# matches almost perfectly with the original one, that used the complete +# temperature curves. mv_color_map = { 0: "yellow", @@ -246,3 +307,9 @@ def plot_cluster_points( ax=fig_canary.axes[0], ) plt.show() + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/examples/full_examples/plot_phonemes_classification.py b/examples/full_examples/plot_phonemes_classification.py index 75d0fd8e7..4b9eafd1e 100644 --- a/examples/full_examples/plot_phonemes_classification.py +++ b/examples/full_examples/plot_phonemes_classification.py @@ -24,9 +24,22 @@ from skfda.preprocessing.registration import FisherRaoElasticRegistration from skfda.preprocessing.smoothing import KernelSmoother -############################################################################## -# We will first load the (binary) Phoneme dataset, restricted to the first -# 150 variables, and plot the first 20 functions. +# %% +# This example uses the Phoneme dataset\ :footcite:`hastie++_1995_penalized` +# containing the frequency curves of some common phonemes as pronounced by +# different people. +# We illustrate with this data the preprocessing and +# classification techniques available in scikit-fda. +# +# This is one of the examples presented in the ICTAI conference\ +# :footcite:p:`ramos-carreno++_2022_scikitfda`. + +# %% +# We will first load the (binary) Phoneme dataset and plot the first 20 +# functions. +# We restrict the data to the first 150 variables, as done in +# :footcite:t:`ferraty+vieu_2006_computational`, because most of the +# useful information is in the lower frequencies. X, y = fetch_phoneme(return_X_y=True) X = X[(y == 0) | (y == 1)] @@ -47,8 +60,11 @@ X[:n_plot].plot(group=y) plt.show() -############################################################################## -# We now smooth and plot the data again, as well as the class means. +# %% +# As we just saw, the curves are very noisy. +# We can leverage the continuity of the trajectories by smoothing, using +# a Nadaraya-Watson estimator. +# We then plot the data again, as well as the class means. smoother = KernelSmoother( NadarayaWatsonHatMatrix( bandwidth=0.1, @@ -66,8 +82,23 @@ X_smooth_ao.mean().plot(fig=fig, color="red", linewidth=3) plt.show() -############################################################################## -# We now register the data per class. As Fisher-Rao elastic registration is +# %% +# The smoothed curves are easier to interpret. +# Now, it is possible to appreciate the characteristic landmarks of each class, +# such as maxima or minima. +# However, not all these landmarks appear at the same frequency for each +# trajectory. +# One way to solve it is by registering (aligning) the data. +# We use Fisher-Rao elastic registration, a state-of-the-art registration +# method to illustrate the effect of registration. +# Although this registration method achieves very good results, it +# attempts to align all the curves to a common template. +# Thus, in order to clearly view the specific landmarks of each class we +# have to register the data per class. +# This also means that if the we cannot use this method for a classification +# task if the landmarks of each class are very different, as it is not able +# to do per-class registration with unlabeled data. +# As Fisher-Rao elastic registration is # very slow, we only register the plotted curves as an approximation. reg = FisherRaoElasticRegistration( penalty=0.01, @@ -83,7 +114,7 @@ X_reg_ao.mean().plot(fig=fig, color="red", linewidth=3) plt.show() -############################################################################## +# %% # We now split the smoothed data in train and test datasets. # Note that there is no data leakage because no parameters are fitted in # the smoothing step, but normally you would want to do all preprocessing in @@ -97,7 +128,7 @@ stratify=y, ) -############################################################################## +# %% # We use a k-nn classifier with a functional analog to the Mahalanobis # distance and a fixed number of neighbors. n_neighbors = int(np.sqrt(X_smooth.n_samples)) @@ -115,7 +146,7 @@ score = accuracy_score(y_test, y_pred) print(score) -############################################################################## +# %% # If we wanted to optimize hyperparameters, we can use scikit-learn tools. pipeline = Pipeline([ ("smoother", smoother), @@ -138,3 +169,9 @@ # y_pred = grid_search.predict(X_test) # score = accuracy_score(y_test, y_pred) # print(score) + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/examples/full_examples/plot_tecator_regression.py b/examples/full_examples/plot_tecator_regression.py index 96333ce4a..3c19dfa3a 100644 --- a/examples/full_examples/plot_tecator_regression.py +++ b/examples/full_examples/plot_tecator_regression.py @@ -25,7 +25,19 @@ ) from skfda.representation.basis import BSplineBasis -############################################################################## +# %% +# This example uses the Tecator dataset\ +# :footcite:`borggaard+thodberg_1992_optimal` +# in order to illustrate the problems of functional regression +# and functional variable selection. +# This dataset contains the spectra of absorbances of several pieces of +# finely chopped meat, as well as the percent of its content in water, +# fat and protein. +# +# This is one of the examples presented in the ICTAI conference\ +# :footcite:p:`ramos-carreno++_2022_scikitfda`. + +# %% # We will first load the Tecator data, keeping only the fat content target, # and plot it. X, y = fetch_tecator(return_X_y=True) @@ -34,13 +46,17 @@ X.plot(gradient_criteria=y) plt.show() -############################################################################## -# We compute numerically the second derivative and plot it. +# %% +# For spectrometric data, the relevant information of the curves can often +# be found in the derivatives\ :footcite:`ferraty+vieu_2006_computational`. +# Thus, we compute numerically the second derivative and plot it. X_der = X.derivative(order=2) X_der.plot(gradient_criteria=y) plt.show() -############################################################################## +# %% +# We first apply a simple linear regression model to compute a baseline +# for our regression predictions. # In order to compute functional linear regression we first convert the data # to a basis expansion. basis = BSplineBasis( @@ -48,7 +64,7 @@ ) X_der_basis = X_der.to_basis(basis) -############################################################################## +# %% # We split the data in train and test, and compute the regression score using # the linear regression model. X_train, X_test, y_train, y_test = train_test_split( @@ -63,8 +79,15 @@ score = r2_score(y_test, y_pred) print(score) -############################################################################## -# Another approach is to use variable selection using maxima hunting. +# %% +# We now will take a different approach. +# It is possible to note from the plot of the derivatives that most information +# necessary for regression can be found at some particular "impact" points. +# Thus, we now apply a functional variable selection method to detect those +# points and use them with a multivariate classifier. +# The variable selection method that we employ here is maxima hunting\ +# :footcite:`berrendero++_2016_variable`, a filter method that computes a +# relevance score for each point of the curve and selects all the local maxima. var_sel = MaximaHunting( local_maxima_selector=RelativeLocalMaximaSelector(max_points=2), ) @@ -72,21 +95,21 @@ print(var_sel.indexes_) -############################################################################## +# %% # We can visualize the relevance function and the selected points. var_sel.dependence_.plot() for p in var_sel.indexes_: plt.axvline(X_der.grid_points[0][p], color="black") plt.show() -############################################################################## +# %% # We also can visualize the selected points on the curves. X_der.plot(gradient_criteria=y) for p in var_sel.indexes_: plt.axvline(X_der.grid_points[0][p], color="black") plt.show() -############################################################################## +# %% # We split the data again (using the same seed), but this time without the # basis expansion. X_train, X_test, y_train, y_test = train_test_split( @@ -95,7 +118,7 @@ random_state=0, ) -############################################################################## +# %% # We now make a pipeline with the variable selection and a multivariate linear # regression method for comparison. pipeline = Pipeline([ @@ -107,7 +130,7 @@ score = r2_score(y_test, y_predicted) print(score) -############################################################################## +# %% # We can use a tree regressor instead to improve both the score and the # interpretability. pipeline = Pipeline([ @@ -119,8 +142,14 @@ score = r2_score(y_test, y_predicted) print(score) -############################################################################## +# %% # We can plot the final version of the tree to explain every prediction. fig, ax = plt.subplots(figsize=(10, 10)) plot_tree(pipeline.named_steps["classifier"], precision=6, filled=True, ax=ax) plt.show() + +# %% +# References +# ---------- +# +# .. footbibliography:: diff --git a/skfda/datasets/_real_datasets.py b/skfda/datasets/_real_datasets.py index d345ebdcf..833e398fe 100644 --- a/skfda/datasets/_real_datasets.py +++ b/skfda/datasets/_real_datasets.py @@ -5,11 +5,12 @@ import numpy as np import pandas as pd -import rdata from pandas import DataFrame, Series from sklearn.utils import Bunch from typing_extensions import Literal +import rdata + from ..representation import FDataGrid from ..typing._numpy import NDArrayFloat, NDArrayInt @@ -292,6 +293,7 @@ def _fetch_fda_usc(name: str) -> Any: Discriminant Analysis. Ann. Statist. 23 (1995), no. 1, 73--102. doi:10.1214/aos/1176324456. https://projecteuclid.org/euclid.aos/1176324456 + """ @@ -886,14 +888,15 @@ def fetch_weather( Series of daily summaries of 73 spanish weather stations selected for the period 1980-2009. The dataset contains the geographic information of each station and the average for the period 1980-2009 of daily temperature, - daily precipitation and daily wind speed. Meteorological State Agency of - Spain (AEMET), http://www.aemet.es/. Government of Spain. - - Authors: - Manuel Febrero Bande, Manuel Oviedo de la Fuente + daily precipitation and daily wind speed. Source: + Meteorological State Agency of Spain (AEMET). (2009). + Meteorological data of Spanish weather stations between years 1980 + and 2009. [dataset]. http://www.aemet.es/ + The data were obtained from the FTP of AEMET in 2009. + """