Merge pull request #1065 from alan-turing-institute/dev

For a 0.20.1 release
JuliaAI · Oct 10, 2023 · 6e45c5d · 6e45c5d
2 parents 97a51d3 + 6e9f223
commit 6e45c5d
Show file tree

Hide file tree

Showing 12 changed files with 105 additions and 43 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,14 +1,15 @@
 name = "MLJ"
 uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "0.20.0"
+version = "0.20.1"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+MLJBalancing = "45f359ea-796d-4f51-95a5-deb1a414c586"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 MLJEnsembles = "50ed68f4-41fd-4504-931a-ed422449fee0"
 MLJFlow = "7b7b8358-b45c-48ea-a8ef-7ca328ad328f"
@@ -31,6 +32,7 @@ CategoricalArrays = "0.8,0.9, 0.10"
 ComputationalResources = "0.3"
 Distributions = "0.21,0.22,0.23, 0.24, 0.25"
 MLJBase = "1"
+MLJBalancing = "0.1"
 MLJEnsembles = "0.4"
 MLJFlow = "0.2"
 MLJIteration = "0.6"
@@ -40,8 +42,8 @@ OpenML = "0.2,0.3"
 ProgressMeter = "1.1"
 Reexport = "1.2"
 ScientificTypes = "3"
-StatsBase = "0.32,0.33, 0.34"
 StatisticalMeasures = "0.1"
+StatsBase = "0.32,0.33, 0.34"
 Tables = "0.2,1.0"
 julia = "1.6"
 

diff --git a/docs/ModelDescriptors.toml b/docs/ModelDescriptors.toml
@@ -10,17 +10,20 @@ AgglomerativeClustering_MLJScikitLearnInterface = ["clustering", "static_models"
 BM25Transformer_MLJText = ["encoders", "text_analysis"]
 BaggingClassifier_MLJScikitLearnInterface = ["classification", "ensemble_models"]
 BaggingRegressor_MLJScikitLearnInterface = ["regression", "ensemble_models"]
+BalancedBaggingClassifier_MLJBalancing = ["class_imbalance", "classification"]
 BayesianLDA_MultivariateStats = ["dimension_reduction", "classification", "Bayesian_models"]
 BayesianLDA_MLJScikitLearnInterface = ["dimension_reduction", "classification", "Bayesian_models"]
 BayesianQDA_MLJScikitLearnInterface = ["dimension_reduction", "classification", "Bayesian_models"]
 BayesianRidgeRegressor_MLJScikitLearnInterface = ["regression", "Bayesian_models"]
 BayesianSubspaceLDA_MultivariateStats = ["dimension_reduction", "classification", "Bayesian_models"]
 BernoulliNBClassifier_MLJScikitLearnInterface = ["classification", "Bayesian_models"]
 Birch_MLJScikitLearnInterface = ["clustering", "dimension_reduction", ]
+BorderlineSMOTE1_Imbalance = ["class_imbalance"]
 CatBoostClassifier_CatBoost = ["classification", "ensemble_models", "iterative_models"]
 CatBoostRegressor_CatBoost = ["regression", "ensemble_models", "iterative_models"]
 CBLOFDetector_OutlierDetectionPython = ["outlier_detection"]
 CDDetector_OutlierDetectionPython = ["outlier_detection"]
+ClusterUndersampler_Imbalance = ["class_imbalance"]
 COFDetector_OutlierDetectionNeighbors = ["outlier_detection"]
 COFDetector_OutlierDetectionPython = ["outlier_detection"]
 COPODDetector_OutlierDetectionPython = ["outlier_detection"]
@@ -46,6 +49,7 @@ ESADDetector_OutlierDetectionNetworks = ["outlier_detection"]
 ElasticNetCVRegressor_MLJScikitLearnInterface = ["regression"]
 ElasticNetRegressor_MLJLinearModels = ["regression"]
 ElasticNetRegressor_MLJScikitLearnInterface = ["regression"]
+ENNUndersampler_Imbalance = ["class_imbalance"]
 EpsilonSVR_LIBSVM = ["regression"]
 EvoLinearRegressor_EvoLinear = ["regression"]
 EvoTreeClassifier_EvoTrees = ["classification", "ensemble_models", "iterative_models"]
@@ -167,8 +171,12 @@ ProbabilisticNuSVC_LIBSVM = ["classification"]
 ProbabilisticSGDClassifier_MLJScikitLearnInterface = ["classification"]
 ProbabilisticSVC_LIBSVM = ["classification"]
 QuantileRegressor_MLJLinearModels = ["regression"]
+RandomOversampler_Imbalance = ["class_imbalance"]
+RandomUndersampler_Imbalance = ["class_imbalance"]
+RandomWalkOversampler_Imbalance = ["class_imbalance"]
 RANSACRegressor_MLJScikitLearnInterface = ["regression"]
 RODDetector_OutlierDetectionPython = ["outlier_detection"]
+ROSE_Imbalance = ["class_imbalance"]
 RandomForestClassifier_BetaML = ["classification", "ensemble_models", "iterative_models"]
 RandomForestClassifier_DecisionTree = ["classification", "ensemble_models", "iterative_models"]
 RandomForestClassifier_MLJScikitLearnInterface = ["classification", "ensemble_models", "iterative_models"]
@@ -186,6 +194,9 @@ RobustRegressor_MLJLinearModels = ["regression"]
 SelfOrganizingMap_SelfOrganizingMaps = ["dimension_reduction", "clustering"]
 SGDClassifier_MLJScikitLearnInterface = ["classification"]
 SGDRegressor_MLJScikitLearnInterface = ["regression"]
+SMOTE_Imbalance = ["class_imbalance"]
+SMOTEN_Imbalance = ["class_imbalance"]
+SMOTENC_Imbalance = ["class_imbalance"]
 SODDetector_OutlierDetectionPython = ["outlier_detection", "outlier_detection"]
 SOSDetector_OutlierDetectionPython = ["outlier_detection"]
 SRRegressor_SymbolicRegression = ["regression"]
@@ -204,6 +215,7 @@ SimpleImputer_BetaML = ["missing_value_imputation"]
 SpectralClustering_MLJScikitLearnInterface = ["clustering", "static_models"]
 Standardizer_MLJModels = ["encoders"]
 SubspaceLDA_MultivariateStats = ["classification", "dimension_reduction"]
+TomekUndersampler_Imbalance = ["class_imbalance"]
 TSVDTransformer_TSVD = ["dimension_reduction"]
 TfidfTransformer_MLJText = ["encoders", "text_analysis"]
 TheilSenRegressor_MLJScikitLearnInterface = ["regression"]

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -4,26 +4,18 @@ CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
-EarlyStopping = "792122b4-ca99-40de-a6bc-6742525f08b6"
 EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 IterationControl = "b3c1a2ee-3fec-4384-bf48-272ea71de57c"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 MLJClusteringInterface = "d354fa79-ed1c-40d4-88ef-b8c7bd1568af"
 MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
-MLJEnsembles = "50ed68f4-41fd-4504-931a-ed422449fee0"
-MLJFlow = "7b7b8358-b45c-48ea-a8ef-7ca328ad328f"
 MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c"
-MLJIteration = "614be32b-d00c-4edb-bd02-1eb411ab5e55"
 MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692"
-MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
-MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7"
 MLJMultivariateStatsInterface = "1b6a4a23-ba22-4f51-9698-8599985d3728"
-MLJTuning = "03970b2e-30c4-11ea-3135-d1576263f10f"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
 NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
 ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
 StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
 StatisticalMeasuresBase = "c062fc1d-0d66-479b-b6ac-8b44719de4cc"

diff --git a/docs/make.jl b/docs/make.jl
@@ -5,16 +5,16 @@ end
 using Pkg
 using Documenter
 using MLJ
-import MLJIteration
-import IterationControl
-import EarlyStopping
-import MLJBase
-import MLJTuning
-import MLJModels
-import MLJEnsembles
-import ScientificTypes
-import MLJModelInterface
-import ScientificTypes
+using MLJBase
+import MLJ.MLJBase.MLJModelInterface
+import MLJ.MLJIteration
+import MLJ.MLJIteration.IterationControl
+import MLJ.MLJIteration.IterationControl.EarlyStopping
+import MLJ.MLJTuning
+import MLJ.MLJModels
+import MLJ.MLJEnsembles
+import MLJ.ScientificTypes
+import MLJ.MLJBalancing
 import ScientificTypesBase
 import Distributions
 using CategoricalArrays
@@ -72,6 +72,7 @@ pages = [
     "Linear Pipelines" => "linear_pipelines.md",
     "Target Transformations" => "target_transformations.md",
     "Homogeneous Ensembles" => "homogeneous_ensembles.md",
+    "Correcting Class Imbalance" => "correcting_class_imbalance.md",
     "Model Stacking" => "model_stacking.md",
     "Learning Networks" => "learning_networks.md",
     "Controlling Iterative Models" => "controlling_iterative_models.md",
@@ -101,20 +102,23 @@ makedocs(
     doctest  = true,
     sitename = "MLJ",
     format   = Documenter.HTML(),
-    modules  = [MLJ,
-                MLJBase,
-                MLJTuning,
-                MLJModels,
-                MLJEnsembles,
-                ScientificTypes,
-                MLJModelInterface,
-                ScientificTypesBase,
-                StatisticalMeasures,
-                MLJIteration,
-                EarlyStopping,
-                IterationControl,
-                CategoricalDistributions,
-                StatisticalMeasures],
+    modules  = [
+        MLJ,
+        MLJBase,
+        MLJTuning,
+        MLJModels,
+        MLJEnsembles,
+        MLJBalancing,
+        MLJIteration,
+        ScientificTypes,
+        MLJModelInterface,
+        ScientificTypesBase,
+        StatisticalMeasures,
+        EarlyStopping,
+        IterationControl,
+        CategoricalDistributions,
+        StatisticalMeasures,
+    ],
     pages    = pages,
     warnonly = [:cross_references, :missing_docs],
 )

diff --git a/docs/model_docstring_tools.jl b/docs/model_docstring_tools.jl
@@ -62,7 +62,7 @@ const HANDLES = keys(DESCRIPTORS_GIVEN_HANDLE)
 """
     models_missing_descriptors()
 
-Return a list of handles for those models in the registry not have the corresponding
+Return a list of handles for those models in the registry not having the corresponding
 handle as key in /docs/src/ModelDescriptors.toml.
 
 """

diff --git a/docs/src/adding_models_for_general_use.md b/docs/src/adding_models_for_general_use.md
@@ -155,8 +155,15 @@ function RidgeRegressor(; lambda=0.0)
 end
 ```
 
-*Important.* The clean method must have the property that
-`clean!(clean!(model)) == clean!(model)` for any instance `model`.
+*Important.* Performing `clean!(model)` a second time should not mutate `model`. That is,
+this test should hold:
+
+```julia
+clean!(model)
+clone = deepcopy(model)
+clean!(model)
+@test model == clone
+```
 
 Although not essential, try to avoid `Union` types for model
 fields. For example, a field declaration `features::Vector{Symbol}`

diff --git a/docs/src/correcting_class_imbalance.md b/docs/src/correcting_class_imbalance.md
@@ -0,0 +1,25 @@
+# Correcting Class Imbalance
+
+## Oversampling and undersampling methods
+
+Models providing oversampling or undersampling methods, to correct for class imbalance,
+are listed under [Class Imbalance](@ref). In particular, several popular algorithms are
+provided by the [Imbalance.jl]() package, which includes detailed documentation and
+tutorials.
+
+## Incorporating class imbalance in supervised learning pipelines
+
+One or more oversampling/undersampling algorithms can be fused with an MLJ classifier
+using the [`BalancedModel`](@ref) wrapper. This creates a new classifier which can be
+treated like any other; resampling to correct for class imbalance, relevant only for
+*training* of the atomic classifier, is then carried out internally. If, for example, one
+applies cross-validation to the wrapped classifier (using [`evaluate!`](@ref), say) then
+this means over/undersampling is then repeated for each training fold automatically.
+
+Refer to the
+[MLJBalancing.jl](https://juliaai.github.io/Imbalance.jl/dev/algorithms/mlj_balancing/)
+documentation for further details.
+
+```@docs
+MLJBalancing.BalancedModel
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -49,7 +49,8 @@ To support MLJ development, please cite these works or star the repo:
 [Working with Categorical Data](@ref) | 
 [Preparing Data](@ref) |
 [Generating Synthetic Data](@ref) |
-[OpenML Integration](@ref)
+[OpenML Integration](@ref) |
+[Correcting Class Imbalance](@ref)
 
 ### Models
 [Model Search](@ref model_search) |
@@ -65,15 +66,18 @@ To support MLJ development, please cite these works or star the repo:
 [Evaluating Model Performance](@ref) |
 [Tuning Models](@ref) |
 [Controlling Iterative Models](@ref) |
-[Learning Curves](@ref)
+[Learning Curves](@ref)|
+[Correcting Class Imbalance](@ref)
 
 ### Composition
 [Composing Models](@ref) |
 [Linear Pipelines](@ref) |
 [Target Transformations](@ref) |
 [Homogeneous Ensembles](@ref) |
 [Model Stacking](@ref) |
-[Learning Networks](@ref)
+[Learning Networks](@ref)|
+[Correcting Class Imbalance](@ref)
+
 
 ### Integration 
 [Logging Workflows](@ref) |

diff --git a/docs/src/list_of_supported_models.md b/docs/src/list_of_supported_models.md
@@ -1,5 +1,8 @@
 # [List of Supported Models](@id model_list)
 
+For a list of models organized around function ("classification", "regression", etc.), see
+the [Model Browser](@ref).
+
 MLJ provides access to a wide variety of machine learning models.
 We are always looking for
 [help](https://github.com/alan-turing-institute/MLJ.jl/blob/master/CONTRIBUTING.md)
@@ -34,9 +37,11 @@ independent assessment.
 [EvoTrees.jl](https://github.com/Evovest/EvoTrees.jl) | - | EvoTreeRegressor, EvoTreeClassifier, EvoTreeCount, EvoTreeGaussian, EvoTreeMLE | medium | tree-based gradient boosting models
 [EvoLinear.jl](https://github.com/jeremiedb/EvoLinear.jl) | - | EvoLinearRegressor | medium | linear boosting models
 [GLM.jl](https://github.com/JuliaStats/GLM.jl) | [MLJGLMInterface.jl](https://github.com/JuliaAI/MLJGLMInterface.jl) | LinearRegressor, LinearBinaryClassifier, LinearCountRegressor | medium² |
+[Imbalance.jl](https://github.com/JuliaAI/Imbalance.jl) | - | RandomOversampler, RandomWalkOversampler, ROSE, SMOTE, BorderlineSMOTE1, SMOTEN, SMOTENC, RandomUndersampler, ClusterUndersampler,  ENNUndersampler, TomekUndersampler, | low | 
 [LIBSVM.jl](https://github.com/mpastell/LIBSVM.jl) | [MLJLIBSVMInterface.jl](https://github.com/JuliaAI/MLJLIBSVMInterface.jl) | LinearSVC, SVC, NuSVC, NuSVR, EpsilonSVR, OneClassSVM | high | also via ScikitLearn.jl
 [LightGBM.jl](https://github.com/IQVIA-ML/LightGBM.jl) | - | LGBMClassifier, LGBMRegressor | high | 
 [Flux.jl](https://github.com/FluxML/Flux.jl) | [MLJFlux.jl](https://github.com/FluxML/MLJFlux.jl) | NeuralNetworkRegressor, NeuralNetworkClassifier, MultitargetNeuralNetworkRegressor, ImageClassifier | low |
+[MLJBalancing.jl](https://github.com/JuliaAI/MLJBalancing.jl) | - | BalancedBaggingClassifier | low | 
 [MLJLinearModels.jl](https://github.com/JuliaAI/MLJLinearModels.jl) | - | LinearRegressor, RidgeRegressor, LassoRegressor, ElasticNetRegressor, QuantileRegressor, HuberRegressor, RobustRegressor, LADRegressor, LogisticClassifier, MultinomialClassifier | medium |
 [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl) (built-in) | - | ConstantClassifier, ConstantRegressor, ContinuousEncoder, DeterministicConstantClassifier, DeterministicConstantRegressor, FeatureSelector, FillImputer, InteractionTransformer, OneHotEncoder, Standardizer, UnivariateBoxCoxTransformer, UnivariateDiscretizer, UnivariateFillImputer,  UnivariateTimeTypeToContinuous, Standardizer, BinaryThreshholdPredictor | medium |
 [MLJText.jl](https://github.com/JuliaAI/MLJText.jl) | - | TfidfTransformer, BM25Transformer, CountTransformer | low |

diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md
@@ -49,9 +49,8 @@ multi-target measure using this package.
 
 In MLJ, measures are specified:
 
-- when evaluating model performance using
-[`evaluate!`](@ref)/[`evaluate`](@ref) - see [Evaluating Model Performance](@ref)
-
+- when evaluating model performance using [`evaluate!`](@ref)/[`evaluate`](@ref); see
+  [Evaluating Model Performance](@ref)
 - when wrapping models using [`TunedModel`](@ref) - see [Tuning Models](@ref)
 - when wrapping iterative models using [`IteratedModel`](@ref) - see [Controlling Iterative Models](@ref)
 - when generating learning curves using [`learning_curve`](@ref) - see [Learning Curves](@ref)

diff --git a/src/MLJ.jl b/src/MLJ.jl
@@ -50,6 +50,8 @@ using MLJModels
 using OpenML
 @reexport using MLJFlow
 @reexport using StatisticalMeasures
+import MLJBalancing
+@reexport using MLJBalancing: BalancedModel
 using MLJIteration
 import MLJIteration.IterationControl
 

diff --git a/test/exported_names.jl b/test/exported_names.jl
@@ -12,6 +12,16 @@ IterationControl.with_state_do(Step(2))
 IteratedModel
 MLJIteration
 
+# MLJBalancing
+
+bmodel = @test_logs(
+    (:warn, r"^No balancer"),
+    BalancedModel(model=ConstantClassifier()),
+)
+
+@test bmodel isa Probabilistic
+
+
 # MLJSerialization
 
 Save()