Merge pull request #1060 from alan-turing-institute/dev

For a 0.20 release
JuliaAI · Sep 29, 2023 · 97a51d3 · 97a51d3
2 parents 1313d4c + a421a6f
commit 97a51d3
Show file tree

Hide file tree

Showing 26 changed files with 485 additions and 664 deletions.
diff --git a/ORGANIZATION.md b/ORGANIZATION.md
@@ -40,6 +40,9 @@ its conventional use, are marked with a ⟂ symbol:
   readme](https://github.com/JuliaAI/MLJBase.jl) for a
   detailed description of MLJBase's contents.
 
+* [StatisticalMeasures.jl](https://github.com/JuliaAI/StatisticalMeasures.jl) provifes
+  performance measures (metrics) such as losses and scores.
+
 * [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl)
   hosts the *MLJ model registry*, which contains metadata on all the
   models the MLJ user can search and load from MLJ. Moreover, it

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "MLJ"
 uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
 authors = ["Anthony D. Blaom <[email protected]>"]
-version = "0.19.5"
+version = "0.20.0"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
@@ -21,6 +21,7 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
+StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
@@ -29,17 +30,18 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 CategoricalArrays = "0.8,0.9, 0.10"
 ComputationalResources = "0.3"
 Distributions = "0.21,0.22,0.23, 0.24, 0.25"
-MLJBase = "0.21.14"
-MLJEnsembles = "0.3"
-MLJFlow = "0.1"
-MLJIteration = "0.5"
+MLJBase = "1"
+MLJEnsembles = "0.4"
+MLJFlow = "0.2"
+MLJIteration = "0.6"
 MLJModels = "0.16"
-MLJTuning = "0.7"
+MLJTuning = "0.8"
 OpenML = "0.2,0.3"
 ProgressMeter = "1.1"
 Reexport = "1.2"
 ScientificTypes = "3"
 StatsBase = "0.32,0.33, 0.34"
+StatisticalMeasures = "0.1"
 Tables = "0.2,1.0"
 julia = "1.6"
 

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -8,11 +8,11 @@ EarlyStopping = "792122b4-ca99-40de-a6bc-6742525f08b6"
 EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 IterationControl = "b3c1a2ee-3fec-4384-bf48-272ea71de57c"
-LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
 MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
 MLJClusteringInterface = "d354fa79-ed1c-40d4-88ef-b8c7bd1568af"
 MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
 MLJEnsembles = "50ed68f4-41fd-4504-931a-ed422449fee0"
+MLJFlow = "7b7b8358-b45c-48ea-a8ef-7ca328ad328f"
 MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c"
 MLJIteration = "614be32b-d00c-4edb-bd02-1eb411ab5e55"
 MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692"
@@ -25,16 +25,11 @@ NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
 ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
+StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
+StatisticalMeasuresBase = "c062fc1d-0d66-479b-b6ac-8b44719de4cc"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
 
 [compat]
-CategoricalDistributions = "0.1"
-Documenter = "0.27"
-MLJEnsembles = "0.3"
-MLJIteration = "0.5"
-MLJModels = "0.16"
-MLJTuning = "0.7"
-ScientificTypes = "3"
-ScientificTypesBase = "3"
+Documenter = "1"
 julia = "1.6"
diff --git a/docs/make.jl b/docs/make.jl
@@ -14,11 +14,13 @@ import MLJModels
 import MLJEnsembles
 import ScientificTypes
 import MLJModelInterface
+import ScientificTypes
 import ScientificTypesBase
 import Distributions
 using CategoricalArrays
-using LossFunctions
 import CategoricalDistributions
+import StatisticalMeasures
+import StatisticalMeasuresBase
 
 const MMI = MLJModelInterface
 
@@ -87,9 +89,7 @@ pages = [
     "Third Party Packages" => "third_party_packages.md",
     "Glossary" => "glossary.md",
     "MLJ Cheatsheet" => "mlj_cheatsheet.md",
-    "Known Issues" => "known_issues.md",
     "FAQ" => "frequently_asked_questions.md",
-    "Julia BlogPost" => "julia_blogpost.md",
     "Index of Methods" => "api.md",
     ]
 
@@ -109,12 +109,14 @@ makedocs(
                 ScientificTypes,
                 MLJModelInterface,
                 ScientificTypesBase,
+                StatisticalMeasures,
                 MLJIteration,
                 EarlyStopping,
                 IterationControl,
-                CategoricalDistributions],
+                CategoricalDistributions,
+                StatisticalMeasures],
     pages    = pages,
-    strict   = Documenter.except(:cross_references, :missing_docs),
+    warnonly = [:cross_references, :missing_docs],
 )
 
 @info "`makedocs` has finished running. "

diff --git a/docs/model_docstring_tools.jl b/docs/model_docstring_tools.jl
@@ -2,6 +2,22 @@
 
 const PATH_TO_MODEL_DOCS = joinpath(@__DIR__, "src", "models")
 
+"""
+    remove_doc_refs(str::AbstractString)
+
+Removes `@ref` references from `str. For example, a substring of the form
+"[`some.thing_like_this123!`](@ref)" is replaced with "`some.thing_like_this123!`".
+
+"""
+function remove_doc_refs(page)
+    regex = r"\[([\?'\.\d`\!\_a-zA-Z]*)\]\(\@ref\)"
+    while contains(page, regex)
+        # replace the first matched regex with the captured string
+        page = replace(page, regex => s"\1")
+    end
+    page
+end
+
 demote_headings(str) = replace(str, "# "=>"## ")
 handle(model) = model.name*"_"*model.package_name
 
@@ -25,7 +41,7 @@ function write_page(model; path=PATH_TO_MODEL_DOCS)
     open(pagepath, "w") do stream
         header = "# [$(model.name)](@id $id)\n\n"
         md_page = doc(model.name, pkg=model.package_name)
-        page = header*demote_headings(string(md_page))
+        page = header*demote_headings(string(md_page)) |> remove_doc_refs
         write(stream, page)
         nothing
     end
@@ -54,7 +70,7 @@ function models_missing_descriptors()
     handles = handle.(models())
     filter(handles) do h
         !(h in HANDLES)
-    end 
+    end
 end
 
 """

diff --git a/docs/src/about_mlj.md b/docs/src/about_mlj.md
@@ -221,8 +221,6 @@ Bugs, suggestions, and feature requests can be posted
 Users are also welcome to join the `#mlj` Julia slack channel to ask
 questions and make suggestions.
 
-See also, [Known Issues](@ref)
-
 
 ## Installation
 

diff --git a/docs/src/common_mlj_workflows.md b/docs/src/common_mlj_workflows.md
@@ -176,10 +176,10 @@ KNN = @load KNNRegressor
 knn = KNN()
 evaluate(knn, X, y,
          resampling=CV(nfolds=5),
-         measure=[RootMeanSquaredError(), MeanAbsoluteError()])
+         measure=[RootMeanSquaredError(), LPLoss(1)])
 ```
 
-Note `RootMeanSquaredError()` has alias `rms` and `MeanAbsoluteError()` has alias `mae`.
+Note `RootMeanSquaredError()` has alias `rms` and `LPLoss(1)` has aliases `l1`, `mae`.
 
 Do `measures()` to list all losses and scores and their aliases.
 
@@ -220,7 +220,7 @@ Fit on the train data set and evaluate on the test data set:
 ```@example workflows
 fit!(mach, rows=train)
 yhat = predict(mach, X[test,:])
-mean(LogLoss(tol=1e-4)(yhat, y[test]))
+LogLoss(tol=1e-4)(yhat, y[test])
 ```
 
 Note `LogLoss()` has aliases `log_loss` and `cross_entropy`.
@@ -451,14 +451,14 @@ transformation/inverse transformation:
 ```@example workflows
 X, y = @load_reduced_ames
 KNN = @load KNNRegressor
-knn_with_target = TransformedTargetModel(model=KNN(K=3), target=Standardizer())
+knn_with_target = TransformedTargetModel(model=KNN(K=3), transformer=Standardizer())
 pipe = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> knn_with_target
 ```
 
 Evaluating the pipeline (just as you would any other model):
 
 ```@example workflows
-pipe.one_hot_encoder.drop_last = true
+pipe.one_hot_encoder.drop_last = true # mutate a nested hyper-parameter
 evaluate(pipe, X, y, resampling=Holdout(), measure=RootMeanSquaredError(), verbosity=2)
 ```
 
@@ -476,7 +476,7 @@ target transformation/inverse transformation:
 ```@example workflows
 Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0
 tree_with_target = TransformedTargetModel(model=Tree(),
-                                          target=y -> log.(y),
+                                          transformer=y -> log.(y),
                                           inverse = z -> exp.(z))
 pipe2 = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> tree_with_target;
 nothing # hide

diff --git a/docs/src/evaluating_model_performance.md b/docs/src/evaluating_model_performance.md
@@ -45,31 +45,41 @@ machine potentially change. )
 
 ## Multiple measures
 
+Multiple measures are specified as a vector:
+
 ```@repl evaluation_of_supervised_models
-evaluate!(mach,
-          resampling=cv,
-          measure=[l1, rms, rmslp1], verbosity=0)
+evaluate!(
+    mach,
+    resampling=cv,
+    measures=[l1, rms, rmslp1], 
+	verbosity=0,
+)
 ```
 
-## Custom measures and weighted measures
-
-```@repl evaluation_of_supervised_models
-my_loss(yhat, y) = maximum((yhat - y).^2);
+[Custom measures](@ref) can also be provided.
 
-my_per_observation_loss(yhat, y) = abs.(yhat - y);
-MLJ.reports_each_observation(::typeof(my_per_observation_loss)) = true;
+## Specifying weights
 
-my_weighted_score(yhat, y) = 1/mean(abs.(yhat - y));
-my_weighted_score(yhat, y, w) = 1/mean(abs.((yhat - y).^w));
-MLJ.supports_weights(::typeof(my_weighted_score)) = true;
-MLJ.orientation(::typeof(my_weighted_score)) = :score;
+Per-observation weights can be passed to measures. If a measure does not support weights,
+the weights are ignored:
 
+```@repl evaluation_of_supervised_models
 holdout = Holdout(fraction_train=0.8)
 weights = [1, 1, 2, 1, 1, 2, 3, 1, 1, 2, 3, 1];
-evaluate!(mach,
-          resampling=CV(nfolds=3),
-          measure=[my_loss, my_per_observation_loss, my_weighted_score, l1],
-          weights=weights, verbosity=0)
+evaluate!(
+    mach,
+    resampling=CV(nfolds=3),
+    measure=[l2, rsquared],
+    weights=weights, 
+)
+```
+
+In classification problems, use `class_weights=...` to specify a class weight dictionary.
+
+```@docs
+MLJBase.evaluate!
+MLJBase.evaluate
+MLJBase.PerformanceEvaluation
 ```
 
 ## User-specified train/test sets
@@ -78,18 +88,20 @@ Users can either provide an explicit list of train/test pairs of row indices for
 
 ```@repl evaluation_of_supervised_models
 fold1 = 1:6; fold2 = 7:12;
-evaluate!(mach,
-          resampling = [(fold1, fold2), (fold2, fold1)],
-          measure=[l1, l2], verbosity=0)
+evaluate!(
+    mach,
+    resampling = [(fold1, fold2), (fold2, fold1)],
+    measures=[l1, l2], 
+	verbosity=0,
+)
 ```
 
-Or define their own re-usable `ResamplingStrategy` objects, - see
-[Custom resampling strategies](@ref) below.
+Or the user can define their own re-usable `ResamplingStrategy` objects, - see [Custom
+resampling strategies](@ref) below.
 
 
 ## Built-in resampling strategies
 
-
 ```@docs
 MLJBase.Holdout
 ```
@@ -159,10 +171,3 @@ function train_test_pairs(holdout::Holdout, rows)
 end
 ```
 
-## API
-
-```@docs
-MLJBase.evaluate!
-MLJBase.evaluate
-MLJBase.PerformanceEvaluation
-```
diff --git a/docs/src/generating_synthetic_data.md b/docs/src/generating_synthetic_data.md
@@ -1,5 +1,9 @@
 # Generating Synthetic Data
 
+Here *synthetic data* means artificially generated data, with no reference to a "real
+world" data set. Not to be confused "fake data" obtained by resampling from a distribution
+fit to some actual real data.
+
 MLJ has a set of functions - `make_blobs`, `make_circles`,
 `make_moons` and `make_regression` (closely resembling functions in
 [scikit-learn](https://scikit-learn.org/stable/datasets/index.html#generated-datasets)