Skip to content

Commit

Permalink
Merge pull request #1060 from alan-turing-institute/dev
Browse files Browse the repository at this point in the history
For a 0.20 release
  • Loading branch information
ablaom authored Sep 29, 2023
2 parents 1313d4c + a421a6f commit 97a51d3
Show file tree
Hide file tree
Showing 26 changed files with 485 additions and 664 deletions.
3 changes: 3 additions & 0 deletions ORGANIZATION.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ its conventional use, are marked with a ⟂ symbol:
readme](https://github.com/JuliaAI/MLJBase.jl) for a
detailed description of MLJBase's contents.

* [StatisticalMeasures.jl](https://github.com/JuliaAI/StatisticalMeasures.jl) provifes
performance measures (metrics) such as losses and scores.

* [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl)
hosts the *MLJ model registry*, which contains metadata on all the
models the MLJ user can search and load from MLJ. Moreover, it
Expand Down
14 changes: 8 additions & 6 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "MLJ"
uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
authors = ["Anthony D. Blaom <[email protected]>"]
version = "0.19.5"
version = "0.20.0"

[deps]
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Expand All @@ -21,6 +21,7 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
Expand All @@ -29,17 +30,18 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
CategoricalArrays = "0.8,0.9, 0.10"
ComputationalResources = "0.3"
Distributions = "0.21,0.22,0.23, 0.24, 0.25"
MLJBase = "0.21.14"
MLJEnsembles = "0.3"
MLJFlow = "0.1"
MLJIteration = "0.5"
MLJBase = "1"
MLJEnsembles = "0.4"
MLJFlow = "0.2"
MLJIteration = "0.6"
MLJModels = "0.16"
MLJTuning = "0.7"
MLJTuning = "0.8"
OpenML = "0.2,0.3"
ProgressMeter = "1.1"
Reexport = "1.2"
ScientificTypes = "3"
StatsBase = "0.32,0.33, 0.34"
StatisticalMeasures = "0.1"
Tables = "0.2,1.0"
julia = "1.6"

Expand Down
13 changes: 4 additions & 9 deletions docs/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ EarlyStopping = "792122b4-ca99-40de-a6bc-6742525f08b6"
EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
IterationControl = "b3c1a2ee-3fec-4384-bf48-272ea71de57c"
LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7"
MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
MLJClusteringInterface = "d354fa79-ed1c-40d4-88ef-b8c7bd1568af"
MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661"
MLJEnsembles = "50ed68f4-41fd-4504-931a-ed422449fee0"
MLJFlow = "7b7b8358-b45c-48ea-a8ef-7ca328ad328f"
MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c"
MLJIteration = "614be32b-d00c-4edb-bd02-1eb411ab5e55"
MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692"
Expand All @@ -25,16 +25,11 @@ NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161"
StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541"
StatisticalMeasuresBase = "c062fc1d-0d66-479b-b6ac-8b44719de4cc"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"

[compat]
CategoricalDistributions = "0.1"
Documenter = "0.27"
MLJEnsembles = "0.3"
MLJIteration = "0.5"
MLJModels = "0.16"
MLJTuning = "0.7"
ScientificTypes = "3"
ScientificTypesBase = "3"
Documenter = "1"
julia = "1.6"
12 changes: 7 additions & 5 deletions docs/make.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ import MLJModels
import MLJEnsembles
import ScientificTypes
import MLJModelInterface
import ScientificTypes
import ScientificTypesBase
import Distributions
using CategoricalArrays
using LossFunctions
import CategoricalDistributions
import StatisticalMeasures
import StatisticalMeasuresBase

const MMI = MLJModelInterface

Expand Down Expand Up @@ -87,9 +89,7 @@ pages = [
"Third Party Packages" => "third_party_packages.md",
"Glossary" => "glossary.md",
"MLJ Cheatsheet" => "mlj_cheatsheet.md",
"Known Issues" => "known_issues.md",
"FAQ" => "frequently_asked_questions.md",
"Julia BlogPost" => "julia_blogpost.md",
"Index of Methods" => "api.md",
]

Expand All @@ -109,12 +109,14 @@ makedocs(
ScientificTypes,
MLJModelInterface,
ScientificTypesBase,
StatisticalMeasures,
MLJIteration,
EarlyStopping,
IterationControl,
CategoricalDistributions],
CategoricalDistributions,
StatisticalMeasures],
pages = pages,
strict = Documenter.except(:cross_references, :missing_docs),
warnonly = [:cross_references, :missing_docs],
)

@info "`makedocs` has finished running. "
Expand Down
20 changes: 18 additions & 2 deletions docs/model_docstring_tools.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,22 @@

const PATH_TO_MODEL_DOCS = joinpath(@__DIR__, "src", "models")

"""
remove_doc_refs(str::AbstractString)
Removes `@ref` references from `str. For example, a substring of the form
"[`some.thing_like_this123!`](@ref)" is replaced with "`some.thing_like_this123!`".
"""
function remove_doc_refs(page)
regex = r"\[([\?'\.\d`\!\_a-zA-Z]*)\]\(\@ref\)"
while contains(page, regex)
# replace the first matched regex with the captured string
page = replace(page, regex => s"\1")
end
page
end

demote_headings(str) = replace(str, "# "=>"## ")
handle(model) = model.name*"_"*model.package_name

Expand All @@ -25,7 +41,7 @@ function write_page(model; path=PATH_TO_MODEL_DOCS)
open(pagepath, "w") do stream
header = "# [$(model.name)](@id $id)\n\n"
md_page = doc(model.name, pkg=model.package_name)
page = header*demote_headings(string(md_page))
page = header*demote_headings(string(md_page)) |> remove_doc_refs
write(stream, page)
nothing
end
Expand Down Expand Up @@ -54,7 +70,7 @@ function models_missing_descriptors()
handles = handle.(models())
filter(handles) do h
!(h in HANDLES)
end
end
end

"""
Expand Down
2 changes: 0 additions & 2 deletions docs/src/about_mlj.md
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,6 @@ Bugs, suggestions, and feature requests can be posted
Users are also welcome to join the `#mlj` Julia slack channel to ask
questions and make suggestions.

See also, [Known Issues](@ref)


## Installation

Expand Down
12 changes: 6 additions & 6 deletions docs/src/common_mlj_workflows.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,10 @@ KNN = @load KNNRegressor
knn = KNN()
evaluate(knn, X, y,
resampling=CV(nfolds=5),
measure=[RootMeanSquaredError(), MeanAbsoluteError()])
measure=[RootMeanSquaredError(), LPLoss(1)])
```

Note `RootMeanSquaredError()` has alias `rms` and `MeanAbsoluteError()` has alias `mae`.
Note `RootMeanSquaredError()` has alias `rms` and `LPLoss(1)` has aliases `l1`, `mae`.

Do `measures()` to list all losses and scores and their aliases.

Expand Down Expand Up @@ -220,7 +220,7 @@ Fit on the train data set and evaluate on the test data set:
```@example workflows
fit!(mach, rows=train)
yhat = predict(mach, X[test,:])
mean(LogLoss(tol=1e-4)(yhat, y[test]))
LogLoss(tol=1e-4)(yhat, y[test])
```

Note `LogLoss()` has aliases `log_loss` and `cross_entropy`.
Expand Down Expand Up @@ -451,14 +451,14 @@ transformation/inverse transformation:
```@example workflows
X, y = @load_reduced_ames
KNN = @load KNNRegressor
knn_with_target = TransformedTargetModel(model=KNN(K=3), target=Standardizer())
knn_with_target = TransformedTargetModel(model=KNN(K=3), transformer=Standardizer())
pipe = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> knn_with_target
```

Evaluating the pipeline (just as you would any other model):

```@example workflows
pipe.one_hot_encoder.drop_last = true
pipe.one_hot_encoder.drop_last = true # mutate a nested hyper-parameter
evaluate(pipe, X, y, resampling=Holdout(), measure=RootMeanSquaredError(), verbosity=2)
```

Expand All @@ -476,7 +476,7 @@ target transformation/inverse transformation:
```@example workflows
Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0
tree_with_target = TransformedTargetModel(model=Tree(),
target=y -> log.(y),
transformer=y -> log.(y),
inverse = z -> exp.(z))
pipe2 = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> tree_with_target;
nothing # hide
Expand Down
65 changes: 35 additions & 30 deletions docs/src/evaluating_model_performance.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,31 +45,41 @@ machine potentially change. )

## Multiple measures

Multiple measures are specified as a vector:

```@repl evaluation_of_supervised_models
evaluate!(mach,
resampling=cv,
measure=[l1, rms, rmslp1], verbosity=0)
evaluate!(
mach,
resampling=cv,
measures=[l1, rms, rmslp1],
verbosity=0,
)
```

## Custom measures and weighted measures

```@repl evaluation_of_supervised_models
my_loss(yhat, y) = maximum((yhat - y).^2);
[Custom measures](@ref) can also be provided.

my_per_observation_loss(yhat, y) = abs.(yhat - y);
MLJ.reports_each_observation(::typeof(my_per_observation_loss)) = true;
## Specifying weights

my_weighted_score(yhat, y) = 1/mean(abs.(yhat - y));
my_weighted_score(yhat, y, w) = 1/mean(abs.((yhat - y).^w));
MLJ.supports_weights(::typeof(my_weighted_score)) = true;
MLJ.orientation(::typeof(my_weighted_score)) = :score;
Per-observation weights can be passed to measures. If a measure does not support weights,
the weights are ignored:

```@repl evaluation_of_supervised_models
holdout = Holdout(fraction_train=0.8)
weights = [1, 1, 2, 1, 1, 2, 3, 1, 1, 2, 3, 1];
evaluate!(mach,
resampling=CV(nfolds=3),
measure=[my_loss, my_per_observation_loss, my_weighted_score, l1],
weights=weights, verbosity=0)
evaluate!(
mach,
resampling=CV(nfolds=3),
measure=[l2, rsquared],
weights=weights,
)
```

In classification problems, use `class_weights=...` to specify a class weight dictionary.

```@docs
MLJBase.evaluate!
MLJBase.evaluate
MLJBase.PerformanceEvaluation
```

## User-specified train/test sets
Expand All @@ -78,18 +88,20 @@ Users can either provide an explicit list of train/test pairs of row indices for

```@repl evaluation_of_supervised_models
fold1 = 1:6; fold2 = 7:12;
evaluate!(mach,
resampling = [(fold1, fold2), (fold2, fold1)],
measure=[l1, l2], verbosity=0)
evaluate!(
mach,
resampling = [(fold1, fold2), (fold2, fold1)],
measures=[l1, l2],
verbosity=0,
)
```

Or define their own re-usable `ResamplingStrategy` objects, - see
[Custom resampling strategies](@ref) below.
Or the user can define their own re-usable `ResamplingStrategy` objects, - see [Custom
resampling strategies](@ref) below.


## Built-in resampling strategies


```@docs
MLJBase.Holdout
```
Expand Down Expand Up @@ -159,10 +171,3 @@ function train_test_pairs(holdout::Holdout, rows)
end
```

## API

```@docs
MLJBase.evaluate!
MLJBase.evaluate
MLJBase.PerformanceEvaluation
```
4 changes: 4 additions & 0 deletions docs/src/generating_synthetic_data.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Generating Synthetic Data

Here *synthetic data* means artificially generated data, with no reference to a "real
world" data set. Not to be confused "fake data" obtained by resampling from a distribution
fit to some actual real data.

MLJ has a set of functions - `make_blobs`, `make_circles`,
`make_moons` and `make_regression` (closely resembling functions in
[scikit-learn](https://scikit-learn.org/stable/datasets/index.html#generated-datasets)
Expand Down
Loading

0 comments on commit 97a51d3

Please sign in to comment.