diff --git a/ORGANIZATION.md b/ORGANIZATION.md index 2a7c690a1..fd6ec873e 100644 --- a/ORGANIZATION.md +++ b/ORGANIZATION.md @@ -40,6 +40,9 @@ its conventional use, are marked with a ⟂ symbol: readme](https://github.com/JuliaAI/MLJBase.jl) for a detailed description of MLJBase's contents. +* [StatisticalMeasures.jl](https://github.com/JuliaAI/StatisticalMeasures.jl) provifes + performance measures (metrics) such as losses and scores. + * [MLJModels.jl](https://github.com/JuliaAI/MLJModels.jl) hosts the *MLJ model registry*, which contains metadata on all the models the MLJ user can search and load from MLJ. Moreover, it diff --git a/Project.toml b/Project.toml index 5a6b42914..e1b39ad64 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MLJ" uuid = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" authors = ["Anthony D. Blaom "] -version = "0.19.5" +version = "0.20.0" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" @@ -21,6 +21,7 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" +StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" @@ -29,17 +30,18 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" CategoricalArrays = "0.8,0.9, 0.10" ComputationalResources = "0.3" Distributions = "0.21,0.22,0.23, 0.24, 0.25" -MLJBase = "0.21.14" -MLJEnsembles = "0.3" -MLJFlow = "0.1" -MLJIteration = "0.5" +MLJBase = "1" +MLJEnsembles = "0.4" +MLJFlow = "0.2" +MLJIteration = "0.6" MLJModels = "0.16" -MLJTuning = "0.7" +MLJTuning = "0.8" OpenML = "0.2,0.3" ProgressMeter = "1.1" Reexport = "1.2" ScientificTypes = "3" StatsBase = "0.32,0.33, 0.34" +StatisticalMeasures = "0.1" Tables = "0.2,1.0" julia = "1.6" diff --git a/docs/Project.toml b/docs/Project.toml index ee3cd3f2e..a1f37bd96 100755 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -8,11 +8,11 @@ EarlyStopping = "792122b4-ca99-40de-a6bc-6742525f08b6" EvoTrees = "f6006082-12f8-11e9-0c9c-0d5d367ab1e5" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" IterationControl = "b3c1a2ee-3fec-4384-bf48-272ea71de57c" -LossFunctions = "30fc2ffe-d236-52d8-8643-a9d8f7c094a7" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJClusteringInterface = "d354fa79-ed1c-40d4-88ef-b8c7bd1568af" MLJDecisionTreeInterface = "c6f25543-311c-4c74-83dc-3ea6d1015661" MLJEnsembles = "50ed68f4-41fd-4504-931a-ed422449fee0" +MLJFlow = "7b7b8358-b45c-48ea-a8ef-7ca328ad328f" MLJGLMInterface = "caf8df21-4939-456d-ac9c-5fefbfb04c0c" MLJIteration = "614be32b-d00c-4edb-bd02-1eb411ab5e55" MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692" @@ -25,16 +25,11 @@ NearestNeighborModels = "636a865e-7cf4-491e-846c-de09b730eb36" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81" ScientificTypesBase = "30f210dd-8aff-4c5f-94ba-8e64358c1161" +StatisticalMeasures = "a19d573c-0a75-4610-95b3-7071388c7541" +StatisticalMeasuresBase = "c062fc1d-0d66-479b-b6ac-8b44719de4cc" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" [compat] -CategoricalDistributions = "0.1" -Documenter = "0.27" -MLJEnsembles = "0.3" -MLJIteration = "0.5" -MLJModels = "0.16" -MLJTuning = "0.7" -ScientificTypes = "3" -ScientificTypesBase = "3" +Documenter = "1" julia = "1.6" diff --git a/docs/make.jl b/docs/make.jl index f646377a5..93a7dfd9a 100755 --- a/docs/make.jl +++ b/docs/make.jl @@ -14,11 +14,13 @@ import MLJModels import MLJEnsembles import ScientificTypes import MLJModelInterface +import ScientificTypes import ScientificTypesBase import Distributions using CategoricalArrays -using LossFunctions import CategoricalDistributions +import StatisticalMeasures +import StatisticalMeasuresBase const MMI = MLJModelInterface @@ -87,9 +89,7 @@ pages = [ "Third Party Packages" => "third_party_packages.md", "Glossary" => "glossary.md", "MLJ Cheatsheet" => "mlj_cheatsheet.md", - "Known Issues" => "known_issues.md", "FAQ" => "frequently_asked_questions.md", - "Julia BlogPost" => "julia_blogpost.md", "Index of Methods" => "api.md", ] @@ -109,12 +109,14 @@ makedocs( ScientificTypes, MLJModelInterface, ScientificTypesBase, + StatisticalMeasures, MLJIteration, EarlyStopping, IterationControl, - CategoricalDistributions], + CategoricalDistributions, + StatisticalMeasures], pages = pages, - strict = Documenter.except(:cross_references, :missing_docs), + warnonly = [:cross_references, :missing_docs], ) @info "`makedocs` has finished running. " diff --git a/docs/model_docstring_tools.jl b/docs/model_docstring_tools.jl index 0edf63f49..5885f7d4d 100644 --- a/docs/model_docstring_tools.jl +++ b/docs/model_docstring_tools.jl @@ -2,6 +2,22 @@ const PATH_TO_MODEL_DOCS = joinpath(@__DIR__, "src", "models") +""" + remove_doc_refs(str::AbstractString) + +Removes `@ref` references from `str. For example, a substring of the form +"[`some.thing_like_this123!`](@ref)" is replaced with "`some.thing_like_this123!`". + +""" +function remove_doc_refs(page) + regex = r"\[([\?'\.\d`\!\_a-zA-Z]*)\]\(\@ref\)" + while contains(page, regex) + # replace the first matched regex with the captured string + page = replace(page, regex => s"\1") + end + page +end + demote_headings(str) = replace(str, "# "=>"## ") handle(model) = model.name*"_"*model.package_name @@ -25,7 +41,7 @@ function write_page(model; path=PATH_TO_MODEL_DOCS) open(pagepath, "w") do stream header = "# [$(model.name)](@id $id)\n\n" md_page = doc(model.name, pkg=model.package_name) - page = header*demote_headings(string(md_page)) + page = header*demote_headings(string(md_page)) |> remove_doc_refs write(stream, page) nothing end @@ -54,7 +70,7 @@ function models_missing_descriptors() handles = handle.(models()) filter(handles) do h !(h in HANDLES) - end + end end """ diff --git a/docs/src/about_mlj.md b/docs/src/about_mlj.md index 276cfefd5..f54896165 100755 --- a/docs/src/about_mlj.md +++ b/docs/src/about_mlj.md @@ -221,8 +221,6 @@ Bugs, suggestions, and feature requests can be posted Users are also welcome to join the `#mlj` Julia slack channel to ask questions and make suggestions. -See also, [Known Issues](@ref) - ## Installation diff --git a/docs/src/common_mlj_workflows.md b/docs/src/common_mlj_workflows.md index ce66037a6..2b7cfaec9 100644 --- a/docs/src/common_mlj_workflows.md +++ b/docs/src/common_mlj_workflows.md @@ -176,10 +176,10 @@ KNN = @load KNNRegressor knn = KNN() evaluate(knn, X, y, resampling=CV(nfolds=5), - measure=[RootMeanSquaredError(), MeanAbsoluteError()]) + measure=[RootMeanSquaredError(), LPLoss(1)]) ``` -Note `RootMeanSquaredError()` has alias `rms` and `MeanAbsoluteError()` has alias `mae`. +Note `RootMeanSquaredError()` has alias `rms` and `LPLoss(1)` has aliases `l1`, `mae`. Do `measures()` to list all losses and scores and their aliases. @@ -220,7 +220,7 @@ Fit on the train data set and evaluate on the test data set: ```@example workflows fit!(mach, rows=train) yhat = predict(mach, X[test,:]) -mean(LogLoss(tol=1e-4)(yhat, y[test])) +LogLoss(tol=1e-4)(yhat, y[test]) ``` Note `LogLoss()` has aliases `log_loss` and `cross_entropy`. @@ -451,14 +451,14 @@ transformation/inverse transformation: ```@example workflows X, y = @load_reduced_ames KNN = @load KNNRegressor -knn_with_target = TransformedTargetModel(model=KNN(K=3), target=Standardizer()) +knn_with_target = TransformedTargetModel(model=KNN(K=3), transformer=Standardizer()) pipe = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> knn_with_target ``` Evaluating the pipeline (just as you would any other model): ```@example workflows -pipe.one_hot_encoder.drop_last = true +pipe.one_hot_encoder.drop_last = true # mutate a nested hyper-parameter evaluate(pipe, X, y, resampling=Holdout(), measure=RootMeanSquaredError(), verbosity=2) ``` @@ -476,7 +476,7 @@ target transformation/inverse transformation: ```@example workflows Tree = @load DecisionTreeRegressor pkg=DecisionTree verbosity=0 tree_with_target = TransformedTargetModel(model=Tree(), - target=y -> log.(y), + transformer=y -> log.(y), inverse = z -> exp.(z)) pipe2 = (X -> coerce(X, :age=>Continuous)) |> OneHotEncoder() |> tree_with_target; nothing # hide diff --git a/docs/src/evaluating_model_performance.md b/docs/src/evaluating_model_performance.md index 63476d14f..448283c57 100644 --- a/docs/src/evaluating_model_performance.md +++ b/docs/src/evaluating_model_performance.md @@ -45,31 +45,41 @@ machine potentially change. ) ## Multiple measures +Multiple measures are specified as a vector: + ```@repl evaluation_of_supervised_models -evaluate!(mach, - resampling=cv, - measure=[l1, rms, rmslp1], verbosity=0) +evaluate!( + mach, + resampling=cv, + measures=[l1, rms, rmslp1], + verbosity=0, +) ``` -## Custom measures and weighted measures - -```@repl evaluation_of_supervised_models -my_loss(yhat, y) = maximum((yhat - y).^2); +[Custom measures](@ref) can also be provided. -my_per_observation_loss(yhat, y) = abs.(yhat - y); -MLJ.reports_each_observation(::typeof(my_per_observation_loss)) = true; +## Specifying weights -my_weighted_score(yhat, y) = 1/mean(abs.(yhat - y)); -my_weighted_score(yhat, y, w) = 1/mean(abs.((yhat - y).^w)); -MLJ.supports_weights(::typeof(my_weighted_score)) = true; -MLJ.orientation(::typeof(my_weighted_score)) = :score; +Per-observation weights can be passed to measures. If a measure does not support weights, +the weights are ignored: +```@repl evaluation_of_supervised_models holdout = Holdout(fraction_train=0.8) weights = [1, 1, 2, 1, 1, 2, 3, 1, 1, 2, 3, 1]; -evaluate!(mach, - resampling=CV(nfolds=3), - measure=[my_loss, my_per_observation_loss, my_weighted_score, l1], - weights=weights, verbosity=0) +evaluate!( + mach, + resampling=CV(nfolds=3), + measure=[l2, rsquared], + weights=weights, +) +``` + +In classification problems, use `class_weights=...` to specify a class weight dictionary. + +```@docs +MLJBase.evaluate! +MLJBase.evaluate +MLJBase.PerformanceEvaluation ``` ## User-specified train/test sets @@ -78,18 +88,20 @@ Users can either provide an explicit list of train/test pairs of row indices for ```@repl evaluation_of_supervised_models fold1 = 1:6; fold2 = 7:12; -evaluate!(mach, - resampling = [(fold1, fold2), (fold2, fold1)], - measure=[l1, l2], verbosity=0) +evaluate!( + mach, + resampling = [(fold1, fold2), (fold2, fold1)], + measures=[l1, l2], + verbosity=0, +) ``` -Or define their own re-usable `ResamplingStrategy` objects, - see -[Custom resampling strategies](@ref) below. +Or the user can define their own re-usable `ResamplingStrategy` objects, - see [Custom +resampling strategies](@ref) below. ## Built-in resampling strategies - ```@docs MLJBase.Holdout ``` @@ -159,10 +171,3 @@ function train_test_pairs(holdout::Holdout, rows) end ``` -## API - -```@docs -MLJBase.evaluate! -MLJBase.evaluate -MLJBase.PerformanceEvaluation -``` diff --git a/docs/src/generating_synthetic_data.md b/docs/src/generating_synthetic_data.md index f999d91f9..0349f0861 100644 --- a/docs/src/generating_synthetic_data.md +++ b/docs/src/generating_synthetic_data.md @@ -1,5 +1,9 @@ # Generating Synthetic Data +Here *synthetic data* means artificially generated data, with no reference to a "real +world" data set. Not to be confused "fake data" obtained by resampling from a distribution +fit to some actual real data. + MLJ has a set of functions - `make_blobs`, `make_circles`, `make_moons` and `make_regression` (closely resembling functions in [scikit-learn](https://scikit-learn.org/stable/datasets/index.html#generated-datasets) diff --git a/docs/src/getting_started.md b/docs/src/getting_started.md index bd104f182..5952cc6bb 100644 --- a/docs/src/getting_started.md +++ b/docs/src/getting_started.md @@ -5,7 +5,10 @@ For an outline of MLJ's **goals** and **features**, see This page introduces some MLJ basics, assuming some familiarity with machine learning. For a complete list of other MLJ learning resources, -see [Learning MLJ](@ref). +see [Learning MLJ](@ref). + +MLJ collects together the functionality provided by mutliple packages. To learn how to +install components separately, run `using MLJ; @doc MLJ`. This section introduces only the most basic MLJ operations and concepts. It assumes MLJ has been successfully installed. See @@ -191,7 +194,7 @@ train, test = partition(eachindex(y), 0.7); # 70:30 split fit!(mach, rows=train); yhat = predict(mach, X[test,:]); yhat[3:5] -log_loss(yhat, y[test]) |> mean +log_loss(yhat, y[test]) ``` Note that `log_loss` and `cross_entropy` are aliases for `LogLoss()` @@ -265,17 +268,8 @@ evaluate!(mach, resampling=Holdout(fraction_train=0.7), ## Next steps -To learn a little more about what MLJ can do, browse [Common MLJ -Workflows](common_mlj_workflows.md) or [Data Science Tutorials in -Julia](https://alan-turing-institute.github.io/DataScienceTutorials.jl/) -or try the [JuliaCon2020 -Workshop](https://github.com/ablaom/MachineLearningInJulia2020) on MLJ -(recorded -[here](https://www.youtube.com/watch?time_continue=27&v=qSWbCn170HU&feature=emb_title)) -returning to the manual as needed. - -*Read at least the remainder of this page before considering serious -use of MLJ.* +For next steps, consult the [Learn MLJ](@ref) section. *At the least, we recommned you +read the remainder of this page before considering serious use of MLJ.* ## Data containers and scientific types @@ -341,7 +335,7 @@ scientific type `Table{K}`, where `K` depends on the scientific types of the col which can be individually inspected using `schema`: ```@repl doda -schema(X) +ScientificTypes.schema ``` #### Matrix data @@ -350,23 +344,11 @@ MLJ models expecting a table do not generally accept a matrix instead. However, a matrix can be wrapped as a table, using [`MLJ.table`](@ref): -```julia -matrix_table = MLJ.table(rand(2,3)) +```@repl doda +matrix_table = MLJ.table(rand(2,3)); schema(matrix_table) ``` -``` -┌─────────┬─────────┬────────────┐ -│ _.names │ _.types │ _.scitypes │ -├─────────┼─────────┼────────────┤ -│ x1 │ Float64 │ Continuous │ -│ x2 │ Float64 │ Continuous │ -│ x3 │ Float64 │ Continuous │ -└─────────┴─────────┴────────────┘ -_.nrows = 2 - -``` - The matrix is *not* copied, only wrapped. To manifest a table as a matrix, use [`MLJ.matrix`](@ref). @@ -447,19 +429,9 @@ are the key features of that convention: - The scientific types of `nothing` and `missing` are `Nothing` and `Missing`, native types we also regard as scientific. -Use `coerce(v, OrderedFactor)` or `coerce(v, Multiclass)` to coerce a -vector `v` of integers, strings or characters to a vector with an -appropriate `Finite` (categorical) scitype. See [Working with -Categorical Data](@ref). - -For more on scitype coercion of arrays and tables, see [`coerce`](@ref), -[`autotype`](@ref) and [`unpack`](@ref) below and the examples at -[ScientificTypes.jl](https://JuliaAI.github.io/ScientificTypes.jl/dev/). +Use `coerce(v, OrderedFactor)` or `coerce(v, Multiclass)` to coerce a vector `v` of +integers, strings or characters to a vector with an appropriate `Finite` (categorical) +scitype. See also [Working with Categorical Data](@ref), and the +[ScientificTypes.jl](https://JuliaAI.github.io/ScientificTypes.jl/dev/) documentation. - -```@docs -scitype -coerce -autotype -``` diff --git a/docs/src/julia_blogpost.md b/docs/src/julia_blogpost.md deleted file mode 100644 index cad97e02a..000000000 --- a/docs/src/julia_blogpost.md +++ /dev/null @@ -1,227 +0,0 @@ -!!! warning "Old post" - - This post is quite old. For a newer overview of the design of MLJ, see [here](https://github.com/alan-turing-institute/MLJ.jl/blob/master/paper/paper.md) - - -# Beyond machine learning pipelines with MLJ - -Anthony Blaom, Diego Arenas, Franz Kiraly, Yiannis Simillides, Sebastian Vollmer - -**May 1st, 2019.** Blog post also posted on the [Julia Language Blog](https://julialang.org/blog/2019/05/beyond-ml-pipelines-with-mlj) - - - - -![](img/learningcurves.png) | ![](img/heatmap.png) -------------------------|-------------------------- -![](img/wrapped_ridge.png) | ![](img/MLPackages.png) - - -## Introducing MLJ - -[MLJ](https://github.com/alan-turing-institute/MLJ.jl) is an -open-source machine learning toolbox written in pure Julia. It -provides a uniform interface for interacting with supervised and -unsupervised learning models currently scattered in different Julia -packages. - -Building on a earlier proof-of-concept, development began in earnest -at [The Alan Turing Institute](https://www.turing.ac.uk) in -December 2018. In a short time interest grew and the project is now -the Institute's most starred software repository. - -After outlining MLJ's current functionality, this post introduces MLJ -**learning networks**, a super-charged pipelining feature for model -composition. - -**Quick links:** - -- [MLJ vs ScikitLearn.jl](https://alan-turing-institute.github.io/MLJ.jl/dev/frequently_asked_questions/) - -- Video from [London Julia User Group meetup in March 2019](https://www.youtube.com/watch?v=CfHkjNmj1eE) (skip to [demo at 21'39](https://youtu.be/CfHkjNmj1eE?t=21m39s))   - -- [Learning MLJ](@ref) - -- Implementing the MLJ interface for a [new model](https://alan-turing-institute.github.io/MLJ.jl/dev/adding_models_for_general_use/) - -- How to [contribute](https://github.com/alan-turing-institute/MLJ.jl/blob/master/CONTRIBUTE.md) - -- Julia [Slack](http://julialang.slack.com) channel: \#mlj. - -- Star'ing us to show support for [MLJ](https://github.com/alan-turing-institute/MLJ.jl) would be greatly appreciated! - - -## MLJ features - -MLJ already has substantial functionality: - -- **Learning networks.** Flexible model composition beyond traditional - pipelines (more on this below). - -- **Automatic tuning.** Automated tuning of hyperparameters, including - composite models. Tuning implemented as a model wrapper for - composition with other meta-algorithms. - -- **Homogeneous model ensembling.** - -- **Registry for model metadata.** Metadata available without loading - model code. Basis of a "task" interface and facilitates - model composition. - -- **Task interface.** Automatically match models to specified learning - tasks, to streamline benchmarking and model selection. - -- **Clean probabilistic API.** Improves support for Bayesian - statistics and probabilistic graphical models. - -- **Data container agnostic.** Present and manipulate data in your - favorite Tables.jl format. - -- **Universal adoption of categorical data types.** Enables model - implementations to properly account for classes seen in training but - not in evaluation. - -Enhancements planned for the near future include integration of -Flux.jl **deep learning** models, and **gradient descent tuning** of -continuous hyperparameters using automatic differentiation. - -While a relatively small number of machine learning models currently -implement the MLJ interface, work in progress aims to wrap models -supported by the popular python framework, scikit-learn, as a -temporary expedient. For a comparison of the MLJ's design with the -Julia wrap [ScitLearn.jl](https://github.com/cstjean/ScikitLearn.jl), -see this -[FAQ](https://github.com/alan-turing-institute/MLJ.jl/blob/master/docs/src/frequently_asked_questions.md). - - -## Learning networks - -MLJ's model composition interface is flexible enough to implement, for -example, the [model -stacks](https://www.kdnuggets.com/2017/02/stacking-models-imropved-predictions.html) -popular in data science competitions. To treat examples of this kind, -the interface design must account for the fact that information flow -in prediction and training modes is different. This can be seen from -the following schematic of a simple two-model stack, viewed as a -network: - -![](img/two_model_stack.png) - -## Building a simple network - -In MLJ, networks of models are built using a declarative syntax -already familiar from basic use of the package. For example, the -ordinary syntax for training a decision tree in MLJ, after one-hot -encoding the categorical features, looks like this: - -```julia -using MLJ -@load DecisionTreeRegressor - -# load some data: -task = load_reduced_ames(); -X, y = task(); - -# one-hot encode the inputs, X: -hot_model = OneHotEncoder() -hot = machine(hot_model, X) -fit!(hot) -Xt = transform(hot, X) - -# fit a decision tree to the transformed data: -tree_model = DecisionTreeRegressor() -tree = machine(tree_model, Xt, y) -fit!(tree, rows = 1:1300) -``` - -Note that a *model* in MLJ is just a struct containing -hyperparameters. Wrapping a model in data delivers a *machine* struct, -which will additionally record the results of training. - -Without a pipeline, each time we want to present new data for -prediction we must first apply one-hot encoding: - -```julia -Xnew = X[1301:1400,:]; -Xnewt = transform(hot, Xnew); -yhat = predict(tree, Xnewt); -yhat[1:3] - 3-element Array{Float64,1}: - 223956.9999999999 - 320142.85714285733 - 161227.49999999994 -``` - -To build a pipeline one simply wraps the supplied data in source nodes -and repeats similar declarations, omitting calls to -`fit!`. The difference now is that each "variable" (e.g., `Xt`, -`yhat`) is a node of our pipeline, instead of concrete data: - -```julia -Xs = source(X) -ys = source(y) - -hot = machine(hot_model, Xs) -Xt = transform(hot, Xs); - -tree = machine(tree_model, Xt, ys) -yhat = predict(tree, Xt) -``` - -If we like, we can think of a node as *dynamic data* - "data" because -it can be called (indexed) on rows, but "dynamic" because the result -depends on the outcome of training events, which in turn depend on -hyperparameter values. For example, after fitting the completed pipeline, -we can make new predictions like this: - -```julia -fit!(yhat, rows=1:1300) - [ Info: Training NodalMachine @ 1…51. - [ Info: Spawned 1300 sub-features to one-hot encode feature :Neighborhood. - [ Info: Spawned 1300 sub-features to one-hot encode feature :MSSubClass. - [ Info: Training NodalMachine @ 1…17. - Node @ 1…79 = predict(1…17, transform(1…51, 1…07)) - -yhat(rows=1301:1302) # to predict on rows of source node -yhat(Xnew) # to predict on new data -156-element Array{Float64,1}: - 223956.9999999999 - 320142.85714285733 - ... -``` - - -## Exporting and retraining - -Once a pipeline like this has been built and tested on sample data, it -can be exported as a stand-alone model, ready to be trained on any -dataset. For details, see the MLJ -[documentation](https://alan-turing-institute.github.io/MLJ.jl/dev/learning_networks/). In -the future, Julia macros will allow common architectures (e.g., linear -pipelines) to be built in a couple of lines. - -Finally, we mention that MLJ learning networks, and their exported -counterparts, are "smart" in the sense that changing a hyperparameter -does not trigger retraining of component models upstream of the -change: - -```julia -tree_model.max_depth = 4 -fit!(yhat, rows=1:1300) - [ Info: Not retraining NodalMachine @ 1…51. It is up-to-date. - [ Info: Updating NodalMachine @ 1…17. - Node @ 1…79 = predict(1…17, transform(1…51, 1…07)) -``` - - -## Just "Write the math!" - -Because of Julia's generic programming features, any kind of operation -you would normally apply to data (arithmetic, row selection, column -concatenation, etc) can be overloaded to work with nodes. In this way, -MLJ's network-building syntax is economical, intuitive and easy to -read. In this respect we have been inspired by [On Machine Learning -and Programming Languages](https://julialang.org/blog/2017/12/ml&pl). - -## Invitation to the community -We now invite the community to try out our newly registered packages, [MLJ](https://github.com/alan-turing-institute/MLJ.jl)alongside [MLJModels](https://github.com/JuliaAI/MLJModels.jl), and provide any feedback or suggestions you may have going forward. We are also particularly interested in hearing how you would use our package, and what features it may be lacking. diff --git a/docs/src/known_issues.md b/docs/src/known_issues.md deleted file mode 100644 index 807127dd9..000000000 --- a/docs/src/known_issues.md +++ /dev/null @@ -1,32 +0,0 @@ -# Known Issues - -Routine issues are posted -[here](https://github.com/alan-turing-institute/MLJ.jl/issues). Below -are some longer term issues and limitations. - -#### ScikitLearn/MKL issue - -For users of Mac OS using Julia 1.3 or higher, using ScikitLearn -models can lead to unexpected MKL errors due to an issue not related -to MLJ. See -[this Julia Discourse discussion](https://discourse.julialang.org/t/julia-1-3-1-4-on-macos-and-intel-mkl-error/36469/2) -and -[this issue](https://github.com/JuliaPackaging/BinaryBuilder.jl/issues/700) -for context. - -A temporary workaround for this issue is to force the installation of -an older version of the `OpenSpecFun_jll` library. To install an -appropriate version, activate your MLJ environment and run - -```julia - using Pkg; - Pkg.add(PackageSpec(url="https://github.com/tlienart/OpenSpecFun_jll.jl")) -``` - -#### Serialization for composite models with component models with custom serialization - -See -[here](https://github.com/alan-turing-institute/MLJ.jl/issues/678). Workaround: -Instead of `XGBoost` models (the chief known case) use models from the -pure Julia package `EvoTrees`. - diff --git a/docs/src/learning_curves.md b/docs/src/learning_curves.md index d49a1f9d1..42847171a 100644 --- a/docs/src/learning_curves.md +++ b/docs/src/learning_curves.md @@ -24,7 +24,7 @@ r_lambda = range(ensemble, :(model.lambda), lower=1e-1, upper=100, scale=:log10) curve = MLJ.learning_curve(mach; range=r_lambda, resampling=CV(nfolds=3), - measure=MeanAbsoluteError()) + measure=l1) ``` ```julia using Plots @@ -52,7 +52,7 @@ atom.lambda= 7.3 r_n = range(ensemble, :n, lower=1, upper=50) curves = MLJ.learning_curve(mach; range=r_n, - measure=MeanAbsoluteError(), + measure=l1, verbosity=0, rng_name=:rng, rngs=4) diff --git a/docs/src/learning_networks.md b/docs/src/learning_networks.md index a8f30398b..46e688941 100644 --- a/docs/src/learning_networks.md +++ b/docs/src/learning_networks.md @@ -240,7 +240,7 @@ data). We demonstrate the process by way of examples of increasing complexity: - [Example A - Mini-pipeline](@ref) - [More on replacing models with symbols](@ref) - [Example B - Multiple operations: transform and inverse transform](@ref) -- [Example C - Exposing internal network state in reports](@ref) +- [Example C - Blending predictions and exposing internal network state in reports](@ref) - [Example D - Multiple nodes pointing to the same machine](@ref) - [Example E - Coupling component model hyper-parameters](@ref) - [More on defining new nodes](@ref) @@ -428,7 +428,7 @@ W = transform(mach, X) @assert inverse_transform(mach, W) ≈ X ``` -### Example C - Exposing internal network state in reports +### Example C - Blending predictions and exposing internal network state in reports The code below defines a new composite model type `CompositeC` that predicts by taking the weighted average of two regressors, and additionally exposes, in the model's report, a diff --git a/docs/src/logging_workflows.md b/docs/src/logging_workflows.md index 7f4b468cd..8eda54f4f 100644 --- a/docs/src/logging_workflows.md +++ b/docs/src/logging_workflows.md @@ -5,8 +5,11 @@ [MLflow](https://mlflow.org) is a popular, language-agnostic, tool for externally logging the outcomes of machine learning experiments, including those carried out using MLJ. -This functionality is provided by the [MLJFlow.jl](https://github.com/JuliaAI/MLJFlow.jl) -package whose methods are automatically available to MLJ users. Refer to the package's -documentation for examples. +MLJ logging examples are given in the [MLJFlow.jl](https://github.com/JuliaAI/MLJFlow.jl) +documentation. MLJ includes and re-exports all the methods of MLJFlow.jl, so there is no +need to import MLJFlow.jl if `using MLJ`. +!!! warning + + MLJFlow.jl is a new package still under active development and should be regarded as experimental. At this time, breaking changes to MLJFlow.jl will not necessarily trigger new breaking releases of MLJ.jl. diff --git a/docs/src/machines.md b/docs/src/machines.md index 8bfc9cd2c..68eb9cddc 100644 --- a/docs/src/machines.md +++ b/docs/src/machines.md @@ -93,8 +93,8 @@ report(mach) ``` ```@docs -fitted_params -report +fitted_params(::Machine) +report(::Machine) ``` ### Training losses and feature importances @@ -166,12 +166,9 @@ machine(model, X, y, cache=false) ### Constructing machines in learning networks -Instead of data `X`, `y`, etc, the `machine` constructor is provided -`Node` or `Source` objects ("dynamic data") when building a learning -network. See [Composing Models](composing_models.md) for more on this -advanced feature. One also uses `machine` to wrap a machine -around a whole learning network; see [Learning network -machines](@ref). +Instead of data `X`, `y`, etc, the `machine` constructor is provided `Node` or `Source` +objects ("dynamic data") when building a learning network. See [Learning Networks](@ref) +for more on this advanced feature. ## Saving machines diff --git a/docs/src/performance_measures.md b/docs/src/performance_measures.md index 2d47c25e8..699bf261f 100644 --- a/docs/src/performance_measures.md +++ b/docs/src/performance_measures.md @@ -1,175 +1,198 @@ # Performance Measures -In MLJ loss functions, scoring rules, sensitivities, and so on, are -collectively referred to as *measures*. These include re-exported loss -functions from the -[LossFunctions.jl](https://github.com/JuliaML/LossFunctions.jl) -library, overloaded to behave the same way as the built-in measures. +## Quick links -To see the list of all measures, run `measures()`. Further measures for -probabilistic predictors, such as proper scoring rules, and for -constructing multi-target product measures, are planned. If you'd like -to see a measure added to MLJ, post a comment -[here](https://github.com/JuliaAI/MLJBase.jl/issues/299).g +- [List of aliases of all + measures](https://juliaai.github.io/StatisticalMeasures.jl/dev/auto_generated_list_of_measures/#aliases) -*Note for developers:* The measures interface and the built-in -measures described here are defined in MLJBase, but will ultimately live -in a separate package. +- [Migration guide for changes to measures in MLJBase 1.0](@ref) +## Introduction -## Using built-in measures +In MLJ loss functions, scoring rules, confusion matrices, sensitivities, etc, are +collectively referred to as *measures*. These measures are provided by the package +[StatisticalMeasures.jl](https://juliaai.github.io/StatisticalMeasures.jl/dev/) but are +immediately available to the MLJ user. Here's a simple example of direct application of +the `log_loss` measures to compute a training loss: -These measures all have the common calling syntax - -```julia -measure(ŷ, y) +```@example measures +using MLJ +X, y = @load_iris +DecisionTreeClassifier = @load DecisionTreeClassifier pkg=DecisionTree +tree = DecisionTreeClassifier(max_depth=2) +mach = machine(tree, X, y) |> fit! +yhat = predict(mach, X) +log_loss(yhat, y) ``` -or +For more examples of direct measure usage, see the StatisticalMeasures.jl +[tutorial](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/). -```julia -measure(ŷ, y, w) -``` +A list of all measures, ready to use after running `using MLJ` or `using +StatisticalMeasures`, is +[here](https://juliaai.github.io/StatisticalMeasures.jl/dev/auto_generated_list_of_measures/). Alternatively, +call [`measures()`](@ref StatisticalMeasures.measures) (experimental) to generate a +dictionary keyed on available measure constructors, with measure metadata as values. -where `y` iterates over observations of some target variable, and `ŷ` -iterates over predictions (`Distribution` or `Sampler` objects in the -probabilistic case). Here `w` is an optional vector of sample weights, -or a dictionary of class weights, when these are supported by the -measure. -```@repl losses_and_scores -using MLJ -y = [1, 2, 3, 4]; -ŷ = [2, 3, 3, 3]; -w = [1, 2, 2, 1]; -rms(ŷ, y) # reports an aggregate loss -l2(ŷ, y, w) # reports per observation losses -y = coerce(["male", "female", "female"], Multiclass) -d = UnivariateFinite(["male", "female"], [0.55, 0.45], pool=y); -ŷ = [d, d, d]; -log_loss(ŷ, y) -``` +## Custom measures -The measures `rms`, `l2` and `log_loss` illustrated here are actually - instances of measure *types*. For, example, `l2 = LPLoss(p=2)` and -`log_loss = LogLoss() = LogLoss(tol=eps())`. Common aliases are -provided: +Any measure-like object with appropriate [calling +behavior](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions) +can be used with MLJ. To quickly build custom measures, we recommend using the package +[StatisticalMeasuresBase.jl](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/), +which provides [this](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/tutorial/) +tutorial. Note, in particular, that an "atomic" measure can be transformed into a +multi-target measure using this package. -```@repl losses_and_scores -cross_entropy -``` +## Uses of measures -## Traits and custom measures +In MLJ, measures are specified: -Notice that `l1` reports per-sample evaluations, while `rms` -only reports an aggregated result. This and other behavior can be -gleaned from measure *traits* which are summarized by the `info` -method: +- when evaluating model performance using +[`evaluate!`](@ref)/[`evaluate`](@ref) - see [Evaluating Model Performance](@ref) -```@repl losses_and_scores -info(l1) -``` +- when wrapping models using [`TunedModel`](@ref) - see [Tuning Models](@ref) +- when wrapping iterative models using [`IteratedModel`](@ref) - see [Controlling Iterative Models](@ref) +- when generating learning curves using [`learning_curve`](@ref) - see [Learning Curves](@ref) -Query the doc-string for a measure using the name of its type: +and elsewhere. -```@repl losses_and_scores -rms -@doc RootMeanSquaredError # same as `?RootMeanSqauredError -``` +## Using LossFunctions.jl + +In previous versions of MLJ, measures from LossFunctions.jl were also available. Now +measures from that package must be explicitly imported and wrapped, as described +[here](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/#Using-losses-from-LossFunctions.jl). -Use `measures()` to list all measures, and `measures(conditions...)` to -search for measures with given traits (as you would [query -models](model_search.md)). The trait `instances` list the actual -callable instances of a given measure type (typically aliases for the -default instance). +## Receiver operator characteristics + +A related performance evaluation tool provided by StatisticalMeasures.jl, and hence by MLJ, is the `roc_curve` method: ```@docs -measures(conditions...) +StatisticalMeasures.roc_curve ``` -A user-defined measure in MLJ can be passed to the `evaluate!` -method, and elsewhere in MLJ, provided it is a function or callable -object conforming to the above syntactic conventions. By default, a -custom measure is understood to: - -- be a loss function (rather than a score) - -- report an aggregated value (rather than per-sample evaluations) - -- be feature-independent - -To override this behavior one simply overloads the appropriate trait, -as shown in the following examples: - -```@repl losses_and_scores -y = [1, 2, 3, 4]; -ŷ = [2, 3, 3, 3]; -w = [1, 2, 2, 1]; -my_loss(ŷ, y) = maximum((ŷ - y).^2); -my_loss(ŷ, y) -my_per_sample_loss(ŷ, y) = abs.(ŷ - y); -MLJ.reports_each_observation(::typeof(my_per_sample_loss)) = true; -my_per_sample_loss(ŷ, y) -my_weighted_score(ŷ, y) = 1/mean(abs.(ŷ - y)); -my_weighted_score(ŷ, y, w) = 1/mean(abs.((ŷ - y).^w)); -MLJ.supports_weights(::typeof(my_weighted_score)) = true; -MLJ.orientation(::typeof(my_weighted_score)) = :score; -my_weighted_score(ŷ, y) -X = (x=rand(4), penalty=[1, 2, 3, 4]); -my_feature_dependent_loss(ŷ, X, y) = sum(abs.(ŷ - y) .* X.penalty)/sum(X.penalty); -MLJ.is_feature_dependent(::typeof(my_feature_dependent_loss)) = true -my_feature_dependent_loss(ŷ, X, y) -``` +## Migration guide for changes to measures in MLJBase 1.0 -The possible signatures for custom measures are: `measure(ŷ, y)`, -`measure(ŷ, y, w)`, `measure(ŷ, X, y)` and `measure(ŷ, X, y, w)`, each -measure implementing one non-weighted version, and possibly a second -weighted version. +Prior to MLJBase.jl 1.0 (respectivey, MLJ.jl version 0.19.6) measures were defined in +MLJBase.jl (a dependency of MLJ.jl) but now they are provided by MLJ.jl dependency +[StatisticalMeasures](https://juliaai.github.io/StatisticalMeasures.jl/dev/). Effects +on users are detailed below: -## Using measures from LossFunctions.jl -The [LossFunctions.jl](https://github.com/JuliaML/LossFunctions.jl) -package includes "distance loss" functions for `Continuous` targets, -and "marginal loss" functions for `Finite{2}` (binary) targets. While the -LossFunctions.jl interface differs from the present one (for, example -binary observations must be +1 or -1), MLJ has overloaded instances -of the LossFunctions.jl types to behave the same as the built-in -types. +### Breaking behavior likely relevant to many users -Note that the "distance losses" in the package apply to deterministic -predictions, while the "marginal losses" apply to probabilistic -predictions. +- If `using MLJBase` without MLJ, then, in Julia 1.9 or higher, `StatisticalMeasures` must + be explicitly imported to use measures that were previously part of MLJBase. If `using + MLJ`, then all previous measures are still available, with the exception of those + corresponding to LossFunctions.jl (see below). +- All measures return a *single* aggregated measurement. In other words, measures + previously reporting a measurement *per-observation* (previously subtyping + `Unaggregated`) no longer do so. To get per-observation measurements, use the new method + `StatisticalMeasures.measurements(measure, ŷ, y[, weights, class_weights])`. -## List of measures +- The default measure for regression models (used in `evaluate/evaluate!` when `measures` + is unspecified) is changed from `rms` to `l2=LPLoss(2)` (mean sum of squares). -All measures listed below have a doc-string associated with the measure's -*type*. So, for example, do `?LPLoss` not `?l2`. +- `MeanAbsoluteError` has been removed and instead `mae` is an alias for `LPLoss(p=1)`. -```@setup losses_and_scores -using DataFrames -``` +- Measures that previously skipped `NaN` values will now (at least by default) propagate + those values. Missing value behavior is unchanged, except some measures that + previously did not support `missing` now do. -```@example losses_and_scores -ms = measures() -types = map(ms) do m - m.name -end -instance = map(ms) do m m.instances end -table = (type=types, instances=instance) -DataFrame(table) -``` +- Aliases for measure *types* have been removed. For example `RMSE` (alias for + `RootMeanSquaredError`) is gone. Aliases for instances, such as `rms` and + `cross_entropy` persist. The exception is `precision`, for which `ppv` can + be used in its place. (This is to avoid conflict with `Base.precision`, which was + previously pirated.) +- `info(measure)` has been decommissioned; query docstrings or access the new [measure + traits](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/methods/#Traits) + individually instead. These traits are now provided by StatisticalMeasures.jl and not + are not exported. For example, to access the orientation of the measure `rms`, do + `import StatisticalMeasures as SM; SM.orientation(rms)`. -## Other performance-related tools +- Behavior of the `measures()` method, to list all measures and associated traits, has + changed. It now returns a dictionary instead of a vector of named tuples; + `measures(predicate)` is decommissioned, but `measures(needle)` is preserved. (This + method, owned by StatisticalMeasures.jl, has some other search options, but is + experimental.) -In MLJ one computes a confusion matrix by calling an instance of the -`ConfusionMatrix` measure type on the data: +- Measures that were wraps of losses from LossFunctions.jl are no longer exposed by + MLJBase or MLJ. To use such a loss, you must explicitly `import LossFunctions` and wrap + the loss appropriately. See [Using losses from + LossFunctions.jl](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/#Using-losses-from-LossFunctions.jl) + for examples. -```@docs -ConfusionMatrix -``` +- Some user-defined measures working in previous versions of MLJBase.jl may not work + without modification, as they must conform to the new [StatisticalMeasuresBase.jl + API](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions). See + [this tutorial](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/tutorial/) on + how define new measures. -```@docs -roc_curve -``` +- Measures with a "feature argument" `X`, as in `some_measure(ŷ, y, X)`, are no longer + supported. See [What is a + measure?](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions) + for allowed signatures in measures. + +### Packages implementing the MLJ model interface + +The migration of measures is not expected to require any changes to the source code in +packges providing implementations of the MLJ model interface (MLJModelInterface.jl) such +as MLJDecisionTreeInterface.jl and MLJFlux.jl, and this is confirmed by extensive +integration tests. However, some current tests will fail, if they use MLJBase +measures. The following should generally suffice to adapt such tests: + +- Add StatisticalMeasures as test dependency, and add `using StatisticalMeasures` to your + `runtests.jl` (and/or included submodules). + +- If measures are qualified, as in `MLJBase.rms`, then the qualification must be removed + or changed to `StatisticalMeasures.rms`, etc. + +- Be aware that the default measure used in methods such as `evaluate!`, when `measure` is + not specified, is changed from `rms` to `l2` for regression models. + +- Be aware of that all measures now report a measurement for every observation, and never + an aggregate. See second point above. + +### Breaking behavior possibly relevant to some developers + +- The abstract measure types `Aggregated`, `Unaggregated`, `Measure` have been + decommissioned. (A measure is now defined purely by its [calling + behavior](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions).) + +- What were previously exported as measure types are now only constructors. + +- `target_scitype(measure)` is decommissioned. Related is + `StatisticalMeasures.observation_scitype(measure)` which declares an upper bound on the + allowed scitype *of a single observation*. + +- `prediction_type(measure)` is decommissioned. Instead use + `StatisticalMeasures.kind_of_proxy(measure)`. + +- The trait `reports_each_observation` is decommissioned. Related is + `StatisticalMeasures.can_report_unaggregated`; if `false` the new `measurements` method + simply returns `n` copies of the aggregated measurement, where `n` is the number of + observations provided, instead of individual observation-dependent measurements. + +- `aggregation(measure)` has been decommissioned. Instead use + `StatisticalMeasures.external_mode_of_aggregation(measure)`. + +- `instances(measure)` has been decommissioned; query docstrings for measure aliases, or + follow this example: `aliases = measures()[RootMeanSquaredError].aliases`. + +- `is_feature_dependent(measure)` has been decommissioned. Measures consuming feature data + are not longer supported; see above. + +- `distribution_type(measure)` has been decommissioned. + +- `docstring(measure)` has been decommissioned. + +- Behavior of `aggregate` [has changed](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/methods/#StatisticalMeasuresBase.aggregate). + +- The following traits, previously exported by MLJBase and MLJ, cannot be applied to + measures: `supports_weights`, `supports_class_weights`, `orientation`, + `human_name`. Instead use the traits with these names provided by + StatisticalMeausures.jl (they will need to be qualified, as in `import + StatisticalMeasures; StatisticalMeasures.orientation(measure)`). diff --git a/docs/src/target_transformations.md b/docs/src/target_transformations.md index d39aa0423..63203f780 100644 --- a/docs/src/target_transformations.md +++ b/docs/src/target_transformations.md @@ -53,11 +53,11 @@ meaningfully compare the corresponding mean absolute errors, which are indeed di this case. ```@example 123 -evaluate(ridge, X, y, measure=mae) +evaluate(ridge, X, y, measure=l1) ``` ```@example 123 -evaluate(ridge2, X, y, measure=mae) +evaluate(ridge2, X, y, measure=l1) ``` Ordinary functions can also be used in target transformations but an @@ -66,11 +66,11 @@ inverse must be explicitly specified: ```@example 123 ridge3 = TransformedTargetModel(ridge, transformer=y->log.(y), inverse=z->exp.(z)) X, y = @load_boston -evaluate(ridge3, X, y, measure=mae) +evaluate(ridge3, X, y, measure=l1) ``` -Without the log transform (ie, using `ridge`) we get the poorer -`mae` of 3.9. +Without the log transform (ie, using `ridge`) we get the poorer mean absolute error, +`l1`, of 3.9. ```@docs TransformedTargetModel diff --git a/docs/src/transformers.md b/docs/src/transformers.md index 2bdae8e0c..f03cdb92f 100644 --- a/docs/src/transformers.md +++ b/docs/src/transformers.md @@ -42,7 +42,7 @@ MLJModels.UnivariateTimeTypeToContinuous A *static transformer* is a model for transforming data that does not generalize to new data (does not "learn") but which nevertheless has hyperparameters. For example, the `DBSAN` clustering model from Clustering.jl can assign labels to some collection of -observations, cannot directly assign a label to some new observation. +observations, cannot directly assign a label to some new observation. The general user may define their own static models. The main use-case is insertion into a [Linear Pipelines](@ref) some parameter-dependent transformation. (If a static transformer @@ -73,86 +73,108 @@ Such static transformers with (unlearned) parameters can have arbitrarily many inputs, but only one output. In the single input case, an `inverse_transform` can also be defined. Since they have no real learned parameters, you bind a static transformer to a machine without -specifying training arguments. +specifying training arguments; there is no need to `fit!` the machine: ```@example boots -mach = machine(Averager(0.5)) |> fit! +mach = machine(Averager(0.5)) transform(mach, [1, 2, 3], [3, 2, 1]) ``` -Let's see how we can include our `Averager` in a learning network (see -[Composing Models](@ref)) to mix the predictions of two regressors, -with one-hot encoding of the inputs: +Let's see how we can include our `Averager` in a [learning network](@ref "Learning +Networks") to mix the predictions of two regressors, with one-hot encoding of the +inputs. Here's two regressors for mixing, and some dummy data for testing our learning +network: ```@example boots -X = source() -y = source() - ridge = (@load RidgeRegressor pkg=MultivariateStats)() knn = (@load KNNRegressor)() + +import Random.seed! +seed!(112) +X = ( + x1=coerce(rand("ab", 100), Multiclass), + x2=rand(100), +) +y = X.x2 + 0.05*rand(100) +schema(X) +``` + +And the learning network: + +```@example boots +Xs = source(X) +ys = source(y) + averager = Averager(0.5) -hotM = machine(OneHotEncoder(), X) -W = transform(hotM, X) # one-hot encode the input +mach0 = machine(OneHotEncoder(), Xs) +W = transform(mach0, Xs) # one-hot encode the input + +mach1 = machine(ridge, W, ys) +y1 = predict(mach1, W) -ridgeM = machine(ridge, W, y) -y1 = predict(ridgeM, W) +mach2 = machine(knn, W, ys) +y2 = predict(mach2, W) -knnM = machine(knn, W, y) -y2 = predict(knnM, W) +mach4= machine(averager) +yhat = transform(mach4, y1, y2) -averagerM= machine(averager) -yhat = transform(averagerM, y1, y2) +# test: +fit!(yhat) +Xnew = selectrows(X, 1:3) +yhat(Xnew) ``` -Now we export to obtain a `Deterministic` composite model and then -instantiate composite model +We next "export" the learning network as a standalone composite model type. First we need +a struct for the composite model. Since we are restricting to `Deterministic` component +regressors, the composite will also make deterministic predictions, and so gets the +supertype `DeterministicNetworkComposite`: -```julia -learning_mach = machine(Deterministic(), X, y; predict=yhat) -Machine{DeterministicSurrogate} @772 trained 0 times. - args: - 1: Source @415 ⏎ `Unknown` - 2: Source @389 ⏎ `Unknown` - - -@from_network learning_mach struct DoubleRegressor - regressor1=ridge - regressor2=knn - averager=averager - end - -composite = DoubleRegressor() -julia> composite = DoubleRegressor() -DoubleRegressor( - regressor1 = RidgeRegressor( - lambda = 1.0), - regressor2 = KNNRegressor( - K = 5, - algorithm = :kdtree, - metric = Distances.Euclidean(0.0), - leafsize = 10, - reorder = true, - weights = :uniform), - averager = Averager( - mix = 0.5)) @301 +```@example boots +mutable struct DoubleRegressor <: DeterministicNetworkComposite + regressor1 + regressor2 + averager +end +``` +As described in [Learning Networks](@ref), we next paste the learning network into a +`prefit` declaration, replace the component models with symbolic placeholders, and add a +learning network "interface": + +```@example boots +import MLJBase +function MLJBase.prefit(composite::DoubleRegressor, verbosity, X, y) + Xs = source(X) + ys = source(y) + + mach0 = machine(OneHotEncoder(), Xs) + W = transform(mach0, Xs) # one-hot encode the input + + mach1 = machine(:regressor1, W, ys) + y1 = predict(mach1, W) + + mach2 = machine(:regressor2, W, ys) + y2 = predict(mach2, W) + + mach4= machine(:averager) + yhat = transform(mach4, y1, y2) + + # learning network interface: + (; predict=yhat) +end ``` -which can be can be evaluated like any other model: +The new model type can be evaluated like any other supervised model: -```julia +```@example boots +X, y = @load_reduced_ames; +composite = DoubleRegressor(ridge, knn, Averager(0.5)) +``` + +```@example boots composite.averager.mix = 0.25 # adjust mix from default of 0.5 -julia> evaluate(composite, (@load_reduced_ames)..., measure=rms) -Evaluating over 6 folds: 100%[=========================] Time: 0:00:00 -┌───────────┬───────────────┬────────────────────────────────────────────────────────┐ -│ _.measure │ _.measurement │ _.per_fold │ -├───────────┼───────────────┼────────────────────────────────────────────────────────┤ -│ rms │ 26800.0 │ [21400.0, 23700.0, 26800.0, 25900.0, 30800.0, 30700.0] │ -└───────────┴───────────────┴────────────────────────────────────────────────────────┘ -_.per_observation = [missing] -_.fitted_params_per_fold = [ … ] -_.report_per_fold = [ … ] +evaluate(composite, X, y, measure=l1) ``` A static transformer can also expose byproducts of the transform computation in the report diff --git a/docs/src/tuning_models.md b/docs/src/tuning_models.md index 46c04c72e..e0c57c961 100644 --- a/docs/src/tuning_models.md +++ b/docs/src/tuning_models.md @@ -67,11 +67,13 @@ one-dimensional range object constructed using the `range` method: ```@example goof r = range(tree, :min_purity_increase, lower=0.001, upper=1.0, scale=:log); -self_tuning_tree = TunedModel(model=tree, - resampling=CV(nfolds=3), - tuning=Grid(resolution=10), - range=r, - measure=rms); +self_tuning_tree = TunedModel( + model=tree, + resampling=CV(nfolds=3), + tuning=Grid(resolution=10), + range=r, + measure=rms +); ``` Incidentally, a grid is generated internally "over the range" by calling the @@ -171,11 +173,13 @@ deterministic measure, such as `misclassification_rate` (which means **Case (i) - probabilistic measure**: ```@example goof -self_tuning_knn = TunedModel(model=knn, - resampling = CV(nfolds=4, rng=1234), - tuning = Grid(resolution=5), - range = K_range, - measure=BrierLoss()); +self_tuning_knn = TunedModel( + model=knn, + resampling = CV(nfolds=4, rng=1234), + tuning = Grid(resolution=5), + range = K_range, + measure=BrierLoss() +); mach = machine(self_tuning_knn, X, y); fit!(mach, verbosity=0); @@ -184,11 +188,13 @@ fit!(mach, verbosity=0); **Case (ii) - deterministic measure**: ```@example goof -self_tuning_knn = TunedModel(model=knn, - resampling = CV(nfolds=4, rng=1234), - tuning = Grid(resolution=5), - range = K_range, - measure=MisclassificationRate()) +self_tuning_knn = TunedModel( + model=knn, + resampling = CV(nfolds=4, rng=1234), + tuning = Grid(resolution=5), + range = K_range, + measure=MisclassificationRate() +) mach = machine(self_tuning_knn, X, y); fit!(mach, verbosity=0); @@ -215,32 +221,36 @@ predict(mach, rows=148:150) ### Specifying a custom measure -Users may specify a custom loss or scoring function. Suppose, for -example, we define a new scoring function `custom_accuracy` by +Users may specify a custom loss or scoring function, so long as it complies with the +StatisticalMeasuresBase.jl +[API](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/implementing_new_measures/#definitions) +and implements the appropriate `orientation` trait (`Score()` or `Loss()`) from that +package. For example, we suppose define a "new" scoring function `custom_accuracy` by ```@example goof -custom_accuracy(y,yhat) = mean(y .== yhat); +custom_accuracy(yhat, y) = mean(y .== yhat); # yhat - prediction, y - ground truth ``` -In tuning, scores are maximised, while losses are minimised. By -default, a custom measure is assumed to be a loss rather than a score, -so we must also declare +In tuning, scores are maximised, while losses are minimised. So here we declare ```@example goof -MLJ.orientation(::typeof(custom_accuracy)) = :score +import StatisticalMeasuresBase as SMB +SMB.orientation(::typeof(custom_accuracy)) = SMB.Score() ``` -For full details on constructing custom measures, see [Traits and custom -measures](@ref). +For full details on constructing custom measures, see +[StatisticalMeasuresBase.jl](https://juliaai.github.io/StatisticalMeasuresBase.jl/dev/). ```@example goof -self_tuning_knn = TunedModel(model=knn, - resampling = CV(nfolds=4), - tuning = Grid(resolution=5), - range = K_range, - measure = [custom_accuracy, MulticlassFScore()], - operation = predict_mode); +self_tuning_knn = TunedModel( + model=knn, + resampling = CV(nfolds=4), + tuning = Grid(resolution=5), + range = K_range, + measure = [custom_accuracy, MulticlassFScore()], + operation = predict_mode +); mach = machine(self_tuning_knn, X, y) fit!(mach, verbosity=0) @@ -268,11 +278,12 @@ points: ```@example goof r1 = range(forest, :(model.n_subfeatures), lower=1, upper=9); r2 = range(forest, :bagging_fraction, lower=0.4, upper=1.0); -self_tuning_forest = TunedModel(model=forest, - tuning=Grid(goal=30), - resampling=CV(nfolds=6), - range=[r1, r2], - measure=rms); +self_tuning_forest = TunedModel( + model=forest, + tuning=Grid(goal=30), + resampling=CV(nfolds=6), + range=[r1, r2], + measure=rms); X = MLJ.table(rand(100, 10)); y = 2X.x1 - X.x2 + 0.05*rand(100); @@ -300,12 +311,14 @@ be limited to 25. ```@example goof tuning = Grid(resolution=100, shuffle=true, rng=1234) -self_tuning_forest = TunedModel(model=forest, - tuning=tuning, - resampling=CV(nfolds=6), - range=[(r1, 3), r2], - measure=rms, - n=25); +self_tuning_forest = TunedModel( + model=forest, + tuning=tuning, + resampling=CV(nfolds=6), + range=[(r1, 3), r2], + measure=rms, + n=25 +); fit!(machine(self_tuning_forest, X, y), verbosity=0); ``` @@ -321,12 +334,14 @@ distribution by default, and all others using a (truncated) normal distribution. ```@example goof -self_tuning_forest = TunedModel(model=forest, - tuning=RandomSearch(), - resampling=CV(nfolds=6), - range=[r1, r2], - measure=rms, - n=25); +self_tuning_forest = TunedModel( + model=forest, + tuning=RandomSearch(), + resampling=CV(nfolds=6), + range=[r1, r2], + measure=rms, + n=25 +); X = MLJ.table(rand(100, 10)); y = 2X.x1 - X.x2 + 0.05*rand(100); mach = machine(self_tuning_forest, X, y); @@ -373,12 +388,14 @@ For this illustration we'll add a third, nominal, hyper-parameter: ```@example goof r3 = range(forest, :(model.post_prune), values=[true, false]); -self_tuning_forest = TunedModel(model=forest, - tuning=latin, - resampling=CV(nfolds=6), - range=[r1, r2, r3], - measure=rms, - n=25); +self_tuning_forest = TunedModel( + model=forest, + tuning=latin, + resampling=CV(nfolds=6), + range=[r1, r2, r3], + measure=rms, + n=25 +); mach = machine(self_tuning_forest, X, y); fit!(mach, verbosity=0) ``` @@ -409,10 +426,12 @@ The following model is equivalent to the best in `models` by using 3-fold cross-validation: ```@example goof -multi_model = TunedModel(models=models, - resampling=CV(nfolds=3), - measure=log_loss, - check_measure=false) +multi_model = TunedModel( + models=models, + resampling=CV(nfolds=3), + measure=log_loss, + check_measure=false +) nothing # hide ``` @@ -424,10 +443,7 @@ evaluated 2 x 3 times): ```@example goof X, y = make_blobs() -e = evaluate(multi_model, X, y, - resampling=CV(nfolds=2), - measure=log_loss, - verbosity=6) +e = evaluate(multi_model, X, y, resampling=CV(nfolds=2), measure=log_loss, verbosity=6) ``` Now, for example, we can get the best model for the first fold out of @@ -451,7 +467,7 @@ For example, for the first fold of the outer loop and the second model: e.report_per_fold[2].history[1] ``` -## API +## Reference ```@docs MLJBase.range diff --git a/docs/src/weights.md b/docs/src/weights.md index 202d7dd5f..3789faf21 100644 --- a/docs/src/weights.md +++ b/docs/src/weights.md @@ -43,7 +43,7 @@ The model `model` supports class weights if ## Specifying weights in performance evaluation -When calling an MLJ measure (metric) that supports weights, provide the +When calling a measure (metric) that supports weights, provide the weights as the last argument, as in ```julia @@ -53,18 +53,12 @@ w = Dict("versicolor" => 1, "setosa" => 2, "virginica"=> 3) macro_f1score(ŷ, y, w) ``` -You can use `supports_weights` and `supports_class_weights` on -measures to check weight support. For example, to list all measures -supporting per observation weights, do +Some measures also support specification of a class weight dictionary. For details see the +StatisticalMeasures.jl +[tutorial](https://juliaai.github.io/StatisticalMeasures.jl/dev/examples_of_usage/). -```julia -measures() do m - m.supports_weights -end -``` +To pass weights to all the measures listed in an [`evaluate!`](@ref)/[`evaluate`](@ref) +call, use the keyword specifiers `weights=...` or `class_weights=...`. For details, see +[Evaluating Model Performance](@ref). -See also [Evaluating Model Performance](@ref). -To pass weights to all the measures listed in an `evaluate!/evaluate` -call, use the keyword specifiers `weights=...` or -`class_weights=...`. For details, see [`evaluate!`](@ref). diff --git a/material/MLJ_stack.png b/material/MLJ_stack.png index 3ccd0eaa3..b849d48bb 100644 Binary files a/material/MLJ_stack.png and b/material/MLJ_stack.png differ diff --git a/material/MLJ_stack.svg b/material/MLJ_stack.svg index 5e6da55fc..c684176b7 100644 --- a/material/MLJ_stack.svg +++ b/material/MLJ_stack.svg @@ -1,3 +1,3 @@ -
MLJModelInterface
MLJModelInterface
ScientificTypesBase
ScientificTypesBa...
ScientificTypes
ScientificTypes
MLJBase
MLJBase
MLJTuning
MLJTuning
MLJIteration
MLJIteration
MLJLinearModels
MLJLinearModels
ThirdPartyModelPkg
ThirdPartyModelPkg
MLJ
MLJ
(StatisticalMetrics)
(StatisticalMetrics)
MLJEnsembles
MLJEnsembles
DataScienceTutorials
DataScienceTutorials
MLJFlux
MLJFlux
model-generic functionality
model-generic functionality
A            B  means "A depends on B"
A            B  means "A depends on...
 For general MLJ Users
 For general MLJ Users
For model  API implementation/testing

For model  API implementation/testin...
 Of interest outside MLJ
 Of interest outside MLJ
For MLJ developers
For MLJ developers
StatisticalTraits
StatisticalTraits
MLJModels
MLJModels
MLJClusteringInterface
MLJClusteringInterf...
MLJDecisionTreeInterface
MLJDecisionTreeInte...
MLJGLMInterface
MLJGLMInterface
MLJLIBSVMInterface
MLJLIBSVMInterface
MLJMultivariateStatsInterface
MLJMultivariateStat...
MLJNaiveBayesInterface 
MLJNaiveBayesInterf...
MLJScikitLearnInterface
MLJScikitLearnInter...
MLJXGBoostInterface
MLJXGBoostInterface
Interfaces for third party
packages administered by MLJ:
Interfaces for third party...
CategoricalDistributions
CategoricalDistributi...
MLJTSVDInterface
MLJTSVDInterface
MLJText
MLJText
MLJTestIntegration
MLJTestIntegration
OpenML
OpenML
Viewer does not support full SVG 1.1
\ No newline at end of file +
MLJModelInterface
MLJModelInterface
ScientificTypesBase
ScientificTypesBa...
ScientificTypes
ScientificTypes
MLJBase
MLJBase
MLJTuning
MLJTuning
MLJIteration
MLJIteration
ThirdPartyModelPkg
ThirdPartyModelPkg
MLJ
MLJ
StatisticalMeasures
StatisticalMeasures
MLJEnsembles
MLJEnsembles
DataScienceTutorials
DataScienceTutorials
model-generic functionality
model-generic functionality
A            B  means "A depends on B"
A            B  means "A depends on...
StatisticalTraits
StatisticalTraits
MLJModels
MLJModels
MLJClusteringInterface
MLJClusteringInterf...
MLJDecisionTreeInterface
MLJDecisionTreeInte...
MLJGLMInterface
MLJGLMInterface
MLJLIBSVMInterface
MLJLIBSVMInterface
MLJMultivariateStatsInterface
MLJMultivariateStat...
MLJNaiveBayesInterface 
MLJNaiveBayesInterf...
MLJScikitLearnInterface
MLJScikitLearnInter...
MLJXGBoostInterface
MLJXGBoostInterface
Model-providing pkgs/interfaces
administered by MLJ
include:
Model-providing pkgs/interface...
MLJTSVDInterface
MLJTSVDInterface
MLJText
MLJText
MLJTestIntegration
MLJTestIntegration
OpenML
OpenML
MLJFlux

MLJFlux...
MLJLinearModels

MLJLinearModels...
MLJFlow
MLJFlow
MLJBalancing
MLJBalancing
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/material/MLJ_stack.xml b/material/MLJ_stack.xml index 6713f3853..463448878 100644 --- a/material/MLJ_stack.xml +++ b/material/MLJ_stack.xml @@ -1 +1 @@ -7V1bd6I8F/41Xev7LpzFWb2ste20Yw8zdWY6vZkVISotggW02l//JhyUQETAADqL3lRiiGHvZx+ys7M5Ey9mq2sbzKd3lgaNM4HTVmdi/0wQ+DYvoH+4Ze23KHLHb5jYuhZ02jY86Z8waOSC1oWuQYfo6FqW4epzslG1TBOqLtEGbNv6ILuNLYP81TmYwETDkwqMZOtvXXOnfmtHaG/bv0J9Mg1/mVe6/jczEHYOnsSZAs36iDSJl2fihW1Zrv9ptrqABiZeSBf/vqsd324mZkPTzXKDLYyt19vvywno/5BvX1fLb67SEoLZLoGxCJ44mK27DklgWwtTg3gU7kzsfUx1Fz7NgYq//UBMR21Td2agKx59BIY+MdFnA47RrHrJSQbzXkLbhatIUzDpa2jNoGuvUZfw2xAKIYJCgn5s+SHLQds0ygspaAQBBiabsbdkQh8CStGp9gButY/bp8/+q9B+/Hv/Yzm6GLR4xkQb64ZxYRmWja5Ny0Sdehpwpt7t+HvHta03GPY4E0TO+2ND3w5JXjR4grySSCFvlwF1374P//5+vrz++ef266r/e85PP60WTyOvYmA4jdCHiY8rv2FsoefG0h+SRnlfWH4HcTzuYCJFmvx77wa3np66MV1ojzFPgsHQbP3xyN9AzZHfjfEZEdklmUkyK2AnhcOhpKgQzwM1YJbpSPOcB1/MdE3DP0NFD4kv1ZrpavD7DCAhyG0CE5KUxASvUDAhlIYJkQ0mOE5RVDWJiSdVR+TSx7o6XM+h0wNOg4qEIu6SmkJuS1/kJC44sUpcyAnaQw0Z8ODSst2pNbFMYFxuW2NU2vYZWNY8INcrdN114I2AhWuRzIQr3X3Gt38R5ODyT+Sr/ioY2rtYhxcmeuDoXfj6T/TL7X3eVXijurCXG1Owk4+OtbBVmEYqKXCfgD2BbgZZw4RMxYUNDeDqS9JTYs9iqVLRb8Q+4X9lMAY0B6E8oe8UEHpCihhqgEIKIJf8HyDy7awizx+VyItFtHpZDM6h4g/nFDsGBLc+WrqnEkNhjnn7G88+HMKHSnBXjI2baWTirOu8fhusPn8Arv3xLn98Tgbt+5YoHg1n+Qhbt2LM2HYTyzhsT4Jp8RUJtsgcV1kFezAULnudBf8+unh+6V7d3PNfH1pFuM9QlNvlK2tWzhqFwbf3q1/8/Nf51Qqc2z/vbP7m/aHFXHF7t57bNlhHOsyxPnB2qxVRlki1wsViUbH+iiCn9Ucf/BkU1T2pNC0xrtCsGykOpMiR4BCVpAOpKBX6j2IyXHdi/mM+I3SAGupmdSBrszOp8z58zSiKGJoUUR8uTN2cNMIeDx2KCiHsQjsp7J0qF4sinyD8iQl7RbIecuDUZD2cd4myfoOkDD2JZW4GtimS3igAz7oL+xUAz1WqAY5nzVlQA7TZqICaFwkKRwYSRVlJXSSI7dT+5CKB8mvKF3KRIXQU8ol8lZiIdeydiCTGBmIXNKFrOGabXzs13EA3IbC9bdEmAh5XaZvsgjSVxpe07UVN2wj3RI5CpRXbA+sqYlSptbgv/B695l09QltHNMQIOdTfybwfVltwnM77YwqOV2DOiBBqBexmHzItZC03KWah9eukh9RkSUrrX05ITWC0N6uiP1pIbTjVbe0R2O7aM02Pb82CO26c2u0ajRMdFN1TV1BVrbgzb89267JA1OlI7SNicAV7OhVwOBTGY/ExCjC4lI3Zo96ko3H31bj6tnyQVq3n+cjhn4VrS31nv0mXlbnX7emLIN4un6fa89/B+d8PYQRbx6SecydXxGhYslhKXJ2MChPfj4FTFSnZ0thLJXCnDsdejIXBwrTandEnIa17SX49o61yjut2JY0acWoc+cQ2eQwXfNKRl0qKmzvD7y/v+o36+XIHJtfj2Xh5MzxVO1F3oEHM7NXXlmdNZfcxWZsTiivR2E2nb13cpuaNCXV6+LlWbQTECKbVkBBD43Y1eXmHbYEys+f0MxT/e3LRUzieJb1DM9ZV5/+NgY8ZeEkid8brP0ghFdECR6Xyq5J7KXNyzHEJvlR+csyl6cDZyGhOTiVPTon7Q/Nl5cJR17tKLUZ/I62EqG6jA+UHBCQxo/T6697ahJVZngd91d0HLvBOO6pwiNhk62Cb7NFktO0WY1KKaetyvtKUVumYVuaFzHZHVqLKACd/7FMHjLM/pE5tBr1Q1LAdQ2F4iGJn1LBLP3RRatgwJGqJ3saVsVg1GiqmoRQlQw5AWTm39Ao5Sb9zhjM4WhNoIhlGS0huvDBVnFON6OquM/ApO/1t6OifYOQNhakdyBYaV+6dyX08FlJoTnASkw0LOqS4barjVFExhX7SNkNlpwlC5Tz7w2+qbAW0PYsWsqIRpRXDpUypLbSpKhKlSvxUMjOqUBOWslElncrFaVUbLaiHpGK0yFXGaneZqv0nn9O5tV8M66OiwhZRO585t/R1CNnrJiWP5rTypdFpd6pgEVcgvNeZAzPetrH+52eYRAqYYaCaI2fudSq1idY2g8B0/PbgKfDENDiHpobb8bkirkc8I8WR2bTRHjnhCh2l21NQDWxELJ8aqBbe8n41gNh9jktSYmoawHGwC0iUATyMPHtXKNGCiRTahG2HlhghT920w1M4uU/d7Bmn5EolUoYij2X7T0osgMhTSl/yHcrReRa1GelYo20kHZXPIOV23QVKETtaokVpRBUKWsZ4lC/ChLARD9DyVzrY5vDcfOVbjR3mdHToKIThu0KzFjhvyQcwFby0H+6nA21nj437d41ZDnyGUp8x3liabUs95MEAdPmKrRzwgz4eZ0GR6jhazx9vMCVmcwPOEA+DA99XLnTcSKUHHChnIWhshjndEFhl6p3cdBcpyr5SYaL5iUUSLxiAx19IHJ3VeBhjGmKEIcnDy6OF6+gaDO1HszZiKQ9y3cYlvdp3ydYlGV042Lr4Po4Gl9Cw5qSj02jqPMhUKtTUq2thLd299FwTXH57sIyBqwJKimQpxyn21Z7cub6M7nGmZZyeXCIyIwu5IygYSVUc2kB3/4GlCOuTCPFC77RtG54iiqUdRaAlxuQMrlWaaJgzHSOTiGcu6n5Q9nk1MUKBBJgSB07WGGHs7Lvcjo3DLkZIJWqGrZ/ssCx2xOVIYcnc9DS4zIHLDEHaktXl6cCy0Zb7UJn7gKsYK5e254CrJMlp/Q9OVaMeya+5EnkhoVJEPipWWbImCcEpnkKZVtXgVDIopVgy0t4MSrmT1r8cWKYGQ5gkUDY1/uiHs2L5tZWmUNJrsNC8y+KltBI4Qli4MBYOYoRuTmjvwmsWw74142J1RBXaub1qyz/uPs7JChp9qOqObplDG8IGHCngIKFBebVBtdBIqYbNChrXg7sGETnURad2TBSLqObBxOCm9/SrgUVmRdHmageFUDoo7hYGcueBrQNEbBe4ToOPXfiQE9XK6/cyUsqOs0LIPUDLvR5YQwIasRzrBir7oFK/11EsuT8PVJ5U/U13BxDYZqNGcmCjfu+jWMZTHmw8X/csy3EbXGTHxRE4IMn4xoaB+FTM2EuncXEFcP/x8SlRbsN+O85YTV/Gm6LZYIjab2DijQy0mW7qOPIB8dAjPC5O28HpPfsyyYhf2QudEBIGHLtpgKjjxCofKzCvyBkTblgcpaMm3BR5qUUtMfn821EVvruTUnA0tu/TiTGQ0b5P/KWeMuM3EKTRsKxkoQu0dJlYNhbaPtIYtj5a4LTwJmcoYWGyHPUuq7gZ9V3D5bsdw6df/cbn2BnyEDpxn4N2fK0sn4OKifK3VIaYlQ0U9rqfVUKBWgutHmcj3MwnNvLLqoVGLyrv9zueUmhpsyzz/b3QXzBOyDd7/ksb4pQDyAnE7N4QD6vRpZU9CzN3KpHY3cWlmLh6D3No3g3+Rd19EA42YeDU90uxcezQpW1hLm1XCOgJp17KCmr8Dw== \ No newline at end of file +7V1be6I8EP41XtqHM3pZe9p27WG3frvd3uyTQtS0SCzgqb/+SxSshIiAnLoPvamEEMPMO28mk0lsyWeT5ZUDpuNbbEKrJQnmsiWftyRJ7Moy+UdLVpuSDr2iBSMHmX6lz4JH9AH9QsEvnSETuqGKHsaWh6bhQgPbNjS8UBlwHLwIVxtiK/ytUzCCkYJHA1jR0t/I9MbBW+if5d8gGo2Dbxa17ubOBASV/Tdxx8DEi50i+aIlnzkYe5tPk+UZtKjwArlsnrvcc3fbMQfaXpIHHGmIX29+zEfg/Kd687qcf/e0tuT3dg6smf/Gfm+9VSACB89sE9JWhJbcW4yRBx+nwKB3F0TppGzsTSxyJZKPwEIjm3y24JD0qhftpN/vOXQ8uNwp8jt9BfEEes6KVAnuBlDwEaRJ6uZ68akPVfXrjHd1ofiFwMfAaNv2p5jIB19SfKndgxtzcfP4cf4q6Q9/737OX876bTFnoQ2RZZ1hCzvk2sY2qdQzgTteP07vu56D32BQoyXJwvovH/l2wuIljUfEq8kc8XZzkO7bj8Hf308XV//9ufm2PP89FccfuC3yxKtZFE4v5MNog6tNwRCT96bWH4hGe5/hTQV5OOxQIe0UbZ697d+seera9qAzpDrxGyO93bQX/g5SvPO9jJ6JkL2wMsPK8tXJ0XBgKQak/SAFVGWIMM+pf2OCTJN+DRc9YXwZeIIM//tzgISk6iFMKEoUE6LGwYRUGCbkfDAhCJpmGFFMPBqIiAsNkTFYTaHbA26DiggRd8NMoerKSZSKRUEuExdqRPbQJAO4f4kdb4xH2AbWxWcpI6XPOn2Mp764XqHnrXxvBMw8HFYmXCLviT5+QsaizeWfnVvnS7/p9cUquLDJC+8+Ra//7N78fG59FTxozJz5dijYq0cXzxwDxolK8d0n4Iygl8DWqCBjceFAC3hoHvaU8lexUqrpN2Yf8b8SDAY8B6E4o+9kMPqQFeXIAJkIIJX9H2HyelKTF2tl8nIWVi9KwSko/nhN5acA/9EHjNaUGBgz4+0rMmOkG6j4TzFq3HbjCNPVC/ftG9+NQ+KyoIT0LmucWZ5WIofL0SnzF+PwdE7cEdTQTUriuXPIcaM0L7SUyW+TZQpNjqkPZjayR42xs9N3WQsZu6RHjb1TpsMmixHBfzFjL8nWAw18NVsP+l2grV8TKyNvgu1tww7H0hsCWI/u0mECEIWCGIC7zhBM4mvBANmCNnpX2SWBtnAibFlhDxGsrx6gg4gQKUSOZYfEEZx6TeeknCI4BvnjOf2DMXLMB+B4q3Vc/+GtcQlYRtBlZpWHxwhimZHbYOWxFoxQa58gcRCnWyurV/QaKVj/JzQcGGP5GuYP6hkUnJtOxcIVmtO6C0+7r9bl9/m9smw/TV9c8Um6wsZ7u7JB+0ofP0vyzfxpbD797Z/+XUgvsF0nei4lBJvYCpXKrPCnod6Bjzdj9KBdjxXvF564nWqtUKjCChnB56duriF0jlTu+tFTxwGrnQpTGmB3d1pm4vSyEF50E3UmuYup32azpJgHyIdNF7JG87nQq9E6XEnDe2HQ48fN9FpgrxuPPTWudjHAk7KQXjHIS+6HFIwg/uBQDYKUlAhShQowVJ+QVPG+bCXYq2bkZLHXiceeJnfi6h+NvVgZHh8v73YVkxsvb0JgkSVwBhhiNASmFBQT59qHVj3/bNkkDZdkCZvlyD9y4niYXAX/hL0hVYxnn0hOrVgC+8i5sc+ejEqPyNddm/0tBO7MabIqI2zU1hIE5EvNqlSk6vnoa2TkKIlX6eu1DqcUv0p/Ybtw8mI1Bh9No06wAldUUg43rpXF/SjK4zg5tJJ+hLHKCY1VqldKjZLbjif+FOEceGC99cGAA6ImBwFra7RNas1+M2YiFZxJhFiUGfP3okZJfUKzINojaEOHvLokDGe2QTOniFi9VQI1JRc/8ezQB3hZN0WF7TvHpF2111LPaVsEW+6GDXLTQNjF3m5ELWNzoue+fu8vP34CQV+8q4uPUV+/a8sJNlGPCCqnyd9+u6HdF25rd884TyqqxEwkJI5UuhypsBsAcpMKN+snmVTipZxdVpXJgpsLzcgi1Y7x/TvCKU37w6/IHznjtXXYDquTopYvova+c2rrC4cQu8mGBLEwOe3Pt8vi1gfPulNgs2Xb0f+0RUWkgQkFqv3iTteVCi3ilU0gsN1Nuf8WtGMmnELbpOU0fVjohd6R48hsy3ivHHGFaun2ZKSBrYmlo4Fy4a0epgGi7lN6+guVpgVcl7qAoRM3jhPPwTnC7tkkHNkEZcfu5gt7QnqwHT9oYTM5imzmi66ZHGgnv02Byytppdw+9zwbXHy/x1bfM0D70FiYU17AZ1TpiNQhd/Dj+R1dGx/Pt2B0NZwM59eDIAWnxjvr47qdepjgR30jQ8JOGHjgAOS5B8j2C7Bq3ktSbPSfc/bOdoaZ91SGCwle0CEltZYaxE0ZoU5k4olPz4gzqpqMEMysUGOBk3SE0Bmg6kw7+Y0QXKEmcPyTwzLb9p+awjL3oafBZQpc8hZSy6XLrwPLhi0PoTJ1qpPMnMxzIM1OUdS4+kcnG3B3NchRiyjzgK5MRqXJ4q5ZtYUT8dACVchwsu/7jNsYUsFycyZYKkxqQ3DWyP7ZZyeufjGwzOmMyZil8PVe1GYdnJ3zKMwKWg32pvO8y+y7kSM4Ilg4s2YuUQSyR7xDR5vJ8GY0E07C45PGO2muqE3KfGjsz5TLCxrn0EAuwvbAgbABRww4wtDgnF9VLjRijjzJCxpX/dsGESnoolM5JrJFVNNgon/de/zVwCIxUehC5aCQCgfF7cwi7jxwECDC9oDnNvjYhw9V0GrnZUj7U/3yQsgdINO9HljBEDSYFfYGKoegUr3XkS21Iw1UHg30hrw+BI7d0EgKbFTvffDyIvLFxtNVD2PXa3CRHBc1cECi8Y11bKo9dfAcmesTVIXp28glraNAszG54Caax6WGAXOCbETDHUTIkvBCpRPej7onq4rT7oGvQrZhzUzyTqcpWz8IReZnjOqVAC2Fw/6awvkVDl5GlpoDwvoD6aLXmYnvL2dPz93L6zvx2327eOIZPP46b1hn76RH6rCsw0v/Lop1uJgoPqg6oKpsoHBwACoTCtydZlkO2shvp1loKS+nnWbc10y6TFfdRrO4bhd5TDvcuIyj8AHOzZLYdl1eDlksd1NZMMiXYrGdTOSdODP0fgrt237D3eyZANtfrIxZGy1qZxv/CJ/iI1OX1my5f67B3Yn6z6CFs/EoAqD9Tp/YZUd63kSgqJGej5fivb4+siFwwskVDW5S4EYVhJqhRqrTz31l+22AEg5jiznfL/EPhpTgY8Z2s0An89LCi3/Rs8zV1ktNtuLbeo1OPRPTGG15p57FHKZYhqFnyvlkT1GUhficT405xCF1feVATimTgs3UPzqnNFZHBbIcIRtgG//oj54dR3WdMAKLpDpy6WCqmk/0kDccr51SUvg/ \ No newline at end of file diff --git a/src/MLJ.jl b/src/MLJ.jl index 161c975eb..b2935eac1 100644 --- a/src/MLJ.jl +++ b/src/MLJ.jl @@ -1,3 +1,34 @@ +""" + MLJ + +[`MLJ`](https://alan-turing-institute.github.io/MLJ.jl/dev/) is a Machine Learning toolbox +for Julia. It collects together functionality from the following packages, which can be +loaded separately: + +- MLJBase.jl: The `machine` interface, tools to `partition` and `unpack` datasets, + `evaluate`/`evaluate!` for model performance, `|>` pipeline syntax, + `TransformedTargetModel` wrapper, general model composition syntax (learning networks), + synthetic data generators, `scitype` and `schema` methods (from ScientificTypes.jl) for + checking how MLJ interprets your data + +- StatisticalMeasures.jl: MLJ-compatible measures (metrics) for machine learning, + confusion matrices, ROC curves. + +- MLJModels.jl: Common transformers for data preprocessing, searching the model registry, + loading models with `@load` + +- MLJTuning.jl: Hyperparameter optimization via `TunedModel` wrapper + +- MLJIteration.jl: `IteratedModel` Wrapper for controlling iterative models + +- MLJEnsembles.jl: Homogeneous model ensembling, via the `EnsembleModel` wrapper + +- MLJBalancing.jl: Incorporation of oversampling/undersampling methods in pipelines, via + the `BalancedModel` wrapper + +- OpenML.jl: Tool for grabbing datasets from OpenML.org + +""" module MLJ @@ -18,6 +49,7 @@ using MLJTuning using MLJModels using OpenML @reexport using MLJFlow +@reexport using StatisticalMeasures using MLJIteration import MLJIteration.IterationControl @@ -108,16 +140,6 @@ for T in MLJBase.EXTENDED_ABSTRACT_MODEL_TYPES @eval(export $T) end -# MLJBase/measures: -# measure names: -for m in MLJBase.MEASURE_TYPES_ALIASES_AND_INSTANCES - :(export $m) |> eval -end -export measures, - aggregate, default_measure, skipinvalid, - roc_curve, roc, - no_avg, macro_avg, micro_avg - # re-export from MLJEnsembles: export EnsembleModel diff --git a/test/exported_names.jl b/test/exported_names.jl index fe95fc3e7..e49709972 100644 --- a/test/exported_names.jl +++ b/test/exported_names.jl @@ -27,4 +27,10 @@ Save() MLFlowLogger +# StatisticalMeasures + +rms +l2 +log_score + true