From b8ba45193d88f7d6037cb5444f64bb452e9e58a2 Mon Sep 17 00:00:00 2001 From: Adam Pocock Date: Fri, 7 Oct 2022 14:29:39 -0400 Subject: [PATCH] Release Tribuo 4.3 (#289) * Updating readme and adding v4.3 release notes. * Adding more to the release notes. * Adding the TF PR. * Bumping to 4.3. * Updating release notes. * Version bumps in readme and release notes. * Fixing the version number in the feature selection pom. * Updating tutorials for 4.3.0. * Update tribuo-v4-3-release-notes.md More spaces --- AnomalyDetection/Core/pom.xml | 2 +- AnomalyDetection/LibLinear/pom.xml | 2 +- AnomalyDetection/LibSVM/pom.xml | 2 +- AnomalyDetection/pom.xml | 2 +- Classification/Core/pom.xml | 2 +- Classification/DecisionTree/pom.xml | 2 +- Classification/Experiments/pom.xml | 2 +- Classification/Explanations/pom.xml | 2 +- Classification/FeatureSelection/pom.xml | 2 +- Classification/LibLinear/pom.xml | 2 +- Classification/LibSVM/pom.xml | 2 +- Classification/MultinomialNaiveBayes/pom.xml | 2 +- Classification/SGD/pom.xml | 2 +- Classification/XGBoost/pom.xml | 2 +- Classification/pom.xml | 2 +- Clustering/Core/pom.xml | 2 +- Clustering/Hdbscan/pom.xml | 2 +- Clustering/KMeans/pom.xml | 2 +- Clustering/pom.xml | 2 +- Common/LibLinear/pom.xml | 2 +- Common/LibSVM/pom.xml | 2 +- Common/NearestNeighbour/pom.xml | 2 +- Common/SGD/pom.xml | 2 +- Common/Trees/pom.xml | 2 +- Common/XGBoost/pom.xml | 2 +- Common/pom.xml | 2 +- Core/pom.xml | 2 +- Data/pom.xml | 2 +- Interop/Core/pom.xml | 2 +- Interop/ModelCard/pom.xml | 2 +- Interop/OCI/pom.xml | 2 +- Interop/ONNX/pom.xml | 2 +- Interop/Tensorflow/pom.xml | 2 +- Interop/pom.xml | 2 +- Json/pom.xml | 2 +- Math/pom.xml | 2 +- MultiLabel/Core/pom.xml | 2 +- MultiLabel/SGD/pom.xml | 2 +- MultiLabel/pom.xml | 2 +- README.md | 62 +-- Regression/Core/pom.xml | 2 +- Regression/LibLinear/pom.xml | 2 +- Regression/LibSVM/pom.xml | 2 +- Regression/RegressionTree/pom.xml | 2 +- Regression/SGD/pom.xml | 2 +- Regression/SLM/pom.xml | 2 +- Regression/XGBoost/pom.xml | 2 +- Regression/pom.xml | 2 +- Reproducibility/pom.xml | 2 +- Util/InformationTheory/pom.xml | 2 +- Util/ONNXExport/pom.xml | 2 +- Util/Tokenization/pom.xml | 2 +- Util/pom.xml | 2 +- distribution/pom.xml | 2 +- .../tribuo-v4-3-release-notes.md | 117 ++++++ pom.xml | 2 +- tests/pom.xml | 2 +- tutorials/anomaly-tribuo-v4.ipynb | 6 +- tutorials/clustering-hdbscan-tribuo-v4.ipynb | 22 +- tutorials/clustering-tribuo-v4.ipynb | 31 +- tutorials/columnar-tribuo-v4.ipynb | 19 +- tutorials/configuration-tribuo-v4.ipynb | 48 +-- .../document-classification-tribuo-v4.ipynb | 390 +++++++++--------- tutorials/external-models-tribuo-v4.ipynb | 6 +- tutorials/feature-selection-tribuo-v4.ipynb | 54 +-- tutorials/irises-tribuo-v4.ipynb | 86 ++-- tutorials/modelcard-tribuo-v4.ipynb | 56 +-- tutorials/multi-label-tribuo-v4.ipynb | 22 +- tutorials/onnx-export-tribuo-v4.ipynb | 78 ++-- tutorials/regression-tribuo-v4.ipynb | 18 +- tutorials/reproducibility-tribuo-v4.ipynb | 98 +++-- tutorials/tensorflow-tribuo-v4.ipynb | 66 +-- 72 files changed, 725 insertions(+), 564 deletions(-) create mode 100644 docs/release-notes/tribuo-v4-3-release-notes.md diff --git a/AnomalyDetection/Core/pom.xml b/AnomalyDetection/Core/pom.xml index 0e5897b38..75672a09b 100644 --- a/AnomalyDetection/Core/pom.xml +++ b/AnomalyDetection/Core/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-anomaly - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml AnomalyDetection-Core diff --git a/AnomalyDetection/LibLinear/pom.xml b/AnomalyDetection/LibLinear/pom.xml index 43736051a..07c3d2069 100644 --- a/AnomalyDetection/LibLinear/pom.xml +++ b/AnomalyDetection/LibLinear/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo-anomaly - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml AnomalyDetection-LibLinear diff --git a/AnomalyDetection/LibSVM/pom.xml b/AnomalyDetection/LibSVM/pom.xml index a065d2b53..139421b45 100644 --- a/AnomalyDetection/LibSVM/pom.xml +++ b/AnomalyDetection/LibSVM/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-anomaly - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml AnomalyDetection-LibSVM diff --git a/AnomalyDetection/pom.xml b/AnomalyDetection/pom.xml index 6c67d6f32..4f7ad4646 100644 --- a/AnomalyDetection/pom.xml +++ b/AnomalyDetection/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml tribuo-anomaly diff --git a/Classification/Core/pom.xml b/Classification/Core/pom.xml index 66aeaecba..8dfe61fa0 100644 --- a/Classification/Core/pom.xml +++ b/Classification/Core/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Classification-Core diff --git a/Classification/DecisionTree/pom.xml b/Classification/DecisionTree/pom.xml index bb8aefece..f8bf11b02 100644 --- a/Classification/DecisionTree/pom.xml +++ b/Classification/DecisionTree/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Classification-Tree diff --git a/Classification/Experiments/pom.xml b/Classification/Experiments/pom.xml index 1572b4a53..879d55aba 100644 --- a/Classification/Experiments/pom.xml +++ b/Classification/Experiments/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Classification-Experiments diff --git a/Classification/Explanations/pom.xml b/Classification/Explanations/pom.xml index bac01b775..ee6cfec24 100644 --- a/Classification/Explanations/pom.xml +++ b/Classification/Explanations/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Classification-Explanations diff --git a/Classification/FeatureSelection/pom.xml b/Classification/FeatureSelection/pom.xml index 040524c71..7923d322a 100644 --- a/Classification/FeatureSelection/pom.xml +++ b/Classification/FeatureSelection/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Classification-FeatureSelection diff --git a/Classification/LibLinear/pom.xml b/Classification/LibLinear/pom.xml index 04f586b41..9ed4d5d2b 100644 --- a/Classification/LibLinear/pom.xml +++ b/Classification/LibLinear/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Classification-LibLinear diff --git a/Classification/LibSVM/pom.xml b/Classification/LibSVM/pom.xml index ef1f5423e..356809703 100644 --- a/Classification/LibSVM/pom.xml +++ b/Classification/LibSVM/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Classification-LibSVM diff --git a/Classification/MultinomialNaiveBayes/pom.xml b/Classification/MultinomialNaiveBayes/pom.xml index 45463b446..e18b32866 100644 --- a/Classification/MultinomialNaiveBayes/pom.xml +++ b/Classification/MultinomialNaiveBayes/pom.xml @@ -22,7 +22,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 Classification-MultinomialNaiveBayes tribuo-classification-mnnaivebayes diff --git a/Classification/SGD/pom.xml b/Classification/SGD/pom.xml index 1da87d230..103b455d8 100644 --- a/Classification/SGD/pom.xml +++ b/Classification/SGD/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Classification-SGD diff --git a/Classification/XGBoost/pom.xml b/Classification/XGBoost/pom.xml index 62666e718..be381123d 100644 --- a/Classification/XGBoost/pom.xml +++ b/Classification/XGBoost/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-classification - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Classification-XGBoost diff --git a/Classification/pom.xml b/Classification/pom.xml index 2cb681313..2eae3f95b 100644 --- a/Classification/pom.xml +++ b/Classification/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml tribuo-classification diff --git a/Clustering/Core/pom.xml b/Clustering/Core/pom.xml index de284dffc..048d58db7 100644 --- a/Clustering/Core/pom.xml +++ b/Clustering/Core/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-clustering - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Clustering-Core diff --git a/Clustering/Hdbscan/pom.xml b/Clustering/Hdbscan/pom.xml index 446bd587d..d8a0ccb36 100644 --- a/Clustering/Hdbscan/pom.xml +++ b/Clustering/Hdbscan/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-clustering - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Clustering-Hdbscan diff --git a/Clustering/KMeans/pom.xml b/Clustering/KMeans/pom.xml index 517b6b018..8e6250b52 100644 --- a/Clustering/KMeans/pom.xml +++ b/Clustering/KMeans/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-clustering - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Clustering-KMeans diff --git a/Clustering/pom.xml b/Clustering/pom.xml index 3d321b91d..285d645f9 100644 --- a/Clustering/pom.xml +++ b/Clustering/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml tribuo-clustering diff --git a/Common/LibLinear/pom.xml b/Common/LibLinear/pom.xml index c1b2595a7..3e30d80b9 100644 --- a/Common/LibLinear/pom.xml +++ b/Common/LibLinear/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-common - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Common-LibLinear diff --git a/Common/LibSVM/pom.xml b/Common/LibSVM/pom.xml index f567a4d3f..2e856b60b 100644 --- a/Common/LibSVM/pom.xml +++ b/Common/LibSVM/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-common - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Common-LibSVM diff --git a/Common/NearestNeighbour/pom.xml b/Common/NearestNeighbour/pom.xml index 7e2fae4d4..30738e96e 100644 --- a/Common/NearestNeighbour/pom.xml +++ b/Common/NearestNeighbour/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-common - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Common-NearestNeighbour diff --git a/Common/SGD/pom.xml b/Common/SGD/pom.xml index 2de721669..ea83850df 100644 --- a/Common/SGD/pom.xml +++ b/Common/SGD/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-common - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Common-SGD diff --git a/Common/Trees/pom.xml b/Common/Trees/pom.xml index cea202cb9..8cf5bee12 100644 --- a/Common/Trees/pom.xml +++ b/Common/Trees/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-common - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Common-Tree diff --git a/Common/XGBoost/pom.xml b/Common/XGBoost/pom.xml index 4b170ed99..bdd0039a1 100644 --- a/Common/XGBoost/pom.xml +++ b/Common/XGBoost/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-common - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Common-XGBoost diff --git a/Common/pom.xml b/Common/pom.xml index 1192e611c..0f1618f5e 100644 --- a/Common/pom.xml +++ b/Common/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml tribuo-common diff --git a/Core/pom.xml b/Core/pom.xml index 2beac4a62..c8b2b287c 100644 --- a/Core/pom.xml +++ b/Core/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Core diff --git a/Data/pom.xml b/Data/pom.xml index e8a1b2a4a..9a74e0372 100644 --- a/Data/pom.xml +++ b/Data/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Data diff --git a/Interop/Core/pom.xml b/Interop/Core/pom.xml index 36f007685..fda41ed94 100644 --- a/Interop/Core/pom.xml +++ b/Interop/Core/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-interop - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Interop-Core diff --git a/Interop/ModelCard/pom.xml b/Interop/ModelCard/pom.xml index 5ad71462e..174b692e0 100644 --- a/Interop/ModelCard/pom.xml +++ b/Interop/ModelCard/pom.xml @@ -5,7 +5,7 @@ tribuo-interop org.tribuo - 4.3.0-SNAPSHOT + 4.3.0 4.0.0 ModelCard diff --git a/Interop/OCI/pom.xml b/Interop/OCI/pom.xml index febb8cb8c..8d2ba3e01 100644 --- a/Interop/OCI/pom.xml +++ b/Interop/OCI/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo-interop - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml OCI diff --git a/Interop/ONNX/pom.xml b/Interop/ONNX/pom.xml index d9c47d840..c8a55cf92 100644 --- a/Interop/ONNX/pom.xml +++ b/Interop/ONNX/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-interop - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Interop-ONNX diff --git a/Interop/Tensorflow/pom.xml b/Interop/Tensorflow/pom.xml index cb3b7432d..05c403fa7 100644 --- a/Interop/Tensorflow/pom.xml +++ b/Interop/Tensorflow/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-interop - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Interop-Tensorflow diff --git a/Interop/pom.xml b/Interop/pom.xml index 0da938c66..4348ca943 100644 --- a/Interop/pom.xml +++ b/Interop/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml tribuo-interop diff --git a/Json/pom.xml b/Json/pom.xml index 9388a31fe..1f0b9b03a 100644 --- a/Json/pom.xml +++ b/Json/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 4.0.0 Json diff --git a/Math/pom.xml b/Math/pom.xml index 0f4ea98c7..a1cbf34e7 100644 --- a/Math/pom.xml +++ b/Math/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Math diff --git a/MultiLabel/Core/pom.xml b/MultiLabel/Core/pom.xml index 5c71cb8cd..c021ea69e 100644 --- a/MultiLabel/Core/pom.xml +++ b/MultiLabel/Core/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-multilabel - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml MultiLabel-Core diff --git a/MultiLabel/SGD/pom.xml b/MultiLabel/SGD/pom.xml index d0b3b260c..459fd5c5f 100644 --- a/MultiLabel/SGD/pom.xml +++ b/MultiLabel/SGD/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo-multilabel - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml MultiLabel-SGD diff --git a/MultiLabel/pom.xml b/MultiLabel/pom.xml index ae0109646..c0f9f40fd 100644 --- a/MultiLabel/pom.xml +++ b/MultiLabel/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml tribuo-multilabel diff --git a/README.md b/README.md index 929af111b..d50aceeed 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@

Tribuo Logo

-# Tribuo - A Java prediction library (v4.2) +# Tribuo - A Java prediction library (v4.3) [Tribuo](https://tribuo.org) is a machine learning library in Java that provides multi-class classification, regression, clustering, anomaly detection @@ -13,10 +13,10 @@ Learning Research Group; we welcome community contributions. All trainers are configurable using the [OLCUT](https://github.com/oracle/olcut) configuration system. This allows a -user to define a trainer in an xml file and repeatably build models. Example -configurations for each of the supplied Trainers can be found in the config -folder of each package. These configuration files can also be written in json -or edn by using the appropriate OLCUT configuration dependency. Models and +user to define a trainer in an xml or json file and repeatably build models. +Example configurations for each of the supplied Trainers can be found in the +config folder of each package. These configuration files can also be written in +json or edn by using the appropriate OLCUT configuration dependency. Models and datasets are serializable using Java serialization. All models and evaluations include a serializable provenance object which @@ -37,15 +37,15 @@ architectures on Windows 10, macOS and Linux (RHEL/OL/CentOS 7+), as these are supported platforms for the native libraries with which we interface. If you're interested in another platform and wish to use one of the native library interfaces (ONNX Runtime, TensorFlow, and XGBoost), we recommend reaching out -to the developers of those libraries. Note the reproducibility package -requires Java 17, and as such is not part of the `tribuo-all` Maven Central -deployment. +to the developers of those libraries. Note the model card and reproducibility +packages require Java 17, and as such are not part of the `tribuo-all` Maven +Central deployment. ## Documentation * [Library Architecture](docs/Architecture.md) * [Package Overview](docs/PackageOverview.md) -* Javadoc [4.2](https://tribuo.org/learn/4.2/javadoc), [4.1](https://tribuo.org/learn/4.1/javadoc/), [4.0](https://tribuo.org/learn/4.0/javadoc/) +* Javadoc [4.3](https://tribuo.org/learn/4.3/javadoc), [4.2](https://tribuo.org/learn/4.2/javadoc), [4.1](https://tribuo.org/learn/4.1/javadoc/), [4.0](https://tribuo.org/learn/4.0/javadoc/) * [Helper Programs](docs/HelperPrograms.md) * [Developer Documentation](docs/Internals.md) * [Roadmap](docs/Roadmap.md) @@ -58,9 +58,9 @@ Regression, Anomaly Detection, TensorFlow, document classification, columnar data loading, working with externally trained models, and the configuration system, can be found in the [tutorials](tutorials). These use the [IJava](https://github.com/SpencerPark/IJava) Jupyter notebook kernel, and work -with Java 10+, except the reproducibility tutorial which requires Java 17. To -convert the tutorials' code back to Java 8, in most cases simply replace the -`var` keyword with the appropriate types. +with Java 10+, except the model card & reproducibility tutorials which require +Java 17. To convert the tutorials' code back to Java 8, in most cases simply +replace the `var` keyword with the appropriate types. ## Algorithms @@ -101,6 +101,13 @@ Tribuo has implementations or interfaces for: Tribuo also supplies a linear chain CRF for sequence classification tasks. This CRF is trained via SGD using any of Tribuo's gradient optimizers. +Tribuo has a set of information theoretic feature selection algorithms which +can be applied to classification tasks. Feature inputs are automatically +discretised into equal width bins. At the moment this includes implementations +of mutual information maximisation (MIM), Conditional Mutual Information +Maximisation (CMIM), minimum Redundancy Maximum Relevancy (mRMR) and Joint +Mutual Information (JMI). + To explain classifier predictions there is an implementation of the LIME algorithm. Tribuo's implementation allows the mixing of text and tabular data, along with the use of any sparse model as an explainer (e.g., regression trees, @@ -167,37 +174,38 @@ discuss how it would fit into Tribuo. Currently we have interfaces to: -* [LibLinear](https://github.com/bwaldvogel/liblinear-java) - via the LibLinear-java port of the original [LibLinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) (v2.43). +* [LibLinear](https://github.com/bwaldvogel/liblinear-java) - via the LibLinear-java port of the original [LibLinear](https://www.csie.ntu.edu.tw/~cjlin/liblinear/) (v2.44). * [LibSVM](https://www.csie.ntu.edu.tw/~cjlin/libsvm/) - using the pure Java transformed version of the C++ implementation (v3.25). -* [ONNX Runtime](https://onnxruntime.ai) - via the Java API contributed by our group (v1.9.0). -* [TensorFlow](https://tensorflow.org) - Using [TensorFlow Java](https://github.com/tensorflow/java) v0.4.1 (based on TensorFlow v2.7.1). This allows the training and deployment of TensorFlow models entirely in Java. -* [XGBoost](https://xgboost.ai) - via the built in XGBoost4J API (v1.5.0). +* [ONNX Runtime](https://onnxruntime.ai) - via the Java API contributed by our group (v1.12.1). +* [TensorFlow](https://tensorflow.org) - Using [TensorFlow Java](https://github.com/tensorflow/java) v0.4.2 (based on TensorFlow v2.7.4). This allows the training and deployment of TensorFlow models entirely in Java. +* [XGBoost](https://xgboost.ai) - via the built in XGBoost4J API (v1.6.2). ## Binaries Binaries are available on Maven Central, using groupId `org.tribuo`. To pull -all of Tribuo, including the bindings for TensorFlow, ONNX Runtime and XGBoost -(which are native libraries), use: +all the Java 8 compatible components of Tribuo, including the bindings for +TensorFlow, ONNX Runtime and XGBoost (which are native libraries), use: Maven: ```xml org.tribuo tribuo-all - 4.2.1 + 4.3.0 pom ``` or from Gradle: ```groovy -implementation ("org.tribuo:tribuo-all:4.2.1@pom") { +implementation ("org.tribuo:tribuo-all:4.3.0@pom") { transitive = true // for build.gradle (i.e., Groovy) // isTransitive = true // for build.gradle.kts (i.e., Kotlin) } ``` The `tribuo-all` dependency is a pom which depends on all the Tribuo -subprojects except for the reproducibility project which requires Java 17. +subprojects except for the model card and reproducibility projects which +require Java 17. Most of Tribuo is pure Java and thus cross-platform, however some of the interfaces link to libraries which use native code. Those interfaces @@ -207,9 +215,11 @@ are supplied. If you need support for a specific platform, reach out to the maintainers of those projects. As of the 4.1 release these native packages all provide x86\_64 binaries for Windows, macOS and Linux. It is also possible to compile each package for macOS ARM64 (i.e., Apple Silicon), though there are no -binaries available on Maven Central for that platform. When developing on an -ARM platform you can select the `arm` profile in Tribuo's pom.xml to disable -the native library tests. +binaries available on Maven Central for that platform for TensorFlow or +XGBoost. As of the 4.3 release Tribuo now depends on a version of ONNX Runtime +which includes support for macOS ARM64 and Linux aarch64 platforms. When +developing on an ARM platform you can select the `arm` profile in Tribuo's +`pom.xml` to disable the native library tests. Individual jars are published for each Tribuo module. It is preferable to depend only on the modules necessary for the specific project. This prevents @@ -223,7 +233,8 @@ with the latest release. To build, simply run `mvn clean package`. All Tribuo's dependencies should be available on Maven Central. Please file an issue for build-related issues if you're having trouble (though do check if you're missing proxy settings for Maven first, as that's a common cause of build -failures, and out of our control). +failures, and out of our control). Note if you're building using Java 16 or +earlier the model card and reproducibility packages will be disabled. ## Repository Layout @@ -254,6 +265,7 @@ Tribuo is licensed under the [Apache 2.0 License](./LICENSE.txt). ## Release Notes: +- [v4.3.0](https://github.com/oracle/tribuo/blob/main/docs/release-notes/tribuo-v4-3-release-notes.md) - Model card support, feature selection for classification, protobuf serialization format, kd-tree for distance computations, speed improvements for sparse linear models. Version bumps for most dependencies, and various other small fixes and improvements. - [v4.2.1](https://github.com/oracle/tribuo/blob/main/docs/release-notes/tribuo-v4-2-1-release-notes.md) - Bug fixes for KMeans' multithreading, nondeterministic iteration orders affecting ONNX export and K-Means initialization, and upgraded TF-Java to 0.4.1. - [v4.2.0](https://github.com/oracle/tribuo/blob/main/docs/release-notes/tribuo-v4-2-release-notes.md) - Added factorization machines, classifier chains, HDBSCAN. Added ONNX export and OCI Data Science integration. Added reproducibility framework. Various other small fixes and improvements, including the regression fixes from v4.1.1. Filled out the remaining javadoc, added 4 new tutorials (onnx export, multi-label classification, reproducibility, hdbscan), expanded existing tutorials. - [v4.1.1](https://github.com/oracle/tribuo/blob/main/docs/release-notes/tribuo-v4-1-1-release-notes.md) - Bug fixes for multi-output regression, multi-label evaluation, KMeans & KNN with SecurityManager, and update TF-Java 0.4.0. diff --git a/Regression/Core/pom.xml b/Regression/Core/pom.xml index 98640b92e..57a7ce958 100644 --- a/Regression/Core/pom.xml +++ b/Regression/Core/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-regression - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Regression-Core diff --git a/Regression/LibLinear/pom.xml b/Regression/LibLinear/pom.xml index 848c33031..251333abe 100644 --- a/Regression/LibLinear/pom.xml +++ b/Regression/LibLinear/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-regression - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Regression-LibLinear diff --git a/Regression/LibSVM/pom.xml b/Regression/LibSVM/pom.xml index f99daa86b..55efa60a0 100644 --- a/Regression/LibSVM/pom.xml +++ b/Regression/LibSVM/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-regression - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Regression-LibSVM diff --git a/Regression/RegressionTree/pom.xml b/Regression/RegressionTree/pom.xml index 9984cc7de..a70ad74f4 100644 --- a/Regression/RegressionTree/pom.xml +++ b/Regression/RegressionTree/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-regression - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Regression-Tree diff --git a/Regression/SGD/pom.xml b/Regression/SGD/pom.xml index 1b9eed27e..f9ad1a88d 100644 --- a/Regression/SGD/pom.xml +++ b/Regression/SGD/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo-regression - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Regression-SGD diff --git a/Regression/SLM/pom.xml b/Regression/SLM/pom.xml index b95a8faf6..3222dadac 100644 --- a/Regression/SLM/pom.xml +++ b/Regression/SLM/pom.xml @@ -20,7 +20,7 @@ org.tribuo tribuo-regression - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Regression-SLM diff --git a/Regression/XGBoost/pom.xml b/Regression/XGBoost/pom.xml index 217c800e3..a45d59ab6 100644 --- a/Regression/XGBoost/pom.xml +++ b/Regression/XGBoost/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-regression - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Regression-XGBoost diff --git a/Regression/pom.xml b/Regression/pom.xml index 51103ce4b..d349b24a0 100644 --- a/Regression/pom.xml +++ b/Regression/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml tribuo-regression diff --git a/Reproducibility/pom.xml b/Reproducibility/pom.xml index 9b7dc416c..3a6702d52 100644 --- a/Reproducibility/pom.xml +++ b/Reproducibility/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 4.0.0 Reproducibility diff --git a/Util/InformationTheory/pom.xml b/Util/InformationTheory/pom.xml index 8f486ba0c..692e34ffb 100644 --- a/Util/InformationTheory/pom.xml +++ b/Util/InformationTheory/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-util - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml InformationTheory diff --git a/Util/ONNXExport/pom.xml b/Util/ONNXExport/pom.xml index af044cd59..2bf8f77b6 100644 --- a/Util/ONNXExport/pom.xml +++ b/Util/ONNXExport/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-util - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml ONNXExport diff --git a/Util/Tokenization/pom.xml b/Util/Tokenization/pom.xml index 1301629d2..242c60919 100644 --- a/Util/Tokenization/pom.xml +++ b/Util/Tokenization/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo-util - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml Tokenization diff --git a/Util/pom.xml b/Util/pom.xml index e8d61b3a7..46680b699 100644 --- a/Util/pom.xml +++ b/Util/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml tribuo-util diff --git a/distribution/pom.xml b/distribution/pom.xml index 8b6958d0c..278ce6694 100644 --- a/distribution/pom.xml +++ b/distribution/pom.xml @@ -21,7 +21,7 @@ org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml diff --git a/docs/release-notes/tribuo-v4-3-release-notes.md b/docs/release-notes/tribuo-v4-3-release-notes.md new file mode 100644 index 000000000..f586bc75c --- /dev/null +++ b/docs/release-notes/tribuo-v4-3-release-notes.md @@ -0,0 +1,117 @@ +# Tribuo v4.3 Release Notes + +Tribuo v4.3 adds feature selection for classification problems, support for +guided generation of model cards, and protobuf serialization for all +serializable classes. In addition there is a new interface for distance based +computations which can now use a kd-tree or brute force comparisons, the sparse +linear model package has been rewritten to use Tribuo's linear algebra system +improving the speed and reducing memory consumption, and we've added some more +tutorials. + +Note this is likely the last feature release of Tribuo to support Java 8. The +next major version of Tribuo will require Java 17. In addition, support for +using `java.io.Serializable` for serialization will be removed in the next +major release, and Tribuo will exclusively use protobuf based serialization. + +## Feature Selection + +In this release we've added support for feature selection algorithms to the +dataset and provenance systems, along with implementations of 4 information +theoretic feature selection algorithms for use in classification problems. The +algorithms (MIM, CMIM, mRMR and JMI) are described in this [review +paper](https://jmlr.org/papers/v13/brown12a.html). Continuous inputs are +discretised into a fixed number of equal width bins before the mutual +information is computed. These algorithms are a useful feature selection +baseline, and we welcome contributions to extend the set of supported +algorithms. + +- Feature selection algorithms [#254](https://github.com/oracle/tribuo/pull/254). + +## Model Card Support + +[Model Cards](https://dl.acm.org/doi/10.1145/3287560.3287596) are a popular way +of describing a model, its training data, expected applications and any use +cases that should be avoided. In this release we've added guided generation of +model cards, where many fields are automatically generated from the provenance +information inside each Tribuo model. Fields which require user input (such as +the expected use cases for a model, or its license) can be added via a CLI +program, and the resulting model card can be saved in json format. + +At the moment, the automatic data extraction fails on some kinds of nested +ensemble models which are generated without using a Tribuo `Trainer` class, +in the future we'll look at improving the data extraction for this case. + +- Model card infrastructure ([#243](https://github.com/oracle/tribuo/pull/243), [#250](https://github.com/oracle/tribuo/pull/250), [#253](https://github.com/oracle/tribuo/pull/253)). + +## Protobuf Serialization + +In this release we've added [protocol +buffer](https://developers.google.com/protocol-buffers) definitions for +serializing all of Tribuo's serializable types, along with the necessary code +to interact with those definitions. This effort has improved the validation of +serialized data, and will allow Tribuo models to be upwards compatible across +major versions of Tribuo. Any serialized model or dataset from Tribuo v4.2 or +earlier can be loaded in and saved out into the new format which will ensure +compatibility with the next major version of Tribuo. + +- Protobuf support for core types ([#226](https://github.com/oracle/tribuo/pull/226), [#255](https://github.com/oracle/tribuo/pull/255), [#262](https://github.com/oracle/tribuo/pull/262), [#264](https://github.com/oracle/tribuo/pull/264)). +- Protobuf support for models (Multinomial Naive Bayes [#267](https://github.com/oracle/tribuo/pull/267), Sparse linear models [#269](https://github.com/oracle/tribuo/pull/269), XGBoost [#270](https://github.com/oracle/tribuo/pull/270), OCI, ONNX and TF [#271](https://github.com/oracle/tribuo/pull/271), LibSVM [#272](https://github.com/oracle/tribuo/pull/272), LibLinear [#273](https://github.com/oracle/tribuo/pull/273), SGD [#275](https://github.com/oracle/tribuo/pull/275), Clustering models [#276](https://github.com/oracle/tribuo/pull/276), Baseline models and ensembles [#277](https://github.com/oracle/tribuo/pull/277), Trees [#278](https://github.com/oracle/tribuo/pull/278)). +- Docs and supporting programs ([#279](https://github.com/oracle/tribuo/pull/279)). + +## Smaller improvements + +We added an interface for querying the nearest neighbours of a vector, and +updated HDBSCAN, K-Means and K-NN to use the new interface. The old +implementation has been renamed the "brute force" search operator, and a new +implementation which uses a kd-tree has been added. + +- Distance refactor ([#213](https://github.com/oracle/tribuo/pull/213), [#216](https://github.com/oracle/tribuo/pull/216), [#221](https://github.com/oracle/tribuo/pull/221), [#231](https://github.com/oracle/tribuo/pull/231), [#285](https://github.com/oracle/tribuo/pull/285)). + +We migrated off Apache Commons Math, which necessitated adding several methods +to Tribuo's math library. In the process we refactored the sparse linear model +code, removing redundant matrix operations and greatly improving the speed of +LASSO. + +- Refactor sparse linear models and remove Apache Commons Math ([#241](https://github.com/oracle/tribuo/pull/241)). + +The ONNX export support has been refactored to allow the use of different ONNX +opsets, and custom ONNX operations. This allows users of Tribuo's ONNX export +support to supply their own operations, and increases the flexibility of the +ONNX support on the JVM. + +- ONNX operator refactor ([#245](https://github.com/oracle/tribuo/pull/245)). + +ONNX Runtime has been upgraded to v1.12.1, which includes Linux ARM64 and macOS +ARM64 binaries. As a result we've removed the ONNX tests from the arm Maven +profile, and so those tests will execute on Linux & macOS ARM64 platforms. + +- ONNX Runtime upgrade ([#256](https://github.com/oracle/tribuo/pull/256)). + +## Small improvements + +- Improved the assignment to the noise cluster in HDBSCAN ([#222](https://github.com/oracle/tribuo/pull/222)). +- Upgrade liblinear-java to v2.44 ([#228](https://github.com/oracle/tribuo/pull/228)). +- Added accessors for the HDBSCAN cluster exemplars ([#229](https://github.com/oracle/tribuo/pull/229)). +- Improve validation of salts when hashing feature names ([#237](https://github.com/oracle/tribuo/pull/237)). +- Added accessors to TransformedModel for the wrapped model ([#244](https://github.com/oracle/tribuo/pull/244)). +- Added a regex text preprocessor ([#247](https://github.com/oracle/tribuo/pull/247)). +- Upgrade OpenCSV to v5.6 ([#259](https://github.com/oracle/tribuo/pull/259)). +- Added a builder to RowProcessor to make it less confusing ([#263](https://github.com/oracle/tribuo/pull/263)). +- Upgrade TF-Java to v0.4.2 ([#281](https://github.com/oracle/tribuo/pull/281)). +- Upgrade OCI Java SDK to v2.46.0, protobuf-java to 3.19.6, XGBoost to 1.6.2, jackson to 2.14.0-rc1 ([#288](https://github.com/oracle/tribuo/pull/288)). + +## Bug Fixes + +- Fix for HDBSCAN small cluster generation ([#236](https://github.com/oracle/tribuo/pull/236)). +- XGBoost provenance capture ([#239](https://github.com/oracle/tribuo/pull/239). + +## Contributors + +- Adam Pocock ([@Craigacp](https://github.com/Craigacp)) +- Jack Sullivan ([@JackSullivan](https://github.com/JackSullivan)) +- Romina Mahinpei ([@rmahinpei](https://github.com/rmahinpei)) +- Philip Ogren ([@pogren](https://github.com/pogren)) +- Katie Younglove ([@katieyounglove](https://github.com/katieyounglove)) +- Jeffrey Alexander ([@jhalexand](https://github.com/jhalexand)) +- Geoff Stewart ([@geoffreydstewart](https://github.com/geoffreydstewart)) + diff --git a/pom.xml b/pom.xml index 44f2a065a..08fdb5c78 100644 --- a/pom.xml +++ b/pom.xml @@ -20,7 +20,7 @@ 4.0.0 org.tribuo tribuo - 4.3.0-SNAPSHOT + 4.3.0 pom Core diff --git a/tests/pom.xml b/tests/pom.xml index 676369b66..30dd07930 100644 --- a/tests/pom.xml +++ b/tests/pom.xml @@ -21,7 +21,7 @@ tribuo org.tribuo - 4.3.0-SNAPSHOT + 4.3.0 ../pom.xml 4.0.0 diff --git a/tutorials/anomaly-tribuo-v4.ipynb b/tutorials/anomaly-tribuo-v4.ipynb index 82051953f..3f3a12471 100644 --- a/tutorials/anomaly-tribuo-v4.ipynb +++ b/tutorials/anomaly-tribuo-v4.ipynb @@ -19,7 +19,7 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-anomaly-libsvm-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-anomaly-libsvm-4.3.0-jar-with-dependencies.jar" ] }, { @@ -111,7 +111,7 @@ "obj = 289.5926348816893, rho = 3.144570476807895\n", "nSV = 296, nBSV = 114\n", "\n", - "Training took (00:00:00:147)\n" + "Training took (00:00:00:124)\n" ] } ], @@ -210,7 +210,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "17+35-LTS-2724" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/clustering-hdbscan-tribuo-v4.ipynb b/tutorials/clustering-hdbscan-tribuo-v4.ipynb index d0ddea3ca..bf903c03d 100644 --- a/tutorials/clustering-hdbscan-tribuo-v4.ipynb +++ b/tutorials/clustering-hdbscan-tribuo-v4.ipynb @@ -19,7 +19,7 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-clustering-hdbscan-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", + "%jars ./tribuo-clustering-hdbscan-4.3.0-jar-with-dependencies.jar\n", "%jars ./xchart-3.8.1.jar" ] }, @@ -168,9 +168,9 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "BufferedImage@73685dc: type = 1 DirectColorModel: rmask=ff0000 gmask=ff00 bmask=ff amask=0 IntegerInterleavedRaster: width = 600 height = 400 #Bands = 3 xOff = 0 yOff = 0 dataOffset[0] 0" + "BufferedImage@3d9dd79d: type = 1 DirectColorModel: rmask=ff0000 gmask=ff00 bmask=ff amask=0 IntegerInterleavedRaster: width = 600 height = 400 #Bands = 3 xOff = 0 yOff = 0 dataOffset[0] 0" ] }, "execution_count": 8, @@ -271,9 +271,9 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "BufferedImage@7fc7c755: type = 1 DirectColorModel: rmask=ff0000 gmask=ff00 bmask=ff amask=0 IntegerInterleavedRaster: width = 600 height = 400 #Bands = 3 xOff = 0 yOff = 0 dataOffset[0] 0" + "BufferedImage@3193ef05: type = 1 DirectColorModel: rmask=ff0000 gmask=ff00 bmask=ff amask=0 IntegerInterleavedRaster: width = 600 height = 400 #Bands = 3 xOff = 0 yOff = 0 dataOffset[0] 0" ] }, "execution_count": 12, @@ -374,9 +374,9 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "BufferedImage@2b5e6079: type = 1 DirectColorModel: rmask=ff0000 gmask=ff00 bmask=ff amask=0 IntegerInterleavedRaster: width = 600 height = 400 #Bands = 3 xOff = 0 yOff = 0 dataOffset[0] 0" + "BufferedImage@2b53682e: type = 1 DirectColorModel: rmask=ff0000 gmask=ff00 bmask=ff amask=0 IntegerInterleavedRaster: width = 600 height = 400 #Bands = 3 xOff = 0 yOff = 0 dataOffset[0] 0" ] }, "execution_count": 16, @@ -448,9 +448,9 @@ "outputs": [ { "data": { - "image/png": "", + "image/png": "", "text/plain": [ - "BufferedImage@51650547: type = 1 DirectColorModel: rmask=ff0000 gmask=ff00 bmask=ff amask=0 IntegerInterleavedRaster: width = 600 height = 400 #Bands = 3 xOff = 0 yOff = 0 dataOffset[0] 0" + "BufferedImage@57a77532: type = 1 DirectColorModel: rmask=ff0000 gmask=ff00 bmask=ff amask=0 IntegerInterleavedRaster: width = 600 height = 400 #Bands = 3 xOff = 0 yOff = 0 dataOffset[0] 0" ] }, "execution_count": 19, @@ -511,7 +511,7 @@ "outputs": [], "source": [ "var trainer = new HdbscanTrainer(5, // The minimum cluster size\n", - " DistanceType.L2, // The distance function \n", + " DistanceType.L2.getDistance(), // The distance function \n", " 5, // The number of neighbors to use to calculate the core-distance\n", " 4, // The number of compute threads\n", " NeighboursQueryFactoryType.BRUTE_FORCE // The nearest neighbour query algorithm\n", @@ -541,7 +541,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "11.0.12+8-LTS-237" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/clustering-tribuo-v4.ipynb b/tutorials/clustering-tribuo-v4.ipynb index c258ece5a..b72073e42 100644 --- a/tutorials/clustering-tribuo-v4.ipynb +++ b/tutorials/clustering-tribuo-v4.ipynb @@ -19,7 +19,7 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-clustering-kmeans-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-clustering-kmeans-4.3.0-jar-with-dependencies.jar" ] }, { @@ -98,14 +98,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training with 5 clusters took (00:00:00:147)\n" + "Training with 5 clusters took (00:00:00:071)\n" ] } ], "source": [ + "var l2Dist = DistanceType.L2.getDistance();\n", "var trainer = new KMeansTrainer(5, /* centroids */\n", " 10, /* iterations */\n", - " DistanceType.L2, /* distance function */\n", + " l2Dist, /* distance function */\n", " 1, /* number of compute threads */\n", " 1 /* RNG seed */\n", " );\n", @@ -180,12 +181,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training with 5 clusters took (00:00:00:071)\n" + "Training with 5 clusters took (00:00:00:038)\n" ] } ], "source": [ - "var plusplusTrainer = new KMeansTrainer(5,10,DistanceType.L2,Initialisation.PLUSPLUS,1,1);\n", + "var plusplusTrainer = new KMeansTrainer(5,10,l2Dist,Initialisation.PLUSPLUS,1,1);\n", "var startTime = System.currentTimeMillis();\n", "var plusplusModel = plusplusTrainer.train(data);\n", "var endTime = System.currentTimeMillis();\n", @@ -253,7 +254,7 @@ "text/plain": [ "Clustering Evaluation\n", "Normalized MI = 0.8128096132028937\n", - "Adjusted MI = 0.8113314999600718" + "Adjusted MI = 0.8113314999600724" ] }, "execution_count": 9, @@ -283,7 +284,7 @@ "text/plain": [ "Clustering Evaluation\n", "Normalized MI = 0.8154291916732408\n", - "Adjusted MI = 0.8139169342020222" + "Adjusted MI = 0.8139169341974347" ] }, "execution_count": 10, @@ -315,7 +316,7 @@ "text/plain": [ "Clustering Evaluation\n", "Normalized MI = 0.7881995472105396\n", - "Adjusted MI = 0.7864797287891366" + "Adjusted MI = 0.7864797287891137" ] }, "execution_count": 11, @@ -352,13 +353,13 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training with 5 clusters on 4 threads took (00:00:00:119)\n" + "Training with 5 clusters on 4 threads took (00:00:00:053)\n" ] } ], "source": [ "var mtData = new MutableDataset<>(new GaussianClusterDataSource(2000, 1L));\n", - "var mtTrainer = new KMeansTrainer(5,10,DistanceType.L2,4,1);\n", + "var mtTrainer = new KMeansTrainer(5,10,l2Dist,4,1);\n", "var mtStartTime = System.currentTimeMillis();\n", "var mtModel = mtTrainer.train(mtData);\n", "var mtEndTime = System.currentTimeMillis();\n", @@ -381,12 +382,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training with 20 clusters on 4 threads took (00:00:00:099)\n" + "Training with 20 clusters on 4 threads took (00:00:00:034)\n" ] } ], "source": [ - "var overTrainer = new KMeansTrainer(20,10,DistanceType.L2,4,1);\n", + "var overTrainer = new KMeansTrainer(20,10,l2Dist,4,1);\n", "var overStartTime = System.currentTimeMillis();\n", "var overModel = overTrainer.train(mtData);\n", "var overEndTime = System.currentTimeMillis();\n", @@ -410,7 +411,7 @@ "text/plain": [ "Clustering Evaluation\n", "Normalized MI = 0.8104463467727057\n", - "Adjusted MI = 0.8088941747451207" + "Adjusted MI = 0.8088941747417295" ] }, "execution_count": 14, @@ -440,7 +441,7 @@ "text/plain": [ "Clustering Evaluation\n", "Normalized MI = 0.8647317143685641\n", - "Adjusted MI = 0.860327445295668" + "Adjusted MI = 0.8603214693630152" ] }, "execution_count": 15, @@ -483,7 +484,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "11.0.12+8-LTS-237" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/columnar-tribuo-v4.ipynb b/tutorials/columnar-tribuo-v4.ipynb index 7b43c2ed5..82928d686 100644 --- a/tutorials/columnar-tribuo-v4.ipynb +++ b/tutorials/columnar-tribuo-v4.ipynb @@ -38,8 +38,8 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-classification-experiments-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-json-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-classification-experiments-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-json-4.3.0-jar-with-dependencies.jar" ] }, { @@ -123,7 +123,18 @@ "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "true" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "var textPipeline = new BasicPipeline(new BreakIteratorTokenizer(Locale.US),2);\n", "var fieldProcessors = new ArrayList();\n", @@ -573,7 +584,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "17+35-LTS-2724" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/configuration-tribuo-v4.ipynb b/tutorials/configuration-tribuo-v4.ipynb index 6d06a20a6..0d875d1f8 100644 --- a/tutorials/configuration-tribuo-v4.ipynb +++ b/tutorials/configuration-tribuo-v4.ipynb @@ -32,8 +32,8 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-classification-experiments-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-json-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-classification-experiments-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-json-4.3.0-jar-with-dependencies.jar" ] }, { @@ -487,7 +487,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training logistic regression took (00:00:03:494)\n" + "Training logistic regression took (00:00:04:874)\n" ] } ], @@ -554,9 +554,9 @@ " \"export\" : \"false\",\n", " \"import\" : \"false\",\n", " \"properties\" : {\n", - " \"outputPath\" : \"/Users/apocock/Development/Tribuo/tutorials/train-labels-idx1-ubyte.gz\",\n", + " \"outputPath\" : \"/local/ExternalRepositories/tribuo/tutorials/train-labels-idx1-ubyte.gz\",\n", " \"outputFactory\" : \"labelfactory-4\",\n", - " \"featuresPath\" : \"/Users/apocock/Development/Tribuo/tutorials/train-images-idx3-ubyte.gz\"\n", + " \"featuresPath\" : \"/local/ExternalRepositories/tribuo/tutorials/train-images-idx3-ubyte.gz\"\n", " }\n", " }, {\n", " \"name\" : \"linearsgdtrainer-0\",\n", @@ -800,14 +800,14 @@ "\t\t\t\t\tclass-name = org.tribuo.MutableDataset\n", "\t\t\t\t\tdatasource = IDXDataSource(\n", "\t\t\t\t\t\t\tclass-name = org.tribuo.datasource.IDXDataSource\n", - "\t\t\t\t\t\t\toutputPath = /Users/apocock/Development/Tribuo/tutorials/train-labels-idx1-ubyte.gz\n", + "\t\t\t\t\t\t\toutputPath = /local/ExternalRepositories/tribuo/tutorials/train-labels-idx1-ubyte.gz\n", "\t\t\t\t\t\t\toutputFactory = LabelFactory(\n", "\t\t\t\t\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t\t\t\t\t)\n", - "\t\t\t\t\t\t\tfeaturesPath = /Users/apocock/Development/Tribuo/tutorials/train-images-idx3-ubyte.gz\n", + "\t\t\t\t\t\t\tfeaturesPath = /local/ExternalRepositories/tribuo/tutorials/train-images-idx3-ubyte.gz\n", "\t\t\t\t\t\t\tfeatures-file-modified-time = 2000-07-21T14:20:24-04:00\n", "\t\t\t\t\t\t\toutput-resource-hash = 3552534A0A558BBED6AED32B30C495CCA23D567EC52CAC8BE1A0730E8010255C\n", - "\t\t\t\t\t\t\tdatasource-creation-time = 2021-12-18T20:07:51.388837-05:00\n", + "\t\t\t\t\t\t\tdatasource-creation-time = 2022-10-07T11:33:57.506314-04:00\n", "\t\t\t\t\t\t\toutput-file-modified-time = 2000-07-21T14:20:27-04:00\n", "\t\t\t\t\t\t\tidx-feature-type = UBYTE\n", "\t\t\t\t\t\t\tfeatures-resource-hash = 440FCABF73CC546FA21475E81EA370265605F56BE210A4024D2CA8F203523609\n", @@ -819,7 +819,7 @@ "\t\t\t\t\tnum-examples = 60000\n", "\t\t\t\t\tnum-features = 717\n", "\t\t\t\t\tnum-outputs = 10\n", - "\t\t\t\t\ttribuo-version = 4.2.0\n", + "\t\t\t\t\ttribuo-version = 4.3.0\n", "\t\t\t\t)\n", "\t\t\ttrainer = LinearSGDTrainer(\n", "\t\t\t\t\tclass-name = org.tribuo.classification.sgd.linear.LinearSGDTrainer\n", @@ -839,32 +839,32 @@ "\t\t\t\t\t\t\tclass-name = org.tribuo.classification.sgd.objectives.LogMulticlass\n", "\t\t\t\t\t\t\thost-short-name = LabelObjective\n", "\t\t\t\t\t\t)\n", - "\t\t\t\t\ttribuo-version = 4.2.0\n", + "\t\t\t\t\ttribuo-version = 4.3.0\n", "\t\t\t\t\ttrain-invocation-count = 0\n", "\t\t\t\t\tis-sequence = false\n", "\t\t\t\t\thost-short-name = Trainer\n", "\t\t\t\t)\n", - "\t\t\ttrained-at = 2021-12-18T20:07:55.508414-05:00\n", + "\t\t\ttrained-at = 2022-10-07T11:34:03.181752-04:00\n", "\t\t\tinstance-values = Map{\n", "\t\t\t\treconfigured-model=true\n", "\t\t\t}\n", - "\t\t\ttribuo-version = 4.2.0\n", - "\t\t\tjava-version = 17.0.1\n", - "\t\t\tos-name = Mac OS X\n", - "\t\t\tos-arch = x86_64\n", + "\t\t\ttribuo-version = 4.3.0\n", + "\t\t\tjava-version = 12\n", + "\t\t\tos-name = Linux\n", + "\t\t\tos-arch = amd64\n", "\t\t)\n", "\tdataset-provenance = MutableDataset(\n", "\t\t\tclass-name = org.tribuo.MutableDataset\n", "\t\t\tdatasource = IDXDataSource(\n", "\t\t\t\t\tclass-name = org.tribuo.datasource.IDXDataSource\n", - "\t\t\t\t\toutputPath = /Users/apocock/Development/Tribuo/tutorials/t10k-labels-idx1-ubyte.gz\n", + "\t\t\t\t\toutputPath = /local/ExternalRepositories/tribuo/tutorials/t10k-labels-idx1-ubyte.gz\n", "\t\t\t\t\toutputFactory = LabelFactory(\n", "\t\t\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t\t\t)\n", - "\t\t\t\t\tfeaturesPath = /Users/apocock/Development/Tribuo/tutorials/t10k-images-idx3-ubyte.gz\n", + "\t\t\t\t\tfeaturesPath = /local/ExternalRepositories/tribuo/tutorials/t10k-images-idx3-ubyte.gz\n", "\t\t\t\t\tfeatures-file-modified-time = 2000-07-21T14:19:56-04:00\n", "\t\t\t\t\toutput-resource-hash = F7AE60F92E00EC6DEBD23A6088C31DBD2371ECA3FFA0DEFAEFB259924204AEC6\n", - "\t\t\t\t\tdatasource-creation-time = 2021-12-18T20:07:41.373899-05:00\n", + "\t\t\t\t\tdatasource-creation-time = 2022-10-07T11:33:44.880399-04:00\n", "\t\t\t\t\toutput-file-modified-time = 2000-07-21T14:20:05-04:00\n", "\t\t\t\t\tidx-feature-type = UBYTE\n", "\t\t\t\t\tfeatures-resource-hash = 8D422C7B0A1C1C79245A5BCF07FE86E33EEAFEE792B84584AEC276F5A2DBC4E6\n", @@ -876,9 +876,9 @@ "\t\t\tnum-examples = 10000\n", "\t\t\tnum-features = 668\n", "\t\t\tnum-outputs = 10\n", - "\t\t\ttribuo-version = 4.2.0\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t)\n", - "\ttribuo-version = 4.2.0\n", + "\ttribuo-version = 4.3.0\n", ")\n" ] } @@ -906,7 +906,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training transformed logistic regression took (00:00:04:707)\n" + "Training transformed logistic regression took (00:00:06:555)\n" ] } ], @@ -1049,9 +1049,9 @@ " \"export\" : \"false\",\n", " \"import\" : \"false\",\n", " \"properties\" : {\n", - " \"outputPath\" : \"/Users/apocock/Development/Tribuo/tutorials/train-labels-idx1-ubyte.gz\",\n", + " \"outputPath\" : \"/local/ExternalRepositories/tribuo/tutorials/train-labels-idx1-ubyte.gz\",\n", " \"outputFactory\" : \"labelfactory-7\",\n", - " \"featuresPath\" : \"/Users/apocock/Development/Tribuo/tutorials/train-images-idx3-ubyte.gz\"\n", + " \"featuresPath\" : \"/local/ExternalRepositories/tribuo/tutorials/train-images-idx3-ubyte.gz\"\n", " }\n", " }, {\n", " \"name\" : \"transformationmap-1\",\n", @@ -1113,7 +1113,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "17+35-LTS-2724" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/document-classification-tribuo-v4.ipynb b/tutorials/document-classification-tribuo-v4.ipynb index ab08e7411..4d93b5b77 100644 --- a/tutorials/document-classification-tribuo-v4.ipynb +++ b/tutorials/document-classification-tribuo-v4.ipynb @@ -49,8 +49,8 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-classification-experiments-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-onnx-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-classification-experiments-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-onnx-4.3.0-jar-with-dependencies.jar" ] }, { @@ -198,34 +198,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training the model on BoW features took (00:00:09:601)\n", + "Training the model on BoW features took (00:00:10:110)\n", "\n", "Class n tp fn fp recall prec f1\n", - "soc.religion.christian 398 352 46 110 0.884 0.762 0.819\n", - "rec.autos 396 344 52 63 0.869 0.845 0.857\n", - "talk.religion.misc 251 166 85 120 0.661 0.580 0.618\n", - "comp.windows.x 395 283 112 55 0.716 0.837 0.772\n", - "rec.sport.baseball 397 370 27 45 0.932 0.892 0.911\n", - "comp.graphics 389 293 96 143 0.753 0.672 0.710\n", - "talk.politics.mideast 376 283 93 11 0.753 0.963 0.845\n", - "comp.sys.ibm.pc.hardware 392 277 115 160 0.707 0.634 0.668\n", - "sci.med 396 323 73 43 0.816 0.883 0.848\n", - "comp.os.ms-windows.misc 394 272 122 87 0.690 0.758 0.722\n", - "sci.crypt 396 349 47 23 0.881 0.938 0.909\n", - "comp.sys.mac.hardware 385 283 102 96 0.735 0.747 0.741\n", - "misc.forsale 390 341 49 63 0.874 0.844 0.859\n", - "rec.motorcycles 398 364 34 23 0.915 0.941 0.927\n", - "talk.politics.misc 310 182 128 94 0.587 0.659 0.621\n", - "sci.electronics 393 272 121 135 0.692 0.668 0.680\n", - "rec.sport.hockey 399 367 32 24 0.920 0.939 0.929\n", - "sci.space 394 325 69 56 0.825 0.853 0.839\n", - "alt.atheism 319 243 76 75 0.762 0.764 0.763\n", - "talk.politics.guns 364 303 61 114 0.832 0.727 0.776\n", - "Total 7,532 5,992 1,540 1,540\n", - "Accuracy 0.796\n", - "Micro Average 0.796 0.796 0.796\n", - "Macro Average 0.790 0.795 0.791\n", - "Balanced Error Rate 0.210\n" + "soc.religion.christian 398 346 52 93 0.869 0.788 0.827\n", + "rec.autos 396 349 47 79 0.881 0.815 0.847\n", + "talk.religion.misc 251 154 97 109 0.614 0.586 0.599\n", + "comp.windows.x 395 293 102 66 0.742 0.816 0.777\n", + "rec.sport.baseball 397 368 29 45 0.927 0.891 0.909\n", + "talk.politics.mideast 376 286 90 22 0.761 0.929 0.836\n", + "comp.graphics 389 285 104 163 0.733 0.636 0.681\n", + "comp.sys.ibm.pc.hardware 392 291 101 165 0.742 0.638 0.686\n", + "sci.med 396 299 97 60 0.755 0.833 0.792\n", + "comp.os.ms-windows.misc 394 241 153 74 0.612 0.765 0.680\n", + "sci.crypt 396 346 50 45 0.874 0.885 0.879\n", + "comp.sys.mac.hardware 385 294 91 85 0.764 0.776 0.770\n", + "talk.politics.misc 310 170 140 96 0.548 0.639 0.590\n", + "rec.motorcycles 398 370 28 25 0.930 0.937 0.933\n", + "misc.forsale 390 344 46 67 0.882 0.837 0.859\n", + "sci.electronics 393 269 124 112 0.684 0.706 0.695\n", + "rec.sport.hockey 399 371 28 18 0.930 0.954 0.942\n", + "sci.space 394 324 70 44 0.822 0.880 0.850\n", + "alt.atheism 319 240 79 97 0.752 0.712 0.732\n", + "talk.politics.guns 364 308 56 119 0.846 0.721 0.779\n", + "Total 7,532 5,948 1,584 1,584\n", + "Accuracy 0.790\n", + "Micro Average 0.790 0.790 0.790\n", + "Macro Average 0.783 0.787 0.783\n", + "Balanced Error Rate 0.217\n" ] } ], @@ -244,7 +244,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We got a macro F1 score of 79.6%, which is a fairly good starting point and it's roughly what other linear models get on this task (e.g., scikit-learn's text classification tutorial gets 76.9% macro F1 when using a similar multinomial Naive Bayes model)." + "We got a macro F1 score of 79.0%, which is a fairly good starting point and it's roughly what other linear models get on this task (e.g., scikit-learn's text classification tutorial gets 76.9% macro F1 when using a similar multinomial Naive Bayes model)." ] }, { @@ -291,34 +291,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training the model on Unigram features took (00:00:09:146)\n", + "Training the model on Unigram features took (00:00:12:351)\n", "\n", "Class n tp fn fp recall prec f1\n", - "soc.religion.christian 398 362 36 88 0.910 0.804 0.854\n", - "rec.autos 396 353 43 58 0.891 0.859 0.875\n", - "talk.religion.misc 251 148 103 97 0.590 0.604 0.597\n", - "comp.windows.x 395 295 100 54 0.747 0.845 0.793\n", - "rec.sport.baseball 397 356 41 49 0.897 0.879 0.888\n", - "comp.graphics 389 280 109 120 0.720 0.700 0.710\n", - "talk.politics.mideast 376 310 66 29 0.824 0.914 0.867\n", - "comp.sys.ibm.pc.hardware 392 266 126 133 0.679 0.667 0.673\n", - "sci.med 396 310 86 42 0.783 0.881 0.829\n", - "comp.os.ms-windows.misc 394 241 153 82 0.612 0.746 0.672\n", - "sci.crypt 396 354 42 55 0.894 0.866 0.880\n", - "comp.sys.mac.hardware 385 312 73 103 0.810 0.752 0.780\n", - "misc.forsale 390 343 47 69 0.879 0.833 0.855\n", - "rec.motorcycles 398 362 36 27 0.910 0.931 0.920\n", - "talk.politics.misc 310 171 139 90 0.552 0.655 0.599\n", - "sci.electronics 393 289 104 110 0.735 0.724 0.730\n", - "rec.sport.hockey 399 374 25 23 0.937 0.942 0.940\n", - "sci.space 394 342 52 57 0.868 0.857 0.863\n", - "alt.atheism 319 240 79 84 0.752 0.741 0.747\n", - "talk.politics.guns 364 314 50 140 0.863 0.692 0.768\n", - "Total 7,532 6,022 1,510 1,510\n", - "Accuracy 0.800\n", - "Micro Average 0.800 0.800 0.800\n", - "Macro Average 0.793 0.795 0.792\n", - "Balanced Error Rate 0.207\n" + "soc.religion.christian 398 341 57 90 0.857 0.791 0.823\n", + "rec.autos 396 357 39 69 0.902 0.838 0.869\n", + "talk.religion.misc 251 155 96 122 0.618 0.560 0.587\n", + "comp.windows.x 395 291 104 59 0.737 0.831 0.781\n", + "rec.sport.baseball 397 361 36 43 0.909 0.894 0.901\n", + "talk.politics.mideast 376 274 102 27 0.729 0.910 0.809\n", + "comp.graphics 389 288 101 114 0.740 0.716 0.728\n", + "comp.sys.ibm.pc.hardware 392 279 113 141 0.712 0.664 0.687\n", + "sci.med 396 310 86 73 0.783 0.809 0.796\n", + "comp.os.ms-windows.misc 394 263 131 98 0.668 0.729 0.697\n", + "sci.crypt 396 344 52 54 0.869 0.864 0.866\n", + "comp.sys.mac.hardware 385 301 84 82 0.782 0.786 0.784\n", + "talk.politics.misc 310 168 142 121 0.542 0.581 0.561\n", + "rec.motorcycles 398 365 33 31 0.917 0.922 0.919\n", + "misc.forsale 390 321 69 70 0.823 0.821 0.822\n", + "sci.electronics 393 282 111 117 0.718 0.707 0.712\n", + "rec.sport.hockey 399 380 19 29 0.952 0.929 0.941\n", + "sci.space 394 323 71 35 0.820 0.902 0.859\n", + "alt.atheism 319 246 73 105 0.771 0.701 0.734\n", + "talk.politics.guns 364 300 64 103 0.824 0.744 0.782\n", + "Total 7,532 5,949 1,583 1,583\n", + "Accuracy 0.790\n", + "Micro Average 0.790 0.790 0.790\n", + "Macro Average 0.784 0.785 0.783\n", + "Balanced Error Rate 0.216\n" ] } ], @@ -336,7 +336,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We see that the logistic regression trained on unigrams gets about 80% accuracy, pretty much the same as the BoW baseline, and takes about the same amount of time to run. Both of these make sense, as the term count isn't necessarily that useful in this particular dataset, and we didn't change the number of features overall or inside each example by using term counting.\n", + "We see that the logistic regression trained on unigrams gets about 79% accuracy, pretty much the same as the BoW baseline, and takes about the same amount of time to run. Both of these make sense, as the term count isn't necessarily that useful in this particular dataset, and we didn't change the number of features overall or inside each example by using term counting.\n", "\n", "\n", "## N-grams as features\n", @@ -381,34 +381,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training the model on Bigram features took (00:00:43:790)\n", + "Training the model on Bigram features took (00:00:32:704)\n", "\n", "Class n tp fn fp recall prec f1\n", - "soc.religion.christian 398 331 67 57 0.832 0.853 0.842\n", - "rec.autos 396 326 70 55 0.823 0.856 0.839\n", - "talk.religion.misc 251 167 84 106 0.665 0.612 0.637\n", - "comp.windows.x 395 297 98 57 0.752 0.839 0.793\n", - "rec.sport.baseball 397 357 40 52 0.899 0.873 0.886\n", - "comp.graphics 389 304 85 196 0.781 0.608 0.684\n", - "talk.politics.mideast 376 300 76 48 0.798 0.862 0.829\n", - "comp.sys.ibm.pc.hardware 392 244 148 104 0.622 0.701 0.659\n", - "sci.med 396 298 98 66 0.753 0.819 0.784\n", - "comp.os.ms-windows.misc 394 260 134 99 0.660 0.724 0.691\n", - "sci.crypt 396 327 69 37 0.826 0.898 0.861\n", - "comp.sys.mac.hardware 385 320 65 162 0.831 0.664 0.738\n", - "misc.forsale 390 352 38 102 0.903 0.775 0.834\n", - "rec.motorcycles 398 359 39 39 0.902 0.902 0.902\n", - "talk.politics.misc 310 185 125 93 0.597 0.665 0.629\n", - "sci.electronics 393 253 140 90 0.644 0.738 0.688\n", - "rec.sport.hockey 399 370 29 30 0.927 0.925 0.926\n", - "sci.space 394 336 58 40 0.853 0.894 0.873\n", - "alt.atheism 319 225 94 65 0.705 0.776 0.739\n", - "talk.politics.guns 364 309 55 114 0.849 0.730 0.785\n", - "Total 7,532 5,920 1,612 1,612\n", - "Accuracy 0.786\n", - "Micro Average 0.786 0.786 0.786\n", - "Macro Average 0.781 0.786 0.781\n", - "Balanced Error Rate 0.219\n" + "soc.religion.christian 398 328 70 38 0.824 0.896 0.859\n", + "rec.autos 396 328 68 53 0.828 0.861 0.844\n", + "talk.religion.misc 251 165 86 93 0.657 0.640 0.648\n", + "comp.windows.x 395 296 99 72 0.749 0.804 0.776\n", + "rec.sport.baseball 397 357 40 51 0.899 0.875 0.887\n", + "talk.politics.mideast 376 292 84 38 0.777 0.885 0.827\n", + "comp.graphics 389 288 101 205 0.740 0.584 0.653\n", + "comp.sys.ibm.pc.hardware 392 281 111 168 0.717 0.626 0.668\n", + "sci.med 396 289 107 60 0.730 0.828 0.776\n", + "comp.os.ms-windows.misc 394 262 132 86 0.665 0.753 0.706\n", + "sci.crypt 396 343 53 65 0.866 0.841 0.853\n", + "comp.sys.mac.hardware 385 289 96 110 0.751 0.724 0.737\n", + "talk.politics.misc 310 176 134 63 0.568 0.736 0.641\n", + "rec.motorcycles 398 366 32 51 0.920 0.878 0.898\n", + "misc.forsale 390 341 49 71 0.874 0.828 0.850\n", + "sci.electronics 393 240 153 67 0.611 0.782 0.686\n", + "rec.sport.hockey 399 366 33 28 0.917 0.929 0.923\n", + "sci.space 394 334 60 51 0.848 0.868 0.858\n", + "alt.atheism 319 256 63 137 0.803 0.651 0.719\n", + "talk.politics.guns 364 300 64 128 0.824 0.701 0.758\n", + "Total 7,532 5,897 1,635 1,635\n", + "Accuracy 0.783\n", + "Micro Average 0.783 0.783 0.783\n", + "Macro Average 0.778 0.784 0.778\n", + "Balanced Error Rate 0.222\n" ] } ], @@ -426,7 +426,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Our performance decreased a little when using bigrams to 78%, and the runtime increased from 10s to 48s. This is because despite there being more information in the features, there are also many, many more features making it easier to confuse this simple linear model plus each example takes longer to process due to the greatly increased number of features. We could look at using a more complex model like boosted trees to exploit this additional information which may increase the performance back above our baseline. We could further increase number of n-gram features but we'll start to see diminishing returns even with more powerful models as the dimensionality of the feature space increases without a commensurate increase in training data.\n", + "Our performance decreased a little when using bigrams to 78%, and the runtime increased from 12s to 32s. This is because despite there being more information in the features, there are also many, many more features making it easier to confuse this simple linear model plus each example takes longer to process due to the greatly increased number of features. We could look at using a more complex model like boosted trees to exploit this additional information which may increase the performance back above our baseline. We could further increase number of n-gram features but we'll start to see diminishing returns even with more powerful models as the dimensionality of the feature space increases without a commensurate increase in training data.\n", "\n", "## TFIDF vectors\n", "\n", @@ -480,34 +480,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training the model on TF-IDF features took (00:00:45:063)\n", + "Training the model on TF-IDF features took (00:00:33:661)\n", "\n", "Class n tp fn fp recall prec f1\n", - "soc.religion.christian 398 350 48 183 0.879 0.657 0.752\n", - "rec.autos 396 332 64 68 0.838 0.830 0.834\n", - "talk.religion.misc 251 155 96 111 0.618 0.583 0.600\n", - "comp.windows.x 395 290 105 58 0.734 0.833 0.781\n", - "rec.sport.baseball 397 345 52 26 0.869 0.930 0.898\n", - "comp.graphics 389 264 125 111 0.679 0.704 0.691\n", - "talk.politics.mideast 376 306 70 32 0.814 0.905 0.857\n", - "comp.sys.ibm.pc.hardware 392 285 107 170 0.727 0.626 0.673\n", - "sci.med 396 305 91 63 0.770 0.829 0.798\n", - "comp.os.ms-windows.misc 394 248 146 71 0.629 0.777 0.696\n", - "sci.crypt 396 340 56 47 0.859 0.879 0.868\n", - "comp.sys.mac.hardware 385 283 102 69 0.735 0.804 0.768\n", - "misc.forsale 390 340 50 79 0.872 0.811 0.841\n", - "rec.motorcycles 398 359 39 36 0.902 0.909 0.905\n", - "talk.politics.misc 310 191 119 130 0.616 0.595 0.605\n", - "sci.electronics 393 292 101 112 0.743 0.723 0.733\n", - "rec.sport.hockey 399 376 23 32 0.942 0.922 0.932\n", - "sci.space 394 339 55 52 0.860 0.867 0.864\n", - "alt.atheism 319 226 93 57 0.708 0.799 0.751\n", - "talk.politics.guns 364 303 61 96 0.832 0.759 0.794\n", - "Total 7,532 5,929 1,603 1,603\n", - "Accuracy 0.787\n", - "Micro Average 0.787 0.787 0.787\n", - "Macro Average 0.781 0.787 0.782\n", - "Balanced Error Rate 0.219\n" + "soc.religion.christian 398 329 69 83 0.827 0.799 0.812\n", + "rec.autos 396 338 58 63 0.854 0.843 0.848\n", + "talk.religion.misc 251 171 80 83 0.681 0.673 0.677\n", + "comp.windows.x 395 317 78 86 0.803 0.787 0.794\n", + "rec.sport.baseball 397 356 41 31 0.897 0.920 0.908\n", + "talk.politics.mideast 376 310 66 28 0.824 0.917 0.868\n", + "comp.graphics 389 273 116 114 0.702 0.705 0.704\n", + "comp.sys.ibm.pc.hardware 392 268 124 128 0.684 0.677 0.680\n", + "sci.med 396 325 71 143 0.821 0.694 0.752\n", + "comp.os.ms-windows.misc 394 263 131 77 0.668 0.774 0.717\n", + "sci.crypt 396 338 58 60 0.854 0.849 0.851\n", + "comp.sys.mac.hardware 385 285 100 69 0.740 0.805 0.771\n", + "talk.politics.misc 310 181 129 55 0.584 0.767 0.663\n", + "rec.motorcycles 398 362 36 47 0.910 0.885 0.897\n", + "misc.forsale 390 331 59 64 0.849 0.838 0.843\n", + "sci.electronics 393 251 142 84 0.639 0.749 0.690\n", + "rec.sport.hockey 399 369 30 15 0.925 0.961 0.943\n", + "sci.space 394 350 44 113 0.888 0.756 0.817\n", + "alt.atheism 319 251 68 80 0.787 0.758 0.772\n", + "talk.politics.guns 364 315 49 126 0.865 0.714 0.783\n", + "Total 7,532 5,983 1,549 1,549\n", + "Accuracy 0.794\n", + "Micro Average 0.794 0.794 0.794\n", + "Macro Average 0.790 0.794 0.790\n", + "Balanced Error Rate 0.210\n" ] } ], @@ -570,34 +570,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training the model on hashed features took (00:00:23:354)\n", + "Training the model on hashed features took (00:00:18:148)\n", "\n", "Class n tp fn fp recall prec f1\n", - "soc.religion.christian 398 306 92 125 0.769 0.710 0.738\n", - "rec.autos 396 324 72 77 0.818 0.808 0.813\n", - "talk.religion.misc 251 139 112 132 0.554 0.513 0.533\n", - "comp.windows.x 395 273 122 78 0.691 0.778 0.732\n", - "rec.sport.baseball 397 335 62 64 0.844 0.840 0.842\n", - "comp.graphics 389 238 151 135 0.612 0.638 0.625\n", - "talk.politics.mideast 376 265 111 35 0.705 0.883 0.784\n", - "comp.sys.ibm.pc.hardware 392 276 116 178 0.704 0.608 0.652\n", - "sci.med 396 251 145 125 0.634 0.668 0.650\n", - "comp.os.ms-windows.misc 394 254 140 109 0.645 0.700 0.671\n", - "sci.crypt 396 305 91 36 0.770 0.894 0.828\n", - "comp.sys.mac.hardware 385 259 126 97 0.673 0.728 0.699\n", - "misc.forsale 390 325 65 87 0.833 0.789 0.810\n", - "rec.motorcycles 398 341 57 75 0.857 0.820 0.838\n", - "talk.politics.misc 310 171 139 195 0.552 0.467 0.506\n", - "sci.electronics 393 243 150 159 0.618 0.604 0.611\n", - "rec.sport.hockey 399 353 46 59 0.885 0.857 0.871\n", - "sci.space 394 305 89 49 0.774 0.862 0.816\n", - "alt.atheism 319 215 104 100 0.674 0.683 0.678\n", - "talk.politics.guns 364 292 72 147 0.802 0.665 0.727\n", - "Total 7,532 5,470 2,062 2,062\n", - "Accuracy 0.726\n", - "Micro Average 0.726 0.726 0.726\n", - "Macro Average 0.721 0.726 0.721\n", - "Balanced Error Rate 0.279\n" + "soc.religion.christian 398 293 105 77 0.736 0.792 0.763\n", + "rec.autos 396 304 92 75 0.768 0.802 0.785\n", + "talk.religion.misc 251 159 92 177 0.633 0.473 0.542\n", + "comp.windows.x 395 293 102 76 0.742 0.794 0.767\n", + "rec.sport.baseball 397 343 54 108 0.864 0.761 0.809\n", + "talk.politics.mideast 376 267 109 34 0.710 0.887 0.789\n", + "comp.graphics 389 254 135 121 0.653 0.677 0.665\n", + "comp.sys.ibm.pc.hardware 392 253 139 138 0.645 0.647 0.646\n", + "sci.med 396 281 115 109 0.710 0.721 0.715\n", + "comp.os.ms-windows.misc 394 240 154 98 0.609 0.710 0.656\n", + "sci.crypt 396 330 66 81 0.833 0.803 0.818\n", + "comp.sys.mac.hardware 385 271 114 117 0.704 0.698 0.701\n", + "talk.politics.misc 310 174 136 160 0.561 0.521 0.540\n", + "rec.motorcycles 398 336 62 41 0.844 0.891 0.867\n", + "misc.forsale 390 334 56 83 0.856 0.801 0.828\n", + "sci.electronics 393 245 148 132 0.623 0.650 0.636\n", + "rec.sport.hockey 399 343 56 32 0.860 0.915 0.886\n", + "sci.space 394 306 88 75 0.777 0.803 0.790\n", + "alt.atheism 319 225 94 105 0.705 0.682 0.693\n", + "talk.politics.guns 364 300 64 142 0.824 0.679 0.744\n", + "Total 7,532 5,551 1,981 1,981\n", + "Accuracy 0.737\n", + "Micro Average 0.737 0.737 0.737\n", + "Macro Average 0.733 0.735 0.732\n", + "Balanced Error Rate 0.267\n" ] } ], @@ -662,34 +662,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training the model on trimmed TF-IDF features took (00:00:19:928)\n", + "Training the model on trimmed TF-IDF features took (00:00:14:750)\n", "\n", "Class n tp fn fp recall prec f1\n", - "soc.religion.christian 398 337 61 93 0.847 0.784 0.814\n", - "rec.autos 396 312 84 60 0.788 0.839 0.813\n", - "talk.religion.misc 251 172 79 143 0.685 0.546 0.608\n", - "comp.windows.x 395 290 105 56 0.734 0.838 0.783\n", - "rec.sport.baseball 397 344 53 37 0.866 0.903 0.884\n", - "comp.graphics 389 284 105 112 0.730 0.717 0.724\n", - "talk.politics.mideast 376 301 75 19 0.801 0.941 0.865\n", - "comp.sys.ibm.pc.hardware 392 286 106 217 0.730 0.569 0.639\n", - "sci.med 396 295 101 74 0.745 0.799 0.771\n", - "comp.os.ms-windows.misc 394 219 175 52 0.556 0.808 0.659\n", - "sci.crypt 396 322 74 49 0.813 0.868 0.840\n", - "comp.sys.mac.hardware 385 287 98 125 0.745 0.697 0.720\n", - "misc.forsale 390 320 70 62 0.821 0.838 0.829\n", - "rec.motorcycles 398 353 45 45 0.887 0.887 0.887\n", - "talk.politics.misc 310 191 119 131 0.616 0.593 0.604\n", - "sci.electronics 393 298 95 148 0.758 0.668 0.710\n", - "rec.sport.hockey 399 370 29 41 0.927 0.900 0.914\n", - "sci.space 394 336 58 64 0.853 0.840 0.846\n", - "alt.atheism 319 218 101 59 0.683 0.787 0.732\n", - "talk.politics.guns 364 302 62 108 0.830 0.737 0.780\n", - "Total 7,532 5,837 1,695 1,695\n", - "Accuracy 0.775\n", - "Micro Average 0.775 0.775 0.775\n", - "Macro Average 0.771 0.778 0.771\n", - "Balanced Error Rate 0.229\n" + "soc.religion.christian 398 346 52 128 0.869 0.730 0.794\n", + "rec.autos 396 314 82 68 0.793 0.822 0.807\n", + "talk.religion.misc 251 162 89 121 0.645 0.572 0.607\n", + "comp.windows.x 395 275 120 55 0.696 0.833 0.759\n", + "rec.sport.baseball 397 333 64 26 0.839 0.928 0.881\n", + "talk.politics.mideast 376 295 81 28 0.785 0.913 0.844\n", + "comp.graphics 389 280 109 173 0.720 0.618 0.665\n", + "comp.sys.ibm.pc.hardware 392 277 115 186 0.707 0.598 0.648\n", + "sci.med 396 304 92 127 0.768 0.705 0.735\n", + "comp.os.ms-windows.misc 394 239 155 72 0.607 0.768 0.678\n", + "sci.crypt 396 343 53 70 0.866 0.831 0.848\n", + "comp.sys.mac.hardware 385 259 126 60 0.673 0.812 0.736\n", + "talk.politics.misc 310 193 117 68 0.623 0.739 0.676\n", + "rec.motorcycles 398 359 39 65 0.902 0.847 0.873\n", + "misc.forsale 390 339 51 84 0.869 0.801 0.834\n", + "sci.electronics 393 246 147 103 0.626 0.705 0.663\n", + "rec.sport.hockey 399 367 32 16 0.920 0.958 0.939\n", + "sci.space 394 335 59 86 0.850 0.796 0.822\n", + "alt.atheism 319 237 82 63 0.743 0.790 0.766\n", + "talk.politics.guns 364 310 54 120 0.852 0.721 0.781\n", + "Total 7,532 5,813 1,719 1,719\n", + "Accuracy 0.772\n", + "Micro Average 0.772 0.772 0.772\n", + "Macro Average 0.768 0.774 0.768\n", + "Balanced Error Rate 0.232\n" ] } ], @@ -734,9 +734,9 @@ "python -m transformers.convert_graph_to_onnx --framework pt --model bert-base-uncased bert-base-uncased.onnx\n", "```\n", "\n", - "You'll also need to download the `tokenizer.json` that goes with the BERT variant you are using, for `bert-base-uncased` that file is [here](https://huggingface.co/bert-base-uncased/blob/main/tokenizer.json). Assuming both of those files are now in the same directory as this tutorial, we can create the `BERTFeatureExtractor`. We're going to take the average token embedding across the whole input, as the `[CLS]` token which provides the sentence embedding tends to perform poorly unless it is fine-tuned on your task.\n", + "You'll also need to download the `tokenizer.json` that goes with the BERT variant you are using, for `bert-base-uncased` that file is [here](https://huggingface.co/bert-base-uncased/raw/main/tokenizer.json). Assuming both of those files are now in the same directory as this tutorial, we can create the `BERTFeatureExtractor`. We're going to take the average token embedding across the whole input, as the `[CLS]` token which provides the sentence embedding tends to perform poorly unless it is fine-tuned on your task.\n", "\n", - "Warning: this feature extraction step took more than a minute per newsgroup on a 2019 16\" 6-core MacBook Pro (using the default settings of ONNX Runtime i.e., using a single thead on the CPU provider) so around 55 minutes to extract the full train and test datasets. Your mileage may vary, and your laptop may get quite warm. We recommend not running it while your laptop is actually on your lap. At the moment Tribuo's `TextFeatureExtractor` interface doesn't batch up the inputs, which limits the performance of contextual feature extractors. We'll look at expanding that interface to support batching in a future release. The session options used can be controlled by the `BERTFeatureExtractor.reconfigureOrtSession(SessionOptions options)` method, which allows the use of whatever configuration is supported by your onnxruntime jar." + "Warning: this feature extraction step took more than a minute per newsgroup on a 2019 16\" 6-core MacBook Pro (using the default settings of ONNX Runtime i.e., using a single thead on the CPU provider) so around 55 minutes to extract the full train and test datasets. Your mileage may vary, and your laptop may get quite warm. We recommend not running it while your laptop is actually on your lap. At the moment Tribuo's `TextFeatureExtractor` interface doesn't batch up the inputs, which limits the performance of contextual feature extractors. We'll look at expanding that interface to support batching in a future release. The session options used can be controlled by the `BERTFeatureExtractor.reconfigureOrtSession(SessionOptions options)` method, which allows the use of whatever configuration is supported by your ONNX Runtime jar." ] }, { @@ -750,7 +750,7 @@ "text": [ "bert training data size = 11314, number of features = 768, number of classes = 20\n", "bert testing data size = 7532, number of features = 768, number of classes = 20\n", - "Extracting features with BERT took (00:38:37:476)\n" + "Extracting features with BERT took (00:32:03:647)\n" ] } ], @@ -789,33 +789,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training a LR on BERT features took (00:00:06:082)\n", + "Training a LR on BERT features took (00:00:03:802)\n", "Class n tp fn fp recall prec f1\n", - "soc.religion.christian 398 353 45 111 0.887 0.761 0.819\n", - "rec.autos 396 332 64 99 0.838 0.770 0.803\n", - "talk.religion.misc 251 102 149 131 0.406 0.438 0.421\n", - "comp.windows.x 395 288 107 121 0.729 0.704 0.716\n", - "rec.sport.baseball 397 365 32 32 0.919 0.919 0.919\n", - "comp.graphics 389 257 132 183 0.661 0.584 0.620\n", - "talk.politics.mideast 376 289 87 26 0.769 0.917 0.836\n", - "comp.sys.ibm.pc.hardware 392 220 172 166 0.561 0.570 0.566\n", - "sci.med 396 320 76 34 0.808 0.904 0.853\n", - "comp.os.ms-windows.misc 394 247 147 187 0.627 0.569 0.597\n", - "sci.crypt 396 314 82 95 0.793 0.768 0.780\n", - "comp.sys.mac.hardware 385 134 251 32 0.348 0.807 0.486\n", - "misc.forsale 390 342 48 103 0.877 0.769 0.819\n", - "rec.motorcycles 398 308 90 75 0.774 0.804 0.789\n", - "talk.politics.misc 310 186 124 226 0.600 0.451 0.515\n", - "sci.electronics 393 252 141 197 0.641 0.561 0.599\n", - "rec.sport.hockey 399 381 18 21 0.955 0.948 0.951\n", - "sci.space 394 332 62 78 0.843 0.810 0.826\n", - "alt.atheism 319 163 156 121 0.511 0.574 0.541\n", - "talk.politics.guns 364 210 154 99 0.577 0.680 0.624\n", - "Total 7,532 5,395 2,137 2,137\n", - "Accuracy 0.716\n", - "Micro Average 0.716 0.716 0.716\n", - "Macro Average 0.706 0.715 0.704\n", - "Balanced Error Rate 0.294\n" + "soc.religion.christian 398 367 31 114 0.922 0.763 0.835\n", + "rec.autos 396 306 90 51 0.773 0.857 0.813\n", + "talk.religion.misc 251 62 189 48 0.247 0.564 0.343\n", + "comp.windows.x 395 305 90 186 0.772 0.621 0.688\n", + "rec.sport.baseball 397 357 40 20 0.899 0.947 0.922\n", + "talk.politics.mideast 376 292 84 29 0.777 0.910 0.838\n", + "comp.graphics 389 243 146 153 0.625 0.614 0.619\n", + "comp.sys.ibm.pc.hardware 392 190 202 111 0.485 0.631 0.548\n", + "sci.med 396 332 64 53 0.838 0.862 0.850\n", + "comp.os.ms-windows.misc 394 202 192 92 0.513 0.687 0.587\n", + "sci.crypt 396 306 90 74 0.773 0.805 0.789\n", + "comp.sys.mac.hardware 385 263 122 201 0.683 0.567 0.620\n", + "talk.politics.misc 310 164 146 168 0.529 0.494 0.511\n", + "rec.motorcycles 398 337 61 118 0.847 0.741 0.790\n", + "misc.forsale 390 317 73 56 0.813 0.850 0.831\n", + "sci.electronics 393 234 159 124 0.595 0.654 0.623\n", + "rec.sport.hockey 399 380 19 21 0.952 0.948 0.950\n", + "sci.space 394 338 56 104 0.858 0.765 0.809\n", + "alt.atheism 319 200 119 175 0.627 0.533 0.576\n", + "talk.politics.guns 364 278 86 161 0.764 0.633 0.692\n", + "Total 7,532 5,473 2,059 2,059\n", + "Accuracy 0.727\n", + "Micro Average 0.727 0.727 0.727\n", + "Macro Average 0.715 0.722 0.712\n", + "Balanced Error Rate 0.285\n" ] } ], @@ -852,7 +852,7 @@ "text": [ "DirectoryFileSource(\n", "\tclass-name = org.tribuo.data.text.DirectoryFileSource\n", - "\tdataDir = /Users/apocock/Development/Tribuo/tutorials/20news/20news-bydate-train\n", + "\tdataDir = /local/ExternalRepositories/tribuo/tutorials/20news/20news-bydate-train\n", "\tpreprocessors = List[\n", "\t\tNewsPreprocessor(\n", "\t\t\t\t\tclass-name = org.tribuo.data.text.impl.NewsPreprocessor\n", @@ -868,8 +868,8 @@ "\t\t\tclass-name = org.tribuo.interop.onnx.extractors.BERTFeatureExtractor\n", "\t\t\tuseCUDA = false\n", "\t\t\tpooling = MEAN\n", - "\t\t\tmodelPath = /Users/apocock/Development/Tribuo/tutorials/bert-base-uncased.onnx\n", - "\t\t\ttokenizerPath = /Users/apocock/Development/Tribuo/tutorials/tokenizer.json\n", + "\t\t\tmodelPath = /local/ExternalRepositories/tribuo/tutorials/bert-base-uncased.onnx\n", + "\t\t\ttokenizerPath = /local/ExternalRepositories/tribuo/tutorials/tokenizer.json\n", "\t\t\toutputFactory = LabelFactory(\n", "\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t)\n", @@ -880,7 +880,7 @@ "\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t)\n", "\tfile-modified-time = 2003-03-18T07:24:55-05:00\n", - "\tdatasource-creation-time = 2021-12-18T20:50:57.169758-05:00\n", + "\tdatasource-creation-time = 2022-10-07T12:14:14.770299736-04:00\n", ")\n" ] } @@ -918,7 +918,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "17+35-LTS-2724" + "version": "17.0.4.1+1-LTS-2" } }, "nbformat": 4, diff --git a/tutorials/external-models-tribuo-v4.ipynb b/tutorials/external-models-tribuo-v4.ipynb index 0a3b57548..840fb0b1c 100644 --- a/tutorials/external-models-tribuo-v4.ipynb +++ b/tutorials/external-models-tribuo-v4.ipynb @@ -24,8 +24,8 @@ "metadata": {}, "outputs": [], "source": [ - "%jars tribuo-classification-experiments-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars tribuo-onnx-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars tribuo-classification-experiments-4.3.0-jar-with-dependencies.jar\n", + "%jars tribuo-onnx-4.3.0-jar-with-dependencies.jar" ] }, { @@ -469,7 +469,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "19" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/feature-selection-tribuo-v4.ipynb b/tutorials/feature-selection-tribuo-v4.ipynb index 9ed5a014a..f5f820d13 100644 --- a/tutorials/feature-selection-tribuo-v4.ipynb +++ b/tutorials/feature-selection-tribuo-v4.ipynb @@ -25,8 +25,8 @@ "metadata": {}, "outputs": [], "source": [ - "%jars tribuo-classification-sgd-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars tribuo-classification-fs-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars tribuo-classification-sgd-4.3.0-jar-with-dependencies.jar\n", + "%jars tribuo-classification-fs-4.3.0-jar-with-dependencies.jar" ] }, { @@ -118,7 +118,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training factorization machine on 783 features took (00:00:30:968)\n" + "Training factorization machine on 783 features took (00:00:31:395)\n" ] } ], @@ -158,21 +158,21 @@ "data": { "text/plain": [ "Class n tp fn fp recall prec f1\n", - "0 980 962 18 37 0.982 0.963 0.972\n", - "1 1,135 1,117 18 11 0.984 0.990 0.987\n", + "0 980 962 18 35 0.982 0.965 0.973\n", + "1 1,135 1,118 17 11 0.985 0.990 0.988\n", "2 1,032 992 40 63 0.961 0.940 0.951\n", - "3 1,010 968 42 45 0.958 0.956 0.957\n", - "4 982 934 48 39 0.951 0.960 0.955\n", - "5 892 861 31 50 0.965 0.945 0.955\n", - "6 958 921 37 27 0.961 0.972 0.966\n", - "7 1,028 981 47 36 0.954 0.965 0.959\n", - "8 974 915 59 40 0.939 0.958 0.949\n", - "9 1,009 953 56 48 0.944 0.952 0.948\n", - "Total 10,000 9,604 396 396\n", - "Accuracy 0.960\n", - "Micro Average 0.960 0.960 0.960\n", - "Macro Average 0.960 0.960 0.960\n", - "Balanced Error Rate 0.040" + "3 1,010 969 41 48 0.959 0.953 0.956\n", + "4 982 937 45 41 0.954 0.958 0.956\n", + "5 892 861 31 46 0.965 0.949 0.957\n", + "6 958 920 38 24 0.960 0.975 0.967\n", + "7 1,028 980 48 34 0.953 0.966 0.960\n", + "8 974 919 55 38 0.944 0.960 0.952\n", + "9 1,009 954 55 48 0.945 0.952 0.949\n", + "Total 10,000 9,612 388 388\n", + "Accuracy 0.961\n", + "Micro Average 0.961 0.961 0.961\n", + "Macro Average 0.961 0.961 0.961\n", + "Balanced Error Rate 0.039" ] }, "execution_count": 5, @@ -207,7 +207,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Selecting the top 100 features with MIM took (00:00:02:213)\n" + "Selecting the top 100 features with MIM took (00:00:02:861)\n" ] } ], @@ -244,14 +244,14 @@ "\t\t\tclass-name = org.tribuo.MutableDataset\n", "\t\t\tdatasource = IDXDataSource(\n", "\t\t\t\t\tclass-name = org.tribuo.datasource.IDXDataSource\n", - "\t\t\t\t\toutputPath = /Users/craigacp/Development/tribuo/tutorials/train-labels-idx1-ubyte.gz\n", + "\t\t\t\t\toutputPath = /local/ExternalRepositories/tribuo/tutorials/train-labels-idx1-ubyte.gz\n", "\t\t\t\t\toutputFactory = LabelFactory(\n", "\t\t\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t\t\t)\n", - "\t\t\t\t\tfeaturesPath = /Users/craigacp/Development/tribuo/tutorials/train-images-idx3-ubyte.gz\n", + "\t\t\t\t\tfeaturesPath = /local/ExternalRepositories/tribuo/tutorials/train-images-idx3-ubyte.gz\n", "\t\t\t\t\tfeatures-file-modified-time = 2000-07-21T14:20:24-04:00\n", "\t\t\t\t\toutput-resource-hash = 3552534A0A558BBED6AED32B30C495CCA23D567EC52CAC8BE1A0730E8010255C\n", - "\t\t\t\t\tdatasource-creation-time = 2022-10-01T21:29:52.192904-04:00\n", + "\t\t\t\t\tdatasource-creation-time = 2022-10-07T13:13:41.469063236-04:00\n", "\t\t\t\t\toutput-file-modified-time = 2000-07-21T14:20:27-04:00\n", "\t\t\t\t\tidx-feature-type = UBYTE\n", "\t\t\t\t\tfeatures-resource-hash = 440FCABF73CC546FA21475E81EA370265605F56BE210A4024D2CA8F203523609\n", @@ -263,7 +263,7 @@ "\t\t\tnum-examples = 60000\n", "\t\t\tnum-features = 717\n", "\t\t\tnum-outputs = 10\n", - "\t\t\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t)\n", "\tfeature-selector = MIM(\n", "\t\t\tclass-name = org.tribuo.classification.fs.MIM\n", @@ -271,7 +271,7 @@ "\t\t\tk = 100\n", "\t\t\thost-short-name = FeatureSelector\n", "\t\t)\n", - "\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\ttribuo-version = 4.3.0\n", ")\n" ] } @@ -331,7 +331,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training factorization machine on 100 features took (00:00:10:029)\n" + "Training factorization machine on 100 features took (00:00:12:582)\n" ] } ], @@ -410,7 +410,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Selecting the top 100 features with JMI took (00:01:15:685)\n", + "Selecting the top 100 features with JMI took (00:01:33:192)\n", "JMI feature set: [378, 461, 409, 568, 350, 434, 542, 406, 489, 596, 401, 381, 433, 377, 569, 462, 437, 514, 405, 155, 428, 597, 436, 373, 515, 351, 541, 543, 429, 460, 154, 488, 625, 400, 464, 567, 374, 379, 570, 375, 345, 540, 487, 456, 376, 346, 408, 490, 457, 318, 156, 516, 539, 290, 513, 459, 372, 595, 153, 486, 402, 323, 354, 347, 430, 626, 517, 458, 317, 432, 326, 407, 512, 427, 656, 349, 485, 404, 455, 263, 624, 353, 523, 598, 484, 403, 463, 571, 382, 511, 322, 291, 183, 435, 655, 544, 431, 483, 465, 410]\n" ] } @@ -443,7 +443,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training factorization machine on 100 features took (00:00:09:881)\n" + "Training factorization machine on 100 features took (00:00:12:409)\n" ] } ], @@ -531,7 +531,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "19" + "version": "17.0.4.1+1-LTS-2" } }, "nbformat": 4, diff --git a/tutorials/irises-tribuo-v4.ipynb b/tutorials/irises-tribuo-v4.ipynb index 9eac6c82b..ad63acbbf 100644 --- a/tutorials/irises-tribuo-v4.ipynb +++ b/tutorials/irises-tribuo-v4.ipynb @@ -27,8 +27,8 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-classification-experiments-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-json-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-classification-experiments-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-json-4.3.0-jar-with-dependencies.jar" ] }, { @@ -408,10 +408,10 @@ "\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t)\n", "\t\t\tseparator = ,\n", - "\t\t\tdataPath = /Users/craigacp/Development/tribuo/tutorials/bezdekIris.data\n", + "\t\t\tdataPath = /local/ExternalRepositories/tribuo/tutorials/bezdekIris.data\n", "\t\t\tresource-hash = 0FED2A99DB77EC533A62DC66894D3EC6DF3B58B6A8F3CF4A6B47E4086B7F97DC\n", "\t\t\tfile-modified-time = 1999-12-14T15:12:39-05:00\n", - "\t\t\tdatasource-creation-time = 2022-10-02T17:16:22.164235-04:00\n", + "\t\t\tdatasource-creation-time = 2022-10-07T11:20:06.279351-04:00\n", "\t\t\thost-short-name = DataSource\n", "\t\t)\n", "\ttrain-proportion = 0.7\n", @@ -463,7 +463,7 @@ "\t\t\tclass-name = org.tribuo.classification.sgd.objectives.LogMulticlass\n", "\t\t\thost-short-name = LabelObjective\n", "\t\t)\n", - "\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\ttribuo-version = 4.3.0\n", "\ttrain-invocation-count = 0\n", "\tis-sequence = false\n", "\thost-short-name = Trainer\n", @@ -524,7 +524,7 @@ " \"tribuo-version\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", " \"key\" : \"tribuo-version\",\n", - " \"value\" : \"4.3.0-SNAPSHOT\",\n", + " \"value\" : \"4.3.0\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance\",\n", " \"additional\" : \"\",\n", " \"is-reference\" : false\n", @@ -532,7 +532,7 @@ " \"java-version\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", " \"key\" : \"java-version\",\n", - " \"value\" : \"19\",\n", + " \"value\" : \"12\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance\",\n", " \"additional\" : \"\",\n", " \"is-reference\" : false\n", @@ -548,7 +548,7 @@ " \"os-arch\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", " \"key\" : \"os-arch\",\n", - " \"value\" : \"aarch64\",\n", + " \"value\" : \"amd64\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance\",\n", " \"additional\" : \"\",\n", " \"is-reference\" : false\n", @@ -556,7 +556,7 @@ " \"trained-at\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", " \"key\" : \"trained-at\",\n", - " \"value\" : \"2022-10-02T17:16:22.729336-04:00\",\n", + " \"value\" : \"2022-10-07T11:20:06.643297-04:00\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.DateTimeProvenance\",\n", " \"additional\" : \"\",\n", " \"is-reference\" : false\n", @@ -564,7 +564,7 @@ " \"os-name\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", " \"key\" : \"os-name\",\n", - " \"value\" : \"Mac OS X\",\n", + " \"value\" : \"Linux\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance\",\n", " \"additional\" : \"\",\n", " \"is-reference\" : false\n", @@ -619,7 +619,7 @@ " \"tribuo-version\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", " \"key\" : \"tribuo-version\",\n", - " \"value\" : \"4.3.0-SNAPSHOT\",\n", + " \"value\" : \"4.3.0\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance\",\n", " \"additional\" : \"\",\n", " \"is-reference\" : false\n", @@ -678,7 +678,7 @@ " \"tribuo-version\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", " \"key\" : \"tribuo-version\",\n", - " \"value\" : \"4.3.0-SNAPSHOT\",\n", + " \"value\" : \"4.3.0\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance\",\n", " \"additional\" : \"\",\n", " \"is-reference\" : false\n", @@ -896,14 +896,14 @@ " \"provenance-class\" : \"org.tribuo.data.csv.CSVDataSource$CSVDataSourceProvenance\",\n", " \"map\" : {\n", " \"resource-hash\" : {\n", - " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n" + " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", + " \"key\" : \"resource-hash\",\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - " \"key\" : \"resource-hash\",\n", " \"value\" : \"0FED2A99DB77EC533A62DC66894D3EC6DF3B58B6A8F3CF4A6B47E4086B7F97DC\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.HashProvenance\",\n", " \"additional\" : \"SHA256\",\n", @@ -983,7 +983,7 @@ " \"datasource-creation-time\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", " \"key\" : \"datasource-creation-time\",\n", - " \"value\" : \"2022-10-02T17:16:22.164235-04:00\",\n", + " \"value\" : \"2022-10-07T11:20:06.279351-04:00\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.DateTimeProvenance\",\n", " \"additional\" : \"\",\n", " \"is-reference\" : false\n", @@ -1023,7 +1023,7 @@ " \"dataPath\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", " \"key\" : \"dataPath\",\n", - " \"value\" : \"/Users/craigacp/Development/tribuo/tutorials/bezdekIris.data\",\n", + " \"value\" : \"/local/ExternalRepositories/tribuo/tutorials/bezdekIris.data\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.FileProvenance\",\n", " \"additional\" : \"\",\n", " \"is-reference\" : false\n", @@ -1084,7 +1084,13 @@ " \"is-reference\" : true\n", " },\n", " \"weightExtractor\" : {\n", - " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", + " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ " \"key\" : \"weightExtractor\",\n", " \"value\" : \"fieldextractor-14\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.impl.NullConfiguredProvenance\",\n", @@ -1269,13 +1275,7 @@ " },\n", " \"class-name\" : {\n", " \"marshalled-class\" : \"com.oracle.labs.mlrg.olcut.provenance.io.SimpleMarshalledProvenance\",\n", - " \"key\" : \"class-name\",\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + " \"key\" : \"class-name\",\n", " \"value\" : \"org.tribuo.data.columnar.processors.field.DoubleFieldProcessor\",\n", " \"provenance-class\" : \"com.oracle.labs.mlrg.olcut.provenance.primitives.StringProvenance\",\n", " \"additional\" : \"\",\n", @@ -1412,7 +1412,13 @@ " \"additional\" : \"\",\n", " \"is-reference\" : false\n", " }\n", - " }\n", + " }\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "} ]\n" ] } @@ -1438,7 +1444,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "linear-sgd-model - Model(class-name=org.tribuo.classification.sgd.linear.LinearSGDModel,dataset=Dataset(class-name=org.tribuo.MutableDataset,datasource=SplitDataSourceProvenance(className=org.tribuo.evaluation.TrainTestSplitter,innerSourceProvenance=DataSource(class-name=org.tribuo.data.csv.CSVDataSource,headers=[sepalLength, sepalWidth, petalLength, petalWidth, species],rowProcessor=RowProcessor(class-name=org.tribuo.data.columnar.RowProcessor,metadataExtractors=[],fieldProcessorList=[FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=petalLength,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=petalWidth,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=sepalWidth,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=sepalLength,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor)],featureProcessors=[],responseProcessor=ResponseProcessor(class-name=org.tribuo.data.columnar.processors.response.FieldResponseProcessor,uppercase=false,fieldNames=[species],defaultValues=[],displayField=false,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),host-short-name=ResponseProcessor),weightExtractor=null,replaceNewlinesWithSpaces=true,regexMappingProcessors={},host-short-name=RowProcessor),quote=\",outputRequired=true,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),separator=,,dataPath=/Users/craigacp/Development/tribuo/tutorials/bezdekIris.data,resource-hash=SHA-256[0FED2A99DB77EC533A62DC66894D3EC6DF3B58B6A8F3CF4A6B47E4086B7F97DC],file-modified-time=1999-12-14T15:12:39-05:00,datasource-creation-time=2022-10-02T17:16:22.164235-04:00,host-short-name=DataSource),trainProportion=0.7,seed=1,size=150,isTrain=true),transformations=[],is-sequence=false,is-dense=true,num-examples=105,num-features=4,num-outputs=3,tribuo-version=4.3.0-SNAPSHOT),trainer=Trainer(class-name=org.tribuo.classification.sgd.linear.LogisticRegressionTrainer,seed=12345,minibatchSize=1,shuffle=true,epochs=5,optimiser=StochasticGradientOptimiser(class-name=org.tribuo.math.optimisers.AdaGrad,epsilon=0.1,initialLearningRate=1.0,initialValue=0.0,host-short-name=StochasticGradientOptimiser),loggingInterval=1000,objective=LabelObjective(class-name=org.tribuo.classification.sgd.objectives.LogMulticlass,host-short-name=LabelObjective),tribuo-version=4.3.0-SNAPSHOT,train-invocation-count=0,is-sequence=false,host-short-name=Trainer),trained-at=2022-10-02T17:16:22.729336-04:00,instance-values={},tribuo-version=4.3.0-SNAPSHOT,java-version=19,os-name=Mac OS X,os-arch=aarch64)\n" + "linear-sgd-model - Model(class-name=org.tribuo.classification.sgd.linear.LinearSGDModel,dataset=Dataset(class-name=org.tribuo.MutableDataset,datasource=SplitDataSourceProvenance(className=org.tribuo.evaluation.TrainTestSplitter,innerSourceProvenance=DataSource(class-name=org.tribuo.data.csv.CSVDataSource,headers=[sepalLength, sepalWidth, petalLength, petalWidth, species],rowProcessor=RowProcessor(class-name=org.tribuo.data.columnar.RowProcessor,metadataExtractors=[],fieldProcessorList=[FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=petalLength,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=petalWidth,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=sepalWidth,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=sepalLength,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor)],featureProcessors=[],responseProcessor=ResponseProcessor(class-name=org.tribuo.data.columnar.processors.response.FieldResponseProcessor,uppercase=false,fieldNames=[species],defaultValues=[],displayField=false,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),host-short-name=ResponseProcessor),weightExtractor=null,replaceNewlinesWithSpaces=true,regexMappingProcessors={},host-short-name=RowProcessor),quote=\",outputRequired=true,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),separator=,,dataPath=/local/ExternalRepositories/tribuo/tutorials/bezdekIris.data,resource-hash=SHA-256[0FED2A99DB77EC533A62DC66894D3EC6DF3B58B6A8F3CF4A6B47E4086B7F97DC],file-modified-time=1999-12-14T15:12:39-05:00,datasource-creation-time=2022-10-07T11:20:06.279351-04:00,host-short-name=DataSource),trainProportion=0.7,seed=1,size=150,isTrain=true),transformations=[],is-sequence=false,is-dense=true,num-examples=105,num-features=4,num-outputs=3,tribuo-version=4.3.0),trainer=Trainer(class-name=org.tribuo.classification.sgd.linear.LogisticRegressionTrainer,seed=12345,minibatchSize=1,shuffle=true,epochs=5,optimiser=StochasticGradientOptimiser(class-name=org.tribuo.math.optimisers.AdaGrad,epsilon=0.1,initialLearningRate=1.0,initialValue=0.0,host-short-name=StochasticGradientOptimiser),loggingInterval=1000,objective=LabelObjective(class-name=org.tribuo.classification.sgd.objectives.LogMulticlass,host-short-name=LabelObjective),tribuo-version=4.3.0,train-invocation-count=0,is-sequence=false,host-short-name=Trainer),trained-at=2022-10-07T11:20:06.643297-04:00,instance-values={},tribuo-version=4.3.0,java-version=12,os-name=Linux,os-arch=amd64)\n" ] } ], @@ -1463,12 +1469,12 @@ "output_type": "stream", "text": [ "{\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"dataset-provenance\" : {\n", " \"num-features\" : \"4\",\n", " \"num-examples\" : \"45\",\n", " \"num-outputs\" : \"3\",\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"datasource\" : {\n", " \"train-proportion\" : \"0.7\",\n", " \"seed\" : \"1\",\n", @@ -1526,14 +1532,14 @@ " \"file-modified-time\" : \"1999-12-14T15:12:39-05:00\",\n", " \"quote\" : \"\\\"\",\n", " \"outputRequired\" : \"true\",\n", - " \"datasource-creation-time\" : \"2022-10-02T17:16:22.164235-04:00\",\n", + " \"datasource-creation-time\" : \"2022-10-07T11:20:06.279351-04:00\",\n", " \"outputFactory\" : {\n", " \"class-name\" : \"org.tribuo.classification.LabelFactory\"\n", " },\n", " \"separator\" : \",\",\n", " \"host-short-name\" : \"DataSource\",\n", " \"class-name\" : \"org.tribuo.data.csv.CSVDataSource\",\n", - " \"dataPath\" : \"/Users/craigacp/Development/tribuo/tutorials/bezdekIris.data\"\n", + " \"dataPath\" : \"/local/ExternalRepositories/tribuo/tutorials/bezdekIris.data\"\n", " },\n", " \"class-name\" : \"org.tribuo.evaluation.TrainTestSplitter\",\n", " \"is-train\" : \"false\"\n", @@ -1546,11 +1552,11 @@ " \"class-name\" : \"org.tribuo.provenance.EvaluationProvenance\",\n", " \"model-provenance\" : {\n", " \"instance-values\" : { },\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", - " \"java-version\" : \"19\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", + " \"java-version\" : \"12\",\n", " \"trainer\" : {\n", " \"seed\" : \"12345\",\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"minibatchSize\" : \"1\",\n", " \"train-invocation-count\" : \"0\",\n", " \"is-sequence\" : \"false\",\n", @@ -1571,14 +1577,14 @@ " \"class-name\" : \"org.tribuo.classification.sgd.objectives.LogMulticlass\"\n", " }\n", " },\n", - " \"os-arch\" : \"aarch64\",\n", - " \"trained-at\" : \"2022-10-02T17:16:22.729336-04:00\",\n", - " \"os-name\" : \"Mac OS X\",\n", + " \"os-arch\" : \"amd64\",\n", + " \"trained-at\" : \"2022-10-07T11:20:06.643297-04:00\",\n", + " \"os-name\" : \"Linux\",\n", " \"dataset\" : {\n", " \"num-features\" : \"4\",\n", " \"num-examples\" : \"105\",\n", " \"num-outputs\" : \"3\",\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"datasource\" : {\n", " \"train-proportion\" : \"0.7\",\n", " \"seed\" : \"1\",\n", @@ -1636,14 +1642,14 @@ " \"file-modified-time\" : \"1999-12-14T15:12:39-05:00\",\n", " \"quote\" : \"\\\"\",\n", " \"outputRequired\" : \"true\",\n", - " \"datasource-creation-time\" : \"2022-10-02T17:16:22.164235-04:00\",\n", + " \"datasource-creation-time\" : \"2022-10-07T11:20:06.279351-04:00\",\n", " \"outputFactory\" : {\n", " \"class-name\" : \"org.tribuo.classification.LabelFactory\"\n", " },\n", " \"separator\" : \",\",\n", " \"host-short-name\" : \"DataSource\",\n", " \"class-name\" : \"org.tribuo.data.csv.CSVDataSource\",\n", - " \"dataPath\" : \"/Users/craigacp/Development/tribuo/tutorials/bezdekIris.data\"\n", + " \"dataPath\" : \"/local/ExternalRepositories/tribuo/tutorials/bezdekIris.data\"\n", " },\n", " \"class-name\" : \"org.tribuo.evaluation.TrainTestSplitter\",\n", " \"is-train\" : \"true\"\n", @@ -1827,7 +1833,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "19" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/modelcard-tribuo-v4.ipynb b/tutorials/modelcard-tribuo-v4.ipynb index 04c75bc6d..fd61787a8 100644 --- a/tutorials/modelcard-tribuo-v4.ipynb +++ b/tutorials/modelcard-tribuo-v4.ipynb @@ -25,8 +25,8 @@ }, "outputs": [], "source": [ - "%jars ./tribuo-anomaly-libsvm-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-modelcard-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-anomaly-libsvm-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-modelcard-4.3.0-jar-with-dependencies.jar" ] }, { @@ -129,7 +129,7 @@ "\t\t\tnum-examples = 2000\n", "\t\t\tnum-features = 5\n", "\t\t\tnum-outputs = 2\n", - "\t\t\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t)\n", "\ttrainer = LibSVMAnomalyTrainer(\n", "\t\t\tclass-name = org.tribuo.anomaly.libsvm.LibSVMAnomalyTrainer\n", @@ -150,17 +150,17 @@ "\t\t\t\t\thost-short-name = SVMType\n", "\t\t\t\t)\n", "\t\t\tgamma = 1.0\n", - "\t\t\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t\ttrain-invocation-count = 0\n", "\t\t\tis-sequence = false\n", "\t\t\thost-short-name = Trainer\n", "\t\t)\n", - "\ttrained-at = 2022-09-22T14:18:03.073938-04:00\n", + "\ttrained-at = 2022-10-07T12:03:06.539476091-04:00\n", "\tinstance-values = Map{}\n", - "\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\ttribuo-version = 4.3.0\n", "\tjava-version = 17.0.4.1\n", - "\tos-name = Mac OS X\n", - "\tos-arch = x86_64\n", + "\tos-name = Linux\n", + "\tos-arch = amd64\n", ")\n", "\n", "EvaluationProvenance(\n", @@ -210,7 +210,7 @@ "\t\t\t\t\tnum-examples = 2000\n", "\t\t\t\t\tnum-features = 5\n", "\t\t\t\t\tnum-outputs = 2\n", - "\t\t\t\t\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\t\t\t\t\ttribuo-version = 4.3.0\n", "\t\t\t\t)\n", "\t\t\ttrainer = LibSVMAnomalyTrainer(\n", "\t\t\t\t\tclass-name = org.tribuo.anomaly.libsvm.LibSVMAnomalyTrainer\n", @@ -231,17 +231,17 @@ "\t\t\t\t\t\t\thost-short-name = SVMType\n", "\t\t\t\t\t\t)\n", "\t\t\t\t\tgamma = 1.0\n", - "\t\t\t\t\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\t\t\t\t\ttribuo-version = 4.3.0\n", "\t\t\t\t\ttrain-invocation-count = 0\n", "\t\t\t\t\tis-sequence = false\n", "\t\t\t\t\thost-short-name = Trainer\n", "\t\t\t\t)\n", - "\t\t\ttrained-at = 2022-09-22T14:18:03.073938-04:00\n", + "\t\t\ttrained-at = 2022-10-07T12:03:06.539476091-04:00\n", "\t\t\tinstance-values = Map{}\n", - "\t\t\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t\tjava-version = 17.0.4.1\n", - "\t\t\tos-name = Mac OS X\n", - "\t\t\tos-arch = x86_64\n", + "\t\t\tos-name = Linux\n", + "\t\t\tos-arch = amd64\n", "\t\t)\n", "\tdataset-provenance = MutableDataset(\n", "\t\t\tclass-name = org.tribuo.MutableDataset\n", @@ -286,9 +286,9 @@ "\t\t\tnum-examples = 2000\n", "\t\t\tnum-features = 5\n", "\t\t\tnum-outputs = 2\n", - "\t\t\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t)\n", - "\ttribuo-version = 4.3.0-SNAPSHOT\n", + "\ttribuo-version = 4.3.0\n", ")\n" ] } @@ -336,7 +336,7 @@ " \"schema-version\" : \"1.0\",\n", " \"model-type\" : \"LibSVMAnomalyModel\",\n", " \"model-package\" : \"org.tribuo.anomaly.libsvm.LibSVMAnomalyModel\",\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"java-version\" : \"17.0.4.1\",\n", " \"configured-parameters\" : {\n", " \"cost\" : \"1.0\",\n", @@ -359,13 +359,13 @@ " \"host-short-name\" : \"SVMType\",\n", " \"class-name\" : \"org.tribuo.anomaly.libsvm.SVMAnomalyType\"\n", " },\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"gamma\" : \"1.0\"\n", " }\n", " },\n", " \"TrainingDetails\" : {\n", " \"schema-version\" : \"1.0\",\n", - " \"training-time\" : \"2022-09-22T14:18:03.073938-04:00\",\n", + " \"training-time\" : \"2022-10-07T12:03:06.539476091-04:00\",\n", " \"training-set-size\" : 2000,\n", " \"num-features\" : 5,\n", " \"features-list\" : [ \"A\", \"B\", \"C\", \"D\", \"E\" ],\n", @@ -415,7 +415,7 @@ " \"schema-version\" : \"1.0\",\n", " \"model-type\" : \"LibSVMAnomalyModel\",\n", " \"model-package\" : \"org.tribuo.anomaly.libsvm.LibSVMAnomalyModel\",\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"java-version\" : \"17.0.4.1\",\n", " \"configured-parameters\" : {\n", " \"cost\" : \"1.0\",\n", @@ -438,13 +438,13 @@ " \"host-short-name\" : \"SVMType\",\n", " \"class-name\" : \"org.tribuo.anomaly.libsvm.SVMAnomalyType\"\n", " },\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"gamma\" : \"1.0\"\n", " }\n", " },\n", " \"TrainingDetails\" : {\n", " \"schema-version\" : \"1.0\",\n", - " \"training-time\" : \"2022-09-22T14:18:03.073938-04:00\",\n", + " \"training-time\" : \"2022-10-07T12:03:06.539476091-04:00\",\n", " \"training-set-size\" : 2000,\n", " \"num-features\" : 5,\n", " \"features-list\" : [ \"A\", \"B\", \"C\", \"D\", \"E\" ],\n", @@ -501,7 +501,7 @@ " \"schema-version\" : \"1.0\",\n", " \"model-type\" : \"LibSVMAnomalyModel\",\n", " \"model-package\" : \"org.tribuo.anomaly.libsvm.LibSVMAnomalyModel\",\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"java-version\" : \"17.0.4.1\",\n", " \"configured-parameters\" : {\n", " \"cost\" : \"1.0\",\n", @@ -524,13 +524,13 @@ " \"host-short-name\" : \"SVMType\",\n", " \"class-name\" : \"org.tribuo.anomaly.libsvm.SVMAnomalyType\"\n", " },\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"gamma\" : \"1.0\"\n", " }\n", " },\n", " \"TrainingDetails\" : {\n", " \"schema-version\" : \"1.0\",\n", - " \"training-time\" : \"2022-09-22T14:18:03.073938-04:00\",\n", + " \"training-time\" : \"2022-10-07T12:03:06.539476091-04:00\",\n", " \"training-set-size\" : 2000,\n", " \"num-features\" : 5,\n", " \"features-list\" : [ \"A\", \"B\", \"C\", \"D\", \"E\" ],\n", @@ -600,7 +600,7 @@ " \"schema-version\" : \"1.0\",\n", " \"model-type\" : \"LibSVMAnomalyModel\",\n", " \"model-package\" : \"org.tribuo.anomaly.libsvm.LibSVMAnomalyModel\",\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"java-version\" : \"17.0.4.1\",\n", " \"configured-parameters\" : {\n", " \"cost\" : \"1.0\",\n", @@ -623,13 +623,13 @@ " \"host-short-name\" : \"SVMType\",\n", " \"class-name\" : \"org.tribuo.anomaly.libsvm.SVMAnomalyType\"\n", " },\n", - " \"tribuo-version\" : \"4.3.0-SNAPSHOT\",\n", + " \"tribuo-version\" : \"4.3.0\",\n", " \"gamma\" : \"1.0\"\n", " }\n", " },\n", " \"TrainingDetails\" : {\n", " \"schema-version\" : \"1.0\",\n", - " \"training-time\" : \"2022-09-22T14:18:03.073938-04:00\",\n", + " \"training-time\" : \"2022-10-07T12:03:06.539476091-04:00\",\n", " \"training-set-size\" : 2000,\n", " \"num-features\" : 5,\n", " \"features-list\" : [ \"A\", \"B\", \"C\", \"D\", \"E\" ],\n", diff --git a/tutorials/multi-label-tribuo-v4.ipynb b/tutorials/multi-label-tribuo-v4.ipynb index 788bafb0c..92fa1c6eb 100644 --- a/tutorials/multi-label-tribuo-v4.ipynb +++ b/tutorials/multi-label-tribuo-v4.ipynb @@ -32,8 +32,8 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-multilabel-sgd-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-classification-experiments-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-multilabel-sgd-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-classification-experiments-4.3.0-jar-with-dependencies.jar" ] }, { @@ -172,7 +172,7 @@ "output_type": "stream", "text": [ "\n", - "Linear model training took (00:00:00:245)\n" + "Linear model training took (00:00:00:192)\n" ] } ], @@ -201,7 +201,7 @@ "output_type": "stream", "text": [ "\n", - "Tree model training took (00:00:03:499)\n" + "Tree model training took (00:00:03:188)\n" ] } ], @@ -256,7 +256,7 @@ "output_type": "stream", "text": [ "\n", - "Linear model evaluation took (00:00:00:073)\n", + "Linear model evaluation took (00:00:00:063)\n", "Class n tp fn fp recall prec f1\n", "(LabelSet={12}) 683 677 6 230 0.991 0.746 0.852\n", "(LabelSet={13}) 13 0 13 0 0.000 0.000 0.000\n", @@ -307,7 +307,7 @@ "output_type": "stream", "text": [ "\n", - "Tree model evaluation took (00:00:00:085)\n", + "Tree model evaluation took (00:00:00:094)\n", "Class n tp fn fp recall prec f1\n", "(LabelSet={12}) 683 607 76 201 0.889 0.751 0.814\n", "(LabelSet={13}) 13 0 13 2 0.000 0.000 0.000\n", @@ -387,8 +387,8 @@ "output_type": "stream", "text": [ "\n", - "Classifier Chain model training took (00:00:03:195)\n", - "Classifier Chain model evaluation took (00:00:00:146)\n", + "Classifier Chain model training took (00:00:02:893)\n", + "Classifier Chain model evaluation took (00:00:00:153)\n", "Class n tp fn fp recall prec f1\n", "(LabelSet={12}) 683 616 67 203 0.902 0.752 0.820\n", "(LabelSet={13}) 13 0 13 2 0.000 0.000 0.000\n", @@ -448,8 +448,8 @@ "output_type": "stream", "text": [ "\n", - "Classifier Chain Ensemble model training took (00:01:04:418)\n", - "Classifier Chain Ensemble model evaluation took (00:00:02:474)\n", + "Classifier Chain Ensemble model training took (00:00:54:230)\n", + "Classifier Chain Ensemble model evaluation took (00:00:02:249)\n", "Class n tp fn fp recall prec f1\n", "(LabelSet={12}) 683 629 54 216 0.921 0.744 0.823\n", "(LabelSet={13}) 13 0 13 1 0.000 0.000 0.000\n", @@ -514,7 +514,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "17+35-LTS-2724" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/onnx-export-tribuo-v4.ipynb b/tutorials/onnx-export-tribuo-v4.ipynb index 7d5057963..9fa494436 100644 --- a/tutorials/onnx-export-tribuo-v4.ipynb +++ b/tutorials/onnx-export-tribuo-v4.ipynb @@ -37,10 +37,10 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-classification-experiments-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-oci-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-onnx-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-json-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-classification-experiments-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-oci-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-onnx-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-json-4.3.0-jar-with-dependencies.jar" ] }, { @@ -150,7 +150,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training factorization machine took (00:00:11:305)\n" + "Training factorization machine took (00:00:15:126)\n" ] } ], @@ -177,7 +177,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Scoring factorization machine took (00:00:00:412)\n", + "Scoring factorization machine took (00:00:00:379)\n", "Class n tp fn fp recall prec f1\n", "0 980 959 21 31 0.979 0.969 0.974\n", "1 1,135 1,120 15 22 0.987 0.981 0.984\n", @@ -352,7 +352,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Scoring ONNX factorization machine took (00:00:00:810)\n", + "Scoring ONNX factorization machine took (00:00:00:801)\n", "Class n tp fn fp recall prec f1\n", "0 980 959 21 31 0.979 0.969 0.974\n", "1 1,135 1,120 15 22 0.987 0.981 0.984\n", @@ -446,7 +446,7 @@ "\t\t\t\t\toutputFactory = LabelFactory(\n", "\t\t\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t\t\t)\n", - "\t\t\t\t\tdatasource-creation-time = 2021-12-18T20:36:37.266127-05:00\n", + "\t\t\t\t\tdatasource-creation-time = 2022-10-07T11:46:10.955196-04:00\n", "\t\t\t\t)\n", "\t\t\ttransformations = List[]\n", "\t\t\tis-sequence = false\n", @@ -454,27 +454,27 @@ "\t\t\tnum-examples = -1\n", "\t\t\tnum-features = 717\n", "\t\t\tnum-outputs = 10\n", - "\t\t\ttribuo-version = 4.2.0\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t)\n", "\ttrainer = Trainer(\n", "\t\t\tclass-name = org.tribuo.Trainer\n", - "\t\t\tfileModifiedTime = 2021-12-18T20:36:36.445-05:00\n", - "\t\t\tmodelHash = 06071247AEDE7539B899A2D530508D8E2B43304B8A7884A257368AA2CF1C18ED\n", - "\t\t\tlocation = file:/Users/apocock/Development/Tribuo/tutorials/./fm-mnist.onnx\n", + "\t\t\tfileModifiedTime = 2022-10-07T11:46:10.476-04:00\n", + "\t\t\tmodelHash = 9DD2FABC436FB75BAD6A3E061BE51022A79F140FC491C6CA8B8033253F43CD5F\n", + "\t\t\tlocation = file:/local/ExternalRepositories/tribuo/tutorials/./fm-mnist.onnx\n", "\t\t)\n", - "\ttrained-at = 2021-12-18T20:36:37.263832-05:00\n", + "\ttrained-at = 2022-10-07T11:46:10.952607-04:00\n", "\tinstance-values = Map{\n", "\t\tmodel-domain=org.tribuo.tutorials.onnxexport.fm\n", "\t\tmodel-graphname=FMClassificationModel\n", - "\t\tmodel-description=factorization-machine-model - Model(class-name=org.tribuo.classification.sgd.fm.FMClassificationModel,dataset=Dataset(class-name=org.tribuo.MutableDataset,datasource=DataSource(class-name=org.tribuo.datasource.IDXDataSource,outputPath=/Users/apocock/Development/Tribuo/tutorials/train-labels-idx1-ubyte.gz,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),featuresPath=/Users/apocock/Development/Tribuo/tutorials/train-images-idx3-ubyte.gz,features-file-modified-time=2000-07-21T14:20:24-04:00,output-resource-hash=SHA-256[3552534A0A558BBED6AED32B30C495CCA23D567EC52CAC8BE1A0730E8010255C],datasource-creation-time=2021-12-18T20:36:23.109293-05:00,output-file-modified-time=2000-07-21T14:20:27-04:00,idx-feature-type=UBYTE,features-resource-hash=SHA-256[440FCABF73CC546FA21475E81EA370265605F56BE210A4024D2CA8F203523609],host-short-name=DataSource),transformations=[],is-sequence=false,is-dense=false,num-examples=60000,num-features=717,num-outputs=10,tribuo-version=4.2.0),trainer=Trainer(class-name=org.tribuo.classification.sgd.fm.FMClassificationTrainer,seed=12345,variance=0.1,minibatchSize=1,factorizedDimSize=6,shuffle=true,epochs=5,optimiser=StochasticGradientOptimiser(class-name=org.tribuo.math.optimisers.AdaGrad,epsilon=0.1,initialLearningRate=0.1,initialValue=0.0,host-short-name=StochasticGradientOptimiser),loggingInterval=30000,objective=LabelObjective(class-name=org.tribuo.classification.sgd.objectives.LogMulticlass,host-short-name=LabelObjective),tribuo-version=4.2.0,train-invocation-count=0,is-sequence=false,host-short-name=Trainer),trained-at=2021-12-18T20:36:35.640663-05:00,instance-values={},tribuo-version=4.2.0,java-version=17.0.1,os-name=Mac OS X,os-arch=x86_64)\n", + "\t\tmodel-description=factorization-machine-model - Model(class-name=org.tribuo.classification.sgd.fm.FMClassificationModel,dataset=Dataset(class-name=org.tribuo.MutableDataset,datasource=DataSource(class-name=org.tribuo.datasource.IDXDataSource,outputPath=/local/ExternalRepositories/tribuo/tutorials/train-labels-idx1-ubyte.gz,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),featuresPath=/local/ExternalRepositories/tribuo/tutorials/train-images-idx3-ubyte.gz,features-file-modified-time=2000-07-21T14:20:24-04:00,output-resource-hash=SHA-256[3552534A0A558BBED6AED32B30C495CCA23D567EC52CAC8BE1A0730E8010255C],datasource-creation-time=2022-10-07T11:45:53.253680-04:00,output-file-modified-time=2000-07-21T14:20:27-04:00,idx-feature-type=UBYTE,features-resource-hash=SHA-256[440FCABF73CC546FA21475E81EA370265605F56BE210A4024D2CA8F203523609],host-short-name=DataSource),transformations=[],is-sequence=false,is-dense=false,num-examples=60000,num-features=717,num-outputs=10,tribuo-version=4.3.0),trainer=Trainer(class-name=org.tribuo.classification.sgd.fm.FMClassificationTrainer,seed=12345,variance=0.1,minibatchSize=1,factorizedDimSize=6,shuffle=true,epochs=5,optimiser=StochasticGradientOptimiser(class-name=org.tribuo.math.optimisers.AdaGrad,epsilon=0.1,initialLearningRate=0.1,initialValue=0.0,host-short-name=StochasticGradientOptimiser),loggingInterval=30000,objective=LabelObjective(class-name=org.tribuo.classification.sgd.objectives.LogMulticlass,host-short-name=LabelObjective),tribuo-version=4.3.0,train-invocation-count=0,is-sequence=false,host-short-name=Trainer),trained-at=2022-10-07T11:46:09.759423-04:00,instance-values={},tribuo-version=4.3.0,java-version=12,os-name=Linux,os-arch=amd64)\n", "\t\tmodel-producer=Tribuo\n", "\t\tmodel-version=0\n", "\t\tinput-name=input\n", "\t}\n", - "\ttribuo-version = 4.2.0\n", - "\tjava-version = 17.0.1\n", - "\tos-name = Mac OS X\n", - "\tos-arch = x86_64\n", + "\ttribuo-version = 4.3.0\n", + "\tjava-version = 12\n", + "\tos-name = Linux\n", + "\tos-arch = amd64\n", ")\n" ] } @@ -511,11 +511,11 @@ "\t\t\t\t\toutputFactory = LabelFactory(\n", "\t\t\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t\t\t)\n", - "\t\t\t\t\toutputPath = /Users/apocock/Development/Tribuo/tutorials/train-labels-idx1-ubyte.gz\n", - "\t\t\t\t\tfeaturesPath = /Users/apocock/Development/Tribuo/tutorials/train-images-idx3-ubyte.gz\n", + "\t\t\t\t\toutputPath = /local/ExternalRepositories/tribuo/tutorials/train-labels-idx1-ubyte.gz\n", + "\t\t\t\t\tfeaturesPath = /local/ExternalRepositories/tribuo/tutorials/train-images-idx3-ubyte.gz\n", "\t\t\t\t\tfeatures-file-modified-time = 2000-07-21T14:20:24-04:00\n", "\t\t\t\t\toutput-resource-hash = 3552534A0A558BBED6AED32B30C495CCA23D567EC52CAC8BE1A0730E8010255C\n", - "\t\t\t\t\tdatasource-creation-time = 2021-12-18T20:36:23.109293-05:00\n", + "\t\t\t\t\tdatasource-creation-time = 2022-10-07T11:45:53.253680-04:00\n", "\t\t\t\t\toutput-file-modified-time = 2000-07-21T14:20:27-04:00\n", "\t\t\t\t\tidx-feature-type = UBYTE\n", "\t\t\t\t\tfeatures-resource-hash = 440FCABF73CC546FA21475E81EA370265605F56BE210A4024D2CA8F203523609\n", @@ -527,7 +527,7 @@ "\t\t\tnum-examples = 60000\n", "\t\t\tnum-features = 717\n", "\t\t\tnum-outputs = 10\n", - "\t\t\ttribuo-version = 4.2.0\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t)\n", "\ttrainer = FMClassificationTrainer(\n", "\t\t\tclass-name = org.tribuo.classification.sgd.fm.FMClassificationTrainer\n", @@ -549,17 +549,17 @@ "\t\t\t\t\tclass-name = org.tribuo.classification.sgd.objectives.LogMulticlass\n", "\t\t\t\t\thost-short-name = LabelObjective\n", "\t\t\t\t)\n", - "\t\t\ttribuo-version = 4.2.0\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t\ttrain-invocation-count = 0\n", "\t\t\tis-sequence = false\n", "\t\t\thost-short-name = Trainer\n", "\t\t)\n", - "\ttrained-at = 2021-12-18T20:36:35.640663-05:00\n", + "\ttrained-at = 2022-10-07T11:46:09.759423-04:00\n", "\tinstance-values = Map{}\n", - "\ttribuo-version = 4.2.0\n", - "\tjava-version = 17.0.1\n", - "\tos-name = Mac OS X\n", - "\tos-arch = x86_64\n", + "\ttribuo-version = 4.3.0\n", + "\tjava-version = 12\n", + "\tos-name = Linux\n", + "\tos-arch = amd64\n", ")\n" ] } @@ -648,7 +648,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Scoring ensemble took (00:00:00:675)\n", + "Scoring ensemble took (00:00:00:611)\n", "Class n tp fn fp recall prec f1\n", "0 980 965 15 43 0.985 0.957 0.971\n", "1 1,135 1,119 16 34 0.986 0.971 0.978\n", @@ -725,7 +725,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Scoring ONNX ensemble took (00:00:01:021)\n", + "Scoring ONNX ensemble took (00:00:00:938)\n", "Predictions are equal - true\n" ] } @@ -774,7 +774,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -809,7 +809,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -827,7 +827,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -848,7 +848,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -863,19 +863,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As `OCIModel` is a Tribuo model we can evaluate it using our standard tools." + "As `OCIModel` is a Tribuo model we can evaluate it using our standard tools.\n", + "\n", + "Note when running this notebook from scratch the OCI Model Deployment can take up to 15 minutes to fully instantiate, and the next cell will not execute correctly until that deployment has finished. You can monitor the status of the deployment in the OCI console." ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Scoring OCI model took (00:01:06:960)\n", + "Scoring OCI model took (00:00:53:606)\n", "Class n tp fn fp recall prec f1\n", "0 980 959 21 31 0.979 0.969 0.974\n", "1 1,135 1,120 15 22 0.987 0.981 0.984\n", @@ -944,7 +946,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "19" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/regression-tribuo-v4.ipynb b/tutorials/regression-tribuo-v4.ipynb index c1dc5bedf..85f6f5cef 100644 --- a/tutorials/regression-tribuo-v4.ipynb +++ b/tutorials/regression-tribuo-v4.ipynb @@ -23,10 +23,10 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-json-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-regression-sgd-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-regression-xgboost-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-regression-tree-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-json-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-regression-sgd-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-regression-xgboost-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-regression-tree-4.3.0-jar-with-dependencies.jar" ] }, { @@ -264,7 +264,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training Linear Regression (SGD) took (00:00:00:070)\n", + "Training Linear Regression (SGD) took (00:00:00:051)\n", "Evaluation (train):\n", " RMSE 0.979522\n", " MAE 0.741870\n", @@ -357,7 +357,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training Linear Regression (AdaGrad) took (00:00:00:041)\n", + "Training Linear Regression (AdaGrad) took (00:00:00:024)\n", "Evaluation (train):\n", " RMSE 0.735311\n", " MAE 0.575096\n", @@ -403,7 +403,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training CART took (00:00:00:071)\n", + "Training CART took (00:00:00:092)\n", "Evaluation (train):\n", " RMSE 0.544516\n", " MAE 0.405062\n", @@ -436,7 +436,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Training XGBoost took (00:00:00:263)\n", + "Training XGBoost took (00:00:00:194)\n", "Evaluation (train):\n", " RMSE 0.143871\n", " MAE 0.097167\n", @@ -477,7 +477,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "17+35-LTS-2724" + "version": "12+33" } }, "nbformat": 4, diff --git a/tutorials/reproducibility-tribuo-v4.ipynb b/tutorials/reproducibility-tribuo-v4.ipynb index 481868ecf..e86544fdd 100644 --- a/tutorials/reproducibility-tribuo-v4.ipynb +++ b/tutorials/reproducibility-tribuo-v4.ipynb @@ -21,10 +21,10 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-classification-experiments-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-onnx-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-json-4.3.0-SNAPSHOT-jar-with-dependencies.jar\n", - "%jars ./tribuo-reproducibility-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-classification-experiments-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-onnx-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-json-4.3.0-jar-with-dependencies.jar\n", + "%jars ./tribuo-reproducibility-4.3.0-jar-with-dependencies.jar" ] }, { @@ -68,7 +68,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "linear-sgd-model - Model(class-name=org.tribuo.classification.sgd.linear.LinearSGDModel,dataset=Dataset(class-name=org.tribuo.MutableDataset,datasource=SplitDataSourceProvenance(className=org.tribuo.evaluation.TrainTestSplitter,innerSourceProvenance=DataSource(class-name=org.tribuo.data.csv.CSVDataSource,headers=[sepalLength, sepalWidth, petalLength, petalWidth, species],rowProcessor=RowProcessor(class-name=org.tribuo.data.columnar.RowProcessor,metadataExtractors=[],fieldProcessorList=[FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=petalLength,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=petalWidth,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=sepalWidth,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=sepalLength,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor)],featureProcessors=[],responseProcessor=ResponseProcessor(class-name=org.tribuo.data.columnar.processors.response.FieldResponseProcessor,uppercase=false,fieldNames=[species],defaultValues=[],displayField=false,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),host-short-name=ResponseProcessor),weightExtractor=null,replaceNewlinesWithSpaces=true,regexMappingProcessors={},host-short-name=RowProcessor),quote=\",outputRequired=true,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),separator=,,dataPath=/Users/apocock/Development/Tribuo/tutorials/bezdekIris.data,resource-hash=SHA-256[0FED2A99DB77EC533A62DC66894D3EC6DF3B58B6A8F3CF4A6B47E4086B7F97DC],file-modified-time=1999-12-14T15:12:39-05:00,datasource-creation-time=2021-12-18T20:31:02.286464-05:00,host-short-name=DataSource),trainProportion=0.7,seed=1,size=150,isTrain=true),transformations=[],is-sequence=false,is-dense=true,num-examples=105,num-features=4,num-outputs=3,tribuo-version=4.2.0),trainer=Trainer(class-name=org.tribuo.classification.sgd.linear.LogisticRegressionTrainer,seed=12345,minibatchSize=1,shuffle=true,epochs=5,optimiser=StochasticGradientOptimiser(class-name=org.tribuo.math.optimisers.AdaGrad,epsilon=0.1,initialLearningRate=1.0,initialValue=0.0,host-short-name=StochasticGradientOptimiser),loggingInterval=1000,objective=LabelObjective(class-name=org.tribuo.classification.sgd.objectives.LogMulticlass,host-short-name=LabelObjective),tribuo-version=4.2.0,train-invocation-count=0,is-sequence=false,host-short-name=Trainer),trained-at=2021-12-18T20:31:02.707624-05:00,instance-values={},tribuo-version=4.2.0,java-version=17.0.1,os-name=Mac OS X,os-arch=x86_64)\n" + "linear-sgd-model - Model(class-name=org.tribuo.classification.sgd.linear.LinearSGDModel,dataset=Dataset(class-name=org.tribuo.MutableDataset,datasource=SplitDataSourceProvenance(className=org.tribuo.evaluation.TrainTestSplitter,innerSourceProvenance=DataSource(class-name=org.tribuo.data.csv.CSVDataSource,headers=[sepalLength, sepalWidth, petalLength, petalWidth, species],rowProcessor=RowProcessor(class-name=org.tribuo.data.columnar.RowProcessor,metadataExtractors=[],fieldProcessorList=[FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=petalLength,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=petalWidth,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=sepalWidth,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor), FieldProcessor(class-name=org.tribuo.data.columnar.processors.field.DoubleFieldProcessor,fieldName=sepalLength,onlyFieldName=true,throwOnInvalid=true,host-short-name=FieldProcessor)],featureProcessors=[],responseProcessor=ResponseProcessor(class-name=org.tribuo.data.columnar.processors.response.FieldResponseProcessor,uppercase=false,fieldNames=[species],defaultValues=[],displayField=false,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),host-short-name=ResponseProcessor),weightExtractor=null,replaceNewlinesWithSpaces=true,regexMappingProcessors={},host-short-name=RowProcessor),quote=\",outputRequired=true,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),separator=,,dataPath=/local/ExternalRepositories/tribuo/tutorials/bezdekIris.data,resource-hash=SHA-256[0FED2A99DB77EC533A62DC66894D3EC6DF3B58B6A8F3CF4A6B47E4086B7F97DC],file-modified-time=1999-12-14T15:12:39-05:00,datasource-creation-time=2022-10-07T11:20:06.279351-04:00,host-short-name=DataSource),trainProportion=0.7,seed=1,size=150,isTrain=true),transformations=[],is-sequence=false,is-dense=true,num-examples=105,num-features=4,num-outputs=3,tribuo-version=4.3.0),trainer=Trainer(class-name=org.tribuo.classification.sgd.linear.LogisticRegressionTrainer,seed=12345,minibatchSize=1,shuffle=true,epochs=5,optimiser=StochasticGradientOptimiser(class-name=org.tribuo.math.optimisers.AdaGrad,epsilon=0.1,initialLearningRate=1.0,initialValue=0.0,host-short-name=StochasticGradientOptimiser),loggingInterval=1000,objective=LabelObjective(class-name=org.tribuo.classification.sgd.objectives.LogMulticlass,host-short-name=LabelObjective),tribuo-version=4.3.0,train-invocation-count=0,is-sequence=false,host-short-name=Trainer),trained-at=2022-10-07T11:20:06.643297-04:00,instance-values={},tribuo-version=4.3.0,java-version=12,os-name=Linux,os-arch=amd64)\n" ] } ], @@ -192,10 +192,10 @@ "\t\t\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t\t\t)\n", "\t\t\t\t\tseparator = ,\n", - "\t\t\t\t\tdataPath = /Users/apocock/Development/Tribuo/tutorials/bezdekIris.data\n", + "\t\t\t\t\tdataPath = /local/ExternalRepositories/tribuo/tutorials/bezdekIris.data\n", "\t\t\t\t\tresource-hash = 0FED2A99DB77EC533A62DC66894D3EC6DF3B58B6A8F3CF4A6B47E4086B7F97DC\n", "\t\t\t\t\tfile-modified-time = 1999-12-14T15:12:39-05:00\n", - "\t\t\t\t\tdatasource-creation-time = 2021-12-18T20:38:43.398834-05:00\n", + "\t\t\t\t\tdatasource-creation-time = 2022-10-07T12:03:48.921236415-04:00\n", "\t\t\t\t\thost-short-name = DataSource\n", "\t\t\t\t)\n", "\t\t\ttrain-proportion = 0.7\n", @@ -209,7 +209,7 @@ "\tnum-examples = 105\n", "\tnum-features = 4\n", "\tnum-outputs = 3\n", - "\ttribuo-version = 4.2.0\n", + "\ttribuo-version = 4.3.0\n", ")\n" ] } @@ -254,7 +254,7 @@ "\t\t\tclass-name = org.tribuo.classification.sgd.objectives.LogMulticlass\n", "\t\t\thost-short-name = LabelObjective\n", "\t\t)\n", - "\ttribuo-version = 4.2.0\n", + "\ttribuo-version = 4.3.0\n", "\ttrain-invocation-count = 0\n", "\tis-sequence = false\n", "\thost-short-name = Trainer\n", @@ -305,15 +305,19 @@ " \"datasource\" : {\n", " \"source\" : {\n", " \"datasource-creation-time\" : {\n", - " \"original\" : \"2021-12-18T20:31:02.286464-05:00\",\n", - " \"reproduced\" : \"2021-12-18T20:38:43.398834-05:00\"\n", + " \"original\" : \"2022-10-07T11:20:06.279351-04:00\",\n", + " \"reproduced\" : \"2022-10-07T12:03:48.921236415-04:00\"\n", " }\n", " }\n", " }\n", " },\n", + " \"java-version\" : {\n", + " \"original\" : \"12\",\n", + " \"reproduced\" : \"17.0.4.1\"\n", + " },\n", " \"trained-at\" : {\n", - " \"original\" : \"2021-12-18T20:31:02.707624-05:00\",\n", - " \"reproduced\" : \"2021-12-18T20:38:43.655448-05:00\"\n", + " \"original\" : \"2022-10-07T11:20:06.643297-04:00\",\n", + " \"reproduced\" : \"2022-10-07T12:03:49.150931420-04:00\"\n", " }\n", "}\n" ] @@ -433,7 +437,7 @@ "\t\t\t\t\toutputFactory = LabelFactory(\n", "\t\t\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t\t\t)\n", - "\t\t\t\t\tdatasource-creation-time = 2021-12-18T20:38:52.702297-05:00\n", + "\t\t\t\t\tdatasource-creation-time = 2022-10-07T12:03:57.351723125-04:00\n", "\t\t\t\t)\n", "\t\t\ttransformations = List[]\n", "\t\t\tis-sequence = false\n", @@ -441,27 +445,27 @@ "\t\t\tnum-examples = -1\n", "\t\t\tnum-features = 717\n", "\t\t\tnum-outputs = 10\n", - "\t\t\ttribuo-version = 4.2.0\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t)\n", "\ttrainer = Trainer(\n", "\t\t\tclass-name = org.tribuo.Trainer\n", - "\t\t\tfileModifiedTime = 2021-12-18T20:36:36.445-05:00\n", - "\t\t\tmodelHash = 06071247AEDE7539B899A2D530508D8E2B43304B8A7884A257368AA2CF1C18ED\n", - "\t\t\tlocation = file:/Users/apocock/Development/Tribuo/tutorials/./fm-mnist.onnx\n", + "\t\t\tfileModifiedTime = 2022-10-07T11:46:10.476-04:00\n", + "\t\t\tmodelHash = 9DD2FABC436FB75BAD6A3E061BE51022A79F140FC491C6CA8B8033253F43CD5F\n", + "\t\t\tlocation = file:/local/ExternalRepositories/tribuo/tutorials/./fm-mnist.onnx\n", "\t\t)\n", - "\ttrained-at = 2021-12-18T20:38:52.700329-05:00\n", + "\ttrained-at = 2022-10-07T12:03:57.349886186-04:00\n", "\tinstance-values = Map{\n", "\t\tmodel-domain=org.tribuo.tutorials.onnxexport.fm\n", "\t\tmodel-graphname=FMClassificationModel\n", - "\t\tmodel-description=factorization-machine-model - Model(class-name=org.tribuo.classification.sgd.fm.FMClassificationModel,dataset=Dataset(class-name=org.tribuo.MutableDataset,datasource=DataSource(class-name=org.tribuo.datasource.IDXDataSource,outputPath=/Users/apocock/Development/Tribuo/tutorials/train-labels-idx1-ubyte.gz,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),featuresPath=/Users/apocock/Development/Tribuo/tutorials/train-images-idx3-ubyte.gz,features-file-modified-time=2000-07-21T14:20:24-04:00,output-resource-hash=SHA-256[3552534A0A558BBED6AED32B30C495CCA23D567EC52CAC8BE1A0730E8010255C],datasource-creation-time=2021-12-18T20:36:23.109293-05:00,output-file-modified-time=2000-07-21T14:20:27-04:00,idx-feature-type=UBYTE,features-resource-hash=SHA-256[440FCABF73CC546FA21475E81EA370265605F56BE210A4024D2CA8F203523609],host-short-name=DataSource),transformations=[],is-sequence=false,is-dense=false,num-examples=60000,num-features=717,num-outputs=10,tribuo-version=4.2.0),trainer=Trainer(class-name=org.tribuo.classification.sgd.fm.FMClassificationTrainer,seed=12345,variance=0.1,minibatchSize=1,factorizedDimSize=6,shuffle=true,epochs=5,optimiser=StochasticGradientOptimiser(class-name=org.tribuo.math.optimisers.AdaGrad,epsilon=0.1,initialLearningRate=0.1,initialValue=0.0,host-short-name=StochasticGradientOptimiser),loggingInterval=30000,objective=LabelObjective(class-name=org.tribuo.classification.sgd.objectives.LogMulticlass,host-short-name=LabelObjective),tribuo-version=4.2.0,train-invocation-count=0,is-sequence=false,host-short-name=Trainer),trained-at=2021-12-18T20:36:35.640663-05:00,instance-values={},tribuo-version=4.2.0,java-version=17.0.1,os-name=Mac OS X,os-arch=x86_64)\n", + "\t\tmodel-description=factorization-machine-model - Model(class-name=org.tribuo.classification.sgd.fm.FMClassificationModel,dataset=Dataset(class-name=org.tribuo.MutableDataset,datasource=DataSource(class-name=org.tribuo.datasource.IDXDataSource,outputPath=/local/ExternalRepositories/tribuo/tutorials/train-labels-idx1-ubyte.gz,outputFactory=OutputFactory(class-name=org.tribuo.classification.LabelFactory),featuresPath=/local/ExternalRepositories/tribuo/tutorials/train-images-idx3-ubyte.gz,features-file-modified-time=2000-07-21T14:20:24-04:00,output-resource-hash=SHA-256[3552534A0A558BBED6AED32B30C495CCA23D567EC52CAC8BE1A0730E8010255C],datasource-creation-time=2022-10-07T11:45:53.253680-04:00,output-file-modified-time=2000-07-21T14:20:27-04:00,idx-feature-type=UBYTE,features-resource-hash=SHA-256[440FCABF73CC546FA21475E81EA370265605F56BE210A4024D2CA8F203523609],host-short-name=DataSource),transformations=[],is-sequence=false,is-dense=false,num-examples=60000,num-features=717,num-outputs=10,tribuo-version=4.3.0),trainer=Trainer(class-name=org.tribuo.classification.sgd.fm.FMClassificationTrainer,seed=12345,variance=0.1,minibatchSize=1,factorizedDimSize=6,shuffle=true,epochs=5,optimiser=StochasticGradientOptimiser(class-name=org.tribuo.math.optimisers.AdaGrad,epsilon=0.1,initialLearningRate=0.1,initialValue=0.0,host-short-name=StochasticGradientOptimiser),loggingInterval=30000,objective=LabelObjective(class-name=org.tribuo.classification.sgd.objectives.LogMulticlass,host-short-name=LabelObjective),tribuo-version=4.3.0,train-invocation-count=0,is-sequence=false,host-short-name=Trainer),trained-at=2022-10-07T11:46:09.759423-04:00,instance-values={},tribuo-version=4.3.0,java-version=12,os-name=Linux,os-arch=amd64)\n", "\t\tmodel-producer=Tribuo\n", "\t\tmodel-version=0\n", "\t\tinput-name=input\n", "\t}\n", - "\ttribuo-version = 4.2.0\n", - "\tjava-version = 17.0.1\n", - "\tos-name = Mac OS X\n", - "\tos-arch = x86_64\n", + "\ttribuo-version = 4.3.0\n", + "\tjava-version = 17.0.4.1\n", + "\tos-name = Linux\n", + "\tos-arch = amd64\n", ")\n" ] } @@ -497,11 +501,11 @@ "\t\t\t\t\toutputFactory = LabelFactory(\n", "\t\t\t\t\t\t\tclass-name = org.tribuo.classification.LabelFactory\n", "\t\t\t\t\t\t)\n", - "\t\t\t\t\toutputPath = /Users/apocock/Development/Tribuo/tutorials/train-labels-idx1-ubyte.gz\n", - "\t\t\t\t\tfeaturesPath = /Users/apocock/Development/Tribuo/tutorials/train-images-idx3-ubyte.gz\n", + "\t\t\t\t\toutputPath = /local/ExternalRepositories/tribuo/tutorials/train-labels-idx1-ubyte.gz\n", + "\t\t\t\t\tfeaturesPath = /local/ExternalRepositories/tribuo/tutorials/train-images-idx3-ubyte.gz\n", "\t\t\t\t\tfeatures-file-modified-time = 2000-07-21T14:20:24-04:00\n", "\t\t\t\t\toutput-resource-hash = 3552534A0A558BBED6AED32B30C495CCA23D567EC52CAC8BE1A0730E8010255C\n", - "\t\t\t\t\tdatasource-creation-time = 2021-12-18T20:36:23.109293-05:00\n", + "\t\t\t\t\tdatasource-creation-time = 2022-10-07T11:45:53.253680-04:00\n", "\t\t\t\t\toutput-file-modified-time = 2000-07-21T14:20:27-04:00\n", "\t\t\t\t\tidx-feature-type = UBYTE\n", "\t\t\t\t\tfeatures-resource-hash = 440FCABF73CC546FA21475E81EA370265605F56BE210A4024D2CA8F203523609\n", @@ -513,7 +517,7 @@ "\t\t\tnum-examples = 60000\n", "\t\t\tnum-features = 717\n", "\t\t\tnum-outputs = 10\n", - "\t\t\ttribuo-version = 4.2.0\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t)\n", "\ttrainer = FMClassificationTrainer(\n", "\t\t\tclass-name = org.tribuo.classification.sgd.fm.FMClassificationTrainer\n", @@ -535,17 +539,17 @@ "\t\t\t\t\tclass-name = org.tribuo.classification.sgd.objectives.LogMulticlass\n", "\t\t\t\t\thost-short-name = LabelObjective\n", "\t\t\t\t)\n", - "\t\t\ttribuo-version = 4.2.0\n", + "\t\t\ttribuo-version = 4.3.0\n", "\t\t\ttrain-invocation-count = 0\n", "\t\t\tis-sequence = false\n", "\t\t\thost-short-name = Trainer\n", "\t\t)\n", - "\ttrained-at = 2021-12-18T20:36:35.640663-05:00\n", + "\ttrained-at = 2022-10-07T11:46:09.759423-04:00\n", "\tinstance-values = Map{}\n", - "\ttribuo-version = 4.2.0\n", - "\tjava-version = 17.0.1\n", - "\tos-name = Mac OS X\n", - "\tos-arch = x86_64\n", + "\ttribuo-version = 4.3.0\n", + "\tjava-version = 12\n", + "\tos-name = Linux\n", + "\tos-arch = amd64\n", ")\n" ] } @@ -593,14 +597,18 @@ " \"dataset\" : {\n", " \"datasource\" : {\n", " \"datasource-creation-time\" : {\n", - " \"original\" : \"2021-12-18T20:36:23.109293-05:00\",\n", - " \"reproduced\" : \"2021-12-18T20:38:58.740193-05:00\"\n", + " \"original\" : \"2022-10-07T11:45:53.253680-04:00\",\n", + " \"reproduced\" : \"2022-10-07T12:04:03.138366189-04:00\"\n", " }\n", " }\n", " },\n", + " \"java-version\" : {\n", + " \"original\" : \"12\",\n", + " \"reproduced\" : \"17.0.4.1\"\n", + " },\n", " \"trained-at\" : {\n", - " \"original\" : \"2021-12-18T20:36:35.640663-05:00\",\n", - " \"reproduced\" : \"2021-12-18T20:39:09.831081-05:00\"\n", + " \"original\" : \"2022-10-07T11:46:09.759423-04:00\",\n", + " \"reproduced\" : \"2022-10-07T12:04:15.478400652-04:00\"\n", " }\n", "}\n" ] @@ -714,14 +722,18 @@ " \"dataset\" : {\n", " \"datasource\" : {\n", " \"datasource-creation-time\" : {\n", - " \"original\" : \"2021-12-18T20:36:23.109293-05:00\",\n", - " \"reproduced\" : \"2021-12-18T20:38:51.027212-05:00\"\n", + " \"original\" : \"2022-10-07T11:45:53.253680-04:00\",\n", + " \"reproduced\" : \"2022-10-07T12:03:56.006018468-04:00\"\n", " }\n", " }\n", " },\n", + " \"java-version\" : {\n", + " \"original\" : \"12\",\n", + " \"reproduced\" : \"17.0.4.1\"\n", + " },\n", " \"trained-at\" : {\n", - " \"original\" : \"2021-12-18T20:36:35.640663-05:00\",\n", - " \"reproduced\" : \"2021-12-18T20:39:18.280345-05:00\"\n", + " \"original\" : \"2022-10-07T11:46:09.759423-04:00\",\n", + " \"reproduced\" : \"2022-10-07T12:04:24.453627627-04:00\"\n", " },\n", " \"trainer\" : {\n", " \"class-name\" : {\n", @@ -787,7 +799,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "17+35-LTS-2724" + "version": "17.0.4.1+1-LTS-2" } }, "nbformat": 4, diff --git a/tutorials/tensorflow-tribuo-v4.ipynb b/tutorials/tensorflow-tribuo-v4.ipynb index f79d08693..215b5ee6b 100644 --- a/tutorials/tensorflow-tribuo-v4.ipynb +++ b/tutorials/tensorflow-tribuo-v4.ipynb @@ -41,7 +41,7 @@ "metadata": {}, "outputs": [], "source": [ - "%jars ./tribuo-tensorflow-4.3.0-SNAPSHOT-jar-with-dependencies.jar" + "%jars ./tribuo-tensorflow-4.3.0-jar-with-dependencies.jar" ] }, { @@ -274,7 +274,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Wine quality training took (00:00:01:891)\n" + "Wine quality training took (00:00:02:255)\n" ] } ], @@ -302,9 +302,9 @@ "output_type": "stream", "text": [ "Wine quality evaluation:\n", - " RMSE 0.651441\n", - " MAE 0.510348\n", - " R^2 0.347424\n", + " RMSE 0.649654\n", + " MAE 0.507282\n", + " R^2 0.351000\n", "\n" ] } @@ -394,7 +394,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "MNIST MLP training took (00:01:06:256)\n" + "MNIST MLP training took (00:01:08:593)\n" ] } ], @@ -497,7 +497,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "MNIST CNN training took (00:02:57:331)\n" + "MNIST CNN training took (00:01:59:597)\n" ] } ], @@ -538,32 +538,32 @@ "output_type": "stream", "text": [ "Class n tp fn fp recall prec f1\n", - "0 980 968 12 25 0.988 0.975 0.981\n", - "1 1,135 1,123 12 5 0.989 0.996 0.992\n", - "2 1,032 1,013 19 39 0.982 0.963 0.972\n", - "3 1,010 980 30 12 0.970 0.988 0.979\n", - "4 982 963 19 19 0.981 0.981 0.981\n", - "5 892 873 19 21 0.979 0.977 0.978\n", - "6 958 938 20 17 0.979 0.982 0.981\n", - "7 1,028 998 30 14 0.971 0.986 0.978\n", - "8 974 937 37 31 0.962 0.968 0.965\n", - "9 1,009 988 21 36 0.979 0.965 0.972\n", - "Total 10,000 9,781 219 219\n", - "Accuracy 0.978\n", - "Micro Average 0.978 0.978 0.978\n", - "Macro Average 0.978 0.978 0.978\n", - "Balanced Error Rate 0.022\n", + "0 980 968 12 22 0.988 0.978 0.983\n", + "1 1,135 1,127 8 15 0.993 0.987 0.990\n", + "2 1,032 1,011 21 59 0.980 0.945 0.962\n", + "3 1,010 959 51 14 0.950 0.986 0.967\n", + "4 982 977 5 35 0.995 0.965 0.980\n", + "5 892 877 15 54 0.983 0.942 0.962\n", + "6 958 934 24 9 0.975 0.990 0.983\n", + "7 1,028 981 47 12 0.954 0.988 0.971\n", + "8 974 931 43 29 0.956 0.970 0.963\n", + "9 1,009 969 40 17 0.960 0.983 0.971\n", + "Total 10,000 9,734 266 266\n", + "Accuracy 0.973\n", + "Micro Average 0.973 0.973 0.973\n", + "Macro Average 0.973 0.973 0.973\n", + "Balanced Error Rate 0.027\n", " 0 1 2 3 4 5 6 7 8 9\n", - "0 968 0 0 0 0 0 6 0 5 1\n", - "1 0 1,123 1 3 0 2 1 0 5 0\n", - "2 3 1 1,013 2 2 0 3 2 5 1\n", - "3 1 0 9 980 0 9 0 4 2 5\n", - "4 1 0 1 0 963 0 4 1 3 9\n", - "5 1 2 0 6 0 873 1 1 3 5\n", - "6 10 1 1 1 2 2 938 0 3 0\n", - "7 0 1 13 0 4 0 0 998 4 8\n", - "8 7 0 13 0 0 7 2 1 937 7\n", - "9 2 0 1 0 11 1 0 5 1 988\n", + "0 968 0 3 0 0 0 2 2 4 1\n", + "1 0 1,127 3 0 1 1 1 0 1 1\n", + "2 3 2 1,011 5 5 0 1 1 4 0\n", + "3 0 1 14 959 0 28 0 1 5 2\n", + "4 0 0 1 0 977 0 1 0 0 3\n", + "5 1 0 1 4 0 877 3 1 4 1\n", + "6 10 3 1 1 3 5 934 0 1 0\n", + "7 0 2 25 1 9 2 0 981 3 5\n", + "8 8 3 10 1 3 11 1 2 931 4\n", + "9 0 4 1 2 14 7 0 5 7 969\n", "\n" ] } @@ -719,7 +719,7 @@ "mimetype": "text/x-java-source", "name": "Java", "pygments_lexer": "java", - "version": "17+35-LTS-2724" + "version": "17.0.4.1+1-LTS-2" } }, "nbformat": 4,