diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 33a6089..589f31a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -14,10 +14,10 @@ "extensions": [ "/workspaces/vscode-quickstart/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix" ] - }, - "codespaces": {} + } }, "containerEnv": { - "PYTHONPATH": "/root/.vscode-server/extensions/zenml.zenml-vscode-quickstart-0.0.1/zenmlQuickstart/quickstartModules:/root/.vscode-remote/extensions/zenml.zenml-vscode-quickstart-0.0.1/zenmlQuickstart/quickstartModules" - } + "PYTHONPATH": "/workspaces/vscode-quickstart/.devcontainer/zenmlQuickstart/quickstartModules" + }, + "workspaceFolder": "/workspaces/vscode-quickstart/.devcontainer/zenmlQuickstart/sections" } diff --git a/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix b/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix index 6e9df86..cdc4279 100644 Binary files a/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix and b/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix differ diff --git a/.devcontainer/zenmlQuickstart/assets/cloud_mcp.png b/.devcontainer/zenmlQuickstart/assets/cloud_mcp.png new file mode 100644 index 0000000..81197e9 Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/cloud_mcp.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/cloud_mcp_predictions.png b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_predictions.png new file mode 100644 index 0000000..a6bf7c9 Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_predictions.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/cloud_mcp_screenshot.png b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_screenshot.png new file mode 100644 index 0000000..8f56def Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_screenshot.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/feature_engineering_pipeline.png b/.devcontainer/zenmlQuickstart/assets/feature_engineering_pipeline.png new file mode 100644 index 0000000..db30191 Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/feature_engineering_pipeline.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/inference_pipeline.png b/.devcontainer/zenmlQuickstart/assets/inference_pipeline.png new file mode 100644 index 0000000..358d553 Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/inference_pipeline.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/pipeline_overview.png b/.devcontainer/zenmlQuickstart/assets/pipeline_overview.png new file mode 100644 index 0000000..609e97d Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/pipeline_overview.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/training_pipeline.png b/.devcontainer/zenmlQuickstart/assets/training_pipeline.png new file mode 100644 index 0000000..a2e6a7d Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/training_pipeline.png differ diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/feature_engineering.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/feature_engineering.yaml new file mode 100644 index 0000000..d5ab212 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/configs/feature_engineering.yaml @@ -0,0 +1,10 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# pipeline configuration +test_size: 0.35 \ No newline at end of file diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/inference.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/inference.yaml new file mode 100644 index 0000000..1dcefe4 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/configs/inference.yaml @@ -0,0 +1,15 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: "breast_cancer_classifier" + version: "production" + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] \ No newline at end of file diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_rf.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_rf.yaml new file mode 100644 index 0000000..8d75610 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_rf.yaml @@ -0,0 +1,19 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: breast_cancer_classifier + version: rf + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] + +# Configure the pipeline +parameters: + model_type: "rf" # Choose between rf/sgd diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_sgd.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_sgd.yaml new file mode 100644 index 0000000..857cdf7 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_sgd.yaml @@ -0,0 +1,19 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: breast_cancer_classifier + version: sgd + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] + +# Configure the pipeline +parameters: + model_type: "sgd" # Choose between rf/sgd \ No newline at end of file diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/__init__.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/__init__.py new file mode 100644 index 0000000..6d8015e --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/__init__.py @@ -0,0 +1,20 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .feature_engineering import feature_engineering +from .inference import inference +from .training import training diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/feature_engineering.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/feature_engineering.py new file mode 100644 index 0000000..edd87b6 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/feature_engineering.py @@ -0,0 +1,74 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional + +from steps import ( + data_loader, + data_preprocessor, + data_splitter, +) + +from zenml import pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def feature_engineering( + test_size: float = 0.2, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", + random_state: int = 17, +): + """ + Feature engineering pipeline. + + This is a pipeline that loads the data, processes it and splits + it into train and test sets. + + Args: + test_size: Size of holdout set for training 0.0..1.0 + drop_na: If `True` NA values will be removed from dataset + normalize: If `True` dataset will be normalized with MinMaxScaler + drop_columns: List of columns to drop from dataset + target: Name of target column in dataset + random_state: Random state to configure the data loader + + Returns: + The processed datasets (dataset_trn, dataset_tst). + """ + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + raw_data = data_loader(random_state=random_state, target=target) + dataset_trn, dataset_tst = data_splitter( + dataset=raw_data, + test_size=test_size, + ) + dataset_trn, dataset_tst, _ = data_preprocessor( + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + drop_na=drop_na, + normalize=normalize, + drop_columns=drop_columns, + target=target, + random_state=random_state, + ) + return dataset_trn, dataset_tst diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/inference.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/inference.py new file mode 100644 index 0000000..4b05c9e --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/inference.py @@ -0,0 +1,62 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from steps import ( + data_loader, + inference_predict, + inference_preprocessor, +) + +from zenml import get_pipeline_context, pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def inference(random_state: int, target: str): + """ + Model inference pipeline. + + This is a pipeline that loads the inference data, processes it with + the same preprocessing pipeline used in training, and runs inference + with the trained model. + + Args: + random_state: Random state for reproducibility. + target: Name of target column in dataset. + """ + # Get the production model artifact + model = get_pipeline_context().model.get_artifact("sklearn_classifier") + + # Get the preprocess pipeline artifact associated with this version + preprocess_pipeline = get_pipeline_context().model.get_artifact( + "preprocess_pipeline" + ) + + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + df_inference = data_loader(random_state=random_state, is_inference=True) + df_inference = inference_preprocessor( + dataset_inf=df_inference, + preprocess_pipeline=preprocess_pipeline, + target=target, + ) + inference_predict( + model=model, + dataset_inf=df_inference, + ) diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/training.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/training.py new file mode 100644 index 0000000..1e8410c --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/training.py @@ -0,0 +1,81 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional +from uuid import UUID + +from steps import model_evaluator, model_promoter, model_trainer + +from pipelines import ( + feature_engineering, +) +from zenml import pipeline +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def training( + train_dataset_id: Optional[UUID] = None, + test_dataset_id: Optional[UUID] = None, + target: Optional[str] = "target", + model_type: Optional[str] = "sgd", +): + """ + Model training pipeline. + + This is a pipeline that loads the data from a preprocessing pipeline, + trains a model on it and evaluates the model. If it is the first model + to be trained, it will be promoted to production. If not, it will be + promoted only if it has a higher accuracy than the current production + model version. + + Args: + train_dataset_id: ID of the train dataset produced by feature engineering. + test_dataset_id: ID of the test dataset produced by feature engineering. + target: Name of target column in dataset. + model_type: The type of model to train. + """ + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + + # Execute Feature Engineering Pipeline + if train_dataset_id is None or test_dataset_id is None: + dataset_trn, dataset_tst = feature_engineering() + else: + client = Client() + dataset_trn = client.get_artifact_version( + name_id_or_prefix=train_dataset_id + ) + dataset_tst = client.get_artifact_version( + name_id_or_prefix=test_dataset_id + ) + + model = model_trainer( + dataset_trn=dataset_trn, target=target, model_type=model_type + ) + + acc = model_evaluator( + model=model, + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + target=target, + ) + + model_promoter(accuracy=acc) diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/__init__.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/__init__.py new file mode 100644 index 0000000..ce6f59f --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/__init__.py @@ -0,0 +1,41 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .data_loader import ( + data_loader, +) +from .data_preprocessor import ( + data_preprocessor, +) +from .data_splitter import ( + data_splitter, +) +from .inference_predict import ( + inference_predict, +) +from .inference_preprocessor import ( + inference_preprocessor, +) +from .model_evaluator import ( + model_evaluator, +) +from .model_promoter import ( + model_promoter, +) +from .model_trainer import ( + model_trainer, +) diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_loader.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_loader.py new file mode 100644 index 0000000..a034502 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_loader.py @@ -0,0 +1,65 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pandas as pd +from sklearn.datasets import load_breast_cancer +from typing_extensions import Annotated + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def data_loader( + random_state: int, is_inference: bool = False, target: str = "target" +) -> Annotated[pd.DataFrame, "dataset"]: + """Dataset reader step. + + This is an example of a dataset reader step that load Breast Cancer dataset. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured with number of rows and logic + to drop target column or not. See the documentation for more information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + random_state: Random state for sampling + is_inference: If `True` subset will be returned and target column + will be removed from dataset. + target: Name of target columns in dataset. + + Returns: + The dataset artifact as Pandas DataFrame and name of target column. + """ + dataset = load_breast_cancer(as_frame=True) + inference_size = int(len(dataset.target) * 0.05) + dataset: pd.DataFrame = dataset.frame + inference_subset = dataset.sample( + inference_size, random_state=random_state + ) + if is_inference: + dataset = inference_subset + dataset.drop(columns=target, inplace=True) + else: + dataset.drop(inference_subset.index, inplace=True) + dataset.reset_index(drop=True, inplace=True) + logger.info(f"Dataset with {len(dataset)} records loaded!") + return dataset diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_preprocessor.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_preprocessor.py new file mode 100644 index 0000000..0cf9d3a --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_preprocessor.py @@ -0,0 +1,94 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional, Tuple + +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import MinMaxScaler +from typing_extensions import Annotated +from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper + +from zenml import log_artifact_metadata, step + + +@step +def data_preprocessor( + random_state: int, + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", +) -> Tuple[ + Annotated[pd.DataFrame, "dataset_trn"], + Annotated[pd.DataFrame, "dataset_tst"], + Annotated[Pipeline, "preprocess_pipeline"], +]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model training. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps like cleaning, + feature engineering, feature selection, etc. It then returns the processed + dataset as a step output artifact. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to drop NA values, drop some + columns and normalize numerical columns. See the documentation for more + information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + random_state: Random state for sampling. + dataset_trn: The train dataset. + dataset_tst: The test dataset. + drop_na: If `True` all NA rows will be dropped. + normalize: If `True` all numeric fields will be normalized. + drop_columns: List of column names to drop. + target: Name of target column in dataset. + + Returns: + The processed datasets (dataset_trn, dataset_tst) and fitted `Pipeline` object. + """ + # We use the sklearn pipeline to chain together multiple preprocessing steps + preprocess_pipeline = Pipeline([("passthrough", "passthrough")]) + if drop_na: + preprocess_pipeline.steps.append(("drop_na", NADropper())) + if drop_columns: + # Drop columns + preprocess_pipeline.steps.append( + ("drop_columns", ColumnsDropper(drop_columns)) + ) + if normalize: + # Normalize the data + preprocess_pipeline.steps.append(("normalize", MinMaxScaler())) + preprocess_pipeline.steps.append( + ("cast", DataFrameCaster(dataset_trn.columns)) + ) + dataset_trn = preprocess_pipeline.fit_transform(dataset_trn) + dataset_tst = preprocess_pipeline.transform(dataset_tst) + + # Log metadata so we can load it in the inference pipeline + log_artifact_metadata( + artifact_name="preprocess_pipeline", + metadata={"random_state": random_state, "target": target}, + ) + return dataset_trn, dataset_tst, preprocess_pipeline diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_splitter.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_splitter.py new file mode 100644 index 0000000..d777e02 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_splitter.py @@ -0,0 +1,61 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Tuple + +import pandas as pd +from sklearn.model_selection import train_test_split +from typing_extensions import Annotated + +from zenml import step + + +@step +def data_splitter( + dataset: pd.DataFrame, test_size: float = 0.2 +) -> Tuple[ + Annotated[pd.DataFrame, "raw_dataset_trn"], + Annotated[pd.DataFrame, "raw_dataset_tst"], +]: + """Dataset splitter step. + + This is an example of a dataset splitter step that splits the data + into train and test set before passing it to ML model. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different test + set sizes. See the documentation for more information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + dataset: Dataset read from source. + test_size: 0.0..1.0 defining portion of test set. + + Returns: + The split dataset: dataset_trn, dataset_tst. + """ + dataset_trn, dataset_tst = train_test_split( + dataset, + test_size=test_size, + random_state=42, + shuffle=True, + ) + dataset_trn = pd.DataFrame(dataset_trn, columns=dataset.columns) + dataset_tst = pd.DataFrame(dataset_tst, columns=dataset.columns) + return dataset_trn, dataset_tst diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_predict.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_predict.py new file mode 100644 index 0000000..1c2ff47 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_predict.py @@ -0,0 +1,57 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any + +import pandas as pd +from typing_extensions import Annotated + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def inference_predict( + model: Any, + dataset_inf: pd.DataFrame, +) -> Annotated[pd.Series, "predictions"]: + """Predictions step. + + This is an example of a predictions step that takes the data and model in + and returns predicted values. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different input data. + See the documentation for more information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + model: Trained model. + dataset_inf: The inference dataset. + + Returns: + The predictions as pandas series + """ + # run prediction from memory + predictions = model.predict(dataset_inf) + + predictions = pd.Series(predictions, name="predicted") + return predictions diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_preprocessor.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_preprocessor.py new file mode 100644 index 0000000..d484433 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_preprocessor.py @@ -0,0 +1,50 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pandas as pd +from sklearn.pipeline import Pipeline +from typing_extensions import Annotated + +from zenml import step + + +@step +def inference_preprocessor( + dataset_inf: pd.DataFrame, + preprocess_pipeline: Pipeline, + target: str, +) -> Annotated[pd.DataFrame, "inference_dataset"]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model inference. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps based on pretrained + preprocessing pipeline. + + Args: + dataset_inf: The inference dataset. + preprocess_pipeline: Pretrained `Pipeline` to process dataset. + target: Name of target columns in dataset. + + Returns: + The processed dataframe: dataset_inf. + """ + # artificially adding `target` column to avoid Pipeline issues + dataset_inf[target] = pd.Series([1] * dataset_inf.shape[0]) + dataset_inf = preprocess_pipeline.transform(dataset_inf) + dataset_inf.drop(columns=[target], inplace=True) + return dataset_inf diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_evaluator.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_evaluator.py new file mode 100644 index 0000000..2a9b6ee --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_evaluator.py @@ -0,0 +1,105 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +import pandas as pd +from sklearn.base import ClassifierMixin + +from zenml import log_artifact_metadata, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_evaluator( + model: ClassifierMixin, + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + min_train_accuracy: float = 0.0, + min_test_accuracy: float = 0.0, + target: Optional[str] = "target", +) -> float: + """Evaluate a trained model. + + This is an example of a model evaluation step that takes in a model artifact + previously trained by another step in your pipeline, and a training + and validation data set pair which it uses to evaluate the model's + performance. The model metrics are then returned as step output artifacts + (in this case, the model accuracy on the train and test set). + + The suggested step implementation also outputs some warnings if the model + performance does not meet some minimum criteria. This is just an example of + how you can use steps to monitor your model performance and alert you if + something goes wrong. As an alternative, you can raise an exception in the + step to force the pipeline run to fail early and all subsequent steps to + be skipped. + + This step is parameterized to configure the step independently of the step code, + before running it in a pipeline. In this example, the step can be configured + to use different values for the acceptable model performance thresholds and + to control whether the pipeline run should fail if the model performance + does not meet the minimum criteria. See the documentation for more + information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + model: The pre-trained model artifact. + dataset_trn: The train dataset. + dataset_tst: The test dataset. + min_train_accuracy: Minimal acceptable training accuracy value. + min_test_accuracy: Minimal acceptable testing accuracy value. + target: Name of target column in dataset. + + Returns: + The model accuracy on the test set. + """ + # Calculate the model accuracy on the train and test set + trn_acc = model.score( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + tst_acc = model.score( + dataset_tst.drop(columns=[target]), + dataset_tst[target], + ) + logger.info(f"Train accuracy={trn_acc*100:.2f}%") + logger.info(f"Test accuracy={tst_acc*100:.2f}%") + + messages = [] + if trn_acc < min_train_accuracy: + messages.append( + f"Train accuracy {trn_acc*100:.2f}% is below {min_train_accuracy*100:.2f}% !" + ) + if tst_acc < min_test_accuracy: + messages.append( + f"Test accuracy {tst_acc*100:.2f}% is below {min_test_accuracy*100:.2f}% !" + ) + else: + for message in messages: + logger.warning(message) + + log_artifact_metadata( + metadata={ + "train_accuracy": float(trn_acc), + "test_accuracy": float(tst_acc), + }, + artifact_name="sklearn_classifier", + ) + return float(tst_acc) diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_promoter.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_promoter.py new file mode 100644 index 0000000..5204063 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_promoter.py @@ -0,0 +1,74 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from zenml import get_step_context, step +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_promoter(accuracy: float, stage: str = "production") -> bool: + """Model promoter step. + + This is an example of a step that conditionally promotes a model. It takes + in the accuracy of the model and the stage to promote the model to. If the + accuracy is below 80%, the model is not promoted. If it is above 80%, the + model is promoted to the stage indicated in the parameters. If there is + already a model in the indicated stage, the model with the higher accuracy + is promoted. + + Args: + accuracy: Accuracy of the model. + stage: Which stage to promote the model to. + + Returns: + Whether the model was promoted or not. + """ + is_promoted = False + + if accuracy < 0.8: + logger.info( + f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." + ) + else: + logger.info(f"Model promoted to {stage}!") + is_promoted = True + + # Get the model in the current context + current_model = get_step_context().model + + # Get the model that is in the production stage + client = Client() + try: + stage_model = client.get_model_version(current_model.name, stage) + # We compare their metrics + prod_accuracy = ( + stage_model.get_artifact("sklearn_classifier") + .run_metadata["test_accuracy"] + .value + ) + if float(accuracy) > float(prod_accuracy): + # If current model has better metrics, we promote it + is_promoted = True + current_model.set_stage(stage, force=True) + except KeyError: + # If no such model exists, current one is promoted + is_promoted = True + current_model.set_stage(stage, force=True) + return is_promoted diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_trainer.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_trainer.py new file mode 100644 index 0000000..eeb24f3 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_trainer.py @@ -0,0 +1,73 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import SGDClassifier +from typing_extensions import Annotated + +from zenml import ArtifactConfig, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_trainer( + dataset_trn: pd.DataFrame, + model_type: str = "sgd", + target: Optional[str] = "target", +) -> Annotated[ + ClassifierMixin, + ArtifactConfig(name="sklearn_classifier", is_model_artifact=True), +]: + """Configure and train a model on the training dataset. + + This is an example of a model training step that takes in a dataset artifact + previously loaded and pre-processed by other steps in your pipeline, then + configures and trains a model on it. The model is then returned as a step + output artifact. + + Args: + dataset_trn: The preprocessed train dataset. + model_type: The type of model to train. + target: The name of the target column in the dataset. + + Returns: + The trained model artifact. + + Raises: + ValueError: If the model type is not supported. + """ + # Initialize the model with the hyperparameters indicated in the step + # parameters and train it on the training set. + if model_type == "sgd": + model = SGDClassifier() + elif model_type == "rf": + model = RandomForestClassifier() + else: + raise ValueError(f"Unknown model type {model_type}") + logger.info(f"Training model {model}...") + + model.fit( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + return model diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/utils/__init__.py b/.devcontainer/zenmlQuickstart/quickstartModules/utils/__init__.py new file mode 100644 index 0000000..8d4e961 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/utils/__init__.py @@ -0,0 +1,16 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/utils/preprocess.py b/.devcontainer/zenmlQuickstart/quickstartModules/utils/preprocess.py new file mode 100644 index 0000000..df60bce --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/utils/preprocess.py @@ -0,0 +1,56 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Union + +import pandas as pd + + +class NADropper: + """Support class to drop NA values in sklearn Pipeline.""" + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.dropna() + + +class ColumnsDropper: + """Support class to drop specific columns in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.drop(columns=self.columns) + + +class DataFrameCaster: + """Support class to cast type back to pd.DataFrame in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X): + return pd.DataFrame(X, columns=self.columns) diff --git a/.devcontainer/zenmlQuickstart/sections/basicPipeline/code2.py b/.devcontainer/zenmlQuickstart/sections/basicPipeline/code2.py new file mode 100644 index 0000000..6a3a5f4 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/basicPipeline/code2.py @@ -0,0 +1,16 @@ +from zenml import pipeline, step + +@step +def first_step() -> int: + return 1 + +@step +def second_step(some_value: int) -> int: + return some_value * 2 + +@pipeline +def my_first_pipeline(): + int_val = first_step() + second_step(int_val) + +my_first_pipeline() diff --git a/.devcontainer/zenmlQuickstart/sections/basicPipeline/connectToZenmlCloud.html b/.devcontainer/zenmlQuickstart/sections/basicPipeline/connectToZenmlCloud.html new file mode 100644 index 0000000..17b6b93 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/basicPipeline/connectToZenmlCloud.html @@ -0,0 +1,23 @@ +
+

+ If you are using + ZenML Cloud, you can enter + your tenant url in the input box below to connect to it. That way you can + see the result of all your runs in your dashboard. +

+
+ + +
+

+ ZenML Cloud is a managed service that provides a hosted ZenML environment. + It allows you to run your pipelines on the cloud, manage your metadata, and + collaborate with your team. Sign up at + ZenML Cloud + for a free trial and to get started! +

+
diff --git a/.devcontainer/zenmlQuickstart/sections/basicPipeline/connectToZenmlCloud.md b/.devcontainer/zenmlQuickstart/sections/basicPipeline/connectToZenmlCloud.md new file mode 100644 index 0000000..7950bfc --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/basicPipeline/connectToZenmlCloud.md @@ -0,0 +1,3 @@ +# A Basic Pipeline ๐ŸŒฑ + +ADD SOMETHING HERE ABOUT THE SIMPLE PIPELINE + STEP diff --git a/.devcontainer/zenmlQuickstart/sections/basicPipeline/finish.md b/.devcontainer/zenmlQuickstart/sections/basicPipeline/finish.md new file mode 100644 index 0000000..aea09ad --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/basicPipeline/finish.md @@ -0,0 +1,5 @@ +# A Basic Pipeline ๐ŸŒฑ + +Nice! You just ran your first ZenML pipeline. + +This pipeline doesn't actually do much beyond pass values between the two steps, but it should give you a high-level overview of the basics of how to write a pipeline. diff --git a/.devcontainer/zenmlQuickstart/sections/consumingModel/consumingModel.md b/.devcontainer/zenmlQuickstart/sections/consumingModel/consumingModel.md new file mode 100644 index 0000000..a80f1a6 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/consumingModel/consumingModel.md @@ -0,0 +1,13 @@ +# Consuming the Model in Production ๐Ÿš€ + +The batch inference pipeline simply takes the model marked as `production` and runs inference on it +with `live data`. The critical step here is the `inference_predict` step, where we load the model in memory +and generate predictions: + +Inference pipeline + +Apart from the loading the model, we must also load the preprocessing pipeline that we ran in feature engineering, +so that we can do the exact steps that we did on training time, in inference time. + +The way to load the right model is to pass in the `production` stage into the `Model` config this time. +This will ensure to always load the production model, decoupled from all other pipelines. diff --git a/.devcontainer/zenmlQuickstart/sections/consumingModel/consumingModel.py b/.devcontainer/zenmlQuickstart/sections/consumingModel/consumingModel.py new file mode 100644 index 0000000..2ef4f25 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/consumingModel/consumingModel.py @@ -0,0 +1,71 @@ +from typing_extensions import Annotated +import pandas as pd +from zenml import step, pipeline, Model, get_step_context +from zenml.client import Client +from uuid import UUID +from zenml import pipeline +from steps import ( + data_loader, + inference_preprocessor +) + +client = Client() +preprocessing_pipeline_artifact_version = client.get_artifact_version("preprocess_pipeline") + +@step +def inference_predict(dataset_inf: pd.DataFrame) -> Annotated[pd.Series, "predictions"]: + """Predictions step""" + # Get the model + model = get_step_context().model + + # run prediction from memory + predictor = model.load_artifact("sklearn_classifier") + predictions = predictor.predict(dataset_inf) + + predictions = pd.Series(predictions, name="predicted") + + return predictions + +@pipeline +def inference(preprocess_pipeline_id: UUID): + """Model batch inference pipeline""" + # random_state = client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id).metadata["random_state"].value + # target = client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id).run_metadata['target'].value + random_state = 42 + target = "target" + + df_inference = data_loader( + random_state=random_state, is_inference=True + ) + df_inference = inference_preprocessor( + dataset_inf=df_inference, + # We use the preprocess pipeline from the feature engineering pipeline + preprocess_pipeline=client.get_artifact_version(name_id_or_prefix=preprocess_pipeline_id), + target=target, + ) + inference_predict( + dataset_inf=df_inference, + ) + +pipeline_settings = {"enable_cache": False} + +# Lets add some metadata to the model to make it identifiable +pipeline_settings["model"] = Model( + name="breast_cancer_classifier", + version="production", # We can pass in the stage name here! + license="Apache 2.0", + description="A breast cancer classifier", + tags=["breast_cancer", "classifier"], +) + +# the `with_options` method allows us to pass in pipeline settings +# and returns a configured pipeline +inference_configured = inference.with_options(**pipeline_settings) + +# Let's run it again to make sure we have two versions +# We need to pass in the ID of the preprocessing done in the feature engineering pipeline +# in order to avoid training-serving skew + +inference_configured( + preprocess_pipeline_id=preprocessing_pipeline_artifact_version.id +) diff --git a/.devcontainer/zenmlQuickstart/sections/consumingModel/finish.md b/.devcontainer/zenmlQuickstart/sections/consumingModel/finish.md new file mode 100644 index 0000000..cc4ed77 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/consumingModel/finish.md @@ -0,0 +1,8 @@ +# Consuming the Model in Production ๐Ÿš€ + +ZenML automatically links all artifacts to the `production` model version as well, including the predictions +that were returned in the pipeline. This completes the MLOps loop of training to inference: + +You can also see all predictions ever created as a complete history in the dashboard: + +Model Control Plane diff --git a/.devcontainer/zenmlQuickstart/sections/featureEngineering/afterRunning1.md b/.devcontainer/zenmlQuickstart/sections/featureEngineering/afterRunning1.md new file mode 100644 index 0000000..ff95b0f --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/featureEngineering/afterRunning1.md @@ -0,0 +1,7 @@ +# Feature Engineering ๐Ÿงช + +Something about the pipeline we just ran + + diff --git a/.devcontainer/zenmlQuickstart/sections/featureEngineering/featureEngineering1.md b/.devcontainer/zenmlQuickstart/sections/featureEngineering/featureEngineering1.md new file mode 100644 index 0000000..ec4d981 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/featureEngineering/featureEngineering1.md @@ -0,0 +1,5 @@ +# Feature Engineering ๐Ÿงช + +We're now at the right point to actually run the pipeline, since we've defined logic for the various steps. + +> Remember, creating a pipeline is as simple as adding a `@pipeline` decorator to a function. This specific pipeline doesn't return a value, but that option is available to you if you need. diff --git a/.devcontainer/zenmlQuickstart/sections/featureEngineering/featureEngineering1.py b/.devcontainer/zenmlQuickstart/sections/featureEngineering/featureEngineering1.py new file mode 100644 index 0000000..2bc5c4f --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/featureEngineering/featureEngineering1.py @@ -0,0 +1,44 @@ +from zenml import pipeline +from zenml.client import Client + +from typing import Optional, List + +from zenml import pipeline + +from steps import ( + data_loader, + data_preprocessor, + data_splitter +) + +# Initialize the ZenML client to fetch objects from the ZenML Server +client = Client() +# as in Notion Doc +@pipeline +def feature_engineering( + test_size: float = 0.3, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", + random_state: int = 17 +): + """Feature engineering pipeline.""" + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + raw_data = data_loader(random_state=random_state, target=target) + dataset_trn, dataset_tst = data_splitter( + dataset=raw_data, + test_size=test_size, + ) + dataset_trn, dataset_tst, _ = data_preprocessor( + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + drop_na=drop_na, + normalize=normalize, + drop_columns=drop_columns, + target=target, + random_state=random_state, + ) + +feature_engineering() diff --git a/.devcontainer/zenmlQuickstart/sections/featureEngineering/runLocalDashboard.html b/.devcontainer/zenmlQuickstart/sections/featureEngineering/runLocalDashboard.html new file mode 100644 index 0000000..eb57869 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/featureEngineering/runLocalDashboard.html @@ -0,0 +1,23 @@ +
+

+ If you want to view your pipeline runs in the ZenML Dashboard but aren't + using ZenML Cloud, you can run a local version of the + dashboard by running zenml up in the terminal or clicking + the button below. +

+

+ If you are running this quickstart in codespaces, you'll get a pop-up in + the bottom right corner when your dashboard is ready. Follow the link to + open the dashboard in a new tab. +

+

+ Otherwise, VSCode may open the browser tab automatically. +

+

+ Log into the Dashboard using default credentials (username default + and password left blank) and follow the short onboarding flow. + From there you can inspect the pipeline or the + specific pipeline run. +

+ +
diff --git a/.devcontainer/zenmlQuickstart/sections/featureEngineering2/afterRunning2.md b/.devcontainer/zenmlQuickstart/sections/featureEngineering2/afterRunning2.md new file mode 100644 index 0000000..a31eecc --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/featureEngineering2/afterRunning2.md @@ -0,0 +1,5 @@ +# Feature Engineering with Cache ๐Ÿงช + +Notice the second time around, the data loader step was **cached**, while the rest of the pipeline was rerun. +This is because ZenML automatically determined that nothing had changed in the data loader step, +so it didn't need to rerun it. diff --git a/.devcontainer/zenmlQuickstart/sections/featureEngineering2/featureEngineering2.md b/.devcontainer/zenmlQuickstart/sections/featureEngineering2/featureEngineering2.md new file mode 100644 index 0000000..8538c25 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/featureEngineering2/featureEngineering2.md @@ -0,0 +1,3 @@ +# Feature Engineering with Cache ๐Ÿงช + +Let's run this again with a slightly different test size, to create more datasets. diff --git a/.devcontainer/zenmlQuickstart/sections/featureEngineering2/featureEngineering2.py b/.devcontainer/zenmlQuickstart/sections/featureEngineering2/featureEngineering2.py new file mode 100644 index 0000000..4ea148c --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/featureEngineering2/featureEngineering2.py @@ -0,0 +1,44 @@ +from zenml import pipeline +from zenml.client import Client + +from typing import Optional, List + +from zenml import pipeline + +from steps import ( + data_loader, + data_preprocessor, + data_splitter +) + +# Initialize the ZenML client to fetch objects from the ZenML Server +client = Client() +# as in Notion Doc +@pipeline +def feature_engineering( + test_size: float = 0.3, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", + random_state: int = 17 +): + """Feature engineering pipeline.""" + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + raw_data = data_loader(random_state=random_state, target=target) + dataset_trn, dataset_tst = data_splitter( + dataset=raw_data, + test_size=test_size, + ) + dataset_trn, dataset_tst, _ = data_preprocessor( + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + drop_na=drop_na, + normalize=normalize, + drop_columns=drop_columns, + target=target, + random_state=random_state, + ) + +feature_engineering(test_size=0.25) diff --git a/.devcontainer/zenmlQuickstart/sections/loadingData/finish.md b/.devcontainer/zenmlQuickstart/sections/loadingData/finish.md new file mode 100644 index 0000000..8c22078 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/loadingData/finish.md @@ -0,0 +1,8 @@ +# Loading Your Data ๐Ÿ“ฌ + +Everything looks as we'd expect and the values are all in the right format ๐Ÿฅณ. + +We're now at the point where can bring this step (and some others) together into a single +pipeline, the top-level organising entity for code in ZenML. Creating such a pipeline is +as simple as adding a `@pipeline` decorator to a function. This specific +pipeline doesn't return a value, but that option is available to you if you need. diff --git a/.devcontainer/zenmlQuickstart/sections/loadingData/loadingData.md b/.devcontainer/zenmlQuickstart/sections/loadingData/loadingData.md new file mode 100644 index 0000000..cbccb9e --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/loadingData/loadingData.md @@ -0,0 +1,33 @@ +# Loading Your Data ๐Ÿ“ฌ + +We'll start off by importing our data. In this quickstart we'll be working with +[the Breast Cancer](https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic) dataset +which is publicly available on the UCI Machine Learning Repository. The task is a classification +problem, to predict whether a patient is diagnosed with breast cancer or not. + +When you're getting started with a machine learning problem you'll want to do +something similar to this: import your data and get it in the right shape for +your training. ZenML mostly gets out of your way when you're writing your Python +code, as you'll see from when you run this code. + +Feature engineering pipeline + +The whole function is decorated with the `@step` decorator, which +tells ZenML to track this function as a step in the pipeline. This means that +ZenML will automatically version, track, and cache the data that is produced by +this function as an `artifact`. This is a very powerful feature, as it means that you can +reproduce your data at any point in the future, even if the original data source +changes or disappears. + +Note the use of the `typing` module's `Annotated` type hint in the output of the +step. We're using this to give a name to the output of the step, which will make +it possible to access it via a keyword later on. + +You'll also notice that we have included type hints for the outputs +to the function. These are not only useful for anyone reading your code, but +help ZenML process your data in a way appropriate to the specific data types. + +ZenML is built in a way that allows you to experiment with your data and build +your pipelines as you work, so if you want to call this function to see how it +works, you can just call it directly. Here we take a look at the first few rows +of your training dataset. diff --git a/.devcontainer/zenmlQuickstart/sections/loadingData/loadingData.py b/.devcontainer/zenmlQuickstart/sections/loadingData/loadingData.py new file mode 100644 index 0000000..0d6cb29 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/loadingData/loadingData.py @@ -0,0 +1,35 @@ +# Do the imports at the top +from typing_extensions import Annotated +from sklearn.datasets import load_breast_cancer + +import pandas as pd +from rich import print +from zenml import step +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + +# Initialize the ZenML client to fetch objects from the ZenML Server +client = Client() + +@step +def data_loader_simplified( + random_state: int, is_inference: bool = False, target: str = "target" +) -> Annotated[pd.DataFrame, "dataset"]: # We name the dataset + """Dataset reader step.""" + dataset = load_breast_cancer(as_frame=True) + inference_size = int(len(dataset.target) * 0.05) + dataset: pd.DataFrame = dataset.frame + inference_subset = dataset.sample(inference_size, random_state=random_state) + if is_inference: + dataset = inference_subset + dataset.drop(columns=target, inplace=True) + else: + dataset.drop(inference_subset.index, inplace=True) + dataset.reset_index(drop=True, inplace=True) + logger.info(f"Dataset with {len(dataset)} records loaded!") + return dataset + +df = data_loader_simplified(random_state=42) +print(df.head()) diff --git a/.devcontainer/zenmlQuickstart/sections/modelControlPlane/finish.md b/.devcontainer/zenmlQuickstart/sections/modelControlPlane/finish.md new file mode 100644 index 0000000..73b6df4 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/modelControlPlane/finish.md @@ -0,0 +1,28 @@ +# Using the Model Control Plane ๐Ÿ‘ทโ€โ™‚๏ธ + +If you are a [ZenML Cloud](https://zenml.io/cloud) user, you can see all of this visualized in the dashboard: + +Model Control Plane + +There is a lot more you can do with ZenML models, including the ability to +track metrics by adding metadata to it, or having them persist in a model +registry. However, these topics can be explored more in the +[ZenML docs](https://docs.zenml.io). + +For now, we will use the ZenML model control plane to promote our best +model to `production`. You can do this by simply setting the `stage` of +your chosen model version to the `production` tag. + +# Set our best classifier to production + +```python +rf_zenml_model_version.set_stage("production", force=True) +``` + +Of course, normally one would only promote the model by comparing to all other model +versions and doing some other tests. But that's a bit more advanced use-case. See the +[e2e_batch example](https://github.com/zenml-io/zenml/tree/main/examples/e2e) to get +more insight into that sort of flow! + +Once the model is promoted, we can now consume the right model version in our +batch inference pipeline directly. Let's see how that works. diff --git a/.devcontainer/zenmlQuickstart/sections/modelControlPlane/modelControlPlane.md b/.devcontainer/zenmlQuickstart/sections/modelControlPlane/modelControlPlane.md new file mode 100644 index 0000000..fb0c377 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/modelControlPlane/modelControlPlane.md @@ -0,0 +1,8 @@ +# Using the Model Control Plane ๐Ÿ‘ทโ€โ™‚๏ธ + +You can see it is relatively easy to train ML models using ZenML pipelines. But it can be somewhat clunky to track all the models produced as you develop your experiments and use-cases. Luckily, ZenML offers a _Model Control Plane_, which is a central register of all your ML models. + +You can easily create a ZenML Model and associate it with your pipelines using the `Model` object. + +This time, running both pipelines has created two associated **model versions**. +You can list your ZenML model and their versions as follows: diff --git a/.devcontainer/zenmlQuickstart/sections/modelControlPlane/modelControlPlane.py b/.devcontainer/zenmlQuickstart/sections/modelControlPlane/modelControlPlane.py new file mode 100644 index 0000000..99c2e40 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/modelControlPlane/modelControlPlane.py @@ -0,0 +1,45 @@ +from zenml import Model +from zenml.client import Client +from pipelines import training + +# Initialize the ZenML client to fetch objects from the ZenML Server +client = Client() +dataset_trn_artifact_version = client.get_artifact_version("dataset_trn") +dataset_tst_artifact_version = client.get_artifact_version("dataset_tst") +pipeline_settings = {} + +# Lets add some metadata to the model to make it identifiable +pipeline_settings["model"] = Model( + name="breast_cancer_classifier", + license="Apache 2.0", + description="A breast cancer classifier", + tags=["breast_cancer", "classifier"], +) + +# Let's train the SGD model and set the version name to "sgd" +pipeline_settings["model"].version = "sgd" + +# the `with_options` method allows us to pass in pipeline settings +# and returns a configured pipeline +training_configured = training.with_options(**pipeline_settings) + +# We can now run this as usual +training_configured( + model_type="sgd", + train_dataset_id=dataset_trn_artifact_version.id, + test_dataset_id=dataset_tst_artifact_version.id +) + +# Let's train the RF model and set the version name to "rf" +pipeline_settings["model"].version = "rf" + +# the `with_options` method allows us to pass in pipeline settings +# and returns a configured pipeline +training_configured = training.with_options(**pipeline_settings) + +# Let's run it again to make sure we have two versions +training_configured( + model_type="rf", + train_dataset_id=dataset_trn_artifact_version.id, + test_dataset_id=dataset_tst_artifact_version.id +) diff --git a/.devcontainer/zenmlQuickstart/sections/modelControlPlane/modelControlPlane2.md b/.devcontainer/zenmlQuickstart/sections/modelControlPlane/modelControlPlane2.md new file mode 100644 index 0000000..d90e6bb --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/modelControlPlane/modelControlPlane2.md @@ -0,0 +1,8 @@ +# Using the Model Control Plane ๐Ÿ‘ทโ€โ™‚๏ธ + +This time, running both pipelines has created two associated **model versions**. + +The interesting part is that ZenML went ahead and linked all artifacts produced by the +pipelines to that model version, including the two pickle files that represent our +SGD and RandomForest classifier. We can see all artifacts directly from the model +version object: diff --git a/.devcontainer/zenmlQuickstart/sections/nextSteps/congratulations.md b/.devcontainer/zenmlQuickstart/sections/nextSteps/congratulations.md new file mode 100644 index 0000000..ce0363c --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/nextSteps/congratulations.md @@ -0,0 +1,21 @@ +# Congratulations! + +You're a legit MLOps engineer now! You trained two models, evaluated them against +a test set, registered the best one with the ZenML model control plane, +and served some predictions. You also learned how to iterate on your models and +data by using some of the ZenML utility abstractions. You saw how to view your +artifacts and models via the client as well as the ZenML Dashboard. + +## Further exploration + +This was just the tip of the iceberg of what ZenML can do; check out the [**docs**](https://docs.zenml.io/) to learn more +about the capabilities of ZenML. For example, you might want to: + +- [Deploy ZenML](https://docs.zenml.io/user-guide/production-guide/connect-deployed-zenml) to collaborate with your colleagues. +- Run the same pipeline on a [cloud MLOps stack in production](https://docs.zenml.io/user-guide/production-guide/cloud-stack). +- Track your metrics in an experiment tracker like [MLflow](https://docs.zenml.io/stacks-and-components/component-guide/experiment-trackers/mlflow). + +## What next? + +* If you have questions or feedback... join our [**Slack Community**](https://zenml.io/slack) and become part of the ZenML family! +* If you want to quickly get started with ZenML, check out the [ZenML Cloud](https://zenml.io/cloud). diff --git a/.devcontainer/zenmlQuickstart/sections/trainingPipeline/finish.md b/.devcontainer/zenmlQuickstart/sections/trainingPipeline/finish.md new file mode 100644 index 0000000..6ee1394 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/trainingPipeline/finish.md @@ -0,0 +1,5 @@ +# Training Pipelines โŒš + +You can see from the logs already how our model training went: the +`RandomForestClassifier` performed considerably better than the `SGDClassifier`. +We can use the ZenML `Client` to verify this: diff --git a/.devcontainer/zenmlQuickstart/sections/trainingPipeline/trainingPipeline1.md b/.devcontainer/zenmlQuickstart/sections/trainingPipeline/trainingPipeline1.md new file mode 100644 index 0000000..08c6cee --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/trainingPipeline/trainingPipeline1.md @@ -0,0 +1,25 @@ +# Training Pipelines โŒš + +Now that we have our data it makes sense to train some models to get a sense of +how difficult the task is. The Breast Cancer dataset is sufficiently large and complex +that it's unlikely we'll be able to train a model that behaves perfectly since the problem +is inherently complex, but we can get a sense of what a reasonable baseline looks like. + +We'll start with two simple models, a SGD Classifier and a Random Forest +Classifier, both batteries-included from `sklearn`. We'll train them both on the +same data and then compare their performance. + +Training pipeline + +Our two training steps both return different kinds of `sklearn` classifier +models, so we use the generic `ClassifierMixin` type hint for the return type. + +ZenML allows you to load any version of any dataset that is tracked by the framework +directly into a pipeline using the `Client().get_artifact_version` interface. This is very convenient +in this case, as we'd like to send our preprocessed dataset from the older pipeline directly +into the training pipeline. + +The end goal of this quick baseline evaluation is to understand which of the two +models performs better. We'll use the `evaluator` step to compare the two +models. This step takes in the model from the trainer step, and computes its score +over the testing set. diff --git a/.devcontainer/zenmlQuickstart/sections/trainingPipeline/trainingPipeline1.py b/.devcontainer/zenmlQuickstart/sections/trainingPipeline/trainingPipeline1.py new file mode 100644 index 0000000..8c8e1f5 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/trainingPipeline/trainingPipeline1.py @@ -0,0 +1,95 @@ +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import SGDClassifier +from typing_extensions import Annotated +from zenml import ArtifactConfig, step, pipeline +from zenml.logger import get_logger +from typing import Optional, List +from uuid import UUID +logger = get_logger(__name__) + +from pipelines import feature_engineering +from steps import model_evaluator +from zenml.client import Client + +# Initialize the ZenML client to fetch objects from the ZenML Server +client = Client() +dataset_trn_artifact_version = client.get_artifact_version("dataset_trn") +dataset_tst_artifact_version = client.get_artifact_version("dataset_tst") +@step +def model_trainer( + dataset_trn: pd.DataFrame, + model_type: str = "sgd", +) -> Annotated[ClassifierMixin, ArtifactConfig(name="sklearn_classifier", is_model_artifact=True)]: + """Configure and train a model on the training dataset.""" + target = "target" + if model_type == "sgd": + model = SGDClassifier() + elif model_type == "rf": + model = RandomForestClassifier() + else: + raise ValueError(f"Unknown model type {model_type}") + + logger.info(f"Training model {model}...") + + model.fit( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + return model + +@pipeline +def training( + train_dataset_id: Optional[UUID] = None, + test_dataset_id: Optional[UUID] = None, + model_type: str = "sgd", + min_train_accuracy: float = 0.0, + min_test_accuracy: float = 0.0, +): + + + """Model training pipeline.""" + if train_dataset_id is None or test_dataset_id is None: + # If we dont pass the IDs, this will run the feature engineering pipeline + dataset_trn, dataset_tst = feature_engineering() + else: + # Load the datasets from an older pipeline + dataset_trn = client.get_artifact_version(name_id_or_prefix=train_dataset_id) + dataset_tst = client.get_artifact_version(name_id_or_prefix=test_dataset_id) + + trained_model = model_trainer( + dataset_trn=dataset_trn, + model_type=model_type, + ) + + model_evaluator( + model=trained_model, + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + min_train_accuracy=min_train_accuracy, + min_test_accuracy=min_test_accuracy, + ) + + +# Use a random forest model with the chosen datasets. +# We need to pass the ID's of the datasets into the function +training( + model_type="rf", + train_dataset_id=dataset_trn_artifact_version.id, + test_dataset_id=dataset_tst_artifact_version.id +) + +rf_run = client.get_pipeline("training").last_run +# Use a SGD classifier +sgd_run = training( + model_type="sgd", + train_dataset_id=dataset_trn_artifact_version.id, + test_dataset_id=dataset_tst_artifact_version.id +) + +sgd_run = client.get_pipeline("training").last_run + +# The evaluator returns a float value with the accuracy + +print(rf_run.steps["model_evaluator"].output.load() > sgd_run.steps["model_evaluator"].output.load()) diff --git a/.devcontainer/zenmlQuickstart/sections/welcome/code1.py b/.devcontainer/zenmlQuickstart/sections/welcome/code1.py new file mode 100644 index 0000000..33428c4 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/welcome/code1.py @@ -0,0 +1 @@ +print("Hello from code example 1!") diff --git a/.devcontainer/zenmlQuickstart/sections/welcome/finish.md b/.devcontainer/zenmlQuickstart/sections/welcome/finish.md new file mode 100644 index 0000000..8574f0c --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/welcome/finish.md @@ -0,0 +1,3 @@ +# Welcome to the ZenML Quickstart Guide ๐Ÿ‘‹ + +Awesome! Well done! You are now ready to move on to the next section. diff --git a/.devcontainer/zenmlQuickstart/sections/welcome/image.png b/.devcontainer/zenmlQuickstart/sections/welcome/image.png new file mode 100644 index 0000000..6826d77 Binary files /dev/null and b/.devcontainer/zenmlQuickstart/sections/welcome/image.png differ diff --git a/.devcontainer/zenmlQuickstart/sections/welcome/welcome.md b/.devcontainer/zenmlQuickstart/sections/welcome/welcome.md new file mode 100644 index 0000000..0c662b6 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/welcome/welcome.md @@ -0,0 +1,15 @@ +# Welcome to the ZenML Quickstart Guide ๐Ÿ‘‹ + +This vscode extension running in codespaces gives you a sandboxed place to explore using ZenML. + +This repository is a minimalistic MLOps project intended as a starting point to learn how to put ML workflows in production. It features: + +- A feature engineering pipeline that loads data and prepares it for training. +- A training pipeline that loads the preprocessed dataset and trains a model. +- A batch inference pipeline that runs predictions on the trained model with new data. + +Follow along this notebook to understand how you can use ZenML to productionalize your ML workflows! + +![Pipeline Overview](/zenmlQuickstart/assets/pipeline_overview.png) + +Try running the code by clicking the `run code` button in the top right corner of this panel. diff --git a/.gitignore b/.gitignore index 439ea1a..7877556 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,51 @@ -.zen \ No newline at end of file +.zen +__pycache__/ +*.py[cod] +*$py.class +sectionsBackup + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# PyBuilder +.pybuilder/ +target/ + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json \ No newline at end of file