diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 33a6089..589f31a 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -14,10 +14,10 @@ "extensions": [ "/workspaces/vscode-quickstart/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix" ] - }, - "codespaces": {} + } }, "containerEnv": { - "PYTHONPATH": "/root/.vscode-server/extensions/zenml.zenml-vscode-quickstart-0.0.1/zenmlQuickstart/quickstartModules:/root/.vscode-remote/extensions/zenml.zenml-vscode-quickstart-0.0.1/zenmlQuickstart/quickstartModules" - } + "PYTHONPATH": "/workspaces/vscode-quickstart/.devcontainer/zenmlQuickstart/quickstartModules" + }, + "workspaceFolder": "/workspaces/vscode-quickstart/.devcontainer/zenmlQuickstart/sections" } diff --git a/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix b/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix index 6e9df86..cdc4279 100644 Binary files a/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix and b/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix differ diff --git a/.devcontainer/zenmlQuickstart/assets/cloud_mcp.png b/.devcontainer/zenmlQuickstart/assets/cloud_mcp.png new file mode 100644 index 0000000..81197e9 Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/cloud_mcp.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/cloud_mcp_predictions.png b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_predictions.png new file mode 100644 index 0000000..a6bf7c9 Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_predictions.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/cloud_mcp_screenshot.png b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_screenshot.png new file mode 100644 index 0000000..8f56def Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_screenshot.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/feature_engineering_pipeline.png b/.devcontainer/zenmlQuickstart/assets/feature_engineering_pipeline.png new file mode 100644 index 0000000..db30191 Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/feature_engineering_pipeline.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/inference_pipeline.png b/.devcontainer/zenmlQuickstart/assets/inference_pipeline.png new file mode 100644 index 0000000..358d553 Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/inference_pipeline.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/pipeline_overview.png b/.devcontainer/zenmlQuickstart/assets/pipeline_overview.png new file mode 100644 index 0000000..609e97d Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/pipeline_overview.png differ diff --git a/.devcontainer/zenmlQuickstart/assets/training_pipeline.png b/.devcontainer/zenmlQuickstart/assets/training_pipeline.png new file mode 100644 index 0000000..a2e6a7d Binary files /dev/null and b/.devcontainer/zenmlQuickstart/assets/training_pipeline.png differ diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/feature_engineering.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/feature_engineering.yaml new file mode 100644 index 0000000..d5ab212 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/configs/feature_engineering.yaml @@ -0,0 +1,10 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# pipeline configuration +test_size: 0.35 \ No newline at end of file diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/inference.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/inference.yaml new file mode 100644 index 0000000..1dcefe4 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/configs/inference.yaml @@ -0,0 +1,15 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: "breast_cancer_classifier" + version: "production" + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] \ No newline at end of file diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_rf.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_rf.yaml new file mode 100644 index 0000000..8d75610 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_rf.yaml @@ -0,0 +1,19 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: breast_cancer_classifier + version: rf + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] + +# Configure the pipeline +parameters: + model_type: "rf" # Choose between rf/sgd diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_sgd.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_sgd.yaml new file mode 100644 index 0000000..857cdf7 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_sgd.yaml @@ -0,0 +1,19 @@ +# environment configuration +settings: + docker: + required_integrations: + - sklearn + requirements: + - pyarrow + +# configuration of the Model Control Plane +model: + name: breast_cancer_classifier + version: sgd + license: Apache 2.0 + description: A breast cancer classifier + tags: ["breast_cancer", "classifier"] + +# Configure the pipeline +parameters: + model_type: "sgd" # Choose between rf/sgd \ No newline at end of file diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/__init__.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/__init__.py new file mode 100644 index 0000000..6d8015e --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/__init__.py @@ -0,0 +1,20 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .feature_engineering import feature_engineering +from .inference import inference +from .training import training diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/feature_engineering.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/feature_engineering.py new file mode 100644 index 0000000..edd87b6 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/feature_engineering.py @@ -0,0 +1,74 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional + +from steps import ( + data_loader, + data_preprocessor, + data_splitter, +) + +from zenml import pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def feature_engineering( + test_size: float = 0.2, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", + random_state: int = 17, +): + """ + Feature engineering pipeline. + + This is a pipeline that loads the data, processes it and splits + it into train and test sets. + + Args: + test_size: Size of holdout set for training 0.0..1.0 + drop_na: If `True` NA values will be removed from dataset + normalize: If `True` dataset will be normalized with MinMaxScaler + drop_columns: List of columns to drop from dataset + target: Name of target column in dataset + random_state: Random state to configure the data loader + + Returns: + The processed datasets (dataset_trn, dataset_tst). + """ + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + raw_data = data_loader(random_state=random_state, target=target) + dataset_trn, dataset_tst = data_splitter( + dataset=raw_data, + test_size=test_size, + ) + dataset_trn, dataset_tst, _ = data_preprocessor( + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + drop_na=drop_na, + normalize=normalize, + drop_columns=drop_columns, + target=target, + random_state=random_state, + ) + return dataset_trn, dataset_tst diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/inference.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/inference.py new file mode 100644 index 0000000..4b05c9e --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/inference.py @@ -0,0 +1,62 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from steps import ( + data_loader, + inference_predict, + inference_preprocessor, +) + +from zenml import get_pipeline_context, pipeline +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def inference(random_state: int, target: str): + """ + Model inference pipeline. + + This is a pipeline that loads the inference data, processes it with + the same preprocessing pipeline used in training, and runs inference + with the trained model. + + Args: + random_state: Random state for reproducibility. + target: Name of target column in dataset. + """ + # Get the production model artifact + model = get_pipeline_context().model.get_artifact("sklearn_classifier") + + # Get the preprocess pipeline artifact associated with this version + preprocess_pipeline = get_pipeline_context().model.get_artifact( + "preprocess_pipeline" + ) + + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + df_inference = data_loader(random_state=random_state, is_inference=True) + df_inference = inference_preprocessor( + dataset_inf=df_inference, + preprocess_pipeline=preprocess_pipeline, + target=target, + ) + inference_predict( + model=model, + dataset_inf=df_inference, + ) diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/training.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/training.py new file mode 100644 index 0000000..1e8410c --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/training.py @@ -0,0 +1,81 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional +from uuid import UUID + +from steps import model_evaluator, model_promoter, model_trainer + +from pipelines import ( + feature_engineering, +) +from zenml import pipeline +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@pipeline +def training( + train_dataset_id: Optional[UUID] = None, + test_dataset_id: Optional[UUID] = None, + target: Optional[str] = "target", + model_type: Optional[str] = "sgd", +): + """ + Model training pipeline. + + This is a pipeline that loads the data from a preprocessing pipeline, + trains a model on it and evaluates the model. If it is the first model + to be trained, it will be promoted to production. If not, it will be + promoted only if it has a higher accuracy than the current production + model version. + + Args: + train_dataset_id: ID of the train dataset produced by feature engineering. + test_dataset_id: ID of the test dataset produced by feature engineering. + target: Name of target column in dataset. + model_type: The type of model to train. + """ + # Link all the steps together by calling them and passing the output + # of one step as the input of the next step. + + # Execute Feature Engineering Pipeline + if train_dataset_id is None or test_dataset_id is None: + dataset_trn, dataset_tst = feature_engineering() + else: + client = Client() + dataset_trn = client.get_artifact_version( + name_id_or_prefix=train_dataset_id + ) + dataset_tst = client.get_artifact_version( + name_id_or_prefix=test_dataset_id + ) + + model = model_trainer( + dataset_trn=dataset_trn, target=target, model_type=model_type + ) + + acc = model_evaluator( + model=model, + dataset_trn=dataset_trn, + dataset_tst=dataset_tst, + target=target, + ) + + model_promoter(accuracy=acc) diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/__init__.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/__init__.py new file mode 100644 index 0000000..ce6f59f --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/__init__.py @@ -0,0 +1,41 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from .data_loader import ( + data_loader, +) +from .data_preprocessor import ( + data_preprocessor, +) +from .data_splitter import ( + data_splitter, +) +from .inference_predict import ( + inference_predict, +) +from .inference_preprocessor import ( + inference_preprocessor, +) +from .model_evaluator import ( + model_evaluator, +) +from .model_promoter import ( + model_promoter, +) +from .model_trainer import ( + model_trainer, +) diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_loader.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_loader.py new file mode 100644 index 0000000..a034502 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_loader.py @@ -0,0 +1,65 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pandas as pd +from sklearn.datasets import load_breast_cancer +from typing_extensions import Annotated + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def data_loader( + random_state: int, is_inference: bool = False, target: str = "target" +) -> Annotated[pd.DataFrame, "dataset"]: + """Dataset reader step. + + This is an example of a dataset reader step that load Breast Cancer dataset. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured with number of rows and logic + to drop target column or not. See the documentation for more information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + random_state: Random state for sampling + is_inference: If `True` subset will be returned and target column + will be removed from dataset. + target: Name of target columns in dataset. + + Returns: + The dataset artifact as Pandas DataFrame and name of target column. + """ + dataset = load_breast_cancer(as_frame=True) + inference_size = int(len(dataset.target) * 0.05) + dataset: pd.DataFrame = dataset.frame + inference_subset = dataset.sample( + inference_size, random_state=random_state + ) + if is_inference: + dataset = inference_subset + dataset.drop(columns=target, inplace=True) + else: + dataset.drop(inference_subset.index, inplace=True) + dataset.reset_index(drop=True, inplace=True) + logger.info(f"Dataset with {len(dataset)} records loaded!") + return dataset diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_preprocessor.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_preprocessor.py new file mode 100644 index 0000000..0cf9d3a --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_preprocessor.py @@ -0,0 +1,94 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional, Tuple + +import pandas as pd +from sklearn.pipeline import Pipeline +from sklearn.preprocessing import MinMaxScaler +from typing_extensions import Annotated +from utils.preprocess import ColumnsDropper, DataFrameCaster, NADropper + +from zenml import log_artifact_metadata, step + + +@step +def data_preprocessor( + random_state: int, + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + drop_na: Optional[bool] = None, + normalize: Optional[bool] = None, + drop_columns: Optional[List[str]] = None, + target: Optional[str] = "target", +) -> Tuple[ + Annotated[pd.DataFrame, "dataset_trn"], + Annotated[pd.DataFrame, "dataset_tst"], + Annotated[Pipeline, "preprocess_pipeline"], +]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model training. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps like cleaning, + feature engineering, feature selection, etc. It then returns the processed + dataset as a step output artifact. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to drop NA values, drop some + columns and normalize numerical columns. See the documentation for more + information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + random_state: Random state for sampling. + dataset_trn: The train dataset. + dataset_tst: The test dataset. + drop_na: If `True` all NA rows will be dropped. + normalize: If `True` all numeric fields will be normalized. + drop_columns: List of column names to drop. + target: Name of target column in dataset. + + Returns: + The processed datasets (dataset_trn, dataset_tst) and fitted `Pipeline` object. + """ + # We use the sklearn pipeline to chain together multiple preprocessing steps + preprocess_pipeline = Pipeline([("passthrough", "passthrough")]) + if drop_na: + preprocess_pipeline.steps.append(("drop_na", NADropper())) + if drop_columns: + # Drop columns + preprocess_pipeline.steps.append( + ("drop_columns", ColumnsDropper(drop_columns)) + ) + if normalize: + # Normalize the data + preprocess_pipeline.steps.append(("normalize", MinMaxScaler())) + preprocess_pipeline.steps.append( + ("cast", DataFrameCaster(dataset_trn.columns)) + ) + dataset_trn = preprocess_pipeline.fit_transform(dataset_trn) + dataset_tst = preprocess_pipeline.transform(dataset_tst) + + # Log metadata so we can load it in the inference pipeline + log_artifact_metadata( + artifact_name="preprocess_pipeline", + metadata={"random_state": random_state, "target": target}, + ) + return dataset_trn, dataset_tst, preprocess_pipeline diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_splitter.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_splitter.py new file mode 100644 index 0000000..d777e02 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/data_splitter.py @@ -0,0 +1,61 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Tuple + +import pandas as pd +from sklearn.model_selection import train_test_split +from typing_extensions import Annotated + +from zenml import step + + +@step +def data_splitter( + dataset: pd.DataFrame, test_size: float = 0.2 +) -> Tuple[ + Annotated[pd.DataFrame, "raw_dataset_trn"], + Annotated[pd.DataFrame, "raw_dataset_tst"], +]: + """Dataset splitter step. + + This is an example of a dataset splitter step that splits the data + into train and test set before passing it to ML model. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different test + set sizes. See the documentation for more information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + dataset: Dataset read from source. + test_size: 0.0..1.0 defining portion of test set. + + Returns: + The split dataset: dataset_trn, dataset_tst. + """ + dataset_trn, dataset_tst = train_test_split( + dataset, + test_size=test_size, + random_state=42, + shuffle=True, + ) + dataset_trn = pd.DataFrame(dataset_trn, columns=dataset.columns) + dataset_tst = pd.DataFrame(dataset_tst, columns=dataset.columns) + return dataset_trn, dataset_tst diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_predict.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_predict.py new file mode 100644 index 0000000..1c2ff47 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_predict.py @@ -0,0 +1,57 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Any + +import pandas as pd +from typing_extensions import Annotated + +from zenml import step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def inference_predict( + model: Any, + dataset_inf: pd.DataFrame, +) -> Annotated[pd.Series, "predictions"]: + """Predictions step. + + This is an example of a predictions step that takes the data and model in + and returns predicted values. + + This step is parameterized, which allows you to configure the step + independently of the step code, before running it in a pipeline. + In this example, the step can be configured to use different input data. + See the documentation for more information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + model: Trained model. + dataset_inf: The inference dataset. + + Returns: + The predictions as pandas series + """ + # run prediction from memory + predictions = model.predict(dataset_inf) + + predictions = pd.Series(predictions, name="predicted") + return predictions diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_preprocessor.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_preprocessor.py new file mode 100644 index 0000000..d484433 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/inference_preprocessor.py @@ -0,0 +1,50 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2023. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pandas as pd +from sklearn.pipeline import Pipeline +from typing_extensions import Annotated + +from zenml import step + + +@step +def inference_preprocessor( + dataset_inf: pd.DataFrame, + preprocess_pipeline: Pipeline, + target: str, +) -> Annotated[pd.DataFrame, "inference_dataset"]: + """Data preprocessor step. + + This is an example of a data processor step that prepares the data so that + it is suitable for model inference. It takes in a dataset as an input step + artifact and performs any necessary preprocessing steps based on pretrained + preprocessing pipeline. + + Args: + dataset_inf: The inference dataset. + preprocess_pipeline: Pretrained `Pipeline` to process dataset. + target: Name of target columns in dataset. + + Returns: + The processed dataframe: dataset_inf. + """ + # artificially adding `target` column to avoid Pipeline issues + dataset_inf[target] = pd.Series([1] * dataset_inf.shape[0]) + dataset_inf = preprocess_pipeline.transform(dataset_inf) + dataset_inf.drop(columns=[target], inplace=True) + return dataset_inf diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_evaluator.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_evaluator.py new file mode 100644 index 0000000..2a9b6ee --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_evaluator.py @@ -0,0 +1,105 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +import pandas as pd +from sklearn.base import ClassifierMixin + +from zenml import log_artifact_metadata, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_evaluator( + model: ClassifierMixin, + dataset_trn: pd.DataFrame, + dataset_tst: pd.DataFrame, + min_train_accuracy: float = 0.0, + min_test_accuracy: float = 0.0, + target: Optional[str] = "target", +) -> float: + """Evaluate a trained model. + + This is an example of a model evaluation step that takes in a model artifact + previously trained by another step in your pipeline, and a training + and validation data set pair which it uses to evaluate the model's + performance. The model metrics are then returned as step output artifacts + (in this case, the model accuracy on the train and test set). + + The suggested step implementation also outputs some warnings if the model + performance does not meet some minimum criteria. This is just an example of + how you can use steps to monitor your model performance and alert you if + something goes wrong. As an alternative, you can raise an exception in the + step to force the pipeline run to fail early and all subsequent steps to + be skipped. + + This step is parameterized to configure the step independently of the step code, + before running it in a pipeline. In this example, the step can be configured + to use different values for the acceptable model performance thresholds and + to control whether the pipeline run should fail if the model performance + does not meet the minimum criteria. See the documentation for more + information: + + https://docs.zenml.io/how-to/build-pipelines/use-pipeline-step-parameters + + Args: + model: The pre-trained model artifact. + dataset_trn: The train dataset. + dataset_tst: The test dataset. + min_train_accuracy: Minimal acceptable training accuracy value. + min_test_accuracy: Minimal acceptable testing accuracy value. + target: Name of target column in dataset. + + Returns: + The model accuracy on the test set. + """ + # Calculate the model accuracy on the train and test set + trn_acc = model.score( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + tst_acc = model.score( + dataset_tst.drop(columns=[target]), + dataset_tst[target], + ) + logger.info(f"Train accuracy={trn_acc*100:.2f}%") + logger.info(f"Test accuracy={tst_acc*100:.2f}%") + + messages = [] + if trn_acc < min_train_accuracy: + messages.append( + f"Train accuracy {trn_acc*100:.2f}% is below {min_train_accuracy*100:.2f}% !" + ) + if tst_acc < min_test_accuracy: + messages.append( + f"Test accuracy {tst_acc*100:.2f}% is below {min_test_accuracy*100:.2f}% !" + ) + else: + for message in messages: + logger.warning(message) + + log_artifact_metadata( + metadata={ + "train_accuracy": float(trn_acc), + "test_accuracy": float(tst_acc), + }, + artifact_name="sklearn_classifier", + ) + return float(tst_acc) diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_promoter.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_promoter.py new file mode 100644 index 0000000..5204063 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_promoter.py @@ -0,0 +1,74 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from zenml import get_step_context, step +from zenml.client import Client +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_promoter(accuracy: float, stage: str = "production") -> bool: + """Model promoter step. + + This is an example of a step that conditionally promotes a model. It takes + in the accuracy of the model and the stage to promote the model to. If the + accuracy is below 80%, the model is not promoted. If it is above 80%, the + model is promoted to the stage indicated in the parameters. If there is + already a model in the indicated stage, the model with the higher accuracy + is promoted. + + Args: + accuracy: Accuracy of the model. + stage: Which stage to promote the model to. + + Returns: + Whether the model was promoted or not. + """ + is_promoted = False + + if accuracy < 0.8: + logger.info( + f"Model accuracy {accuracy*100:.2f}% is below 80% ! Not promoting model." + ) + else: + logger.info(f"Model promoted to {stage}!") + is_promoted = True + + # Get the model in the current context + current_model = get_step_context().model + + # Get the model that is in the production stage + client = Client() + try: + stage_model = client.get_model_version(current_model.name, stage) + # We compare their metrics + prod_accuracy = ( + stage_model.get_artifact("sklearn_classifier") + .run_metadata["test_accuracy"] + .value + ) + if float(accuracy) > float(prod_accuracy): + # If current model has better metrics, we promote it + is_promoted = True + current_model.set_stage(stage, force=True) + except KeyError: + # If no such model exists, current one is promoted + is_promoted = True + current_model.set_stage(stage, force=True) + return is_promoted diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_trainer.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_trainer.py new file mode 100644 index 0000000..eeb24f3 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/steps/model_trainer.py @@ -0,0 +1,73 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional + +import pandas as pd +from sklearn.base import ClassifierMixin +from sklearn.ensemble import RandomForestClassifier +from sklearn.linear_model import SGDClassifier +from typing_extensions import Annotated + +from zenml import ArtifactConfig, step +from zenml.logger import get_logger + +logger = get_logger(__name__) + + +@step +def model_trainer( + dataset_trn: pd.DataFrame, + model_type: str = "sgd", + target: Optional[str] = "target", +) -> Annotated[ + ClassifierMixin, + ArtifactConfig(name="sklearn_classifier", is_model_artifact=True), +]: + """Configure and train a model on the training dataset. + + This is an example of a model training step that takes in a dataset artifact + previously loaded and pre-processed by other steps in your pipeline, then + configures and trains a model on it. The model is then returned as a step + output artifact. + + Args: + dataset_trn: The preprocessed train dataset. + model_type: The type of model to train. + target: The name of the target column in the dataset. + + Returns: + The trained model artifact. + + Raises: + ValueError: If the model type is not supported. + """ + # Initialize the model with the hyperparameters indicated in the step + # parameters and train it on the training set. + if model_type == "sgd": + model = SGDClassifier() + elif model_type == "rf": + model = RandomForestClassifier() + else: + raise ValueError(f"Unknown model type {model_type}") + logger.info(f"Training model {model}...") + + model.fit( + dataset_trn.drop(columns=[target]), + dataset_trn[target], + ) + return model diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/utils/__init__.py b/.devcontainer/zenmlQuickstart/quickstartModules/utils/__init__.py new file mode 100644 index 0000000..8d4e961 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/utils/__init__.py @@ -0,0 +1,16 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/utils/preprocess.py b/.devcontainer/zenmlQuickstart/quickstartModules/utils/preprocess.py new file mode 100644 index 0000000..df60bce --- /dev/null +++ b/.devcontainer/zenmlQuickstart/quickstartModules/utils/preprocess.py @@ -0,0 +1,56 @@ +# Apache Software License 2.0 +# +# Copyright (c) ZenML GmbH 2024. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Union + +import pandas as pd + + +class NADropper: + """Support class to drop NA values in sklearn Pipeline.""" + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.dropna() + + +class ColumnsDropper: + """Support class to drop specific columns in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X: Union[pd.DataFrame, pd.Series]): + return X.drop(columns=self.columns) + + +class DataFrameCaster: + """Support class to cast type back to pd.DataFrame in sklearn Pipeline.""" + + def __init__(self, columns): + self.columns = columns + + def fit(self, *args, **kwargs): + return self + + def transform(self, X): + return pd.DataFrame(X, columns=self.columns) diff --git a/.devcontainer/zenmlQuickstart/sections/basicPipeline/code2.py b/.devcontainer/zenmlQuickstart/sections/basicPipeline/code2.py new file mode 100644 index 0000000..6a3a5f4 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/basicPipeline/code2.py @@ -0,0 +1,16 @@ +from zenml import pipeline, step + +@step +def first_step() -> int: + return 1 + +@step +def second_step(some_value: int) -> int: + return some_value * 2 + +@pipeline +def my_first_pipeline(): + int_val = first_step() + second_step(int_val) + +my_first_pipeline() diff --git a/.devcontainer/zenmlQuickstart/sections/basicPipeline/connectToZenmlCloud.html b/.devcontainer/zenmlQuickstart/sections/basicPipeline/connectToZenmlCloud.html new file mode 100644 index 0000000..17b6b93 --- /dev/null +++ b/.devcontainer/zenmlQuickstart/sections/basicPipeline/connectToZenmlCloud.html @@ -0,0 +1,23 @@ +
+ If you are using + ZenML Cloud, you can enter + your tenant url in the input box below to connect to it. That way you can + see the result of all your runs in your dashboard. +
+ ++ ZenML Cloud is a managed service that provides a hosted ZenML environment. + It allows you to run your pipelines on the cloud, manage your metadata, and + collaborate with your team. Sign up at + ZenML Cloud + for a free trial and to get started! +
+
+ If you want to view your pipeline runs in the ZenML Dashboard but aren't
+ using ZenML Cloud, you can run a local version of the
+ dashboard by running zenml up
in the terminal or clicking
+ the button below.
+
+ If you are running this quickstart in codespaces, you'll get a pop-up in + the bottom right corner when your dashboard is ready. Follow the link to + open the dashboard in a new tab. +
++ Otherwise, VSCode may open the browser tab automatically. +
+
+ Log into the Dashboard using default credentials (username default
+ and password left blank) and follow the short onboarding flow.
+ From there you can inspect the pipeline or the
+ specific pipeline run.
+