zenml-io · jonathan-hurd · Jul 29, 2024 · Jul 30, 2024 · Jul 30, 2024 · Jul 30, 2024
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -14,10 +14,10 @@
       "extensions": [
         "/workspaces/vscode-quickstart/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix"
       ]
-    },
-    "codespaces": {}
+    }
   },
   "containerEnv": {
-    "PYTHONPATH": "/root/.vscode-server/extensions/zenml.zenml-vscode-quickstart-0.0.1/zenmlQuickstart/quickstartModules:/root/.vscode-remote/extensions/zenml.zenml-vscode-quickstart-0.0.1/zenmlQuickstart/quickstartModules"
-  }
+    "PYTHONPATH": "/workspaces/vscode-quickstart/.devcontainer/zenmlQuickstart/quickstartModules"
+  },
+  "workspaceFolder": "/workspaces/vscode-quickstart/.devcontainer/zenmlQuickstart/sections"
 }
diff --git a/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix b/.devcontainer/extensions/zenml-vscode-quickstart-0.0.1.vsix
diff --git a/.devcontainer/zenmlQuickstart/assets/cloud_mcp.png b/.devcontainer/zenmlQuickstart/assets/cloud_mcp.png
diff --git a/.devcontainer/zenmlQuickstart/assets/cloud_mcp_predictions.png b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_predictions.png
diff --git a/.devcontainer/zenmlQuickstart/assets/cloud_mcp_screenshot.png b/.devcontainer/zenmlQuickstart/assets/cloud_mcp_screenshot.png
diff --git a/.devcontainer/zenmlQuickstart/assets/feature_engineering_pipeline.png b/.devcontainer/zenmlQuickstart/assets/feature_engineering_pipeline.png
diff --git a/.devcontainer/zenmlQuickstart/assets/inference_pipeline.png b/.devcontainer/zenmlQuickstart/assets/inference_pipeline.png
diff --git a/.devcontainer/zenmlQuickstart/assets/pipeline_overview.png b/.devcontainer/zenmlQuickstart/assets/pipeline_overview.png
diff --git a/.devcontainer/zenmlQuickstart/assets/training_pipeline.png b/.devcontainer/zenmlQuickstart/assets/training_pipeline.png
diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/feature_engineering.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/feature_engineering.yaml
@@ -0,0 +1,10 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+    requirements:
+      - pyarrow
+
+# pipeline configuration
+test_size: 0.35
diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/inference.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/inference.yaml
@@ -0,0 +1,15 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+    requirements:
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: "breast_cancer_classifier"
+  version: "production"
+  license: Apache 2.0
+  description: A breast cancer classifier
+  tags: ["breast_cancer", "classifier"]
diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_rf.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_rf.yaml
@@ -0,0 +1,19 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+    requirements:
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: breast_cancer_classifier
+  version: rf
+  license: Apache 2.0
+  description: A breast cancer classifier
+  tags: ["breast_cancer", "classifier"]
+
+# Configure the pipeline
+parameters:
+  model_type: "rf"  # Choose between rf/sgd
diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_sgd.yaml b/.devcontainer/zenmlQuickstart/quickstartModules/configs/training_sgd.yaml
@@ -0,0 +1,19 @@
+# environment configuration
+settings:
+  docker:
+    required_integrations:
+      - sklearn
+    requirements:
+      - pyarrow
+
+# configuration of the Model Control Plane
+model:
+  name: breast_cancer_classifier
+  version: sgd
+  license: Apache 2.0
+  description: A breast cancer classifier
+  tags: ["breast_cancer", "classifier"]
+
+# Configure the pipeline
+parameters:
+  model_type: "sgd"  # Choose between rf/sgd
diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/__init__.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/__init__.py
@@ -0,0 +1,20 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .feature_engineering import feature_engineering
+from .inference import inference
+from .training import training
diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/feature_engineering.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/feature_engineering.py
@@ -0,0 +1,74 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional
+
+from steps import (
+    data_loader,
+    data_preprocessor,
+    data_splitter,
+)
+
+from zenml import pipeline
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def feature_engineering(
+    test_size: float = 0.2,
+    drop_na: Optional[bool] = None,
+    normalize: Optional[bool] = None,
+    drop_columns: Optional[List[str]] = None,
+    target: Optional[str] = "target",
+    random_state: int = 17,
+):
+    """
+    Feature engineering pipeline.
+
+    This is a pipeline that loads the data, processes it and splits
+    it into train and test sets.
+
+    Args:
+        test_size: Size of holdout set for training 0.0..1.0
+        drop_na: If `True` NA values will be removed from dataset
+        normalize: If `True` dataset will be normalized with MinMaxScaler
+        drop_columns: List of columns to drop from dataset
+        target: Name of target column in dataset
+        random_state: Random state to configure the data loader
+
+    Returns:
+        The processed datasets (dataset_trn, dataset_tst).
+    """
+    # Link all the steps together by calling them and passing the output
+    # of one step as the input of the next step.
+    raw_data = data_loader(random_state=random_state, target=target)
+    dataset_trn, dataset_tst = data_splitter(
+        dataset=raw_data,
+        test_size=test_size,
+    )
+    dataset_trn, dataset_tst, _ = data_preprocessor(
+        dataset_trn=dataset_trn,
+        dataset_tst=dataset_tst,
+        drop_na=drop_na,
+        normalize=normalize,
+        drop_columns=drop_columns,
+        target=target,
+        random_state=random_state,
+    )
+    return dataset_trn, dataset_tst
diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/inference.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/inference.py
@@ -0,0 +1,62 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from steps import (
+    data_loader,
+    inference_predict,
+    inference_preprocessor,
+)
+
+from zenml import get_pipeline_context, pipeline
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def inference(random_state: int, target: str):
+    """
+    Model inference pipeline.
+
+    This is a pipeline that loads the inference data, processes it with
+    the same preprocessing pipeline used in training, and runs inference
+    with the trained model.
+
+    Args:
+        random_state: Random state for reproducibility.
+        target: Name of target column in dataset.
+    """
+    # Get the production model artifact
+    model = get_pipeline_context().model.get_artifact("sklearn_classifier")
+
+    # Get the preprocess pipeline artifact associated with this version
+    preprocess_pipeline = get_pipeline_context().model.get_artifact(
+        "preprocess_pipeline"
+    )
+
+    # Link all the steps together by calling them and passing the output
+    #  of one step as the input of the next step.
+    df_inference = data_loader(random_state=random_state, is_inference=True)
+    df_inference = inference_preprocessor(
+        dataset_inf=df_inference,
+        preprocess_pipeline=preprocess_pipeline,
+        target=target,
+    )
+    inference_predict(
+        model=model,
+        dataset_inf=df_inference,
+    )
diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/training.py b/.devcontainer/zenmlQuickstart/quickstartModules/pipelines/training.py
@@ -0,0 +1,81 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional
+from uuid import UUID
+
+from steps import model_evaluator, model_promoter, model_trainer
+
+from pipelines import (
+    feature_engineering,
+)
+from zenml import pipeline
+from zenml.client import Client
+from zenml.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@pipeline
+def training(
+    train_dataset_id: Optional[UUID] = None,
+    test_dataset_id: Optional[UUID] = None,
+    target: Optional[str] = "target",
+    model_type: Optional[str] = "sgd",
+):
+    """
+    Model training pipeline.
+
+    This is a pipeline that loads the data from a preprocessing pipeline,
+    trains a model on it and evaluates the model. If it is the first model
+    to be trained, it will be promoted to production. If not, it will be
+    promoted only if it has a higher accuracy than the current production
+    model version.
+
+    Args:
+        train_dataset_id: ID of the train dataset produced by feature engineering.
+        test_dataset_id: ID of the test dataset produced by feature engineering.
+        target: Name of target column in dataset.
+        model_type: The type of model to train.
+    """
+    # Link all the steps together by calling them and passing the output
+    # of one step as the input of the next step.
+
+    # Execute Feature Engineering Pipeline
+    if train_dataset_id is None or test_dataset_id is None:
+        dataset_trn, dataset_tst = feature_engineering()
+    else:
+        client = Client()
+        dataset_trn = client.get_artifact_version(
+            name_id_or_prefix=train_dataset_id
+        )
+        dataset_tst = client.get_artifact_version(
+            name_id_or_prefix=test_dataset_id
+        )
+
+    model = model_trainer(
+        dataset_trn=dataset_trn, target=target, model_type=model_type
+    )
+
+    acc = model_evaluator(
+        model=model,
+        dataset_trn=dataset_trn,
+        dataset_tst=dataset_tst,
+        target=target,
+    )
+
+    model_promoter(accuracy=acc)
diff --git a/.devcontainer/zenmlQuickstart/quickstartModules/steps/__init__.py b/.devcontainer/zenmlQuickstart/quickstartModules/steps/__init__.py
@@ -0,0 +1,41 @@
+# Apache Software License 2.0
+#
+# Copyright (c) ZenML GmbH 2024. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from .data_loader import (
+    data_loader,
+)
+from .data_preprocessor import (
+    data_preprocessor,
+)
+from .data_splitter import (
+    data_splitter,
+)
+from .inference_predict import (
+    inference_predict,
+)
+from .inference_preprocessor import (
+    inference_preprocessor,
+)
+from .model_evaluator import (
+    model_evaluator,
+)
+from .model_promoter import (
+    model_promoter,
+)
+from .model_trainer import (
+    model_trainer,
+)