Skip to content

Commit

Permalink
[Project] Add log_dataset and log_model methods (mlrun#786)
Browse files Browse the repository at this point in the history
  • Loading branch information
Hedingber authored Mar 8, 2021
1 parent a5886ec commit 6e957a5
Showing 1 changed file with 168 additions and 1 deletion.
169 changes: 168 additions & 1 deletion mlrun/projects/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,16 @@
import getpass
import shutil
import warnings
import typing

from ..db import get_run_db
from ..artifacts import ArtifactManager, ArtifactProducer, dict_to_artifact
from ..artifacts import (
ArtifactManager,
ArtifactProducer,
dict_to_artifact,
DatasetArtifact,
ModelArtifact,
)
from ..secrets import SecretsStore
from ..model import ModelObj
import tarfile
Expand All @@ -37,6 +44,7 @@
get_object,
wait_for_pipeline_completion,
)
from ..features import Feature
import importlib.util as imputil
from urllib.parse import urlparse
from kfp import compiler
Expand All @@ -49,6 +57,7 @@
)
from ..runtimes.utils import add_code_metadata
import mlrun.api.schemas
import mlrun.errors
import mlrun.api.utils.projects.leader


Expand Down Expand Up @@ -798,6 +807,164 @@ def log_artifact(
self.spec.set_artifact(item.key, item.base_dict())
return item

def log_dataset(
self,
key,
df,
tag="",
local_path=None,
artifact_path=None,
upload=True,
labels=None,
format="",
preview=None,
stats=False,
target_path="",
extra_data=None,
**kwargs,
):
"""
log a dataset artifact and optionally upload it to datastore
example::
raw_data = {
"first_name": ["Jason", "Molly", "Tina", "Jake", "Amy"],
"last_name": ["Miller", "Jacobson", "Ali", "Milner", "Cooze"],
"age": [42, 52, 36, 24, 73],
"testScore": [25, 94, 57, 62, 70],
}
df = pd.DataFrame(raw_data, columns=["first_name", "last_name", "age", "testScore"])
context.log_dataset("mydf", df=df, stats=True)
:param key: artifact key
:param df: dataframe object
:param local_path: path to the local file we upload, will also be use
as the destination subpath (under "artifact_path")
:param artifact_path: target artifact path (when not using the default)
to define a subpath under the default location use:
`artifact_path=context.artifact_subpath('data')`
:param tag: version tag
:param format: optional, format to use (e.g. csv, parquet, ..)
:param target_path: absolute target path (instead of using artifact_path + local_path)
:param preview: number of lines to store as preview in the artifact metadata
:param stats: calculate and store dataset stats in the artifact metadata
:param extra_data: key/value list of extra files/charts to link with this dataset
:param upload: upload to datastore (default is True)
:param labels: a set of key/value labels to tag the artifact with
:returns: artifact object
"""
ds = DatasetArtifact(
key,
df,
preview=preview,
extra_data=extra_data,
format=format,
stats=stats,
**kwargs,
)

item = self.log_artifact(
ds,
local_path=local_path,
artifact_path=artifact_path or self.spec.artifact_path,
target_path=target_path,
tag=tag,
upload=upload,
labels=labels,
)
return item

def log_model(
self,
key,
body=None,
framework="",
tag="",
model_dir=None,
model_file=None,
metrics=None,
parameters=None,
artifact_path=None,
upload=True,
labels=None,
inputs: typing.List[Feature] = None,
outputs: typing.List[Feature] = None,
feature_vector: str = None,
feature_weights: list = None,
training_set=None,
label_column=None,
extra_data=None,
):
"""log a model artifact and optionally upload it to datastore
example::
context.log_model("model", body=dumps(model),
model_file="model.pkl",
metrics=context.results,
training_set=training_df,
label_column='label',
feature_vector=feature_vector_uri,
labels={"app": "fraud"})
:param key: artifact key or artifact class ()
:param body: will use the body as the artifact content
:param model_file: path to the local model file we upload (seel also model_dir)
:param model_dir: path to the local dir holding the model file and extra files
:param artifact_path: target artifact path (when not using the default)
to define a subpath under the default location use:
`artifact_path=context.artifact_subpath('data')`
:param framework: name of the ML framework
:param tag: version tag
:param metrics: key/value dict of model metrics
:param parameters: key/value dict of model parameters
:param inputs: ordered list of model input features (name, type, ..)
:param outputs: ordered list of model output/result elements (name, type, ..)
:param upload: upload to datastore (default is True)
:param labels: a set of key/value labels to tag the artifact with
:param feature_vector: feature store feature vector uri (store://feature-vectors/<project>/<name>[:tag])
:param feature_weights: list of feature weights, one per input column
:param training_set: training set dataframe, used to infer inputs & outputs
:param label_column: which columns in the training set are the label (target) columns
:param extra_data: key/value list of extra files/charts to link with this dataset
value can be abs/relative path string | bytes | artifact object
:returns: artifact object
"""

if training_set is not None and inputs:
raise mlrun.errors.MLRunInvalidArgumentError(
"cannot specify inputs and training set together"
)

model = ModelArtifact(
key,
body,
model_file=model_file,
metrics=metrics,
parameters=parameters,
inputs=inputs,
outputs=outputs,
framework=framework,
feature_vector=feature_vector,
feature_weights=feature_weights,
extra_data=extra_data,
)
if training_set is not None:
model.infer_from_df(training_set, label_column)

item = self.log_artifact(
model,
local_path=model_dir,
artifact_path=artifact_path or self.spec.artifact_path,
tag=tag,
upload=upload,
labels=labels,
)
return item

def reload(self, sync=False):
"""reload the project and function objects from yaml/specs
Expand Down

0 comments on commit 6e957a5

Please sign in to comment.