Skip to content

Commit

Permalink
feat: add kubeflow dockerfile (#996)
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinBernstorff authored Sep 2, 2024
1 parent 45f9ee9 commit 4c4f1f0
Show file tree
Hide file tree
Showing 26 changed files with 193 additions and 55 deletions.
18 changes: 16 additions & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -185,12 +185,26 @@ psycop/tests/test_eval_results/*
psycop/**/*.md

# Wandb
/wandb
**/wandb
multirun

# Ignore the ouptuts for publishing folder
**/outputs_for_publishing
pytest.xml
joblib
/outputs
.git

# model checkpoints
/data

# logging
lightning_logs/*
logs/*
mlruns/*
# Allow mapping file to be uploaded
!**/diagnosis_code_mapping.json
!.devcontainer/devcontainer.json

# Ignore testmon database
.testmon*
*.db
50 changes: 50 additions & 0 deletions .github/workflows/kubeflow-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
name: kubeflow-publish
on:
push:
branches:
- "main"
pull_request:
branches:
- "main"
workflow_dispatch:

jobs:
release:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v3

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3

- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Docker meta
id: meta
uses: docker/metadata-action@v5
with:
# list of Docker images to use as base name for tags
images: |
ghcr.io/aarhus-psychiatry-research/psycop-common-kubeflow
# generate Docker tags based on the following events/attributes
tags: |
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=semver,pattern={{major}}
- name: Build and push Docker image
uses: docker/build-push-action@v6
with:
context: .
push: ${{ github.ref == 'refs/heads/main' }}
platforms: linux/amd64
file: Kubeflow
tags: ghcr.io/aarhus-psychiatry-research/psycop-common-kubeflow:latest
5 changes: 2 additions & 3 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,11 @@ jobs:
- uses: actions/setup-python@v4
with:
python-version: "3.10"
cache: "pip"

- name: Install pre-commit
run: pip install pre-commit
run: pip install pre-commit invoke

- name: Lint
id: pre_commit
run: |
pre-commit run --color always --all-files
inv lint
1 change: 0 additions & 1 deletion .github/workflows/type_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ jobs:
id: setup_python
with:
python-version: "3.10"
cache: "pip"

- name: Install dependencies
shell: bash
Expand Down
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@ default_stages: [commit]

repos:
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.4.4
rev: v0.6.3
hooks:
- id: ruff
args:
[
"--fix",
"--unsafe-fixes",
]
- id: ruff-format

Expand Down
73 changes: 73 additions & 0 deletions Kubeflow
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
FROM kubeflownotebookswg/codeserver-python:v1.9.0

# Switch over to root for building the custom image
USER root

# Install system dependencies including odbc and FreeTDS driver
RUN apt-get update && apt-get install -y \
unixodbc-dev \
unixodbc \
tdsodbc \
curl \
gcc \
g++ \
build-essential \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Add FreeTDS driver to odbcinst
RUN cat <<EOF > /etc/odbcinst.ini
[FreeTDS]
Description = FreeTDS Driver
Driver = /usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so
Setup = /usr/lib/x86_64-linux-gnu/odbc/libtdsS.so
EOF

# Install msodbcsql18 and mssql-tools18
RUN \
if ! [[ "18.04 20.04 22.04 23.04 24.04" == *"$(lsb_release -rs)"* ]]; then \
echo "Ubuntu $(lsb_release -rs) is not currently supported."; \
exit; \
fi

RUN curl https://packages.microsoft.com/keys/microsoft.asc | tee /etc/apt/trusted.gpg.d/microsoft.asc \
&& curl https://packages.microsoft.com/config/ubuntu/$(lsb_release -rs)/prod.list | tee /etc/apt/sources.list.d/mssql-release.list \
&& apt-get update \
&& ACCEPT_EULA=Y apt-get install -y msodbcsql18 \
&& ACCEPT_EULA=Y apt-get install -y mssql-tools18 \
&& echo 'export PATH="$PATH:/opt/mssql-tools18/bin"' >> ~/.bashrc \
&& /bin/bash -c "source ~/.bashrc"

# Install Python packages
ENV UV_SYSTEM_PYTHON=1
RUN pip install uv

# Mount a cache dir for faster repeated installs. Only mounts during build.
# Do not compile the python packages, only compile them at runtime.
# Heaviest requirements first, to preserve cache hits.
COPY gpu-requirements.txt .
RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r gpu-requirements.txt --no-compile

# Add build-essential for psutil
COPY requirements.txt .
RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r requirements.txt --no-compile

COPY test-requirements.txt .
RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r test-requirements.txt --no-compile

COPY dev-requirements.txt .
RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r dev-requirements.txt --no-compile

# Ensure pyright is installed from npm since npm is not accessible at runtime
RUN pyright --help

# BI's SQL server is MsSQL 2016, which supports TLS <= 1.2. This is lower than the default TLS version of Ubuntu 22.04.
# Modify the OpenSSL configuration file, in system and conda, to set the minimum supported TLS version to TLSv1.2
RUN sed -i 's/^\(\[system_default_sect\]\)/\1\nMinProtocol = TLSv1.2/' /etc/ssl/openssl.cnf && \
sed -i 's/^CipherString = DEFAULT:@SECLEVEL=2/CipherString = DEFAULT@SECLEVEL=0/' /etc/ssl/openssl.cnf

RUN sed -i '/\[openssl_init\]/a ssl_conf = ssl_sect' /opt/conda/ssl/openssl.cnf && \
sed -i '$a\\n[ssl_sect]\nsystem_default = system_default_sect\n\n[system_default_sect]\nMinProtocol = TLSv1.2\nCipherString = DEFAULT@SECLEVEL=0' /opt/conda/ssl/openssl.cnf

# Switch back to root user
USER $NB_USER
2 changes: 1 addition & 1 deletion dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,6 @@ lefthook==0.1.2
pre-commit==3.4.0
pyright==1.1.368
pytest-sugar==0.9.7
ruff==0.4.4
ruff==0.6.3
pandas-stubs==2.1.1.230928
glances==3.4.0.5
4 changes: 2 additions & 2 deletions lefthook.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@ pre-commit:
commands:
format:
glob: "*.{py}"
run: ruff format . # Do not fail on pre-commit
run: ruff --version && ruff format . # Do not fail on pre-commit
stage_fixed: true
lint:
glob: "*.{py}"
run: ruff check --fix-only --unsafe-fixes . # Do not fail on pre-commit
run: ruff --version && ruff check --fix-only --unsafe-fixes . # Do not fail on pre-commit
stage_fixed: true

test:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from __future__ import annotations

import logging
import os
from typing import TYPE_CHECKING

import psutil
from timeseriesflattener import Flattener
from timeseriesflattener import PredictionTimeFrame as FlattenerPredictionTimeFrame
from timeseriesflattener.v1.flattened_dataset import TimeseriesFlattener
Expand Down Expand Up @@ -99,9 +99,13 @@ def create_flattened_dataset_tsflattener_v1(
FlattenedDataset: Flattened dataset.
"""

cpu_count = os.cpu_count()
if cpu_count is None:
cpu_count = 4

flattened_dataset = TimeseriesFlattener(
prediction_times_df=prediction_times_df,
n_workers=min(len(feature_specs), psutil.cpu_count(logical=True)),
n_workers=min(len(feature_specs), cpu_count),
cache=None,
drop_pred_times_with_insufficient_look_distance=drop_pred_times_with_insufficient_look_distance,
predictor_col_name_prefix=project_info.prefix.predictor,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from psycop.common.test_utils.str_to_df import str_to_df


@pytest.fixture()
@pytest.fixture
def synth_prediction_times() -> pd.DataFrame:
return str_to_df(
"""entity_id,timestamp
Expand All @@ -35,7 +35,7 @@ def synth_prediction_times() -> pd.DataFrame:
)


@pytest.fixture()
@pytest.fixture
def synth_predictor_1() -> pd.DataFrame:
return str_to_df(
"""entity_id,timestamp,value
Expand All @@ -49,7 +49,7 @@ def synth_predictor_1() -> pd.DataFrame:
)


@pytest.fixture()
@pytest.fixture
def synth_predictor_2() -> pd.DataFrame:
return str_to_df(
"""entity_id,timestamp,value
Expand All @@ -63,7 +63,7 @@ def synth_predictor_2() -> pd.DataFrame:
)


@pytest.fixture()
@pytest.fixture
def synth_project_info() -> ProjectInfo:
return ProjectInfo(
project_name="test",
Expand Down
2 changes: 1 addition & 1 deletion psycop/common/feature_generation/loaders/raw/sql_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def sql_load(
>>> sql = "SELECT * FROM [fct]." + view
>>> df = sql_load(sql, chunksize = None)
"""
driver = "SQL Server"
driver = "ODBC Driver 18 for SQL Server"
params = urllib.parse.quote(
f"DRIVER={driver};SERVER={server};DATABASE={database};Trusted_Connection=yes"
)
Expand Down
4 changes: 2 additions & 2 deletions psycop/common/feature_generation/utils_for_testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,13 @@ def check_any_item_in_list_has_str(list_of_str: list[Any], str_: str) -> bool:
return any(str_ in item for item in list_of_str)


@pytest.fixture()
@pytest.fixture
def synth_prediction_times() -> pd.DataFrame:
"""Load the prediction times."""
return load_synth_prediction_times()


@pytest.fixture()
@pytest.fixture
def synth_outcome() -> pd.DataFrame:
"""Load the synth outcome times."""
return load_synth_outcome()
8 changes: 4 additions & 4 deletions psycop/common/model_training/data_loader/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from psycop.common.test_utils.str_to_df import str_to_df


@pytest.fixture()
@pytest.fixture
def base_feature_df() -> pd.DataFrame:
return str_to_df(
"""prediction_time_uuid,feature_name_1,dw_ek_borger,timestamp
Expand All @@ -15,7 +15,7 @@ def base_feature_df() -> pd.DataFrame:
)


@pytest.fixture()
@pytest.fixture
def feature_df_same_order_uuids() -> pd.DataFrame:
return str_to_df(
"""prediction_time_uuid,feature_name_2,dw_ek_borger,timestamp
Expand All @@ -26,7 +26,7 @@ def feature_df_same_order_uuids() -> pd.DataFrame:
)


@pytest.fixture()
@pytest.fixture
def feature_df_different_order_uuids() -> pd.DataFrame:
return str_to_df(
"""prediction_time_uuid,feature_name_2,dw_ek_borger,timestamp
Expand All @@ -37,7 +37,7 @@ def feature_df_different_order_uuids() -> pd.DataFrame:
)


@pytest.fixture()
@pytest.fixture
def feature_df_different_split() -> pd.DataFrame:
return str_to_df(
"""prediction_time_uuid,feature_name_1,dw_ek_borger,timestamp
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
## write test for hashing of uuids


@pytest.fixture()
@pytest.fixture
def dataloader() -> DataLoader:
data_cfg = DataSchema(dir=Path(), suffix="", splits_for_training=[""], n_training_samples=None)
return DataLoader(data_cfg=data_cfg)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from psycop.common.global_utils.paths import PSYCOP_PKG_ROOT


@pytest.fixture()
@pytest.fixture
def predictor_specs() -> list[PredictorSpec]:
return [
PredictorSpec(
Expand All @@ -29,7 +29,7 @@ def predictor_specs() -> list[PredictorSpec]:
]


@pytest.fixture()
@pytest.fixture
def static_specs() -> list[StaticSpec]:
return [
StaticSpec(
Expand All @@ -38,7 +38,7 @@ def static_specs() -> list[StaticSpec]:
]


@pytest.fixture()
@pytest.fixture
def outcome_specs() -> list[OutcomeSpec]:
return [
OutcomeSpec(
Expand All @@ -53,7 +53,7 @@ def outcome_specs() -> list[OutcomeSpec]:
]


@pytest.fixture()
@pytest.fixture
def df() -> pd.DataFrame:
"""Load the synthetic flattened data set."""
return pd.read_csv(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from psycop.common.test_utils.str_to_df import str_to_pl_df


@pytest.fixture()
@pytest.fixture
def mock_geography_data() -> pl.DataFrame:
return str_to_pl_df(
"""dw_ek_borger,timestamp,region
Expand Down
Loading

0 comments on commit 4c4f1f0

Please sign in to comment.