feat: add kubeflow dockerfile (#996)

Aarhus-Psychiatry-Research · Sep 2, 2024 · 4c4f1f0 · 4c4f1f0
1 parent 45f9ee9
commit 4c4f1f0
Show file tree

Hide file tree

Showing 26 changed files with 193 additions and 55 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -185,12 +185,26 @@ psycop/tests/test_eval_results/*
 psycop/**/*.md
 
 # Wandb
-/wandb
+**/wandb
 multirun
 
 # Ignore the ouptuts for publishing folder
 **/outputs_for_publishing
 pytest.xml
 joblib
 /outputs
-.git
+
+# model checkpoints
+/data
+
+# logging
+lightning_logs/*
+logs/*
+mlruns/*
+# Allow mapping file to be uploaded
+!**/diagnosis_code_mapping.json
+!.devcontainer/devcontainer.json
+
+# Ignore testmon database
+.testmon*
+*.db
diff --git a/.github/workflows/kubeflow-publish.yml b/.github/workflows/kubeflow-publish.yml
@@ -0,0 +1,50 @@
+name: kubeflow-publish
+on:
+  push:
+    branches:
+      - "main"
+  pull_request:
+    branches:
+      - "main"
+  workflow_dispatch:
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Login to GitHub Container Registry
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          # list of Docker images to use as base name for tags
+          images: |
+            ghcr.io/aarhus-psychiatry-research/psycop-common-kubeflow
+          # generate Docker tags based on the following events/attributes
+          tags: |
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=semver,pattern={{major}}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          push: ${{ github.ref == 'refs/heads/main' }}
+          platforms: linux/amd64
+          file: Kubeflow
+          tags: ghcr.io/aarhus-psychiatry-research/psycop-common-kubeflow:latest
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -21,12 +21,11 @@ jobs:
       - uses: actions/setup-python@v4
         with:
           python-version: "3.10"
-          cache: "pip"
 
       - name: Install pre-commit
-        run: pip install pre-commit
+        run: pip install pre-commit invoke
 
       - name: Lint
         id: pre_commit
         run: |
-          pre-commit run --color always --all-files
+          inv lint
diff --git a/.github/workflows/type_check.yml b/.github/workflows/type_check.yml
@@ -19,7 +19,6 @@ jobs:
         id: setup_python
         with:
           python-version: "3.10"
-          cache: "pip"
 
       - name: Install dependencies
         shell: bash

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,12 +2,13 @@ default_stages: [commit]
 
 repos:
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: v0.4.4
+    rev: v0.6.3
     hooks:
       - id: ruff
         args:
           [
             "--fix",
+            "--unsafe-fixes",
           ]
       - id: ruff-format
 

diff --git a/Kubeflow b/Kubeflow
@@ -0,0 +1,73 @@
+FROM kubeflownotebookswg/codeserver-python:v1.9.0
+
+# Switch over to root for building the custom image
+USER root
+
+# Install system dependencies including odbc and FreeTDS driver
+RUN apt-get update && apt-get install -y \
+    unixodbc-dev \
+    unixodbc \
+    tdsodbc \
+    curl \
+    gcc \
+    g++ \
+    build-essential \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Add FreeTDS driver to odbcinst
+RUN cat <<EOF > /etc/odbcinst.ini
+[FreeTDS]
+Description = FreeTDS Driver
+Driver = /usr/lib/x86_64-linux-gnu/odbc/libtdsodbc.so
+Setup = /usr/lib/x86_64-linux-gnu/odbc/libtdsS.so
+EOF
+
+# Install msodbcsql18 and mssql-tools18
+RUN \
+    if ! [[ "18.04 20.04 22.04 23.04 24.04" == *"$(lsb_release -rs)"* ]]; then \
+    echo "Ubuntu $(lsb_release -rs) is not currently supported."; \
+    exit; \
+    fi
+
+RUN curl https://packages.microsoft.com/keys/microsoft.asc | tee /etc/apt/trusted.gpg.d/microsoft.asc \
+    && curl https://packages.microsoft.com/config/ubuntu/$(lsb_release -rs)/prod.list | tee /etc/apt/sources.list.d/mssql-release.list \
+    && apt-get update \
+    && ACCEPT_EULA=Y apt-get install -y msodbcsql18 \
+    && ACCEPT_EULA=Y apt-get install -y mssql-tools18 \
+    && echo 'export PATH="$PATH:/opt/mssql-tools18/bin"' >> ~/.bashrc \
+    && /bin/bash -c "source ~/.bashrc"
+
+# Install Python packages
+ENV UV_SYSTEM_PYTHON=1
+RUN pip install uv
+
+# Mount a cache dir for faster repeated installs. Only mounts during build.
+# Do not compile the python packages, only compile them at runtime.
+# Heaviest requirements first, to preserve cache hits.
+COPY gpu-requirements.txt .
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r gpu-requirements.txt --no-compile
+
+# Add build-essential for psutil
+COPY requirements.txt .
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r requirements.txt --no-compile
+
+COPY test-requirements.txt .
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r test-requirements.txt --no-compile
+
+COPY dev-requirements.txt .
+RUN --mount=type=cache,target=/root/.cache/uv uv pip install -r dev-requirements.txt --no-compile
+
+# Ensure pyright is installed from npm since npm is not accessible at runtime
+RUN pyright --help
+
+# BI's SQL server is MsSQL 2016, which supports TLS <= 1.2. This is lower than the default TLS version of Ubuntu 22.04.
+# Modify the OpenSSL configuration file, in system and conda, to set the minimum supported TLS version to TLSv1.2
+RUN sed -i 's/^\(\[system_default_sect\]\)/\1\nMinProtocol = TLSv1.2/' /etc/ssl/openssl.cnf && \
+    sed -i 's/^CipherString = DEFAULT:@SECLEVEL=2/CipherString = DEFAULT@SECLEVEL=0/' /etc/ssl/openssl.cnf
+
+RUN sed -i '/\[openssl_init\]/a ssl_conf = ssl_sect' /opt/conda/ssl/openssl.cnf && \
+    sed -i '$a\\n[ssl_sect]\nsystem_default = system_default_sect\n\n[system_default_sect]\nMinProtocol = TLSv1.2\nCipherString = DEFAULT@SECLEVEL=0' /opt/conda/ssl/openssl.cnf
+
+# Switch back to root user
+USER $NB_USER
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -5,6 +5,6 @@ lefthook==0.1.2
 pre-commit==3.4.0
 pyright==1.1.368
 pytest-sugar==0.9.7
-ruff==0.4.4
+ruff==0.6.3
 pandas-stubs==2.1.1.230928
 glances==3.4.0.5
diff --git a/lefthook.yml b/lefthook.yml
@@ -16,11 +16,11 @@ pre-commit:
   commands:
     format:
       glob: "*.{py}"
-      run: ruff format . # Do not fail on pre-commit
+      run: ruff --version && ruff format . # Do not fail on pre-commit
       stage_fixed: true
     lint:
       glob: "*.{py}"
-      run: ruff check --fix-only --unsafe-fixes . # Do not fail on pre-commit
+      run: ruff --version && ruff check --fix-only --unsafe-fixes . # Do not fail on pre-commit
       stage_fixed: true
 
 test:

diff --git a/psycop/common/feature_generation/application_modules/flatten_dataset.py b/psycop/common/feature_generation/application_modules/flatten_dataset.py
@@ -3,9 +3,9 @@
 from __future__ import annotations
 
 import logging
+import os
 from typing import TYPE_CHECKING
 
-import psutil
 from timeseriesflattener import Flattener
 from timeseriesflattener import PredictionTimeFrame as FlattenerPredictionTimeFrame
 from timeseriesflattener.v1.flattened_dataset import TimeseriesFlattener
@@ -99,9 +99,13 @@ def create_flattened_dataset_tsflattener_v1(
         FlattenedDataset: Flattened dataset.
     """
 
+    cpu_count = os.cpu_count()
+    if cpu_count is None:
+        cpu_count = 4
+
     flattened_dataset = TimeseriesFlattener(
         prediction_times_df=prediction_times_df,
-        n_workers=min(len(feature_specs), psutil.cpu_count(logical=True)),
+        n_workers=min(len(feature_specs), cpu_count),
         cache=None,
         drop_pred_times_with_insufficient_look_distance=drop_pred_times_with_insufficient_look_distance,
         predictor_col_name_prefix=project_info.prefix.predictor,

diff --git a/psycop/common/feature_generation/application_modules/test_chunked_feature_generation.py b/psycop/common/feature_generation/application_modules/test_chunked_feature_generation.py
@@ -19,7 +19,7 @@
 from psycop.common.test_utils.str_to_df import str_to_df
 
 
-@pytest.fixture()
+@pytest.fixture
 def synth_prediction_times() -> pd.DataFrame:
     return str_to_df(
         """entity_id,timestamp
@@ -35,7 +35,7 @@ def synth_prediction_times() -> pd.DataFrame:
     )
 
 
-@pytest.fixture()
+@pytest.fixture
 def synth_predictor_1() -> pd.DataFrame:
     return str_to_df(
         """entity_id,timestamp,value
@@ -49,7 +49,7 @@ def synth_predictor_1() -> pd.DataFrame:
     )
 
 
-@pytest.fixture()
+@pytest.fixture
 def synth_predictor_2() -> pd.DataFrame:
     return str_to_df(
         """entity_id,timestamp,value
@@ -63,7 +63,7 @@ def synth_predictor_2() -> pd.DataFrame:
     )
 
 
-@pytest.fixture()
+@pytest.fixture
 def synth_project_info() -> ProjectInfo:
     return ProjectInfo(
         project_name="test",

diff --git a/psycop/common/feature_generation/loaders/raw/sql_load.py b/psycop/common/feature_generation/loaders/raw/sql_load.py
@@ -40,7 +40,7 @@ def sql_load(
         >>> sql = "SELECT * FROM [fct]." + view
         >>> df = sql_load(sql, chunksize = None)
     """
-    driver = "SQL Server"
+    driver = "ODBC Driver 18 for SQL Server"
     params = urllib.parse.quote(
         f"DRIVER={driver};SERVER={server};DATABASE={database};Trusted_Connection=yes"
     )

diff --git a/psycop/common/feature_generation/utils_for_testing.py b/psycop/common/feature_generation/utils_for_testing.py
@@ -56,13 +56,13 @@ def check_any_item_in_list_has_str(list_of_str: list[Any], str_: str) -> bool:
     return any(str_ in item for item in list_of_str)
 
 
-@pytest.fixture()
+@pytest.fixture
 def synth_prediction_times() -> pd.DataFrame:
     """Load the prediction times."""
     return load_synth_prediction_times()
 
 
-@pytest.fixture()
+@pytest.fixture
 def synth_outcome() -> pd.DataFrame:
     """Load the synth outcome times."""
     return load_synth_outcome()
diff --git a/psycop/common/model_training/data_loader/tests/conftest.py b/psycop/common/model_training/data_loader/tests/conftest.py
@@ -4,7 +4,7 @@
 from psycop.common.test_utils.str_to_df import str_to_df
 
 
-@pytest.fixture()
+@pytest.fixture
 def base_feature_df() -> pd.DataFrame:
     return str_to_df(
         """prediction_time_uuid,feature_name_1,dw_ek_borger,timestamp
@@ -15,7 +15,7 @@ def base_feature_df() -> pd.DataFrame:
     )
 
 
-@pytest.fixture()
+@pytest.fixture
 def feature_df_same_order_uuids() -> pd.DataFrame:
     return str_to_df(
         """prediction_time_uuid,feature_name_2,dw_ek_borger,timestamp
@@ -26,7 +26,7 @@ def feature_df_same_order_uuids() -> pd.DataFrame:
     )
 
 
-@pytest.fixture()
+@pytest.fixture
 def feature_df_different_order_uuids() -> pd.DataFrame:
     return str_to_df(
         """prediction_time_uuid,feature_name_2,dw_ek_borger,timestamp
@@ -37,7 +37,7 @@ def feature_df_different_order_uuids() -> pd.DataFrame:
     )
 
 
-@pytest.fixture()
+@pytest.fixture
 def feature_df_different_split() -> pd.DataFrame:
     return str_to_df(
         """prediction_time_uuid,feature_name_1,dw_ek_borger,timestamp

diff --git a/psycop/common/model_training/data_loader/tests/test_feature_merging.py b/psycop/common/model_training/data_loader/tests/test_feature_merging.py
@@ -11,7 +11,7 @@
 ## write test for hashing of uuids
 
 
-@pytest.fixture()
+@pytest.fixture
 def dataloader() -> DataLoader:
     data_cfg = DataSchema(dir=Path(), suffix="", splits_for_training=[""], n_training_samples=None)
     return DataLoader(data_cfg=data_cfg)

diff --git a/psycop/common/model_training/tests/test_feature_describer/test_feature_describer.py b/psycop/common/model_training/tests/test_feature_describer/test_feature_describer.py
@@ -16,7 +16,7 @@
 from psycop.common.global_utils.paths import PSYCOP_PKG_ROOT
 
 
-@pytest.fixture()
+@pytest.fixture
 def predictor_specs() -> list[PredictorSpec]:
     return [
         PredictorSpec(
@@ -29,7 +29,7 @@ def predictor_specs() -> list[PredictorSpec]:
     ]
 
 
-@pytest.fixture()
+@pytest.fixture
 def static_specs() -> list[StaticSpec]:
     return [
         StaticSpec(
@@ -38,7 +38,7 @@ def static_specs() -> list[StaticSpec]:
     ]
 
 
-@pytest.fixture()
+@pytest.fixture
 def outcome_specs() -> list[OutcomeSpec]:
     return [
         OutcomeSpec(
@@ -53,7 +53,7 @@ def outcome_specs() -> list[OutcomeSpec]:
     ]
 
 
-@pytest.fixture()
+@pytest.fixture
 def df() -> pd.DataFrame:
     """Load the synthetic flattened data set."""
     return pd.read_csv(

diff --git a/...del_training_v2/trainer/preprocessing/steps/geographical_split/test_geographical_split.py b/...del_training_v2/trainer/preprocessing/steps/geographical_split/test_geographical_split.py
@@ -13,7 +13,7 @@
 from psycop.common.test_utils.str_to_df import str_to_pl_df
 
 
-@pytest.fixture()
+@pytest.fixture
 def mock_geography_data() -> pl.DataFrame:
     return str_to_pl_df(
         """dw_ek_borger,timestamp,region