Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FedML Adaptation for K8S Pod Solution #2247

Open
wants to merge 30 commits into
base: charlie/dev/v0.7.0
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
f08a1ab
Merge pull request #2244 from FedML-AI/charlie/dev/v0.7.0
charlieyl Dec 25, 2024
1620afb
add fedml k8s agent
charlieyl Dec 28, 2024
d1425ee
Create a symbolic link to the model storage directory
charlieyl Dec 28, 2024
e753bc4
update get gpu info from cmd(crictl inspect)
charlieyl Dec 28, 2024
2d9c237
Enhance FedML K8s agent configuration and logging capabilities
charlieyl Jan 1, 2025
dde6fe3
Enhance FedML K8s agent with SSH support and update Dockerfile and de…
charlieyl Jan 2, 2025
73a8bc8
[update]Refactor FedML K8s agent deployment and logging:
charlieyl Jan 3, 2025
d59bd31
[update]Resolve Circular Dependency Issues in Utility Classes
charlieyl Jan 3, 2025
b205def
[update]Resolve Circular Dependency Issues in Utility Classes
charlieyl Jan 3, 2025
968f98b
update logging
charlieyl Jan 4, 2025
82085af
remove logs
charlieyl Jan 4, 2025
51ac57d
[bugfix]ensure the current model directory exists before creating a s…
charlieyl Jan 4, 2025
b604981
[refactor] Replaced symbolic link creation with a file copy mechanism…
charlieyl Jan 4, 2025
168e165
[update] Enhance FedML K8s agent with crictl integration and configur…
charlieyl Jan 4, 2025
e51cd9b
[update] Enhanced logging in job monitoring and model deployment scri…
charlieyl Jan 4, 2025
9c2c719
[update] Updated file paths for model serving readiness and GPU info …
charlieyl Jan 4, 2025
5225a15
[update] handle log since datetime formatting in crictl_utils.py
charlieyl Jan 4, 2025
9dd0543
[update] add additional log volume mounts and fix log bug
charlieyl Jan 4, 2025
9d91e46
[update] modify setup.py to use flexible wandb version. The old versi…
charlieyl Jan 4, 2025
39834a3
[update] Pin wandb version to 0.13.2 in setup.py to ensure compatibility
charlieyl Jan 4, 2025
5a3b2fc
[update] Enhance FedML K8s agent by adding env variables for model in…
charlieyl Jan 4, 2025
1d1619d
[update] Enhance container metrics reporting with memory in GB. Added…
charlieyl Jan 5, 2025
0f5d9a6
[update] launch in k8s scheduler, we don't use containerize
charlieyl Jan 6, 2025
9dd2112
[update]Added constants for task type and refined logging for K8s dep…
charlieyl Jan 6, 2025
3e9a087
[update] Enhance FedML K8s agent with SSH setup and environment varia…
charlieyl Jan 13, 2025
47a69ac
[update] Aadjust deployment templates to include separate SSH public …
charlieyl Jan 15, 2025
264f85d
Refactor SSH key setup in entrypoint.sh to prevent duplicate entries …
charlieyl Jan 15, 2025
93f6725
Fix indentation for memory resource limit in FedML K8s deployment tem…
charlieyl Jan 17, 2025
37ade35
Prevent non-deploy pods from registering deploy-related topics.
charlieyl Jan 17, 2025
1710185
update template add shz memoty
charlieyl Feb 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions devops/k8s/fedml-k8s-agent/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# docker build -t fedml/fedml-k8s-agent:202412271540 -f /root/fedml-k8s-agent/Dockerfile .
FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PATH=/root/miniconda3/bin:$PATH

# Set default environment variables (these can be overridden in K8s deployment)
ENV FEDML_API_KEY=""
ENV FEDML_ENV="release"
ENV FEDML_PROVIDER="false"
ENV FEDML_DEVICE_ID=""
ENV INFERENCE_GATEWAY_PORT=""
ENV INFERENCE_PROXY_PORT=""
ENV FEDML_CONNECTION_TYPE=""
ENV SSH_PUBLIC_KEY=""

# Add build argument to bypass cache
ARG CACHEBUST=1

# Install system dependencies
RUN apt-get update \
&& apt-get install -y \
vim \
openssh-server \
net-tools \
wget \
curl \
iproute2 \
ca-certificates \
gnupg \
sudo \
lsb-release \
git \
&& rm -rf /var/lib/apt/lists/*

# Configure SSH base setup
RUN mkdir -p /var/run/sshd \
&& mkdir -p /root/.ssh \
&& chmod 700 /root/.ssh \
&& echo 'Port 22\n\
PermitRootLogin prohibit-password\n\
PubkeyAuthentication yes\n\
AuthorizedKeysFile .ssh/authorized_keys\n\
PasswordAuthentication no\n\
ChallengeResponseAuthentication no\n\
UsePAM yes\n\
X11Forwarding yes\n\
PrintMotd no\n\
AcceptEnv LANG LC_*\n\
Subsystem sftp /usr/lib/openssh/sftp-server\n\
LogLevel INFO' > /etc/ssh/sshd_config \
&& chmod 600 /etc/ssh/sshd_config

# Install Redis
RUN curl -fsSL https://packages.redis.io/gpg | gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg \
&& echo "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main" \
| tee /etc/apt/sources.list.d/redis.list \
&& apt-get update \
&& apt-get install -y redis \
&& rm -rf /var/lib/apt/lists/*

# Install crictl
RUN wget https://github.com/kubernetes-sigs/cri-tools/releases/download/v1.31.1/crictl-v1.31.1-linux-amd64.tar.gz \
&& tar -zxvf crictl-v1.31.1-linux-amd64.tar.gz \
&& mv crictl /usr/local/bin/ \
&& rm crictl-v1.31.1-linux-amd64.tar.gz

# Install Miniconda
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh \
&& bash /tmp/miniconda.sh -b -p /root/miniconda3 \
&& rm /tmp/miniconda.sh

# Setup crictl configuration
RUN echo 'runtime-endpoint: unix:///run/containerd/containerd.sock\n\
image-endpoint: unix:///run/containerd/containerd.sock\n\
timeout: 30\n\
debug: false' > /etc/crictl.yaml

# Copy entrypoint script
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

# Activate conda and install FedML
RUN echo "Cache bust: $CACHEBUST" \
&& conda init bash \
&& . ~/.bashrc \
&& conda create -y -n fedml python=3.10 \
&& conda activate fedml \
&& pip install --upgrade gevent multiprocess \
&& pip install "git+https://github.com/FedML-AI/FedML.git@k8s/dev/v0.7.0#egg=fedml&subdirectory=python"

ENTRYPOINT ["/entrypoint.sh"]
67 changes: 67 additions & 0 deletions devops/k8s/fedml-k8s-agent/entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#!/bin/bash

# Initialize conda
eval "$(/root/miniconda3/bin/conda shell.bash hook)"
conda init bash
source ~/.bashrc

# Activate conda environment
conda activate fedml

# Function to handle FedML login
fedml_login() {
if [ "$FEDML_PROVIDER" = "true" ]; then
if [ -z "$INFERENCE_GATEWAY_PORT" ] && [ -z "$INFERENCE_PROXY_PORT" ] && [ -z "$FEDML_CONNECTION_TYPE" ]; then
fedml login -p "$FEDML_API_KEY" -v "$FEDML_ENV"
else
fedml login -p "$FEDML_API_KEY" -v "$FEDML_ENV" -mgp "$INFERENCE_GATEWAY_PORT" -wpp "$INFERENCE_PROXY_PORT" -wct "$FEDML_CONNECTION_TYPE"
fi
else
if [ -z "$INFERENCE_GATEWAY_PORT" ] && [ -z "$INFERENCE_PROXY_PORT" ] && [ -z "$FEDML_CONNECTION_TYPE" ]; then
fedml login "$FEDML_API_KEY" -v "$FEDML_ENV"
else
fedml login "$FEDML_API_KEY" -v "$FEDML_ENV" -mgp "$INFERENCE_GATEWAY_PORT" -wpp "$INFERENCE_PROXY_PORT" -wct "$FEDML_CONNECTION_TYPE"
fi
fi
}

# Print all environment variables
echo "=== Environment Variables ==="
printenv | sort
echo "==========================="

# Create devices.id file if FEDML_DEVICE_ID is provided
if [ -n "$FEDML_DEVICE_ID" ]; then
mkdir -p "/root/.fedml/fedml-client/fedml/data/runner_infos"
echo -n "$FEDML_DEVICE_ID" > "/root/.fedml/fedml-client/fedml/data/runner_infos/devices.id"
fi

# Setup SSH key if provided
sshDir="/root/.ssh"
mkdir -p $sshDir
if [ -n "$SSH_PUBLIC_KEY_SYSTEM" ]; then
if ! grep -qF "$SSH_PUBLIC_KEY_SYSTEM" $sshDir/authorized_keys 2>/dev/null; then
echo "$SSH_PUBLIC_KEY_SYSTEM" >> $sshDir/authorized_keys
chmod 600 $sshDir/authorized_keys
fi
fi
if [ -n "$SSH_PUBLIC_KEY_USER" ]; then
if ! grep -qF "$SSH_PUBLIC_KEY_USER" $sshDir/authorized_keys 2>/dev/null; then
echo "$SSH_PUBLIC_KEY_USER" >> $sshDir/authorized_keys
chmod 600 $sshDir/authorized_keys
fi
fi

# Start SSH server
systemctl start ssh || service ssh start || /usr/sbin/sshd

# Start Redis server
redis-server --daemonize yes

# Login to FedML
fedml_login



# Keep container running
tail -f /dev/null
Loading