Skip to content

Commit

Permalink
feat: add l40s gpu type task decorator
Browse files Browse the repository at this point in the history
  • Loading branch information
AidanAbd committed Dec 13, 2024
1 parent d7694f5 commit 118105e
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 1 deletion.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,12 @@ Types of changes

# Latch SDK Changelog

## 2.54.11 - 2024-12-13

### Added

* Add `l40s_gpu_task` to request L40s GPU instances

## 2.54.10 - 2024-12-06

### Updated
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ include = ["src/**/*.py", "src/latch_cli/services/init/*"]

[project]
name = "latch"
version = "2.54.10"
version = "2.54.11"
description = "The Latch SDK"
authors = [{ name = "Kenny Workman", email = "[email protected]" }]
maintainers = [
Expand Down
1 change: 1 addition & 0 deletions src/latch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from latch.resources.tasks import (
custom_memory_optimized_task,
custom_task,
l40s_gpu_task,
large_gpu_task,
large_task,
medium_task,
Expand Down
85 changes: 85 additions & 0 deletions src/latch/resources/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,3 +627,88 @@ def nextflow_runtime_task(cpu: int, memory: int, storage_gib: int = 50):
]

return functools.partial(task, task_config=task_config)


@dataclass
class _L40sGPUInstanceSpec:
type: str
cpu: int
memory: int
gpus: int


def l40s_gpu_task(cpu: int, memory: int, gpus: int, **kwargs):
"""Creates a task configuration for L40s GPU instances.
Will choose the smallest instance that satisfies the resource requirements and will assign the entire instance to the task.
Args:
cpu: Number of vCPUs requested (4-96)
memory: Memory in GiB requested (32-768)
gpus: Number of GPUs requested (1 or 4)
"""
instance_specs: list[_L40sGPUInstanceSpec] = [
_L40sGPUInstanceSpec(type="g6e.xlarge", cpu=4, memory=32, gpus=1),
_L40sGPUInstanceSpec(type="g6e.2xlarge", cpu=8, memory=64, gpus=1),
_L40sGPUInstanceSpec(type="g6e.4xlarge", cpu=16, memory=128, gpus=1),
_L40sGPUInstanceSpec(type="g6e.8xlarge", cpu=32, memory=256, gpus=1),
_L40sGPUInstanceSpec(type="g6e.12xlarge", cpu=48, memory=384, gpus=1),
_L40sGPUInstanceSpec(type="g6e.16xlarge", cpu=64, memory=512, gpus=4),
_L40sGPUInstanceSpec(type="g6e.24xlarge", cpu=96, memory=768, gpus=4),
# _L40sGPUInstanceSpec(type="g6e.48xlarge", cpu=192, memory=1536, gpus=8),
# # todo(aidan): add 48xlarge instance (need to debug why does not join cluster)
]

selected_instance = None
for spec in instance_specs:
if (cpu <= spec.cpu and
memory <= spec.memory and
gpus <= spec.gpus):
selected_instance = spec
break

if not selected_instance:
raise ValueError(
f"No instance type available for requested resources: "
f"{cpu} vCPUs, {memory} GiB RAM, {gpus} GPUs. "
f"Maximum available: 96 vCPUs, 768 GiB RAM, 4 GPUs"
)

primary_container = V1Container(name="primary")
resources = V1ResourceRequirements(
requests={
"cpu": str(selected_instance.cpu - 2),
"memory": f"{selected_instance.memory - 4}Gi",
"nvidia.com/gpu": str(selected_instance.gpus),
"ephemeral-storage": "4500Gi",
},
limits={
"cpu": str(selected_instance.cpu),
"memory": f"{selected_instance.memory}Gi",
"nvidia.com/gpu": str(selected_instance.gpus),
"ephemeral-storage": "5000Gi",
},
)
primary_container.resources = resources

pod_config = Pod(
pod_spec=V1PodSpec(
containers=[primary_container],
tolerations=[
V1Toleration(
effect="NoSchedule",
key="ng",
value=selected_instance.type
)
],
),
primary_container_name="primary",
annotations={
"io.kubernetes.cri-o.userns-mode": (
"private:uidmapping=0:1048576:65536;gidmapping=0:1048576:65536"
),
"cluster-autoscaler.kubernetes.io/safe-to-evict": "false",
},
)

return functools.partial(task, task_config=pod_config, **kwargs)

0 comments on commit 118105e

Please sign in to comment.