-
Notifications
You must be signed in to change notification settings - Fork 64
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add vLLM+HPA support to ChatQnA Helm chart (#610)
* Add monitoring support for the vLLM component Signed-off-by: Eero Tamminen <[email protected]> * Initial vLLM support for ChatQnA For now vLLM replaces just TGI, but as it supports also embedding, also TEI-embed/-rerank may be replaceable later on. Signed-off-by: Eero Tamminen <[email protected]> * Fix HPA comments in tgi/tei/tererank values files Signed-off-by: Eero Tamminen <[email protected]> * Add HPA scaling support for ChatQnA / vLLM Signed-off-by: Eero Tamminen <[email protected]> * Adapt to latest vllm changes - Remove --eager-enforce on hpu to improve performance - Refactor to the upstream docker entrypoint changes Fixes issue #631. Signed-off-by: Lianhao Lu <[email protected]> * Clean up ChatQnA vLLM Gaudi parameters Signed-off-by: Eero Tamminen <[email protected]> --------- Signed-off-by: Eero Tamminen <[email protected]> Signed-off-by: Lianhao Lu <[email protected]> Co-authored-by: Lianhao Lu <[email protected]>
- Loading branch information
Showing
25 changed files
with
253 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
gaudi-tgi-values.yaml |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
gaudi-vllm-values.yaml |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
# Accelerate inferencing in heaviest components to improve performance | ||
# by overriding their subchart values | ||
|
||
tgi: | ||
enabled: false | ||
|
||
vllm: | ||
enabled: true | ||
accelDevice: "gaudi" | ||
image: | ||
repository: opea/vllm-gaudi | ||
tag: "latest" | ||
resources: | ||
limits: | ||
habana.ai/gaudi: 1 | ||
startupProbe: | ||
initialDelaySeconds: 5 | ||
periodSeconds: 5 | ||
timeoutSeconds: 1 | ||
failureThreshold: 120 | ||
readinessProbe: | ||
initialDelaySeconds: 5 | ||
periodSeconds: 5 | ||
timeoutSeconds: 1 | ||
livenessProbe: | ||
initialDelaySeconds: 5 | ||
periodSeconds: 5 | ||
timeoutSeconds: 1 | ||
|
||
PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" | ||
OMPI_MCA_btl_vader_single_copy_mechanism: "none" | ||
|
||
extraCmdArgs: [ | ||
"--tensor-parallel-size", "1", | ||
"--block-size", "128", | ||
"--max-num-seqs", "256", | ||
"--max-seq_len-to-capture", "2048" | ||
] | ||
|
||
|
||
# Reranking: second largest bottleneck when reranking is in use | ||
# (i.e. query context docs have been uploaded with data-prep) | ||
# | ||
# TODO: could vLLM be used also for reranking / embedding? | ||
teirerank: | ||
accelDevice: "gaudi" | ||
OMPI_MCA_btl_vader_single_copy_mechanism: "none" | ||
MAX_WARMUP_SEQUENCE_LENGTH: "512" | ||
image: | ||
repository: ghcr.io/huggingface/tei-gaudi | ||
tag: 1.5.0 | ||
resources: | ||
limits: | ||
habana.ai/gaudi: 1 | ||
securityContext: | ||
readOnlyRootFilesystem: false | ||
livenessProbe: | ||
timeoutSeconds: 1 | ||
readinessProbe: | ||
timeoutSeconds: 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 57 additions & 0 deletions
57
helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# Copyright (C) 2024 Intel Corporation | ||
# SPDX-License-Identifier: Apache-2.0 | ||
|
||
{{- if and .Values.global.monitoring .Values.autoscaling.enabled }} | ||
apiVersion: autoscaling/v2 | ||
kind: HorizontalPodAutoscaler | ||
metadata: | ||
name: {{ include "vllm.fullname" . }} | ||
spec: | ||
scaleTargetRef: | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
name: {{ include "vllm.fullname" . }} | ||
minReplicas: 1 | ||
maxReplicas: {{ .Values.autoscaling.maxReplicas }} | ||
metrics: | ||
- type: Object | ||
object: | ||
describedObject: | ||
apiVersion: v1 | ||
# get metric for named object of given type (in same namespace) | ||
kind: Service | ||
name: {{ include "vllm.fullname" . }} | ||
target: | ||
# Metric is sum from all pods. "AverageValue" divides value returned from | ||
# the custom metrics API by the number of Pods before comparing to the target: | ||
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details | ||
# https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics | ||
type: AverageValue | ||
{{- if .Values.accelDevice }} | ||
averageValue: 0.1 | ||
{{- else }} | ||
# allow larger latencies with unaccelerated service | ||
averageValue: 1.0 | ||
{{- end }} | ||
metric: | ||
name: {{ include "vllm.metricPrefix" . }}_token_latency | ||
behavior: | ||
scaleDown: | ||
stabilizationWindowSeconds: 180 | ||
policies: | ||
- type: Percent | ||
value: 25 | ||
periodSeconds: 90 | ||
scaleUp: | ||
selectPolicy: Max | ||
stabilizationWindowSeconds: 0 | ||
policies: | ||
# Slow linear rampup in case additional CPU pods go to same node | ||
# (i.e. interfere with each other) | ||
- type: Pods | ||
value: 1 | ||
periodSeconds: 90 | ||
#- type: Percent | ||
# value: 25 | ||
# periodSeconds: 90 | ||
{{- end }} |
Oops, something went wrong.