diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml index 5a255bf3e..7c52d1318 100644 --- a/helm-charts/chatqna/Chart.yaml +++ b/helm-charts/chatqna/Chart.yaml @@ -18,6 +18,11 @@ dependencies: - name: tgi version: 0-latest repository: "file://../common/tgi" + condition: tgi.enabled + - name: vllm + version: 0-latest + repository: "file://../common/vllm" + condition: vllm.enabled - name: tei version: 0-latest repository: "file://../common/tei" diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index d3b8097c9..eaf17cdc7 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -11,6 +11,7 @@ Helm chart for deploying ChatQnA service. ChatQnA depends on the following servi - [teirerank](../common/teirerank/README.md) - [llm-uservice](../common/llm-uservice/README.md) - [tgi](../common/tgi/README.md) +- [vllm](../common/vllm/README.md) ## Installing the Chart @@ -26,13 +27,15 @@ export MODELNAME="Intel/neural-chat-7b-v3-3" # If you would like to use the traditional UI, please change the image as well as the containerport within the values # append these at the end of the command "--set chatqna-ui.image.repository=opea/chatqna-ui,chatqna-ui.image.tag=latest,chatqna-ui.containerPort=5173" helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -# To use Gaudi device -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml +# To use Gaudi device with TGI +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-tgi-values.yaml +# To use Gaudi device with vLLM +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-vllm-values.yaml # To use Nvidia GPU #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml -# To include guardrail component in chatqna on Xeon +# To include guardrail component in chatqna on Xeon with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml -# To include guardrail component in chatqna on Gaudi +# To include guardrail component in chatqna on Gaudi with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml ``` diff --git a/helm-charts/chatqna/ci-gaudi-tgi-values.yaml b/helm-charts/chatqna/ci-gaudi-tgi-values.yaml new file mode 120000 index 000000000..8702c8f68 --- /dev/null +++ b/helm-charts/chatqna/ci-gaudi-tgi-values.yaml @@ -0,0 +1 @@ +gaudi-tgi-values.yaml \ No newline at end of file diff --git a/helm-charts/chatqna/ci-gaudi-values.yaml b/helm-charts/chatqna/ci-gaudi-values.yaml deleted file mode 120000 index 7243d31b2..000000000 --- a/helm-charts/chatqna/ci-gaudi-values.yaml +++ /dev/null @@ -1 +0,0 @@ -gaudi-values.yaml \ No newline at end of file diff --git a/helm-charts/chatqna/ci-gaudi-vllm-values.yaml b/helm-charts/chatqna/ci-gaudi-vllm-values.yaml new file mode 120000 index 000000000..d9ab8c698 --- /dev/null +++ b/helm-charts/chatqna/ci-gaudi-vllm-values.yaml @@ -0,0 +1 @@ +gaudi-vllm-values.yaml \ No newline at end of file diff --git a/helm-charts/chatqna/gaudi-values.yaml b/helm-charts/chatqna/gaudi-tgi-values.yaml similarity index 100% rename from helm-charts/chatqna/gaudi-values.yaml rename to helm-charts/chatqna/gaudi-tgi-values.yaml diff --git a/helm-charts/chatqna/gaudi-vllm-values.yaml b/helm-charts/chatqna/gaudi-vllm-values.yaml new file mode 100644 index 000000000..3b1873330 --- /dev/null +++ b/helm-charts/chatqna/gaudi-vllm-values.yaml @@ -0,0 +1,63 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: false + +vllm: + enabled: true + accelDevice: "gaudi" + image: + repository: opea/vllm-gaudi + tag: "latest" + resources: + limits: + habana.ai/gaudi: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + + extraCmdArgs: [ + "--tensor-parallel-size", "1", + "--block-size", "128", + "--max-num-seqs", "256", + "--max-seq_len-to-capture", "2048" + ] + + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "gaudi" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + MAX_WARMUP_SEQUENCE_LENGTH: "512" + image: + repository: ghcr.io/huggingface/tei-gaudi + tag: 1.5.0 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + readOnlyRootFilesystem: false + livenessProbe: + timeoutSeconds: 1 + readinessProbe: + timeoutSeconds: 1 diff --git a/helm-charts/chatqna/hpa-values.yaml b/helm-charts/chatqna/hpa-values.yaml index a374991f1..4881b2c67 100644 --- a/helm-charts/chatqna/hpa-values.yaml +++ b/helm-charts/chatqna/hpa-values.yaml @@ -4,7 +4,7 @@ # Enable HorizontalPodAutoscaler (HPA) # # That will overwrite named PrometheusAdapter configMap with ChatQnA specific -# custom metric queries for embedding, reranking, tgi services. +# custom metric queries for embedding, reranking, and LLM services. # # Default upstream configMap is in: # - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml @@ -15,6 +15,10 @@ autoscaling: # Override values in specific subcharts # Enabling "autoscaling" for any of the subcharts requires enabling it also above! +vllm: + autoscaling: + maxReplicas: 4 + enabled: true tgi: autoscaling: maxReplicas: 4 diff --git a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml index 440a4019e..416b8910b 100644 --- a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml +++ b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml @@ -13,10 +13,27 @@ metadata: data: config.yaml: | rules: - {{- if .Values.tgi.autoscaling.enabled }} + {{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }} # check metric with: # kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/ | jq # + - seriesQuery: '{__name__="vllm:time_per_output_token_seconds_sum",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}' + # Average output token latency from vLLM histograms, over 1 min + # (interval should be at least 4x serviceMonitor query interval, + # 0.001 divider add is to make sure there's always a valid value) + metricsQuery: 'rate(vllm:time_per_output_token_seconds_sum{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(vllm:time_per_output_token_seconds_count{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]))' + name: + matches: ^vllm:time_per_output_token_seconds_sum + as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_token_latency" + resources: + # HPA needs both namespace + suitable object resource for its query paths: + # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/ + # (pod is not suitable object type for matching as each instance has different name) + overrides: + namespace: {resource: "namespace"} + service: {resource: "service"} + {{- end }} + {{- if and .Values.tgi.enabled .Values.tgi.autoscaling.enabled }} {{- if .Values.tgi.accelDevice }} - seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}' # TGI instances queue_size sum @@ -27,16 +44,12 @@ data: {{- else }} - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}' # Average request latency from TGI histograms, over 1 min - # (0.001 divider add is to make sure there's always a valid value) metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))' name: matches: ^tgi_request_inference_duration_sum as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency" {{- end }} resources: - # HPA needs both namespace + suitable object resource for its query paths: - # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/ - # (pod is not suitable object type for matching as each instance has different name) overrides: namespace: {resource: "namespace"} service: {resource: "service"} diff --git a/helm-charts/chatqna/templates/deployment.yaml b/helm-charts/chatqna/templates/deployment.yaml index 6f19fee10..4a1a1e31c 100644 --- a/helm-charts/chatqna/templates/deployment.yaml +++ b/helm-charts/chatqna/templates/deployment.yaml @@ -35,11 +35,19 @@ spec: - name: {{ .Release.Name }} env: - name: LLM_SERVER_HOST_IP + {{- if .Values.vllm.enabled }} + value: {{ .Release.Name }}-vllm + {{- else }} value: {{ .Release.Name }}-tgi + {{- end }} - name: LLM_SERVER_PORT value: "80" - name: LLM_MODEL + {{- if .Values.vllm.enabled }} + value: {{ .Values.vllm.LLM_MODEL_ID | quote }} + {{- else }} value: {{ .Values.tgi.LLM_MODEL_ID | quote }} + {{- end }} - name: RERANK_SERVER_HOST_IP value: {{ .Release.Name }}-teirerank - name: RERANK_SERVER_PORT diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index 5558cf62b..c939f9b9d 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -67,6 +67,10 @@ autoscaling: # Override values in specific subcharts tgi: + enabled: true + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 +vllm: + enabled: false LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 # disable guardrails-usvc by default diff --git a/helm-charts/common/agent/values.yaml b/helm-charts/common/agent/values.yaml index 4e602d960..0ebfd4c31 100644 --- a/helm-charts/common/agent/values.yaml +++ b/helm-charts/common/agent/values.yaml @@ -14,7 +14,7 @@ tgi: vllm: enabled: false LLM_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3" - extraCmdArgs: ["/bin/bash", "-c", "python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model mistralai/Mistral-7B-Instruct-v0.3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral"] + extraCmdArgs: ["--tensor-parallel-size", "1", "--block-size", "128", "--max-num-seqs", "4096", "--max-seq_len-to-capture", "8192", "--enable-auto-tool-choice", "--tool-call-parser", "mistral"] replicaCount: 1 llm_endpoint_url: "" diff --git a/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml b/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml index 2438eaed9..0f1170f36 100644 --- a/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml +++ b/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml @@ -13,7 +13,7 @@ vllm: tag: "latest" LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 OMPI_MCA_btl_vader_single_copy_mechanism: none - extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] + extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] resources: limits: habana.ai/gaudi: 1 diff --git a/helm-charts/common/tei/values.yaml b/helm-charts/common/tei/values.yaml index 547c0a91d..cdef58b2b 100644 --- a/helm-charts/common/tei/values.yaml +++ b/helm-charts/common/tei/values.yaml @@ -9,7 +9,7 @@ replicaCount: 1 # Enabling HPA will: # - Ignore above replica count, as it will be controlled by HPA -# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Add example HPA scaling rules with custom metrics thresholds # - Require custom metrics ConfigMap available in the main application chart autoscaling: maxReplicas: 2 diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml index 254f4d169..745d71849 100644 --- a/helm-charts/common/teirerank/values.yaml +++ b/helm-charts/common/teirerank/values.yaml @@ -9,7 +9,7 @@ replicaCount: 1 # Enabling HPA will: # - Ignore above replica count, as it will be controlled by HPA -# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Add example HPA scaling rules with custom metrics thresholds # - Require custom metrics ConfigMap available in the main application chart autoscaling: maxReplicas: 3 diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml index dff938e65..cfc8e7f61 100644 --- a/helm-charts/common/tgi/values.yaml +++ b/helm-charts/common/tgi/values.yaml @@ -9,7 +9,7 @@ replicaCount: 1 # Enabling HPA will: # - Ignore above replica count, as it will be controlled by HPA -# - Add example HPA scaling rules with thresholds suitable for Xeon deployments +# - Add example HPA scaling rules with custom metrics thresholds # - Require custom metrics ConfigMap available in the main application chart autoscaling: maxReplicas: 4 diff --git a/helm-charts/common/vllm/README.md b/helm-charts/common/vllm/README.md index 0235a7443..1807cb2ea 100644 --- a/helm-charts/common/vllm/README.md +++ b/helm-charts/common/vllm/README.md @@ -51,3 +51,5 @@ curl http://localhost:2080/v1/completions \ | global.modelUseHostPath | string | `""` | Cached models directory, vllm will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. | | image.repository | string | `"opea/vllm"` | | | image.tag | string | `"latest"` | | +| autoscaling.enabled | bool | `false` | Enable HPA autoscaling for the service deployment based on metrics it provides. See [HPA instructions](../../HPA.md) before enabling! | +| global.monitoring | bool | `false` | Enable usage metrics for the service. Required for HPA. See [monitoring instructions](../../monitoring.md) before enabling! | diff --git a/helm-charts/common/vllm/gaudi-values.yaml b/helm-charts/common/vllm/gaudi-values.yaml index 65e622044..08f4db145 100644 --- a/helm-charts/common/vllm/gaudi-values.yaml +++ b/helm-charts/common/vllm/gaudi-values.yaml @@ -5,15 +5,15 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. +accelDevice: "gaudi" + image: repository: opea/vllm-gaudi tag: "latest" # VLLM_CPU_KVCACHE_SPACE: "40" OMPI_MCA_btl_vader_single_copy_mechanism: none -extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] -# Workaround for current HPU image with start command /bin/bash -# extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"] +extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"] resources: limits: habana.ai/gaudi: 1 diff --git a/helm-charts/common/vllm/templates/_helpers.tpl b/helm-charts/common/vllm/templates/_helpers.tpl index 63ec9e61d..3dd629e1f 100644 --- a/helm-charts/common/vllm/templates/_helpers.tpl +++ b/helm-charts/common/vllm/templates/_helpers.tpl @@ -30,6 +30,13 @@ Create chart name and version as used by the chart label. {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} {{- end }} +{{/* +Convert chart name to a string suitable as metric prefix +*/}} +{{- define "vllm.metricPrefix" -}} +{{- include "vllm.fullname" . | replace "-" "_" | regexFind "[a-zA-Z_:][a-zA-Z0-9_:]*" }} +{{- end }} + {{/* Common labels */}} diff --git a/helm-charts/common/vllm/templates/configmap.yaml b/helm-charts/common/vllm/templates/configmap.yaml index 14a8ba240..5fbbf6b79 100644 --- a/helm-charts/common/vllm/templates/configmap.yaml +++ b/helm-charts/common/vllm/templates/configmap.yaml @@ -25,6 +25,9 @@ data: {{- if .Values.VLLM_CPU_KVCACHE_SPACE }} VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}} {{- end }} + {{- if .Values.PT_HPU_ENABLE_LAZY_COLLECTIVES }} + PT_HPU_ENABLE_LAZY_COLLECTIVES: {{ .Values.PT_HPU_ENABLE_LAZY_COLLECTIVES | quote }} + {{- end }} {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }} OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote}} {{- end }} diff --git a/helm-charts/common/vllm/templates/deployment.yaml b/helm-charts/common/vllm/templates/deployment.yaml index afa559cd6..30f736b6b 100644 --- a/helm-charts/common/vllm/templates/deployment.yaml +++ b/helm-charts/common/vllm/templates/deployment.yaml @@ -8,7 +8,10 @@ metadata: labels: {{- include "vllm.labels" . | nindent 4 }} spec: + {{- if ne (int .Values.replicaCount) 1 }} + # remove if replica count should not be reset on pod update (e.g. with HPA) replicas: {{ .Values.replicaCount }} + {{- end }} selector: matchLabels: {{- include "vllm.selectorLabels" . | nindent 6 }} @@ -159,3 +162,7 @@ spec: matchLabels: {{- include "vllm.selectorLabels" . | nindent 14 }} {{- end }} + {{- if not .Values.accelDevice }} + # extra time to finish processing buffered requests on CPU before pod is forcibly terminated + terminationGracePeriodSeconds: 120 + {{- end }} diff --git a/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml new file mode 100644 index 000000000..c52861631 --- /dev/null +++ b/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml @@ -0,0 +1,57 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +{{- if and .Values.global.monitoring .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "vllm.fullname" . }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "vllm.fullname" . }} + minReplicas: 1 + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + - type: Object + object: + describedObject: + apiVersion: v1 + # get metric for named object of given type (in same namespace) + kind: Service + name: {{ include "vllm.fullname" . }} + target: + # Metric is sum from all pods. "AverageValue" divides value returned from + # the custom metrics API by the number of Pods before comparing to the target: + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details + # https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics + type: AverageValue +{{- if .Values.accelDevice }} + averageValue: 0.1 +{{- else }} + # allow larger latencies with unaccelerated service + averageValue: 1.0 +{{- end }} + metric: + name: {{ include "vllm.metricPrefix" . }}_token_latency + behavior: + scaleDown: + stabilizationWindowSeconds: 180 + policies: + - type: Percent + value: 25 + periodSeconds: 90 + scaleUp: + selectPolicy: Max + stabilizationWindowSeconds: 0 + policies: + # Slow linear rampup in case additional CPU pods go to same node + # (i.e. interfere with each other) + - type: Pods + value: 1 + periodSeconds: 90 + #- type: Percent + # value: 25 + # periodSeconds: 90 +{{- end }} diff --git a/helm-charts/common/vllm/templates/servicemonitor.yaml b/helm-charts/common/vllm/templates/servicemonitor.yaml new file mode 100644 index 000000000..d1a8a2c76 --- /dev/null +++ b/helm-charts/common/vllm/templates/servicemonitor.yaml @@ -0,0 +1,23 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# +# Dashboard for the exposed vLLM metrics: +# - https://github.com/vllm-project/vllm/tree/main/examples/production_monitoring/ +# Metric descriptions: +# - https://docs.vllm.ai/en/stable/serving/metrics.html + +{{- if .Values.global.monitoring }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "vllm.fullname" . }} + labels: + release: {{ .Values.global.prometheusRelease }} +spec: + selector: + matchLabels: + {{- include "vllm.selectorLabels" . | nindent 6 }} + endpoints: + - port: vllm + interval: 5s +{{- end }} diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml index c8958e3e0..6e9e6c9c2 100644 --- a/helm-charts/common/vllm/values.yaml +++ b/helm-charts/common/vllm/values.yaml @@ -7,6 +7,17 @@ replicaCount: 1 +# Enabling HPA will: +# - Ignore above replica count, as it will be controlled by HPA +# - Add example HPA scaling rules with custom metrics thresholds +# - Require custom metrics ConfigMap available in the main application chart +autoscaling: + maxReplicas: 4 + enabled: false + +# empty for CPU (longer latencies are tolerated before HPA scaling unaccelerated service) +accelDevice: "" + port: 2080 shmSize: 1Gi image: @@ -62,7 +73,7 @@ resources: {} # cpu: 100m # memory: 128Mi -extraCmdArgs: ["--enforce-eager", "--dtype", "auto"] +extraCmdArgs: [] livenessProbe: httpGet: @@ -92,6 +103,11 @@ tolerations: [] affinity: {} LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + +# Environment variables for vLLM (set in configmap): +# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#environment-variables +OMPI_MCA_btl_vader_single_copy_mechanism: "" +PT_HPU_ENABLE_LAZY_COLLECTIVES: "" VLLM_CPU_KVCACHE_SPACE: "" global: @@ -113,3 +129,9 @@ global: # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume. modelUseHostPath: "" modelUsePVC: "" + + # Install Prometheus serviceMonitor for service + monitoring: false + + # Prometheus Helm install release name for serviceMonitor + prometheusRelease: prometheus-stack diff --git a/helm-charts/monitoring.md b/helm-charts/monitoring.md index 09c1ec37e..011711d0c 100644 --- a/helm-charts/monitoring.md +++ b/helm-charts/monitoring.md @@ -75,7 +75,16 @@ $ prom_url=http://$(kubectl -n $prom_ns get -o jsonpath="{.spec.clusterIP}:{.spe $ curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*$chart ``` -Check that Prometheus metrics from TGI inference component are available: +Then check that Prometheus metrics from a relevant LLM inferencing service are available. + +For vLLM: + +```console +$ curl --no-progress-meter $prom_url/api/v1/query? \ + --data-urlencode 'query=vllm:cache_config_info{service="'$chart'-vllm"}' | jq +``` + +Or TGI: ```console $ curl --no-progress-meter $prom_url/api/v1/query? \ @@ -83,4 +92,4 @@ $ curl --no-progress-meter $prom_url/api/v1/query? \ ``` **NOTE**: services provide metrics only after they've processed their first request. -And reranking service will be used only after context data has been uploaded! +And ChatQnA uses (TEI) reranking service only after query context data has been uploaded!