diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml index 7d22d2811..4c690badc 100644 --- a/helm-charts/chatqna/Chart.yaml +++ b/helm-charts/chatqna/Chart.yaml @@ -18,6 +18,11 @@ dependencies: - name: tgi version: 1.0.0 repository: "file://../common/tgi" + condition: tgi.enabled + - name: vllm + version: 1.0.0 + repository: "file://../common/vllm" + condition: vllm.enabled - name: tei version: 1.0.0 repository: "file://../common/tei" diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md index d3b8097c9..eaf17cdc7 100644 --- a/helm-charts/chatqna/README.md +++ b/helm-charts/chatqna/README.md @@ -11,6 +11,7 @@ Helm chart for deploying ChatQnA service. ChatQnA depends on the following servi - [teirerank](../common/teirerank/README.md) - [llm-uservice](../common/llm-uservice/README.md) - [tgi](../common/tgi/README.md) +- [vllm](../common/vllm/README.md) ## Installing the Chart @@ -26,13 +27,15 @@ export MODELNAME="Intel/neural-chat-7b-v3-3" # If you would like to use the traditional UI, please change the image as well as the containerport within the values # append these at the end of the command "--set chatqna-ui.image.repository=opea/chatqna-ui,chatqna-ui.image.tag=latest,chatqna-ui.containerPort=5173" helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -# To use Gaudi device -#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml +# To use Gaudi device with TGI +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-tgi-values.yaml +# To use Gaudi device with vLLM +#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-vllm-values.yaml # To use Nvidia GPU #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml -# To include guardrail component in chatqna on Xeon +# To include guardrail component in chatqna on Xeon with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml -# To include guardrail component in chatqna on Gaudi +# To include guardrail component in chatqna on Gaudi with TGI #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml ``` diff --git a/helm-charts/chatqna/ci-gaudi-tgi-values.yaml b/helm-charts/chatqna/ci-gaudi-tgi-values.yaml new file mode 120000 index 000000000..8702c8f68 --- /dev/null +++ b/helm-charts/chatqna/ci-gaudi-tgi-values.yaml @@ -0,0 +1 @@ +gaudi-tgi-values.yaml \ No newline at end of file diff --git a/helm-charts/chatqna/ci-gaudi-values.yaml b/helm-charts/chatqna/ci-gaudi-values.yaml deleted file mode 120000 index 7243d31b2..000000000 --- a/helm-charts/chatqna/ci-gaudi-values.yaml +++ /dev/null @@ -1 +0,0 @@ -gaudi-values.yaml \ No newline at end of file diff --git a/helm-charts/chatqna/ci-gaudi-vllm-values.yaml b/helm-charts/chatqna/ci-gaudi-vllm-values.yaml new file mode 120000 index 000000000..d9ab8c698 --- /dev/null +++ b/helm-charts/chatqna/ci-gaudi-vllm-values.yaml @@ -0,0 +1 @@ +gaudi-vllm-values.yaml \ No newline at end of file diff --git a/helm-charts/chatqna/gaudi-values.yaml b/helm-charts/chatqna/gaudi-tgi-values.yaml similarity index 100% rename from helm-charts/chatqna/gaudi-values.yaml rename to helm-charts/chatqna/gaudi-tgi-values.yaml diff --git a/helm-charts/chatqna/gaudi-vllm-values.yaml b/helm-charts/chatqna/gaudi-vllm-values.yaml new file mode 100644 index 000000000..27eb3c9e3 --- /dev/null +++ b/helm-charts/chatqna/gaudi-vllm-values.yaml @@ -0,0 +1,66 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Accelerate inferencing in heaviest components to improve performance +# by overriding their subchart values + +tgi: + enabled: false + +vllm: + enabled: true + image: + repository: opea/vllm-gaudi + tag: "latest" + resources: + limits: + habana.ai/gaudi: 1 + startupProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + failureThreshold: 120 + readinessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + livenessProbe: + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 1 + + # TODO: these are taken from GenAIExamples HPU manifest as-is + # vLLM chart needs to adopt / apply relevant ones + HABANA_LOGS: "/tmp/habana_logs" + NUMBA_CACHE_DIR: "/tmp" + PT_HPU_ENABLE_LAZY_COLLECTIVES: "true" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + HF_HOME: "/tmp/.cache/huggingface" + GPU_MEMORY_UTILIZATION: "0.5" + DTYPE: "auto" + TENSOR_PARALLEL_SIZE: "1" + BLOCK_SIZE: "128" + MAX_NUM_SEQS: "256" + MAX_SEQ_LEN_TO_CAPTURE: "2048" + + +# Reranking: second largest bottleneck when reranking is in use +# (i.e. query context docs have been uploaded with data-prep) +# +# TODO: could vLLM be used also for reranking / embedding? +teirerank: + accelDevice: "gaudi" + OMPI_MCA_btl_vader_single_copy_mechanism: "none" + MAX_WARMUP_SEQUENCE_LENGTH: "512" + image: + repository: ghcr.io/huggingface/tei-gaudi + tag: 1.5.0 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + readOnlyRootFilesystem: false + livenessProbe: + timeoutSeconds: 1 + readinessProbe: + timeoutSeconds: 1 diff --git a/helm-charts/chatqna/templates/deployment.yaml b/helm-charts/chatqna/templates/deployment.yaml index 812d38486..d7381f1b1 100644 --- a/helm-charts/chatqna/templates/deployment.yaml +++ b/helm-charts/chatqna/templates/deployment.yaml @@ -34,11 +34,19 @@ spec: - name: {{ .Release.Name }} env: - name: LLM_SERVER_HOST_IP + {{- if .Values.vllm.enabled }} + value: {{ .Release.Name }}-vllm + {{- else }} value: {{ .Release.Name }}-tgi + {{- end }} - name: LLM_SERVER_PORT value: "80" - name: LLM_MODEL + {{- if .Values.vllm.enabled }} + value: {{ .Values.vllm.LLM_MODEL_ID | quote }} + {{- else }} value: {{ .Values.tgi.LLM_MODEL_ID | quote }} + {{- end }} - name: RERANK_SERVER_HOST_IP value: {{ .Release.Name }}-teirerank - name: RERANK_SERVER_PORT diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml index 0cd82d7ff..761bb9ce0 100644 --- a/helm-charts/chatqna/values.yaml +++ b/helm-charts/chatqna/values.yaml @@ -46,7 +46,25 @@ autoscaling: # Override values in specific subcharts tgi: + enabled: true LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 +vllm: + enabled: false + # TODO: manifest in GenAIExamples uses "meta-llama/Meta-Llama-3-8B-Instruct" instead? + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + # TODO: these are non-redundant/non-broken options used by Agent component, + # but I think their values should be handled inside vLLM component, with + # deployment applying numbers set in configMap, based on values YAML file + # variables. + extraCmdArgs: [ + "--enforce-eager", + "--tensor-parallel-size", "1", + "--dtype", "auto", + "--block-size", "128", + "--max-num-seqs", "256", + "--max-seq_len-to-capture", "2048", + "--gpu-memory-utilization", "0.5" + ] # disable guardrails-usvc by default # See guardrails-values.yaml for guardrail related options