diff --git a/helm-charts/chatqna/Chart.yaml b/helm-charts/chatqna/Chart.yaml
index 5a255bf3e..7c52d1318 100644
--- a/helm-charts/chatqna/Chart.yaml
+++ b/helm-charts/chatqna/Chart.yaml
@@ -18,6 +18,11 @@ dependencies:
   - name: tgi
     version: 0-latest
     repository: "file://../common/tgi"
+    condition: tgi.enabled
+  - name: vllm
+    version: 0-latest
+    repository: "file://../common/vllm"
+    condition: vllm.enabled
   - name: tei
     version: 0-latest
     repository: "file://../common/tei"
diff --git a/helm-charts/chatqna/README.md b/helm-charts/chatqna/README.md
index d3b8097c9..eaf17cdc7 100644
--- a/helm-charts/chatqna/README.md
+++ b/helm-charts/chatqna/README.md
@@ -11,6 +11,7 @@ Helm chart for deploying ChatQnA service. ChatQnA depends on the following servi
 - [teirerank](../common/teirerank/README.md)
 - [llm-uservice](../common/llm-uservice/README.md)
 - [tgi](../common/tgi/README.md)
+- [vllm](../common/vllm/README.md)
 
 ## Installing the Chart
 
@@ -26,13 +27,15 @@ export MODELNAME="Intel/neural-chat-7b-v3-3"
 # If you would like to use the traditional UI, please change the image as well as the containerport within the values
 # append these at the end of the command "--set chatqna-ui.image.repository=opea/chatqna-ui,chatqna-ui.image.tag=latest,chatqna-ui.containerPort=5173"
 helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME}
-# To use Gaudi device
-#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-values.yaml
+# To use Gaudi device with TGI
+#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-tgi-values.yaml
+# To use Gaudi device with vLLM
+#helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/gaudi-vllm-values.yaml
 # To use Nvidia GPU
 #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} --set tgi.LLM_MODEL_ID=${MODELNAME} -f chatqna/nv-values.yaml
-# To include guardrail component in chatqna on Xeon
+# To include guardrail component in chatqna on Xeon with TGI
 #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-values.yaml
-# To include guardrail component in chatqna on Gaudi
+# To include guardrail component in chatqna on Gaudi with TGI
 #helm install chatqna chatqna --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --set global.modelUseHostPath=${MODELDIR} -f chatqna/guardrails-gaudi-values.yaml
 ```
 
diff --git a/helm-charts/chatqna/ci-gaudi-tgi-values.yaml b/helm-charts/chatqna/ci-gaudi-tgi-values.yaml
new file mode 120000
index 000000000..8702c8f68
--- /dev/null
+++ b/helm-charts/chatqna/ci-gaudi-tgi-values.yaml
@@ -0,0 +1 @@
+gaudi-tgi-values.yaml
\ No newline at end of file
diff --git a/helm-charts/chatqna/ci-gaudi-values.yaml b/helm-charts/chatqna/ci-gaudi-values.yaml
deleted file mode 120000
index 7243d31b2..000000000
--- a/helm-charts/chatqna/ci-gaudi-values.yaml
+++ /dev/null
@@ -1 +0,0 @@
-gaudi-values.yaml
\ No newline at end of file
diff --git a/helm-charts/chatqna/ci-gaudi-vllm-values.yaml b/helm-charts/chatqna/ci-gaudi-vllm-values.yaml
new file mode 120000
index 000000000..d9ab8c698
--- /dev/null
+++ b/helm-charts/chatqna/ci-gaudi-vllm-values.yaml
@@ -0,0 +1 @@
+gaudi-vllm-values.yaml
\ No newline at end of file
diff --git a/helm-charts/chatqna/gaudi-values.yaml b/helm-charts/chatqna/gaudi-tgi-values.yaml
similarity index 100%
rename from helm-charts/chatqna/gaudi-values.yaml
rename to helm-charts/chatqna/gaudi-tgi-values.yaml
diff --git a/helm-charts/chatqna/gaudi-vllm-values.yaml b/helm-charts/chatqna/gaudi-vllm-values.yaml
new file mode 100644
index 000000000..3b1873330
--- /dev/null
+++ b/helm-charts/chatqna/gaudi-vllm-values.yaml
@@ -0,0 +1,63 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Accelerate inferencing in heaviest components to improve performance
+# by overriding their subchart values
+
+tgi:
+  enabled: false
+
+vllm:
+  enabled: true
+  accelDevice: "gaudi"
+  image:
+    repository: opea/vllm-gaudi
+    tag: "latest"
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  startupProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+    failureThreshold: 120
+  readinessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+  livenessProbe:
+    initialDelaySeconds: 5
+    periodSeconds: 5
+    timeoutSeconds: 1
+
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+
+  extraCmdArgs: [
+    "--tensor-parallel-size", "1",
+    "--block-size", "128",
+    "--max-num-seqs", "256",
+    "--max-seq_len-to-capture", "2048"
+  ]
+
+
+# Reranking: second largest bottleneck when reranking is in use
+# (i.e. query context docs have been uploaded with data-prep)
+#
+# TODO: could vLLM be used also for reranking / embedding?
+teirerank:
+  accelDevice: "gaudi"
+  OMPI_MCA_btl_vader_single_copy_mechanism: "none"
+  MAX_WARMUP_SEQUENCE_LENGTH: "512"
+  image:
+    repository: ghcr.io/huggingface/tei-gaudi
+    tag: 1.5.0
+  resources:
+    limits:
+      habana.ai/gaudi: 1
+  securityContext:
+    readOnlyRootFilesystem: false
+  livenessProbe:
+    timeoutSeconds: 1
+  readinessProbe:
+    timeoutSeconds: 1
diff --git a/helm-charts/chatqna/hpa-values.yaml b/helm-charts/chatqna/hpa-values.yaml
index a374991f1..4881b2c67 100644
--- a/helm-charts/chatqna/hpa-values.yaml
+++ b/helm-charts/chatqna/hpa-values.yaml
@@ -4,7 +4,7 @@
 # Enable HorizontalPodAutoscaler (HPA)
 #
 # That will overwrite named PrometheusAdapter configMap with ChatQnA specific
-# custom metric queries for embedding, reranking, tgi services.
+# custom metric queries for embedding, reranking, and LLM services.
 #
 # Default upstream configMap is in:
 #  - https://github.com/kubernetes-sigs/prometheus-adapter/blob/master/deploy/manifests/config-map.yaml
@@ -15,6 +15,10 @@ autoscaling:
 # Override values in specific subcharts
 
 # Enabling "autoscaling" for any of the subcharts requires enabling it also above!
+vllm:
+  autoscaling:
+    maxReplicas: 4
+    enabled: true
 tgi:
   autoscaling:
     maxReplicas: 4
diff --git a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
index 440a4019e..416b8910b 100644
--- a/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
+++ b/helm-charts/chatqna/templates/custom-metrics-configmap.yaml
@@ -13,10 +13,27 @@ metadata:
 data:
   config.yaml: |
     rules:
-    {{- if .Values.tgi.autoscaling.enabled }}
+    {{- if and .Values.vllm.enabled .Values.vllm.autoscaling.enabled }}
     # check metric with:
     # kubectl get --raw /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric> | jq
     #
+    - seriesQuery: '{__name__="vllm:time_per_output_token_seconds_sum",service="{{ include "vllm.fullname" .Subcharts.vllm }}"}'
+      # Average output token latency from vLLM histograms, over 1 min
+      # (interval should be at least 4x serviceMonitor query interval,
+      # 0.001 divider add is to make sure there's always a valid value)
+      metricsQuery: 'rate(vllm:time_per_output_token_seconds_sum{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(vllm:time_per_output_token_seconds_count{service="{{ include "vllm.fullname" .Subcharts.vllm }}",<<.LabelMatchers>>}[1m]))'
+      name:
+        matches: ^vllm:time_per_output_token_seconds_sum
+        as: "{{ include "vllm.metricPrefix" .Subcharts.vllm }}_token_latency"
+      resources:
+        # HPA needs both namespace + suitable object resource for its query paths:
+        # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
+        # (pod is not suitable object type for matching as each instance has different name)
+        overrides:
+          namespace: {resource: "namespace"}
+          service:   {resource: "service"}
+    {{- end }}
+    {{- if and .Values.tgi.enabled .Values.tgi.autoscaling.enabled }}
     {{- if .Values.tgi.accelDevice }}
     - seriesQuery: '{__name__="tgi_queue_size",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
       # TGI instances queue_size sum
@@ -27,16 +44,12 @@ data:
     {{- else }}
     - seriesQuery: '{__name__="tgi_request_inference_duration_sum",service="{{ include "tgi.fullname" .Subcharts.tgi }}"}'
       # Average request latency from TGI histograms, over 1 min
-      # (0.001 divider add is to make sure there's always a valid value)
       metricsQuery: 'rate(tgi_request_inference_duration_sum{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]) / (0.001+rate(tgi_request_inference_duration_count{service="{{ include "tgi.fullname" .Subcharts.tgi }}",<<.LabelMatchers>>}[1m]))'
       name:
         matches: ^tgi_request_inference_duration_sum
         as: "{{ include "tgi.metricPrefix" .Subcharts.tgi }}_request_latency"
     {{- end }}
       resources:
-        # HPA needs both namespace + suitable object resource for its query paths:
-        # /apis/custom.metrics.k8s.io/v1beta1/namespaces/default/service/*/<metric>
-        # (pod is not suitable object type for matching as each instance has different name)
         overrides:
           namespace: {resource: "namespace"}
           service:   {resource: "service"}
diff --git a/helm-charts/chatqna/templates/deployment.yaml b/helm-charts/chatqna/templates/deployment.yaml
index 6f19fee10..4a1a1e31c 100644
--- a/helm-charts/chatqna/templates/deployment.yaml
+++ b/helm-charts/chatqna/templates/deployment.yaml
@@ -35,11 +35,19 @@ spec:
         - name: {{ .Release.Name }}
           env:
             - name: LLM_SERVER_HOST_IP
+              {{- if .Values.vllm.enabled }}
+              value: {{ .Release.Name }}-vllm
+              {{- else }}
               value: {{ .Release.Name }}-tgi
+              {{- end }}
             - name: LLM_SERVER_PORT
               value: "80"
             - name: LLM_MODEL
+              {{- if .Values.vllm.enabled }}
+              value: {{ .Values.vllm.LLM_MODEL_ID | quote }}
+              {{- else }}
               value: {{ .Values.tgi.LLM_MODEL_ID | quote }}
+              {{- end }}
             - name: RERANK_SERVER_HOST_IP
               value: {{ .Release.Name }}-teirerank
             - name: RERANK_SERVER_PORT
diff --git a/helm-charts/chatqna/values.yaml b/helm-charts/chatqna/values.yaml
index 5558cf62b..c939f9b9d 100644
--- a/helm-charts/chatqna/values.yaml
+++ b/helm-charts/chatqna/values.yaml
@@ -67,6 +67,10 @@ autoscaling:
 
 # Override values in specific subcharts
 tgi:
+  enabled: true
+  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+vllm:
+  enabled: false
   LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
 
 # disable guardrails-usvc by default
diff --git a/helm-charts/common/agent/values.yaml b/helm-charts/common/agent/values.yaml
index 4e602d960..0ebfd4c31 100644
--- a/helm-charts/common/agent/values.yaml
+++ b/helm-charts/common/agent/values.yaml
@@ -14,7 +14,7 @@ tgi:
 vllm:
   enabled: false
   LLM_MODEL_ID: "mistralai/Mistral-7B-Instruct-v0.3"
-  extraCmdArgs: ["/bin/bash", "-c", "python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model mistralai/Mistral-7B-Instruct-v0.3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 4096 --max-seq_len-to-capture 8192 --enable-auto-tool-choice --tool-call-parser mistral"]
+  extraCmdArgs: ["--tensor-parallel-size", "1", "--block-size", "128", "--max-num-seqs", "4096", "--max-seq_len-to-capture", "8192", "--enable-auto-tool-choice", "--tool-call-parser", "mistral"]
 
 replicaCount: 1
 llm_endpoint_url: ""
diff --git a/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml b/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml
index 2438eaed9..0f1170f36 100644
--- a/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml
+++ b/helm-charts/common/llm-uservice/ci-vllm-gaudi-values.yaml
@@ -13,7 +13,7 @@ vllm:
     tag: "latest"
   LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
   OMPI_MCA_btl_vader_single_copy_mechanism: none
-  extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
+  extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
   resources:
     limits:
       habana.ai/gaudi: 1
diff --git a/helm-charts/common/tei/values.yaml b/helm-charts/common/tei/values.yaml
index 547c0a91d..cdef58b2b 100644
--- a/helm-charts/common/tei/values.yaml
+++ b/helm-charts/common/tei/values.yaml
@@ -9,7 +9,7 @@ replicaCount: 1
 
 # Enabling HPA will:
 # - Ignore above replica count, as it will be controlled by HPA
-# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Add example HPA scaling rules with custom metrics thresholds
 # - Require custom metrics ConfigMap available in the main application chart
 autoscaling:
   maxReplicas: 2
diff --git a/helm-charts/common/teirerank/values.yaml b/helm-charts/common/teirerank/values.yaml
index 254f4d169..745d71849 100644
--- a/helm-charts/common/teirerank/values.yaml
+++ b/helm-charts/common/teirerank/values.yaml
@@ -9,7 +9,7 @@ replicaCount: 1
 
 # Enabling HPA will:
 # - Ignore above replica count, as it will be controlled by HPA
-# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Add example HPA scaling rules with custom metrics thresholds
 # - Require custom metrics ConfigMap available in the main application chart
 autoscaling:
   maxReplicas: 3
diff --git a/helm-charts/common/tgi/values.yaml b/helm-charts/common/tgi/values.yaml
index dff938e65..cfc8e7f61 100644
--- a/helm-charts/common/tgi/values.yaml
+++ b/helm-charts/common/tgi/values.yaml
@@ -9,7 +9,7 @@ replicaCount: 1
 
 # Enabling HPA will:
 # - Ignore above replica count, as it will be controlled by HPA
-# - Add example HPA scaling rules with thresholds suitable for Xeon deployments
+# - Add example HPA scaling rules with custom metrics thresholds
 # - Require custom metrics ConfigMap available in the main application chart
 autoscaling:
   maxReplicas: 4
diff --git a/helm-charts/common/vllm/README.md b/helm-charts/common/vllm/README.md
index 0235a7443..1807cb2ea 100644
--- a/helm-charts/common/vllm/README.md
+++ b/helm-charts/common/vllm/README.md
@@ -51,3 +51,5 @@ curl http://localhost:2080/v1/completions \
 | global.modelUseHostPath         | string | `""`                                 | Cached models directory, vllm will not download if the model is cached here. The host path "modelUseHostPath" will be mounted to container as /data directory. Set this to null/empty will force it to download model. |
 | image.repository                | string | `"opea/vllm"`                        |                                                                                                                                                                                                                        |
 | image.tag                       | string | `"latest"`                           |                                                                                                                                                                                                                        |
+| autoscaling.enabled             | bool   | `false`                              | Enable HPA autoscaling for the service deployment based on metrics it provides. See [HPA instructions](../../HPA.md) before enabling!                                                                                  |
+| global.monitoring               | bool   | `false`                              | Enable usage metrics for the service. Required for HPA. See [monitoring instructions](../../monitoring.md) before enabling!                                                                                            |
diff --git a/helm-charts/common/vllm/gaudi-values.yaml b/helm-charts/common/vllm/gaudi-values.yaml
index 65e622044..08f4db145 100644
--- a/helm-charts/common/vllm/gaudi-values.yaml
+++ b/helm-charts/common/vllm/gaudi-values.yaml
@@ -5,15 +5,15 @@
 # This is a YAML-formatted file.
 # Declare variables to be passed into your templates.
 
+accelDevice: "gaudi"
+
 image:
   repository: opea/vllm-gaudi
   tag: "latest"
 
 # VLLM_CPU_KVCACHE_SPACE: "40"
 OMPI_MCA_btl_vader_single_copy_mechanism: none
-extraCmdArgs: ["--enforce-eager","--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
-# Workaround for current HPU image with start command /bin/bash
-# extraCmdArgs: ["/bin/bash","-c","python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model Intel/neural-chat-7b-v3-3 --tensor-parallel-size 1 --host 0.0.0.0 --port 2080 --download-dir /data --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048"]
+extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq_len-to-capture","2048"]
 resources:
   limits:
     habana.ai/gaudi: 1
diff --git a/helm-charts/common/vllm/templates/_helpers.tpl b/helm-charts/common/vllm/templates/_helpers.tpl
index 63ec9e61d..3dd629e1f 100644
--- a/helm-charts/common/vllm/templates/_helpers.tpl
+++ b/helm-charts/common/vllm/templates/_helpers.tpl
@@ -30,6 +30,13 @@ Create chart name and version as used by the chart label.
 {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
 {{- end }}
 
+{{/*
+Convert chart name to a string suitable as metric prefix
+*/}}
+{{- define "vllm.metricPrefix" -}}
+{{- include "vllm.fullname" . | replace "-" "_" | regexFind "[a-zA-Z_:][a-zA-Z0-9_:]*" }}
+{{- end }}
+
 {{/*
 Common labels
 */}}
diff --git a/helm-charts/common/vllm/templates/configmap.yaml b/helm-charts/common/vllm/templates/configmap.yaml
index 14a8ba240..5fbbf6b79 100644
--- a/helm-charts/common/vllm/templates/configmap.yaml
+++ b/helm-charts/common/vllm/templates/configmap.yaml
@@ -25,6 +25,9 @@ data:
   {{- if .Values.VLLM_CPU_KVCACHE_SPACE }}
   VLLM_CPU_KVCACHE_SPACE: {{ .Values.VLLM_CPU_KVCACHE_SPACE | quote}}
   {{- end }}
+  {{- if .Values.PT_HPU_ENABLE_LAZY_COLLECTIVES }}
+  PT_HPU_ENABLE_LAZY_COLLECTIVES: {{ .Values.PT_HPU_ENABLE_LAZY_COLLECTIVES | quote }}
+  {{- end }}
   {{- if .Values.OMPI_MCA_btl_vader_single_copy_mechanism }}
   OMPI_MCA_btl_vader_single_copy_mechanism: {{ .Values.OMPI_MCA_btl_vader_single_copy_mechanism | quote}}
   {{- end }}
diff --git a/helm-charts/common/vllm/templates/deployment.yaml b/helm-charts/common/vllm/templates/deployment.yaml
index afa559cd6..30f736b6b 100644
--- a/helm-charts/common/vllm/templates/deployment.yaml
+++ b/helm-charts/common/vllm/templates/deployment.yaml
@@ -8,7 +8,10 @@ metadata:
   labels:
     {{- include "vllm.labels" . | nindent 4 }}
 spec:
+  {{- if ne (int .Values.replicaCount) 1 }}
+  # remove if replica count should not be reset on pod update (e.g. with HPA)
   replicas: {{ .Values.replicaCount }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "vllm.selectorLabels" . | nindent 6 }}
@@ -159,3 +162,7 @@ spec:
             matchLabels:
               {{- include "vllm.selectorLabels" . | nindent 14 }}
       {{- end }}
+      {{- if not .Values.accelDevice }}
+      # extra time to finish processing buffered requests on CPU before pod is forcibly terminated
+      terminationGracePeriodSeconds: 120
+      {{- end }}
diff --git a/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml b/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml
new file mode 100644
index 000000000..c52861631
--- /dev/null
+++ b/helm-charts/common/vllm/templates/horizontal-pod-autoscaler.yaml
@@ -0,0 +1,57 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+{{- if and .Values.global.monitoring .Values.autoscaling.enabled }}
+apiVersion: autoscaling/v2
+kind: HorizontalPodAutoscaler
+metadata:
+  name: {{ include "vllm.fullname" . }}
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: {{ include "vllm.fullname" . }}
+  minReplicas: 1
+  maxReplicas: {{ .Values.autoscaling.maxReplicas }}
+  metrics:
+  - type: Object
+    object:
+      describedObject:
+        apiVersion: v1
+        # get metric for named object of given type (in same namespace)
+        kind: Service
+        name: {{ include "vllm.fullname" . }}
+      target:
+        # Metric is sum from all pods. "AverageValue" divides value returned from
+        # the custom metrics API by the number of Pods before comparing to the target:
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/#algorithm-details
+        #  https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale-walkthrough/#autoscaling-on-multiple-metrics-and-custom-metrics
+        type: AverageValue
+{{- if .Values.accelDevice }}
+        averageValue: 0.1
+{{- else }}
+        # allow larger latencies with unaccelerated service
+        averageValue: 1.0
+{{- end }}
+      metric:
+        name: {{ include "vllm.metricPrefix" . }}_token_latency
+  behavior:
+    scaleDown:
+      stabilizationWindowSeconds: 180
+      policies:
+      - type: Percent
+        value: 25
+        periodSeconds: 90
+    scaleUp:
+      selectPolicy: Max
+      stabilizationWindowSeconds: 0
+      policies:
+      # Slow linear rampup in case additional CPU pods go to same node
+      # (i.e. interfere with each other)
+      - type: Pods
+        value: 1
+        periodSeconds: 90
+      #- type: Percent
+      #  value: 25
+      #  periodSeconds: 90
+{{- end }}
diff --git a/helm-charts/common/vllm/templates/servicemonitor.yaml b/helm-charts/common/vllm/templates/servicemonitor.yaml
new file mode 100644
index 000000000..d1a8a2c76
--- /dev/null
+++ b/helm-charts/common/vllm/templates/servicemonitor.yaml
@@ -0,0 +1,23 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+#
+# Dashboard for the exposed vLLM metrics:
+# - https://github.com/vllm-project/vllm/tree/main/examples/production_monitoring/
+# Metric descriptions:
+# - https://docs.vllm.ai/en/stable/serving/metrics.html
+
+{{- if .Values.global.monitoring }}
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ include "vllm.fullname" . }}
+  labels:
+    release: {{ .Values.global.prometheusRelease }}
+spec:
+  selector:
+    matchLabels:
+      {{- include "vllm.selectorLabels" . | nindent 6 }}
+  endpoints:
+  - port: vllm
+    interval: 5s
+{{- end }}
diff --git a/helm-charts/common/vllm/values.yaml b/helm-charts/common/vllm/values.yaml
index c8958e3e0..6e9e6c9c2 100644
--- a/helm-charts/common/vllm/values.yaml
+++ b/helm-charts/common/vllm/values.yaml
@@ -7,6 +7,17 @@
 
 replicaCount: 1
 
+# Enabling HPA will:
+# - Ignore above replica count, as it will be controlled by HPA
+# - Add example HPA scaling rules with custom metrics thresholds
+# - Require custom metrics ConfigMap available in the main application chart
+autoscaling:
+  maxReplicas: 4
+  enabled: false
+
+# empty for CPU (longer latencies are tolerated before HPA scaling unaccelerated service)
+accelDevice: ""
+
 port: 2080
 shmSize: 1Gi
 image:
@@ -62,7 +73,7 @@ resources: {}
   #   cpu: 100m
   #   memory: 128Mi
 
-extraCmdArgs: ["--enforce-eager", "--dtype", "auto"]
+extraCmdArgs: []
 
 livenessProbe:
   httpGet:
@@ -92,6 +103,11 @@ tolerations: []
 affinity: {}
 
 LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+
+# Environment variables for vLLM (set in configmap):
+# https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#environment-variables
+OMPI_MCA_btl_vader_single_copy_mechanism: ""
+PT_HPU_ENABLE_LAZY_COLLECTIVES: ""
 VLLM_CPU_KVCACHE_SPACE: ""
 
 global:
@@ -113,3 +129,9 @@ global:
   # By default, both var are set to empty, the model will be downloaded and saved to a tmp volume.
   modelUseHostPath: ""
   modelUsePVC: ""
+
+  # Install Prometheus serviceMonitor for service
+  monitoring: false
+
+  # Prometheus Helm install release name for serviceMonitor
+  prometheusRelease: prometheus-stack
diff --git a/helm-charts/monitoring.md b/helm-charts/monitoring.md
index 09c1ec37e..011711d0c 100644
--- a/helm-charts/monitoring.md
+++ b/helm-charts/monitoring.md
@@ -75,7 +75,16 @@ $ prom_url=http://$(kubectl -n $prom_ns get -o jsonpath="{.spec.clusterIP}:{.spe
 $ curl --no-progress-meter $prom_url/metrics | grep scrape_pool_targets.*$chart
 ```
 
-Check that Prometheus metrics from TGI inference component are available:
+Then check that Prometheus metrics from a relevant LLM inferencing service are available.
+
+For vLLM:
+
+```console
+$ curl --no-progress-meter $prom_url/api/v1/query? \
+  --data-urlencode 'query=vllm:cache_config_info{service="'$chart'-vllm"}' | jq
+```
+
+Or TGI:
 
 ```console
 $ curl --no-progress-meter $prom_url/api/v1/query? \
@@ -83,4 +92,4 @@ $ curl --no-progress-meter $prom_url/api/v1/query? \
 ```
 
 **NOTE**: services provide metrics only after they've processed their first request.
-And reranking service will be used only after context data has been uploaded!
+And ChatQnA uses (TEI) reranking service only after query context data has been uploaded!