build: Update for 25.01, TRTLLM v0.17.0.post1, and fix HF_HOME setting (

#104) Co-authored-by: Ryan McCormick <[email protected]>
triton-inference-server · Feb 14, 2025 · 3a3d1c5 · 3a3d1c5
1 parent 758dec4
commit 3a3d1c5
Show file tree

Hide file tree

Showing 18 changed files with 1,494 additions and 236 deletions.
diff --git a/README.md b/README.md
@@ -1,3 +1,30 @@
+<!--
+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -->
+
 # Triton Command Line Interface (Triton CLI)
 > [!NOTE]
 > Triton CLI is currently in BETA. Its features and functionality are likely
@@ -22,8 +49,8 @@ and running the CLI from within the latest corresponding `tritonserver`
 container image, which should have all necessary system dependencies installed.
 
 For vLLM and TRT-LLM, you can use their respective images:
-- `nvcr.io/nvidia/tritonserver:24.10-vllm-python-py3`
-- `nvcr.io/nvidia/tritonserver:24.10-trtllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:25.01-vllm-python-py3`
+- `nvcr.io/nvidia/tritonserver:25.01-trtllm-python-py3`
 
 If you decide to run the CLI on the host or in a custom image, please
 see this list of [additional dependencies](#additional-dependencies-for-custom-environments)
@@ -38,6 +65,7 @@ matrix below:
 
 | Triton CLI Version | TRT-LLM Version | Triton Container Tag |
 |:------------------:|:---------------:|:--------------------:|
+| 0.1.2  | v0.17.0.post1 | 25.01 |
 | 0.1.1  | v0.14.0 | 24.10 |
 | 0.1.0  | v0.13.0 | 24.09 |
 | 0.0.11 | v0.12.0 | 24.08 |
@@ -60,7 +88,7 @@ It is also possible to install from a specific branch name, a commit hash
 or a tag name. For example to install `triton_cli` with a specific tag:
 
 ```bash
-GIT_REF="0.1.1"
+GIT_REF="0.1.2"
 pip install git+https://github.com/triton-inference-server/triton_cli.git@${GIT_REF}
 ```
 
@@ -95,7 +123,7 @@ triton -h
 triton import -m gpt2
 
 # Start server pointing at the default model repository
-triton start --image nvcr.io/nvidia/tritonserver:24.10-vllm-python-py3
+triton start --image nvcr.io/nvidia/tritonserver:25.01-vllm-python-py3
 
 # Infer with CLI
 triton infer -m gpt2 --prompt "machine learning is"
@@ -120,6 +148,12 @@ minutes.
 > Also, usage of certain restricted models like Llama models requires authentication
 > in Huggingface through either `huggingface-cli login` or setting the `HF_TOKEN`
 > environment variable.
+>
+> If your huggingface cache is not located at `${HOME}/.cache/huggingface`, you could
+> set the huggingface cache with
+>
+> ex: `export HF_HOME=path/to/your/huggingface/cache`
+>
 
 ### Model Sources
 
@@ -175,26 +209,26 @@ docker run -ti \
   --shm-size=1g --ulimit memlock=-1 \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.10-vllm-python-py3
+  nvcr.io/nvidia/tritonserver:25.01-vllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/[email protected].1
+pip install git+https://github.com/triton-inference-server/[email protected].2
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login
 
 # Generate a Triton model repository containing a vLLM model config
 triton remove -m all
-triton import -m llama-3-8b-instruct --backend vllm
+triton import -m llama-3.1-8b-instruct --backend vllm
 
 # Start Triton pointing at the default model repository
 triton start
 
 # Interact with model
-triton infer -m llama-3-8b-instruct --prompt "machine learning is"
+triton infer -m llama-3.1-8b-instruct --prompt "machine learning is"
 
 # Profile model with GenAI-Perf
-triton profile -m llama-3-8b-instruct --backend vllm
+triton profile -m llama-3.1-8b-instruct --backend vllm
 ```
 
 ### Serving a TRT-LLM Model
@@ -240,26 +274,26 @@ docker run -ti \
   -v /tmp:/tmp \
   -v ${HOME}/models:/root/models \
   -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
-  nvcr.io/nvidia/tritonserver:24.10-trtllm-python-py3
+  nvcr.io/nvidia/tritonserver:25.01-trtllm-python-py3
 
 # Install the Triton CLI
-pip install git+https://github.com/triton-inference-server/[email protected].0
+pip install git+https://github.com/triton-inference-server/[email protected].2
 
 # Authenticate with huggingface for restricted models like Llama-2 and Llama-3
 huggingface-cli login
 
 # Build TRT LLM engine and generate a Triton model repository pointing at it
 triton remove -m all
-triton import -m llama-3-8b-instruct --backend tensorrtllm
+triton import -m llama-3.1-8b-instruct --backend tensorrtllm
 
 # Start Triton pointing at the default model repository
 triton start
 
 # Interact with model
-triton infer -m llama-3-8b-instruct --prompt "machine learning is"
+triton infer -m llama-3.1-8b-instruct --prompt "machine learning is"
 
 # Profile model with GenAI-Perf
-triton profile -m llama-3-8b-instruct --backend tensorrtllm
+triton profile -m llama-3.1-8b-instruct --backend tensorrtllm
 ```
 ## Additional Dependencies for Custom Environments
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -51,7 +51,7 @@ dependencies = [
     "grpcio>=1.67.0",
     # Use explicit client version matching genai-perf version for tagged release
     "tritonclient[all] == 2.51",
-    "genai-perf @ git+https://github.com/triton-inference-server/perf_analyzer.git@r24.10#subdirectory=genai-perf",
+    "genai-perf @ git+https://github.com/triton-inference-server/perf_analyzer.git@r25.01#subdirectory=genai-perf",
     # Misc deps
     "directory-tree == 0.0.4", # may remove in future
     # https://github.com/docker/docker-py/issues/3256#issuecomment-2376439000

diff --git a/src/triton_cli/__init__.py b/src/triton_cli/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -24,4 +24,4 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-__version__ = "0.1.1"
+__version__ = "0.1.2"
diff --git a/src/triton_cli/common.py b/src/triton_cli/common.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -47,5 +47,5 @@ class TritonCLIException(Exception):
 # Model Repository
 DEFAULT_MODEL_REPO: Path = Path.home() / "models"
 DEFAULT_HF_CACHE: Path = Path.home() / ".cache" / "huggingface"
-HF_CACHE: Path = Path(os.environ.get("TRANSFORMERS_CACHE", DEFAULT_HF_CACHE))
+HF_CACHE: Path = Path(os.environ.get("HF_HOME", DEFAULT_HF_CACHE))
 SUPPORTED_BACKENDS: set = {"vllm", "tensorrtllm"}
diff --git a/src/triton_cli/docker/Dockerfile b/src/triton_cli/docker/Dockerfile
@@ -1,11 +1,37 @@
+# Copyright 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 # TRT-LLM image contains engine building and runtime dependencies
-FROM nvcr.io/nvidia/tritonserver:24.10-trtllm-python-py3
+FROM nvcr.io/nvidia/tritonserver:25.01-trtllm-python-py3
 
 # Setup vLLM Triton backend
 RUN mkdir -p /opt/tritonserver/backends/vllm && \
-    git clone -b r24.10 https://github.com/triton-inference-server/vllm_backend.git /tmp/vllm_backend && \
+    git clone -b r25.01 https://github.com/triton-inference-server/vllm_backend.git /tmp/vllm_backend && \
     cp -r /tmp/vllm_backend/src/* /opt/tritonserver/backends/vllm && \
     rm -r /tmp/vllm_backend
 
 # vLLM runtime dependencies
-RUN pip install "vllm==0.5.3.post1" "setuptools==74.0.0"
+RUN pip install "vllm==0.6.3.post1" "setuptools==74.0.0"
diff --git a/src/triton_cli/repository.py b/src/triton_cli/repository.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2023-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -372,6 +372,10 @@ def __build_trtllm_engine(self, huggingface_id: str, engines_path: Path):
         # TODO: Investigate if LLM is internally saving a copy to a temp dir
         engine.save(str(engines_path))
 
+        # The new trtllm(v0.17.0+) requires explicit calling shutdown to shutdown
+        # the mpi blocking thread, or the engine process won't exit
+        engine.shutdown()
+
     def __create_model_repository(
         self, name: str, version: int = 1, backend: str = None
     ):

diff --git a/src/triton_cli/templates/trt_llm/postprocessing/1/model.py b/src/triton_cli/templates/trt_llm/postprocessing/1/model.py
@@ -132,13 +132,7 @@ def execute(self, requests):
             for batch_idx, beam_tokens in enumerate(token_batch):
                 for beam_idx, tokens in enumerate(beam_tokens):
                     seq_len = sequence_lengths[idx][batch_idx][beam_idx]
-                    # Exclude fake ids in multimodal models
-                    fake_id_len = 0
-                    for i in range(seq_len):
-                        if tokens[i] < self.tokenizer.vocab_size:
-                            fake_id_len = i
-                            break
-                    list_of_tokens.append(tokens[fake_id_len:seq_len])
+                    list_of_tokens.append(tokens[:seq_len])
                     req_idx_offset += 1
 
             req_idx_offsets.append(req_idx_offset)

diff --git a/src/triton_cli/templates/trt_llm/postprocessing/config.pbtxt b/src/triton_cli/templates/trt_llm/postprocessing/config.pbtxt
@@ -67,4 +67,4 @@ instance_group [
         count: ${postprocessing_instance_count}
         kind: KIND_CPU
     }
-]
+]
-Original file line number
+Diff line change
@@ Expand Up / @@ -67,4 +67,4 @@ instance_group [ @@
             count: ${postprocessing_instance_count}
             kind: KIND_CPU
         }
-    ]
+    ]