From fffb00025bbe7fea3085b03a32b013fb3a5a381d Mon Sep 17 00:00:00 2001
From: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
Date: Mon, 27 Jan 2025 14:07:12 +0900
Subject: [PATCH] feat: moved from personal repository
 https://github.com/knzo25/bevfusion_ros2

Signed-off-by: Kenzo Lobos-Tsunekawa <kenzo.lobos@tier4.jp>
---
 .../autoware_lidar_bevfusion/CMakeLists.txt   | 213 +++++++
 .../config/bevfusion.param.yaml               |  23 +
 .../config/bevfusion_ml_package.param.yaml    |  25 +
 .../detection_class_remapper.param.yaml       |  38 ++
 .../include/autoware/bev_ops/bev_pool_cuda.h  |  12 +
 .../lidar_bevfusion/bevfusion_config.hpp      | 195 +++++++
 .../lidar_bevfusion/bevfusion_trt.hpp         | 150 +++++
 .../detection_class_remapper.hpp              |  48 ++
 .../lidar_bevfusion/lidar_bevfusion_node.hpp  |  97 ++++
 .../postprocess/circle_nms_kernel.hpp         |  34 ++
 .../postprocess/non_maximum_suppression.hpp   |  82 +++
 .../postprocess/postprocess_kernel.hpp        |  47 ++
 .../lidar_bevfusion/preprocess/point_type.hpp |  35 ++
 .../preprocess/pointcloud_densification.hpp   | 104 ++++
 .../preprocess/precomputed_features.hpp       |  89 +++
 .../preprocess/preprocess_kernel.hpp          |  76 +++
 .../preprocess/voxel_generator.hpp            |  64 +++
 .../autoware/lidar_bevfusion/ros_utils.hpp    |  67 +++
 .../autoware/lidar_bevfusion/utils.hpp        |  57 ++
 .../lidar_bevfusion/visibility_control.hpp    |  37 ++
 .../get_indice_pairs_implicit_gemm_plugin.hpp | 140 +++++
 ...ice_pairs_implicit_gemm_plugin_creator.hpp |  65 +++
 .../tensorrt_plugins/implicit_gemm_plugin.hpp | 138 +++++
 .../implicit_gemm_plugin_creator.hpp          |  59 ++
 .../tensorrt_plugins/plugin_registration.hpp  |  29 +
 .../tensorrt_plugins/plugin_utils.hpp         |  35 ++
 .../quick_cumsum_cuda_plugin.hpp              | 123 ++++
 .../quick_cumsum_cuda_plugin_creator.hpp      |  62 ++
 .../launch/lidar_bevfusion.launch.xml         | 124 ++++
 .../lib/bevfusion_trt.cpp                     | 532 ++++++++++++++++++
 .../lib/detection_class_remapper.cpp          |  74 +++
 .../lib/postprocess/circle_nms_kernel.cu      | 147 +++++
 .../postprocess/non_maximum_suppression.cpp   | 114 ++++
 .../lib/postprocess/postprocess_kernel.cu     | 141 +++++
 .../preprocess/pointcloud_densification.cpp   | 111 ++++
 .../lib/preprocess/precomputed_features.cpp   | 376 +++++++++++++
 .../lib/preprocess/preprocess_kernel.cu       | 228 ++++++++
 .../lib/preprocess/voxel_generator.cpp        |  89 +++
 .../lib/ros_utils.cpp                         | 126 +++++
 .../autoware_lidar_bevfusion/package.xml      |  36 ++
 .../schema/bevfusion.schema.dummy.json        | 121 ++++
 .../schema/bevfusion_ml_package.schema.json   |  71 +++
 .../detection_class_remapper.schema.json      |  72 +++
 .../src/bev_ops/bev_pool_cuda.cu              |  55 ++
 .../src/lidar_bevfusion_node.cpp              | 342 +++++++++++
 .../get_indice_pairs_implicit_gemm_plugin.cpp | 421 ++++++++++++++
 ...ice_pairs_implicit_gemm_plugin_creator.cpp | 324 +++++++++++
 .../tensorrt_plugins/implicit_gemm_plugin.cpp | 314 +++++++++++
 .../implicit_gemm_plugin_creator.cpp          | 174 ++++++
 .../tensorrt_plugins/plugin_registration.cpp  |  73 +++
 .../src/tensorrt_plugins/plugin_utils.cpp     |  57 ++
 .../quick_cumsum_cuda_plugin.cpp              | 245 ++++++++
 .../quick_cumsum_cuda_plugin_creator.cpp      | 143 +++++
 .../autoware/cuda_utils/cuda_utils.hpp        |  36 ++
 54 files changed, 6690 insertions(+)
 create mode 100644 perception/autoware_lidar_bevfusion/CMakeLists.txt
 create mode 100644 perception/autoware_lidar_bevfusion/config/bevfusion.param.yaml
 create mode 100644 perception/autoware_lidar_bevfusion/config/bevfusion_ml_package.param.yaml
 create mode 100644 perception/autoware_lidar_bevfusion/config/detection_class_remapper.param.yaml
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/bev_ops/bev_pool_cuda.h
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_config.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_trt.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/detection_class_remapper.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/lidar_bevfusion_node.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/point_type.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/precomputed_features.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/voxel_generator.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/ros_utils.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/utils.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/visibility_control.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_registration.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_utils.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp
 create mode 100644 perception/autoware_lidar_bevfusion/launch/lidar_bevfusion.launch.xml
 create mode 100644 perception/autoware_lidar_bevfusion/lib/bevfusion_trt.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/lib/detection_class_remapper.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/lib/postprocess/circle_nms_kernel.cu
 create mode 100644 perception/autoware_lidar_bevfusion/lib/postprocess/non_maximum_suppression.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/lib/postprocess/postprocess_kernel.cu
 create mode 100644 perception/autoware_lidar_bevfusion/lib/preprocess/pointcloud_densification.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/lib/preprocess/precomputed_features.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/lib/preprocess/preprocess_kernel.cu
 create mode 100644 perception/autoware_lidar_bevfusion/lib/preprocess/voxel_generator.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/lib/ros_utils.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/package.xml
 create mode 100644 perception/autoware_lidar_bevfusion/schema/bevfusion.schema.dummy.json
 create mode 100644 perception/autoware_lidar_bevfusion/schema/bevfusion_ml_package.schema.json
 create mode 100644 perception/autoware_lidar_bevfusion/schema/detection_class_remapper.schema.json
 create mode 100644 perception/autoware_lidar_bevfusion/src/bev_ops/bev_pool_cuda.cu
 create mode 100644 perception/autoware_lidar_bevfusion/src/lidar_bevfusion_node.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin_creator.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_registration.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_utils.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin.cpp
 create mode 100644 perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.cpp
 create mode 100644 sensing/autoware_cuda_utils/include/autoware/cuda_utils/cuda_utils.hpp

diff --git a/perception/autoware_lidar_bevfusion/CMakeLists.txt b/perception/autoware_lidar_bevfusion/CMakeLists.txt
new file mode 100644
index 0000000000000..64fd656713537
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/CMakeLists.txt
@@ -0,0 +1,213 @@
+cmake_minimum_required(VERSION 3.14)
+project(autoware_lidar_bevfusion)
+
+find_package(autoware_cmake REQUIRED)
+autoware_package()
+
+add_compile_options(-Wno-deprecated-declarations)
+
+option(CUDA_VERBOSE "Verbose output of CUDA modules" OFF)
+
+# set flags for CUDA availability
+option(CUDA_AVAIL "CUDA available" OFF)
+find_package(CUDA)
+if(CUDA_FOUND)
+  find_library(CUBLAS_LIBRARIES cublas HINTS
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib
+  )
+  if(CUDA_VERBOSE)
+    message("CUDA is available!")
+    message("CUDA Libs: ${CUDA_LIBRARIES}")
+    message("CUDA Headers: ${CUDA_INCLUDE_DIRS}")
+  endif()
+  # Note: cublas_device was depreciated in CUDA version 9.2
+  #       https://forums.developer.nvidia.com/t/where-can-i-find-libcublas-device-so-or-libcublas-device-a/67251/4
+  #       In LibTorch, CUDA_cublas_device_LIBRARY is used.
+  unset(CUDA_cublas_device_LIBRARY CACHE)
+  set(CUDA_AVAIL ON)
+else()
+  message("CUDA NOT FOUND")
+  set(CUDA_AVAIL OFF)
+endif()
+
+# set flags for TensorRT availability
+option(TRT_AVAIL "TensorRT available" OFF)
+# try to find the tensorRT modules
+find_library(NVINFER nvinfer)
+find_library(NVONNXPARSER nvonnxparser)
+if(NVINFER AND NVONNXPARSER)
+  if(CUDA_VERBOSE)
+    message("TensorRT is available!")
+    message("NVINFER: ${NVINFER}")
+    message("NVONNXPARSER: ${NVONNXPARSER}")
+  endif()
+  set(TRT_AVAIL ON)
+else()
+  message("TensorRT is NOT Available")
+  set(TRT_AVAIL OFF)
+endif()
+
+# set flags for CUDNN availability
+option(CUDNN_AVAIL "CUDNN available" OFF)
+# try to find the CUDNN module
+find_library(CUDNN_LIBRARY
+NAMES libcudnn.so${__cudnn_ver_suffix} libcudnn${__cudnn_ver_suffix}.dylib ${__cudnn_lib_win_name}
+PATHS $ENV{LD_LIBRARY_PATH} ${__libpath_cudart} ${CUDNN_ROOT_DIR} ${PC_CUDNN_LIBRARY_DIRS} ${CMAKE_INSTALL_PREFIX}
+PATH_SUFFIXES lib lib64 bin
+DOC "CUDNN library."
+)
+if(CUDNN_LIBRARY)
+  if(CUDA_VERBOSE)
+    message(STATUS "CUDNN is available!")
+    message(STATUS "CUDNN_LIBRARY: ${CUDNN_LIBRARY}")
+  endif()
+  set(CUDNN_AVAIL ON)
+else()
+  message("CUDNN is NOT Available")
+  set(CUDNN_AVAIL OFF)
+endif()
+
+# set flags for spconv availability
+option(SPCONV_AVAIL "spconv available" OFF)
+# try to find spconv
+find_package(cumm)
+find_package(spconv)
+if(${cumm_FOUND} AND ${spconv_FOUND})
+  message("spconv is available!")
+  set(SPCONV_AVAIL ON)
+else()
+  message("spconv is NOT Available")
+  set(SPCONV_AVAIL OFF)
+endif()
+
+if(TRT_AVAIL AND CUDA_AVAIL AND CUDNN_AVAIL AND SPCONV_AVAIL)
+  find_package(ament_cmake_auto REQUIRED)
+  ament_auto_find_build_dependencies()
+
+  include_directories(
+    include
+    ${CUDA_INCLUDE_DIRS}
+  )
+
+  if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G")
+  endif()
+
+  # Find Eigen3
+  find_package(Eigen3 3.3 REQUIRED NO_MODULE)
+
+  add_definitions("-DTV_CUDA")
+
+  list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -diag-suppress 1675 --extended-lambda")
+
+  cuda_add_library(bev_ops SHARED
+    src/bev_ops/bev_pool_cuda.cu
+  )
+
+  add_library(autoware_tensorrt_plugins SHARED
+    src/tensorrt_plugins/plugin_utils.cpp
+    src/tensorrt_plugins/quick_cumsum_cuda_plugin.cpp
+    src/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.cpp
+    src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.cpp
+    src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.cpp
+    src/tensorrt_plugins/implicit_gemm_plugin_creator.cpp
+    src/tensorrt_plugins/implicit_gemm_plugin.cpp
+
+    src/tensorrt_plugins/plugin_registration.cpp
+  )
+
+  target_compile_definitions(autoware_tensorrt_plugins PRIVATE _GLIBCXX_USE_CXX11_ABI=1)
+
+  target_link_libraries(autoware_tensorrt_plugins PRIVATE
+      ${NVINFER}
+      CUDA::cudart
+      bev_ops
+      spconv::spconv
+  )
+
+  cuda_add_library(${PROJECT_NAME}_cuda_lib SHARED
+    lib/postprocess/circle_nms_kernel.cu
+    lib/postprocess/postprocess_kernel.cu
+    lib/preprocess/preprocess_kernel.cu
+  )
+
+  target_link_libraries(${PROJECT_NAME}_cuda_lib
+    spconv::spconv
+  )
+
+  target_include_directories(${PROJECT_NAME}_cuda_lib
+    SYSTEM PUBLIC
+    ${autoware_cuda_utils_INCLUDE_DIRS}
+  )
+
+  ament_auto_add_library(${PROJECT_NAME}_lib SHARED
+    lib/detection_class_remapper.cpp
+    lib/postprocess/non_maximum_suppression.cpp
+    lib/preprocess/voxel_generator.cpp
+    lib/preprocess/pointcloud_densification.cpp
+    lib/preprocess/precomputed_features.cpp
+    lib/ros_utils.cpp
+    lib/bevfusion_trt.cpp
+  )
+
+  target_compile_definitions(${PROJECT_NAME}_lib PRIVATE
+    TENSORRT_VERSION_MAJOR=${TENSORRT_VERSION_MAJOR}
+  )
+
+  target_link_libraries(${PROJECT_NAME}_lib
+    ${NVINFER}
+    ${NVONNXPARSER}
+    ${CUDA_LIBRARIES}
+    ${CUBLAS_LIBRARIES}
+    ${CUDA_curand_LIBRARY}
+    ${CUDNN_LIBRARY}
+    ${PROJECT_NAME}_cuda_lib
+  )
+
+  # To suppress unknown-pragmas error. The root-cause is CUB library in CUDA 11.6.
+  # This issue was fixed by https://github.com/NVIDIA/cub/commit/7d608bf1dc14553e2fb219eabeed80b76621b6fe
+  target_include_directories(${PROJECT_NAME}_lib
+    SYSTEM PUBLIC
+    ${CUDA_INCLUDE_DIRS}
+    $(autoware_point_types_INCLUDE_DIRS)
+  )
+
+  ament_auto_add_library(${PROJECT_NAME}_component SHARED
+    src/lidar_bevfusion_node.cpp
+  )
+
+  target_link_libraries(${PROJECT_NAME}_component
+    ${PROJECT_NAME}_lib
+  )
+
+  rclcpp_components_register_node(${PROJECT_NAME}_component
+    PLUGIN "autoware::lidar_bevfusion::LidarBEVFusionNode"
+    EXECUTABLE ${PROJECT_NAME}_node
+  )
+
+  install(
+    TARGETS ${PROJECT_NAME}_cuda_lib
+    DESTINATION lib
+  )
+
+  install(
+    TARGETS autoware_tensorrt_plugins
+    DESTINATION share/${PROJECT_NAME}/plugins
+  )
+
+  ament_auto_package(
+    INSTALL_TO_SHARE
+      launch
+      config
+  )
+
+else()
+  find_package(ament_cmake_auto REQUIRED)
+  ament_auto_find_build_dependencies()
+
+  ament_auto_package(
+    INSTALL_TO_SHARE
+      launch
+  )
+endif()
diff --git a/perception/autoware_lidar_bevfusion/config/bevfusion.param.yaml b/perception/autoware_lidar_bevfusion/config/bevfusion.param.yaml
new file mode 100644
index 0000000000000..3958123c76086
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/config/bevfusion.param.yaml
@@ -0,0 +1,23 @@
+/**:
+  ros__parameters:
+    # modality
+    sensor_fusion: true
+    # non-network params
+    max_camera_lidar_delay: 0.12
+    # plugins
+    plugins_path: $(find-pkg-share autoware_lidar_bevfusion)/plugins/libautoware_tensorrt_plugins.so
+    # network
+    trt_precision: fp32
+    cloud_capacity: 2000000
+    onnx_path: "$(var model_path)/bevfusion_camera_lidar_v2.onnx"
+    engine_path: "$(var model_path)/bevfusion_camera_lidar_v2.engine"
+    # pre-process params
+    densification_num_past_frames: 0
+    densification_world_frame_id: map
+    # post-process params
+    circle_nms_dist_threshold: 0.5
+    iou_nms_target_class_names: ["CAR"]
+    iou_nms_search_distance_2d: 10.0
+    iou_nms_threshold: 0.1
+    yaw_norm_thresholds: [0.3, 0.3, 0.3, 0.3, 0.0] # refers to the class_names
+    score_threshold: 0.1
diff --git a/perception/autoware_lidar_bevfusion/config/bevfusion_ml_package.param.yaml b/perception/autoware_lidar_bevfusion/config/bevfusion_ml_package.param.yaml
new file mode 100644
index 0000000000000..48e30a5d2b879
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/config/bevfusion_ml_package.param.yaml
@@ -0,0 +1,25 @@
+/**:
+  ros__parameters:
+    class_names: ["CAR", "TRUCK", "BUS", "BICYCLE", "PEDESTRIAN"]
+    voxels_num: [1, 128000, 256000] # [min, opt, max]
+    point_cloud_range: [-122.4, -122.4, -3.0, 122.4, 122.4, 5.0] # [x_min, y_min, z_min, x_max, y_max, z_max]
+    voxel_size: [0.17, 0.17, 0.2] # [x, y, z]
+    num_proposals: 500
+    out_size_factor: 8
+    max_points_per_voxel: 10
+
+    dbound: [1.0, 166.2, 1.4]
+    xbound: [-122.4, 122.4, 0.68]
+    ybound: [-122.4, 122.4, 0.68]
+    zbound: [-10.0, 10.0, 20.0]
+    num_cameras: 6
+    raw_image_height: 1080
+    raw_image_width: 1440
+    img_aug_scale_x: 0.489
+    img_aug_scale_y: 0.489
+    img_aug_offset_y: -262.0
+    roi_height: 384
+    roi_width: 704
+    features_height: 48
+    features_width: 88
+    num_depth_features: 118
diff --git a/perception/autoware_lidar_bevfusion/config/detection_class_remapper.param.yaml b/perception/autoware_lidar_bevfusion/config/detection_class_remapper.param.yaml
new file mode 100644
index 0000000000000..40275d750f0d9
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/config/detection_class_remapper.param.yaml
@@ -0,0 +1,38 @@
+/**:
+  ros__parameters:
+    allow_remapping_by_area_matrix:
+      # NOTE(knzo25): We turn all vehicles into trailers if they go over 3x12 [m^2].
+      # NOTE(knzo25): We turn cars into trucks if they have an area between 2.2 x 5.5 and 3.0 * 12.0 [m^2]
+      # row: original class. column: class to remap to
+      #UNKNOWN, CAR, TRUCK, BUS,  TRAILER, MOTORBIKE, BICYCLE,PEDESTRIAN
+      [0,       0,   0,     0,    0,       0,         0,      0,         #UNKNOWN
+       0,       0,   1,     0,    1,       0,         0,      0,         #CAR
+       0,       0,   0,     0,    1,       0,         0,      0,         #TRUCK
+       0,       0,   0,     0,    1,       0,         0,      0,         #BUS
+       0,       0,   0,     0,    0,       0,         0,      0,         #TRAILER
+       0,       0,   0,     0,    0,       0,         0,      0,         #MOTORBIKE
+       0,       0,   0,     0,    0,       0,         0,      0,         #BICYCLE
+       0,       0,   0,     0,    0,       0,         0,      0]         #PEDESTRIAN
+
+    min_area_matrix:
+      #UNKNOWN, CAR,   TRUCK,   BUS,   TRAILER,  MOTORBIKE, BICYCLE, PEDESTRIAN
+      [ 0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000,      #UNKNOWN
+        0.000,  0.000, 12.100,  0.000, 36.000,   0.000,     0.000,   0.000,      #CAR
+        0.000,  0.000,  0.000,  0.000, 36.000,   0.000,     0.000,   0.000,      #TRUCK
+        0.000,  0.000,  0.000,  0.000, 36.000,   0.000,     0.000,   0.000,      #BUS
+        0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000,      #TRAILER
+        0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000,      #MOTORBIKE
+        0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000,      #BICYCLE
+        0.000,  0.000,  0.000,  0.000,  0.000,   0.000,     0.000,   0.000]      #PEDESTRIAN
+
+
+    max_area_matrix:
+      #UNKNOWN, CAR,   TRUCK,  BUS,    TRAILER,   MOTORBIKE, BICYCLE, PEDESTRIAN
+      [ 0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000,      #UNKNOWN
+        0.000,  0.000, 36.000,  0.000, 999.999,   0.000,     0.000,   0.000,      #CAR
+        0.000,  0.000,  0.000,  0.000, 999.999,   0.000,     0.000,   0.000,      #TRUCK
+        0.000,  0.000,  0.000,  0.000, 999.999,   0.000,     0.000,   0.000,      #BUS
+        0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000,      #TRAILER
+        0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000,      #MOTORBIKE
+        0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000,      #BICYCLE
+        0.000,  0.000,  0.000,  0.000,   0.000,   0.000,     0.000,   0.000]      #PEDESTRIAN
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/bev_ops/bev_pool_cuda.h b/perception/autoware_lidar_bevfusion/include/autoware/bev_ops/bev_pool_cuda.h
new file mode 100644
index 0000000000000..f169d8f240ce7
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/bev_ops/bev_pool_cuda.h
@@ -0,0 +1,12 @@
+
+#ifndef AUTOWARE__BEV_OPS__BEV_POOL_CUDA_H_
+#define AUTOWARE__BEV_OPS__BEV_POOL_CUDA_H_
+
+#include <cuda_runtime_api.h>
+
+void bev_pool(
+  int b, int d, int h, int w, int n, int c, int n_intervals, const float * x,
+  const int * geom_feats, const int * interval_starts, const int * interval_lengths, float * out,
+  cudaStream_t & stream);
+
+#endif  // AUTOWARE__BEV_OPS__BEV_POOL_CUDA_H_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_config.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_config.hpp
new file mode 100644
index 0000000000000..5be7f08f71b9d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_config.hpp
@@ -0,0 +1,195 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_CONFIG_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_CONFIG_HPP_
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class BEVFusionConfig
+{
+public:
+  // cSpell:ignore dbound, xbound, ybound, zbound
+  BEVFusionConfig(
+    const bool sensor_fusion, const std::string & plugins_path, const std::int64_t out_size_factor,
+    const std::int64_t cloud_capacity, const std::int64_t max_points_per_voxel,
+    const std::vector<std::int64_t> & voxels_num, const std::vector<float> & point_cloud_range,
+    const std::vector<float> & voxel_size, const std::vector<float> & dbound,
+    const std::vector<float> & xbound, const std::vector<float> & ybound,
+    const std::vector<float> & zbound, const std::int64_t num_cameras,
+    const std::int64_t raw_image_height, const std::int64_t raw_image_width,
+    const float img_aug_scale_x, const float img_aug_scale_y, const std::int64_t roi_height,
+    const std::int64_t roi_width, const std::int64_t features_height,
+    const std::int64_t features_width, const std::int64_t num_depth_features,
+    const std::int64_t num_proposals, const float circle_nms_dist_threshold,
+    const std::vector<double> & yaw_norm_thresholds, const float score_threshold)
+  {
+    sensor_fusion_ = sensor_fusion;
+    plugins_path_ = plugins_path;
+
+    out_size_factor_ = out_size_factor;
+
+    cloud_capacity_ = cloud_capacity;
+    max_points_per_voxel_ = max_points_per_voxel;
+
+    if (voxels_num.size() == 3) {
+      min_num_voxels_ = voxels_num[0];
+      max_num_voxels_ = voxels_num[2];
+
+      voxels_num_[0] = voxels_num[0];
+      voxels_num_[1] = voxels_num[1];
+      voxels_num_[2] = voxels_num[2];
+    }
+    if (point_cloud_range.size() == 6) {
+      min_x_range_ = point_cloud_range[0];
+      min_y_range_ = point_cloud_range[1];
+      min_z_range_ = point_cloud_range[2];
+      max_x_range_ = point_cloud_range[3];
+      max_y_range_ = point_cloud_range[4];
+      max_z_range_ = point_cloud_range[5];
+    }
+    if (voxel_size.size() == 3) {
+      voxel_x_size_ = voxel_size[0];
+      voxel_y_size_ = voxel_size[1];
+      voxel_z_size_ = voxel_size[2];
+    }
+    if (dbound.size() == 3 && xbound.size() == 3 && ybound.size() == 3 && zbound.size() == 3) {
+      dbound_ = dbound;
+      xbound_ = xbound;
+      ybound_ = ybound;
+      zbound_ = zbound;
+    }
+
+    num_cameras_ = num_cameras;
+    raw_image_height_ = raw_image_height;
+    raw_image_width_ = raw_image_width;
+    img_aug_scale_x_ = img_aug_scale_x;
+    img_aug_scale_y_ = img_aug_scale_y;
+    roi_height_ = roi_height;
+    roi_width_ = roi_width;
+    features_height_ = features_height;
+    features_width_ = features_width;
+    num_depth_features_ = num_depth_features;
+    resized_height_ = raw_image_height_ * img_aug_scale_y_;
+    resized_width_ = raw_image_width_ * img_aug_scale_x_;
+
+    if (num_proposals > 0) {
+      num_proposals_ = num_proposals;
+    }
+    if (score_threshold > 0.0) {
+      score_threshold_ = score_threshold;
+    }
+    if (circle_nms_dist_threshold > 0.0) {
+      circle_nms_dist_threshold_ = circle_nms_dist_threshold;
+    }
+    yaw_norm_thresholds_ =
+      std::vector<float>(yaw_norm_thresholds.begin(), yaw_norm_thresholds.end());
+    for (auto & yaw_norm_threshold : yaw_norm_thresholds_) {
+      yaw_norm_threshold =
+        (yaw_norm_threshold >= 0.0 && yaw_norm_threshold < 1.0) ? yaw_norm_threshold : 0.0;
+    }
+    grid_x_size_ = static_cast<std::int64_t>((max_x_range_ - min_x_range_) / voxel_x_size_);
+    grid_y_size_ = static_cast<std::int64_t>((max_y_range_ - min_y_range_) / voxel_y_size_);
+    grid_z_size_ = static_cast<std::int64_t>((max_z_range_ - min_z_range_) / voxel_z_size_);
+  }
+
+  ///// MODALITY /////
+  bool sensor_fusion_{};
+
+  // CUDA parameters
+  const std::uint32_t threads_per_block_{256};  // threads number for a block
+
+  // TensorRT parameters
+  std::string plugins_path_{};
+
+  ///// NETWORK PARAMETERS /////
+
+  // Common network parameters
+  std::int64_t out_size_factor_{};
+
+  std::int64_t cloud_capacity_{};
+  std::int64_t min_num_voxels_{};
+  std::int64_t max_num_voxels_{};
+  std::int64_t max_points_per_voxel_;
+  const std::int64_t num_point_feature_size_{5};  // x, y, z, intensity, lag
+
+  // Pointcloud range in meters
+  float min_x_range_{};
+  float max_x_range_{};
+  float min_y_range_{};
+  float max_y_range_{};
+  float min_z_range_{};
+  float max_z_range_{};
+
+  // Voxel size in meters
+  float voxel_x_size_{};
+  float voxel_y_size_{};
+  float voxel_z_size_{};
+
+  // Grid size
+  std::int64_t grid_x_size_{};
+  std::int64_t grid_y_size_{};
+  std::int64_t grid_z_size_{};
+
+  // Camera branch parameters
+  std::vector<float> dbound_{};
+  std::vector<float> xbound_{};
+  std::vector<float> ybound_{};
+  std::vector<float> zbound_{};
+
+  std::int64_t num_cameras_{};
+  std::int64_t raw_image_height_{};
+  std::int64_t raw_image_width_{};
+
+  float img_aug_scale_x_{};
+  float img_aug_scale_y_{};
+
+  std::int64_t roi_height_{};
+  std::int64_t roi_width_{};
+
+  std::int64_t resized_height_{};
+  std::int64_t resized_width_{};
+
+  std::int64_t features_height_{};
+  std::int64_t features_width_{};
+  std::int64_t num_depth_features_{};
+
+  // Head parameters
+  std::int64_t num_proposals_{};
+  const std::size_t num_classes_{5};
+
+  // Post processing parameters
+
+  // the score threshold for classification
+  float score_threshold_{};
+
+  float circle_nms_dist_threshold_{};
+  std::vector<float> yaw_norm_thresholds_{};
+  // the detected boxes result decode by (x, y, z, w, l, h, yaw, vx, vy)
+  const std::int64_t num_box_values_{10};
+
+  ///// RUNTIME DIMENSIONS /////
+  std::array<std::int64_t, 3> voxels_num_{};
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_CONFIG_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_trt.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_trt.hpp
new file mode 100644
index 0000000000000..9a20a90ba39fc
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/bevfusion_trt.hpp
@@ -0,0 +1,150 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_TRT_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_TRT_HPP_
+
+#include "autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp"
+#include "autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp"
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+#include "autoware/lidar_bevfusion/preprocess/voxel_generator.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+#include "autoware/lidar_bevfusion/visibility_control.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+#include <autoware/cuda_utils/cuda_unique_ptr.hpp>
+#include <autoware/tensorrt_common/tensorrt_common.hpp>
+#include <autoware/universe_utils/system/stop_watch.hpp>
+
+#include <sensor_msgs/msg/camera_info.hpp>
+#include <sensor_msgs/msg/image.hpp>
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <pcl/point_cloud.h>
+#include <pcl/point_types.h>
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+using autoware::cuda_utils::CudaUniquePtr;
+
+class NetworkParam
+{
+public:
+  NetworkParam(std::string onnx_path, std::string engine_path, std::string trt_precision)
+  : onnx_path_(std::move(onnx_path)),
+    engine_path_(std::move(engine_path)),
+    trt_precision_(std::move(trt_precision))
+  {
+  }
+
+  std::string onnx_path() const { return onnx_path_; }
+  std::string engine_path() const { return engine_path_; }
+  std::string trt_precision() const { return trt_precision_; }
+
+private:
+  std::string onnx_path_;
+  std::string engine_path_;
+  std::string trt_precision_;
+};
+
+class LIDAR_BEVFUSION_PUBLIC BEVFusionTRT
+{
+public:
+  using Matrix4fRowM = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
+
+  explicit BEVFusionTRT(
+    const tensorrt_common::TrtCommonConfig & trt_config,
+    const DensificationParam & densification_param, const BEVFusionConfig & config);
+  virtual ~BEVFusionTRT();
+
+  bool detect(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr & msg,
+    const std::vector<sensor_msgs::msg::Image::ConstSharedPtr> & image_msgs,
+    const std::vector<float> & camera_masks, const tf2_ros::Buffer & tf_buffer,
+    std::vector<Box3D> & det_boxes3d, std::unordered_map<std::string, double> & proc_timing);
+
+  void setIntrinsicsExtrinsics(
+    std::vector<sensor_msgs::msg::CameraInfo> & camera_info_vector,
+    std::vector<Matrix4fRowM> & lidar2camera_vector);
+
+protected:
+  void initPtr();
+  void initTrt(const tensorrt_common::TrtCommonConfig & trt_config);
+
+  bool preProcess(
+    const sensor_msgs::msg::PointCloud2::ConstSharedPtr & pc_msg,
+    const std::vector<sensor_msgs::msg::Image::ConstSharedPtr> & image_msgs,
+    const std::vector<float> & camera_masks, const tf2_ros::Buffer & tf_buffer);
+
+  bool inference();
+
+  bool postProcess(std::vector<Box3D> & det_boxes3d);
+
+  std::unique_ptr<autoware::tensorrt_common::TrtCommon> network_trt_ptr_{nullptr};
+  std::unique_ptr<VoxelGenerator> vg_ptr_{nullptr};
+  std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_{
+    nullptr};
+  std::unique_ptr<PreprocessCuda> pre_ptr_{nullptr};
+  std::unique_ptr<PostprocessCuda> post_ptr_{nullptr};
+  cudaStream_t stream_{nullptr};
+  std::vector<cudaStream_t> camera_streams_{};
+
+  BEVFusionConfig config_;
+  std::vector<int> roi_start_y_vector_;
+
+  // pre-process inputs
+
+  unsigned int voxel_features_size_{0};
+  unsigned int voxel_coords_size_{0};
+  unsigned int bbox_pred_size_{0};
+
+  // lidar buffers
+  CudaUniquePtr<float[]> points_d_{nullptr};
+  CudaUniquePtr<float[]> voxel_features_d_{nullptr};
+  CudaUniquePtr<std::int32_t[]> voxel_coords_d_{nullptr};
+  CudaUniquePtr<std::int32_t[]> num_points_per_voxel_d_{nullptr};
+
+  // pre computed tensors
+  std::int64_t num_geom_feats_{};
+  std::int64_t num_kept_{};
+  std::int64_t num_ranks_{};
+  std::int64_t num_indices_{};
+  CudaUniquePtr<float_t[]> lidar2image_d_{};
+  CudaUniquePtr<std::int32_t[]> geom_feats_d_{};
+  CudaUniquePtr<std::uint8_t[]> kept_d_{};
+  CudaUniquePtr<std::int64_t[]> ranks_d_{};
+  CudaUniquePtr<std::int64_t[]> indices_d_{};
+
+  // image buffers
+  CudaUniquePtr<std::uint8_t[]> roi_tensor_d_{nullptr};
+  std::vector<CudaUniquePtr<std::uint8_t[]>> image_buffers_d_{};
+  CudaUniquePtr<float[]> camera_masks_d_{nullptr};
+
+  // output buffers
+  CudaUniquePtr<std::int64_t[]> label_pred_output_d_{nullptr};
+  CudaUniquePtr<float[]> bbox_pred_output_d_{nullptr};
+  CudaUniquePtr<float[]> score_output_d_{nullptr};
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__BEVFUSION_TRT_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/detection_class_remapper.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/detection_class_remapper.hpp
new file mode 100644
index 0000000000000..5658684f9790e
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/detection_class_remapper.hpp
@@ -0,0 +1,48 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__DETECTION_CLASS_REMAPPER_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__DETECTION_CLASS_REMAPPER_HPP_
+
+#include <Eigen/Core>
+
+#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
+#include <autoware_perception_msgs/msg/detected_objects.hpp>
+#include <autoware_perception_msgs/msg/object_classification.hpp>
+#include <autoware_perception_msgs/msg/shape.hpp>
+
+#include <cstdint>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class DetectionClassRemapper
+{
+public:
+  void setParameters(
+    const std::vector<std::int64_t> & allow_remapping_by_area_matrix,
+    const std::vector<double> & min_area_matrix, const std::vector<double> & max_area_matrix);
+  void mapClasses(autoware_perception_msgs::msg::DetectedObjects & msg);
+
+protected:
+  Eigen::Matrix<bool, Eigen::Dynamic, Eigen::Dynamic> allow_remapping_by_area_matrix_;
+  Eigen::MatrixXd min_area_matrix_;
+  Eigen::MatrixXd max_area_matrix_;
+  int num_labels_;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__DETECTION_CLASS_REMAPPER_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/lidar_bevfusion_node.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/lidar_bevfusion_node.hpp
new file mode 100644
index 0000000000000..1ee0d2c0071cf
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/lidar_bevfusion_node.hpp
@@ -0,0 +1,97 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__LIDAR_BEVFUSION_NODE_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__LIDAR_BEVFUSION_NODE_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_trt.hpp"
+#include "autoware/lidar_bevfusion/detection_class_remapper.hpp"
+#include "autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp"
+#include "autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp"
+#include "autoware/lidar_bevfusion/visibility_control.hpp"
+
+#include <Eigen/Core>
+#include <autoware/universe_utils/ros/debug_publisher.hpp>
+#include <autoware/universe_utils/ros/published_time_publisher.hpp>
+#include <autoware/universe_utils/system/stop_watch.hpp>
+#include <rclcpp/rclcpp.hpp>
+
+#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
+#include <autoware_perception_msgs/msg/detected_objects.hpp>
+#include <autoware_perception_msgs/msg/object_classification.hpp>
+#include <autoware_perception_msgs/msg/shape.hpp>
+#include <sensor_msgs/msg/camera_info.hpp>
+#include <sensor_msgs/msg/compressed_image.hpp>
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class LIDAR_BEVFUSION_PUBLIC LidarBEVFusionNode : public rclcpp::Node
+{
+public:
+  using Matrix4f = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
+
+  explicit LidarBEVFusionNode(const rclcpp::NodeOptions & options);
+
+private:
+  void cloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr msg);
+  void imageCallback(const sensor_msgs::msg::Image::ConstSharedPtr msg, std::size_t camera_id);
+  void cameraInfoCallback(const sensor_msgs::msg::CameraInfo & msg, std::size_t camera_id);
+
+  rclcpp::Subscription<sensor_msgs::msg::PointCloud2>::ConstSharedPtr cloud_sub_{nullptr};
+  std::vector<rclcpp::Subscription<sensor_msgs::msg::Image>::ConstSharedPtr> image_subs_;
+  std::vector<rclcpp::Subscription<sensor_msgs::msg::CameraInfo>::ConstSharedPtr> camera_info_subs_;
+  rclcpp::Publisher<autoware_perception_msgs::msg::DetectedObjects>::SharedPtr objects_pub_{
+    nullptr};
+
+  tf2_ros::Buffer tf_buffer_;
+  tf2_ros::TransformListener tf_listener_{tf_buffer_};
+
+  DetectionClassRemapper detection_class_remapper_;
+  float score_threshold_{0.0};
+  std::vector<std::string> class_names_;
+  std::optional<std::string> lidar_frame_;
+  float max_camera_lidar_delay_;
+
+  std::vector<sensor_msgs::msg::Image::ConstSharedPtr> image_msgs_;
+  std::vector<float> camera_masks_;
+  std::vector<std::optional<sensor_msgs::msg::CameraInfo>> camera_info_msgs_;
+  std::vector<std::optional<Matrix4f>> lidar2camera_extrinsics_;
+
+  bool sensor_fusion_{false};
+  bool images_available_{false};
+  bool intrinsics_available_{false};
+  bool extrinsics_available_{false};
+  bool intrinsics_extrinsics_precomputed_{false};
+
+  NonMaximumSuppression iou_bev_nms_;
+
+  std::unique_ptr<BEVFusionTRT> detector_ptr_{nullptr};
+
+  // debugger
+  std::unique_ptr<autoware::universe_utils::StopWatch<std::chrono::milliseconds>> stop_watch_ptr_{
+    nullptr};
+  std::unique_ptr<autoware::universe_utils::DebugPublisher> debug_publisher_ptr_{nullptr};
+  std::unique_ptr<autoware::universe_utils::PublishedTimePublisher> published_time_pub_{nullptr};
+};
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__LIDAR_BEVFUSION_NODE_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp
new file mode 100644
index 0000000000000..f6b00c19c5f06
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp
@@ -0,0 +1,34 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
+
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <thrust/device_vector.h>
+
+#include <cstddef>
+
+namespace autoware::lidar_bevfusion
+{
+// Non-maximum suppression (NMS) uses the distance on the xy plane instead of
+// intersection over union (IoU) to suppress overlapped objects.
+std::size_t circleNMS(
+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream);
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__CIRCLE_NMS_KERNEL_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp
new file mode 100644
index 0000000000000..c8d64a76b41c6
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp
@@ -0,0 +1,82 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
+
+#include "autoware/lidar_bevfusion/ros_utils.hpp"
+
+#include <Eigen/Eigen>
+
+#include <autoware_perception_msgs/msg/detected_object.hpp>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+using autoware_perception_msgs::msg::DetectedObject;
+
+// TODO(knzo25): Update this implementation to mach changes from centerpoint
+enum class NMS_TYPE {
+  IoU_BEV
+  // IoU_3D
+  // Distance_2D
+  // Distance_3D
+};
+
+struct NMSParams
+{
+  NMS_TYPE nms_type_{};
+  std::vector<std::string> target_class_names_{};
+  double search_distance_2d_{};
+  double iou_threshold_{};
+  // double distance_threshold_{};
+};
+
+std::vector<bool> classNamesToBooleanMask(const std::vector<std::string> & class_names)
+{
+  std::vector<bool> mask;
+  constexpr std::size_t num_object_classification = 8;
+  mask.resize(num_object_classification);
+  for (const auto & class_name : class_names) {
+    const auto semantic_type = getSemanticType(class_name);
+    mask.at(semantic_type) = true;
+  }
+
+  return mask;
+}
+
+class NonMaximumSuppression
+{
+public:
+  void setParameters(const NMSParams &);
+
+  std::vector<DetectedObject> apply(const std::vector<DetectedObject> &);
+
+private:
+  bool isTargetLabel(const std::uint8_t);
+
+  bool isTargetPairObject(const DetectedObject &, const DetectedObject &);
+
+  Eigen::MatrixXd generateIoUMatrix(const std::vector<DetectedObject> &);
+
+  NMSParams params_{};
+  std::vector<bool> target_class_mask_{};
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__NON_MAXIMUM_SUPPRESSION_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp
new file mode 100644
index 0000000000000..4a7c8845e85ee
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp
@@ -0,0 +1,47 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class PostprocessCuda
+{
+public:
+  explicit PostprocessCuda(const BEVFusionConfig & config, cudaStream_t & stream);
+
+  cudaError_t generateDetectedBoxes3D_launch(
+    const std::int64_t * label_pred_output, const float * bbox_pred_output,
+    const float * score_output, std::vector<Box3D> & det_boxes3d, cudaStream_t stream);
+
+private:
+  BEVFusionConfig config_;
+  cudaStream_t stream_;
+  cudaStream_t stream_event_;
+  cudaEvent_t start_, stop_;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__POSTPROCESS__POSTPROCESS_KERNEL_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/point_type.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/point_type.hpp
new file mode 100644
index 0000000000000..35cf87e7d6c3d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/point_type.hpp
@@ -0,0 +1,35 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINT_TYPE_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINT_TYPE_HPP_
+
+#include <cstdint>
+
+namespace autoware::lidar_bevfusion
+{
+
+struct InputPointType
+{
+  float x;
+  float y;
+  float z;
+  std::uint8_t intensity;
+  std::uint8_t return_type;
+  std::uint16_t channel;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINT_TYPE_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp
new file mode 100644
index 0000000000000..c02ba430568cf
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp
@@ -0,0 +1,104 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
+
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+#include <autoware/cuda_utils/cuda_unique_ptr.hpp>
+
+#include <tf2_sensor_msgs/tf2_sensor_msgs.hpp>
+
+#include <tf2_ros/buffer.h>
+#include <tf2_ros/transform_listener.h>
+
+#include <cstddef>
+#include <list>
+#include <string>
+#include <utility>
+
+namespace autoware::lidar_bevfusion
+{
+
+using autoware::cuda_utils::CudaUniquePtr;
+
+class DensificationParam
+{
+public:
+  DensificationParam(const std::string & world_frame_id, const unsigned int num_past_frames)
+  : world_frame_id_(std::move(world_frame_id)),
+    pointcloud_cache_size_(num_past_frames + /*current frame*/ 1)
+  {
+  }
+
+  std::string world_frame_id() const { return world_frame_id_; }
+  unsigned int pointcloud_cache_size() const { return pointcloud_cache_size_; }
+
+private:
+  std::string world_frame_id_;
+  unsigned int pointcloud_cache_size_{1};
+};
+
+struct PointCloudWithTransform
+{
+  CudaUniquePtr<InputPointType[]> data_d{nullptr};
+  std_msgs::msg::Header header;
+  std::size_t num_points{0};
+  Eigen::Affine3f affine_past2world;
+};
+
+class PointCloudDensification
+{
+public:
+  explicit PointCloudDensification(const DensificationParam & param, cudaStream_t & stream);
+
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer);
+
+  double getCurrentTimestamp() const { return current_timestamp_; }
+  Eigen::Affine3f getAffineWorldToCurrent() const { return affine_world2current_; }
+  std::list<PointCloudWithTransform>::iterator getPointCloudCacheIter()
+  {
+    return pointcloud_cache_.begin();
+  }
+  bool isCacheEnd(std::list<PointCloudWithTransform>::iterator iter)
+  {
+    return iter == pointcloud_cache_.end();
+  }
+  std::size_t getIdx(std::list<PointCloudWithTransform>::iterator iter)
+  {
+    return std::distance(pointcloud_cache_.begin(), iter);
+  }
+  std::size_t getCacheSize()
+  {
+    return std::distance(pointcloud_cache_.begin(), pointcloud_cache_.end());
+  }
+  unsigned int pointcloud_cache_size() const { return param_.pointcloud_cache_size(); }
+
+private:
+  void enqueue(const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine);
+  void dequeue();
+
+  DensificationParam param_;
+  double current_timestamp_{0.0};
+  Eigen::Affine3f affine_world2current_;
+  std::list<PointCloudWithTransform> pointcloud_cache_;
+  cudaStream_t stream_;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__POINTCLOUD_DENSIFICATION_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/precomputed_features.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/precomputed_features.hpp
new file mode 100644
index 0000000000000..5962c0b0e6620
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/precomputed_features.hpp
@@ -0,0 +1,89 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PRECOMPUTED_FEATURES_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PRECOMPUTED_FEATURES_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+
+#include <Eigen/Dense>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <sensor_msgs/msg/camera_info.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <tuple>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+using Tensor1D = Eigen::Tensor<float, 1, Eigen::RowMajor>;
+using Tensor2D = Eigen::Tensor<float, 2, Eigen::RowMajor>;
+using Tensor3D = Eigen::Tensor<float, 3, Eigen::RowMajor>;
+using Tensor4D = Eigen::Tensor<float, 4, Eigen::RowMajor>;
+using Tensor5D = Eigen::Tensor<float, 5, Eigen::RowMajor>;
+using Tensor6D = Eigen::Tensor<float, 6, Eigen::RowMajor>;
+using Tensor7D = Eigen::Tensor<float, 7, Eigen::RowMajor>;
+
+using TensorMap1D = Eigen::TensorMap<Tensor1D>;
+using TensorMap2D = Eigen::TensorMap<Tensor2D>;
+using TensorMap3D = Eigen::TensorMap<Tensor3D>;
+using TensorMap4D = Eigen::TensorMap<Tensor4D>;
+using TensorMap5D = Eigen::TensorMap<Tensor5D>;
+using TensorMap6D = Eigen::TensorMap<Tensor6D>;
+using TensorMap7D = Eigen::TensorMap<Tensor7D>;
+
+using Matrix4fRowM = Eigen::Matrix<float, 4, 4, Eigen::RowMajor>;
+using Matrix3fRowM = Eigen::Matrix<float, 3, 3, Eigen::RowMajor>;
+
+using Vector3fRowM = Eigen::Matrix<float, 1, 3, Eigen::RowMajor>;
+
+Tensor4D create_frustum(const BEVFusionConfig & config);
+
+Tensor5D get_geometry(
+  const Tensor4D & frustum,             // [D, H, W, 3]
+  const Tensor3D & camera2lidar_rots,   // [N, 3, 3]
+  const Tensor2D & camera2lidar_trans,  // [N, 3]
+  const Tensor3D & intrins_inverse,     // [N, 3, 3]
+  const Tensor3D & post_rots_inverse,   // [N, 3, 3]
+  const Tensor2D & post_trans           // [N, 3]
+);
+
+// Define the function
+std::tuple<
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor>,
+  Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> >
+bev_pool_aux(const Tensor5D & geom_feats_input, const BEVFusionConfig & config);
+
+std::tuple<
+  Eigen::VectorXf,  // lidar2image
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor>,
+  Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> >
+precompute_features(
+  const std::vector<Matrix4fRowM> & lidar2camera_transforms,
+  const std::vector<Matrix4fRowM> & camera_aug_matrices,
+  const std::vector<sensor_msgs::msg::CameraInfo> & camera_info_vector,
+  const BEVFusionConfig & config);
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PRECOMPUTED_FEATURES_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp
new file mode 100644
index 0000000000000..c14ee72cfd1c3
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp
@@ -0,0 +1,76 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <cuda_runtime_api.h>
+#include <tensorview/tensor.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace autoware::lidar_bevfusion
+{
+
+class PreprocessCuda
+{
+public:
+  PreprocessCuda(const BEVFusionConfig & config, cudaStream_t & stream, bool allocate_buffers);
+
+  cudaError_t generateSweepPoints_launch(
+    const InputPointType * input_data, std::size_t points_size, float time_lag,
+    const float * transform, float * output_points);
+
+  std::size_t generateVoxels(
+    const float * points, unsigned int points_size, float * voxel_features,
+    std::int32_t * voxel_coords, std::int32_t * num_points_per_voxel);
+
+  cudaError_t resize_and_extract_roi_launch(
+    const std::uint8_t * input_img, std::uint8_t * output_img, int H, int W, int H2, int W2, int H3,
+    int W3, int y_start, int x_start, cudaStream_t stream);
+
+private:
+  BEVFusionConfig config_;
+  cudaStream_t stream_;
+
+  tv::Tensor hash_key_value_;
+  tv::Tensor point_indice_data_;
+  tv::Tensor points_voxel_id_;
+};
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__PREPROCESS_KERNEL_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/voxel_generator.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/voxel_generator.hpp
new file mode 100644
index 0000000000000..7b90c4e86de06
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/preprocess/voxel_generator.hpp
@@ -0,0 +1,64 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+#include "autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp"
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+#include "autoware/lidar_bevfusion/ros_utils.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+#include <autoware/cuda_utils/cuda_unique_ptr.hpp>
+#include <autoware/point_types/types.hpp>
+#include <tf2_eigen/tf2_eigen.hpp>
+
+#include <sensor_msgs/msg/point_cloud2.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+class VoxelGenerator
+{
+public:
+  explicit VoxelGenerator(
+    const DensificationParam & densification_param, const BEVFusionConfig & config,
+    cudaStream_t & stream);
+  std::size_t generateSweepPoints(CudaUniquePtr<float[]> & points_d);
+  bool enqueuePointCloud(
+    const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer);
+  void initCloudInfo(const sensor_msgs::msg::PointCloud2 & msg);
+  std::tuple<const std::uint32_t, const std::uint8_t, const std::uint8_t> getFieldInfo(
+    const sensor_msgs::msg::PointCloud2 & msg, const std::string & field_name);
+
+private:
+  std::unique_ptr<PointCloudDensification> pd_ptr_{nullptr};
+  std::unique_ptr<PreprocessCuda> pre_ptr_{nullptr};
+  BEVFusionConfig config_;
+  CudaUniquePtr<float[]> affine_past2current_d_{nullptr};
+  std::vector<float> points_;
+  cudaStream_t stream_;
+};
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__PREPROCESS__VOXEL_GENERATOR_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/ros_utils.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/ros_utils.hpp
new file mode 100644
index 0000000000000..62128024a15aa
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/ros_utils.hpp
@@ -0,0 +1,67 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__ROS_UTILS_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__ROS_UTILS_HPP_
+
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <autoware/point_types/types.hpp>
+
+#include <autoware_perception_msgs/msg/detected_object_kinematics.hpp>
+#include <autoware_perception_msgs/msg/detected_objects.hpp>
+#include <autoware_perception_msgs/msg/object_classification.hpp>
+#include <autoware_perception_msgs/msg/shape.hpp>
+#include <sensor_msgs/msg/point_field.hpp>
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#define CHECK_OFFSET(structure1, structure2, field)             \
+  static_assert(                                                \
+    offsetof(structure1, field) == offsetof(structure2, field), \
+    "Offset of " #field " in " #structure1 " does not match the one in " #structure2 ".")
+#define CHECK_TYPE(structure1, structure2, field)                             \
+  static_assert(                                                              \
+    std::is_same_v<decltype(structure1::field), decltype(structure2::field)>, \
+    "Type of " #field " in " #structure1 " and " #structure1 " have different types.")
+#define CHECK_FIELD(structure1, structure2, field) \
+  CHECK_OFFSET(structure1, structure2, field);     \
+  CHECK_TYPE(structure1, structure2, field)
+
+namespace autoware::lidar_bevfusion
+{
+using sensor_msgs::msg::PointField;
+
+CHECK_FIELD(InputPointType, autoware::point_types::PointXYZIRC, x);
+CHECK_FIELD(InputPointType, autoware::point_types::PointXYZIRC, y);
+CHECK_FIELD(InputPointType, autoware::point_types::PointXYZIRC, z);
+CHECK_FIELD(InputPointType, autoware::point_types::PointXYZIRC, intensity);
+static_assert(sizeof(InputPointType) == sizeof(autoware::point_types::PointXYZIRC));
+
+// TODO(knzo25): will move this from the pointcloud preprocessor to aut autoware_point_types after
+// this is merged
+bool is_data_layout_compatible_with_point_xyzirc(const sensor_msgs::msg::PointCloud2 & input);
+
+void box3DToDetectedObject(
+  const Box3D & box3d, const std::vector<std::string> & class_names,
+  autoware_perception_msgs::msg::DetectedObject & obj);
+
+uint8_t getSemanticType(const std::string & class_name);
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__ROS_UTILS_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/utils.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/utils.hpp
new file mode 100644
index 0000000000000..e0cbe9a7ebabf
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/utils.hpp
@@ -0,0 +1,57 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__UTILS_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__UTILS_HPP_
+
+#include <cstddef>
+#include <iostream>
+#include <stdexcept>
+#include <unordered_map>
+
+namespace autoware::lidar_bevfusion
+{
+
+struct Box3D
+{
+  int label;
+  float score;
+  float x;
+  float y;
+  float z;
+  float width;
+  float length;
+  float height;
+  float yaw;
+  float vx;
+  float vy;
+};
+
+// cspell: ignore divup
+template <typename T1, typename T2>
+unsigned int divup(const T1 a, const T2 b)
+{
+  if (a == 0) {
+    throw std::runtime_error("A dividend of divup isn't positive.");
+  }
+  if (b == 0) {
+    throw std::runtime_error("A divisor of divup isn't positive.");
+  }
+
+  return (a + b - 1) / b;
+}
+
+}  // namespace autoware::lidar_bevfusion
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__UTILS_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/visibility_control.hpp b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/visibility_control.hpp
new file mode 100644
index 0000000000000..96e64e770ded3
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/lidar_bevfusion/visibility_control.hpp
@@ -0,0 +1,37 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__LIDAR_BEVFUSION__VISIBILITY_CONTROL_HPP_
+#define AUTOWARE__LIDAR_BEVFUSION__VISIBILITY_CONTROL_HPP_
+
+////////////////////////////////////////////////////////////////////////////////
+#if defined(__WIN32)
+#if defined(LIDAR_BEVFUSION_BUILDING_DLL) || defined(LIDAR_BEVFUSION_EXPORTS)
+#define LIDAR_BEVFUSION_PUBLIC __declspec(dllexport)
+#define LIDAR_BEVFUSION_LOCAL
+#else  // defined(LIDAR_BEVFUSION_BUILDING_DLL) || defined(LIDAR_BEVFUSION_EXPORTS)
+#define LIDAR_BEVFUSION_PUBLIC __declspec(dllimport)
+#define LIDAR_BEVFUSION_LOCAL
+#endif  // defined(LIDAR_BEVFUSION_BUILDING_DLL) || defined(LIDAR_BEVFUSION_EXPORTS)
+#elif defined(__linux__)
+#define LIDAR_BEVFUSION_PUBLIC __attribute__((visibility("default")))
+#define LIDAR_BEVFUSION_LOCAL __attribute__((visibility("hidden")))
+#elif defined(__APPLE__)
+#define LIDAR_BEVFUSION_PUBLIC __attribute__((visibility("default")))
+#define LIDAR_BEVFUSION_LOCAL __attribute__((visibility("hidden")))
+#else
+#error "Unsupported Build Configuration"
+#endif
+
+#endif  // AUTOWARE__LIDAR_BEVFUSION__VISIBILITY_CONTROL_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.hpp
new file mode 100644
index 0000000000000..9e43e8fa8c894
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.hpp
@@ -0,0 +1,140 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__GET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__GET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_HPP_
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <cuda_runtime.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+constexpr char const * const kGET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_NAME{
+  "GetIndicePairsImplicitGemm"};
+constexpr char const * const kGET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_VERSION{"1"};
+constexpr char const * const kGET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_NAMESPACE{""};
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+struct GetIndicePairsImplicitGemmParameters
+{
+  std::int32_t batch_size;
+  std::int32_t algo;
+  std::int32_t is_train;
+  std::vector<std::int32_t> dilation;
+  std::vector<std::int32_t> ksize;
+  std::vector<std::int32_t> out_padding;
+  std::vector<std::int32_t> padding;
+  std::vector<std::int32_t> spatial_shape;
+  std::vector<std::int32_t> stride;
+  std::int32_t subm;
+  std::int32_t transpose;
+
+  nvinfer1::Dims dilation_dims;
+  nvinfer1::Dims ksize_dims;
+  nvinfer1::Dims out_padding_dims;
+  nvinfer1::Dims padding_dims;
+  nvinfer1::Dims spatial_shape_dims;
+  nvinfer1::Dims stride_dims;
+};
+
+class GetIndicePairsImplicitGemmPlugin : public IPluginV3,
+                                         public IPluginV3OneCore,
+                                         public IPluginV3OneBuild,
+                                         public IPluginV3OneRuntime
+{
+public:
+  GetIndicePairsImplicitGemmPlugin(
+    const std::string & name, GetIndicePairsImplicitGemmParameters const & params);
+
+  ~GetIndicePairsImplicitGemmPlugin() override = default;
+
+  // IPluginV3 Methods
+
+  IPluginCapability * getCapabilityInterface(PluginCapabilityType type) noexcept override;
+
+  IPluginV3 * clone() noexcept override;
+
+  // IPluginV3OneCore Methods
+
+  char const * getPluginName() const noexcept override;
+
+  char const * getPluginVersion() const noexcept override;
+
+  char const * getPluginNamespace() const noexcept override;
+
+  // IPluginV3OneBuild Methods
+
+  std::int32_t getNbOutputs() const noexcept override;
+
+  std::int32_t configurePlugin(
+    DynamicPluginTensorDesc const * in, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * out, std::int32_t num_outputs) noexcept override;
+
+  bool supportsFormatCombination(
+    std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+    std::int32_t num_outputs) noexcept override;
+
+  std::int32_t getOutputDataTypes(
+    DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+    std::int32_t num_inputs) const noexcept override;
+
+  std::int32_t getOutputShapes(
+    DimsExprs const * inputs, std::int32_t num_inputs, DimsExprs const * shape_inputs,
+    std::int32_t num_shape_inputs, DimsExprs * outputs, std::int32_t num_outputs,
+    IExprBuilder & expr_builder) noexcept override;
+
+  // IPluginV3OneRuntime Methods
+
+  std::int32_t enqueue(
+    PluginTensorDesc const * input_desc, PluginTensorDesc const * output_desc,
+    void const * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+  std::int32_t onShapeChange(
+    PluginTensorDesc const * in, std::int32_t num_inputs, PluginTensorDesc const * out,
+    std::int32_t num_outputs) noexcept override;
+
+  IPluginV3 * attachToContext(IPluginResourceContext * context) noexcept override;
+
+  PluginFieldCollection const * getFieldsToSerialize() noexcept override;
+
+  std::size_t getWorkspaceSize(
+    DynamicPluginTensorDesc const * inputs, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * outputs, std::int32_t num_outputs) const noexcept override;
+
+private:
+  void initFieldsToSerialize();
+
+  // upper bound of number of output indices. needed to bound memory usage.
+  static constexpr int out_inds_num_limit_{256000};
+
+  std::string layer_name_;
+  GetIndicePairsImplicitGemmParameters params_;
+  std::vector<nvinfer1::PluginField> data_to_serialize_;
+  nvinfer1::PluginFieldCollection fc_to_serialize_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__GET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.hpp
new file mode 100644
index 0000000000000..7b5eeb8cc1e9a
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.hpp
@@ -0,0 +1,65 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__GET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__GET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
+
+#include "autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <vector>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+// Plugin factory class.
+class GetIndicePairsImplicitGemmPluginCreator : public nvinfer1::IPluginCreatorV3One
+{
+public:
+  GetIndicePairsImplicitGemmPluginCreator();
+
+  ~GetIndicePairsImplicitGemmPluginCreator() override = default;
+
+  char const * getPluginNamespace() const noexcept override
+  {
+    return kGET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_NAMESPACE;
+  }
+
+  char const * getPluginName() const noexcept override
+  {
+    return kGET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_NAME;
+  }
+
+  char const * getPluginVersion() const noexcept override
+  {
+    return kGET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_VERSION;
+  }
+
+  nvinfer1::PluginFieldCollection const * getFieldNames() noexcept override;
+
+  IPluginV3 * createPlugin(
+    char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept override;
+
+private:
+  nvinfer1::PluginFieldCollection fc_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__GET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin.hpp
new file mode 100644
index 0000000000000..4ddca60e6795f
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin.hpp
@@ -0,0 +1,138 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_HPP_
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <cuda_runtime.h>
+#include <spconvlib/spconv/csrc/sparse/convops/gemmops/GemmTunerSimple.h>
+#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+constexpr char const * const kIMPLICIT_GEMM_PLUGIN_NAME{"ImplicitGemm"};
+constexpr char const * const kIMPLICIT_GEMM_PLUGIN_VERSION{"1"};
+constexpr char const * const kIMPLICIT_GEMM_PLUGIN_NAMESPACE{""};
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+struct ImplicitGemmParameters
+{
+  float act_alpha;
+  float act_beta;
+  std::int64_t is_subm;
+  std::int64_t is_train;
+  float output_add_scale;
+  float output_scale;
+};
+
+class ImplicitGemmPlugin : public IPluginV3,
+                           public IPluginV3OneCore,
+                           public IPluginV3OneBuild,
+                           public IPluginV3OneRuntime
+{
+public:
+  using ConvTunerSimple = spconvlib::spconv::csrc::sparse::convops::spops::ConvTuner;
+  ImplicitGemmPlugin(const std::string & name, ImplicitGemmParameters const & params);
+
+  ~ImplicitGemmPlugin() override = default;
+
+  // IPluginV3 Methods
+
+  IPluginCapability * getCapabilityInterface(PluginCapabilityType type) noexcept override;
+
+  IPluginV3 * clone() noexcept override;
+
+  // IPluginV3OneCore Methods
+
+  char const * getPluginName() const noexcept override;
+
+  char const * getPluginVersion() const noexcept override;
+
+  char const * getPluginNamespace() const noexcept override;
+
+  // IPluginV3OneBuild Methods
+
+  std::int32_t getNbOutputs() const noexcept override;
+
+  std::int32_t configurePlugin(
+    DynamicPluginTensorDesc const * in, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * out, std::int32_t num_outputs) noexcept override;
+
+  bool supportsFormatCombination(
+    std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+    std::int32_t num_outputs) noexcept override;
+
+  std::int32_t getOutputDataTypes(
+    DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+    std::int32_t num_inputs) const noexcept override;
+
+  std::int32_t getOutputShapes(
+    DimsExprs const * inputs, std::int32_t num_inputs, DimsExprs const * shape_inputs,
+    std::int32_t num_shape_inputs, DimsExprs * outputs, std::int32_t num_outputs,
+    IExprBuilder & expr_builder) noexcept override;
+
+  // IPluginV3OneRuntime Methods
+
+  std::int32_t enqueue(
+    PluginTensorDesc const * input_desc, PluginTensorDesc const * output_desc,
+    void const * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+  std::int32_t onShapeChange(
+    PluginTensorDesc const * in, std::int32_t num_inputs, PluginTensorDesc const * out,
+    std::int32_t num_outputs) noexcept override;
+
+  IPluginV3 * attachToContext(IPluginResourceContext * context) noexcept override;
+
+  PluginFieldCollection const * getFieldsToSerialize() noexcept override;
+
+  std::size_t getWorkspaceSize(
+    DynamicPluginTensorDesc const * inputs, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * outputs, std::int32_t num_outputs) const noexcept override;
+
+private:
+  static constexpr std::int32_t INOUT_IN_FEATURES_INDEX = 0;
+  static constexpr std::int32_t INOUT_FILTERS_INDEX = 1;
+  static constexpr std::int32_t INOUT_PAIR_FWD_INDEX = 2;
+  static constexpr std::int32_t INOUT_PAIR_MASK_FWD_SPLITS_INDEX = 3;
+  static constexpr std::int32_t INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX = 4;
+  static constexpr std::int32_t INOUT_OUT_FEATURES_INDEX = 5;
+
+  void initFieldsToSerialize();
+
+  std::string layer_name_;
+  ImplicitGemmParameters params_;
+  std::tuple<int, int> arch_;
+  std::vector<nvinfer1::PluginField> data_to_serialize_;
+  nvinfer1::PluginFieldCollection fc_to_serialize_;
+
+  std::unique_ptr<ConvTunerSimple> tunner_ptr_{};
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp
new file mode 100644
index 0000000000000..c6450b2e9b1ab
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp
@@ -0,0 +1,59 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
+
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <vector>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+// Plugin factory class.
+class ImplicitGemmPluginCreator : public nvinfer1::IPluginCreatorV3One
+{
+public:
+  ImplicitGemmPluginCreator();
+
+  ~ImplicitGemmPluginCreator() override = default;
+
+  char const * getPluginNamespace() const noexcept override
+  {
+    return kIMPLICIT_GEMM_PLUGIN_NAMESPACE;
+  }
+
+  char const * getPluginName() const noexcept override { return kIMPLICIT_GEMM_PLUGIN_NAME; }
+
+  char const * getPluginVersion() const noexcept override { return kIMPLICIT_GEMM_PLUGIN_VERSION; }
+
+  nvinfer1::PluginFieldCollection const * getFieldNames() noexcept override;
+
+  IPluginV3 * createPlugin(
+    char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept override;
+
+private:
+  nvinfer1::PluginFieldCollection fc_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__IMPLICIT_GEMM_PLUGIN_CREATOR_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_registration.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_registration.hpp
new file mode 100644
index 0000000000000..4ce6e28a205aa
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_registration.hpp
@@ -0,0 +1,29 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__PLUGIN_REGISTRATION_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__PLUGIN_REGISTRATION_HPP_
+
+#include <NvInferRuntime.h>
+
+#include <cstdint>
+
+// These are the functions that TensorRT library will call at the runtime.
+
+extern "C" void setLoggerFinder(nvinfer1::ILoggerFinder * finder);
+
+extern "C" nvinfer1::IPluginCreatorInterface * const * getPluginCreators(
+  std::int32_t & num_creators);
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__PLUGIN_REGISTRATION_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_utils.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_utils.hpp
new file mode 100644
index 0000000000000..c368f796b714d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/plugin_utils.hpp
@@ -0,0 +1,35 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__PLUGIN_UTILS_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__PLUGIN_UTILS_HPP_
+
+#include <NvInferRuntime.h>
+
+#include <cstdint>
+#include <stdexcept>
+
+void caughtError(std::exception const & e);
+
+void logDebug(char const * msg);
+
+void logInfo(char const * msg);
+
+#define PLUGIN_ASSERT(val) reportAssertion((val), #val, __FILE__, __LINE__)
+void reportAssertion(bool success, char const * msg, char const * file, std::int32_t line);
+
+#define PLUGIN_VALIDATE(val) reportValidation((val), #val, __FILE__, __LINE__)
+void reportValidation(bool success, char const * msg, char const * file, std::int32_t line);
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__PLUGIN_UTILS_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp
new file mode 100644
index 0000000000000..e488b677ce90f
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp
@@ -0,0 +1,123 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_HPP_
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <cuda_runtime.h>
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+constexpr char const * const kQUICK_CUMSUM_CUDA_PLUGIN_NAME{"QuickCumsumCuda"};
+constexpr char const * const kQUICK_CUMSUM_CUDA_PLUGIN_VERSION{"1"};
+constexpr char const * const kQUICK_CUMSUM_CUDA_PLUGIN_NAMESPACE{""};
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+struct QuickCumsumCudaParameters
+{
+  std::int32_t batch_size;
+  std::int32_t dimension;
+  std::int32_t height;
+  std::int32_t width;
+};
+
+class QuickCumsumCudaPlugin : public IPluginV3,
+                              public IPluginV3OneCore,
+                              public IPluginV3OneBuild,
+                              public IPluginV3OneRuntime
+{
+public:
+  QuickCumsumCudaPlugin(const std::string & name, QuickCumsumCudaParameters const & params);
+
+  ~QuickCumsumCudaPlugin() override = default;
+
+  // IPluginV3 Methods
+
+  IPluginCapability * getCapabilityInterface(PluginCapabilityType type) noexcept override;
+
+  IPluginV3 * clone() noexcept override;
+
+  // IPluginV3OneCore Methods
+
+  char const * getPluginName() const noexcept override;
+
+  char const * getPluginVersion() const noexcept override;
+
+  char const * getPluginNamespace() const noexcept override;
+
+  // IPluginV3OneBuild Methods
+
+  std::int32_t getNbOutputs() const noexcept override;
+
+  std::int32_t configurePlugin(
+    DynamicPluginTensorDesc const * in, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * out, std::int32_t num_outputs) noexcept override;
+
+  bool supportsFormatCombination(
+    int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+    std::int32_t num_outputs) noexcept override;
+
+  std::int32_t getOutputDataTypes(
+    DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+    std::int32_t num_inputs) const noexcept override;
+
+  std::int32_t getOutputShapes(
+    DimsExprs const * inputs, std::int32_t num_inputs, DimsExprs const * shape_inputs,
+    std::int32_t num_shape_inputs, DimsExprs * outputs, std::int32_t num_outputs,
+    IExprBuilder & expr_builder) noexcept override;
+
+  // IPluginV3OneRuntime Methods
+
+  std::int32_t enqueue(
+    PluginTensorDesc const * input_desc, PluginTensorDesc const * output_desc,
+    void const * const * inputs, void * const * outputs, void * workspace,
+    cudaStream_t stream) noexcept override;
+
+  std::int32_t onShapeChange(
+    PluginTensorDesc const * in, std::int32_t num_inputs, PluginTensorDesc const * out,
+    std::int32_t num_outputs) noexcept override;
+
+  IPluginV3 * attachToContext(IPluginResourceContext * context) noexcept override;
+
+  PluginFieldCollection const * getFieldsToSerialize() noexcept override;
+
+  std::size_t getWorkspaceSize(
+    DynamicPluginTensorDesc const * inputs, std::int32_t num_inputs,
+    DynamicPluginTensorDesc const * outputs, std::int32_t num_outputs) const noexcept override;
+
+private:
+  // TensorRT plugin parameters.
+  std::string layer_name_;
+  QuickCumsumCudaParameters params_;
+
+  void initFieldsToSerialize();
+
+  std::vector<nvinfer1::PluginField> data_to_serialize_;
+  nvinfer1::PluginFieldCollection fc_to_serialize_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_HPP_
diff --git a/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp
new file mode 100644
index 0000000000000..c7ddc4e6f9f0d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/include/autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp
@@ -0,0 +1,62 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_CREATOR_HPP_
+#define AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_CREATOR_HPP_
+
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <vector>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+// Plugin factory class.
+class QuickCumsumCudaPluginCreator : public nvinfer1::IPluginCreatorV3One
+{
+public:
+  QuickCumsumCudaPluginCreator();
+
+  ~QuickCumsumCudaPluginCreator() override = default;
+
+  char const * getPluginNamespace() const noexcept override
+  {
+    return kQUICK_CUMSUM_CUDA_PLUGIN_NAMESPACE;
+  }
+
+  char const * getPluginName() const noexcept override { return kQUICK_CUMSUM_CUDA_PLUGIN_NAME; }
+
+  char const * getPluginVersion() const noexcept override
+  {
+    return kQUICK_CUMSUM_CUDA_PLUGIN_VERSION;
+  }
+
+  nvinfer1::PluginFieldCollection const * getFieldNames() noexcept override;
+
+  IPluginV3 * createPlugin(
+    char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept override;
+
+private:
+  nvinfer1::PluginFieldCollection fc_;
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+
+}  // namespace plugin
+}  // namespace nvinfer1
+
+#endif  // AUTOWARE__TENSORRT_PLUGINS__QUICK_CUMSUM_CUDA_PLUGIN_CREATOR_HPP_
diff --git a/perception/autoware_lidar_bevfusion/launch/lidar_bevfusion.launch.xml b/perception/autoware_lidar_bevfusion/launch/lidar_bevfusion.launch.xml
new file mode 100644
index 0000000000000..a2cecb2f72f62
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/launch/lidar_bevfusion.launch.xml
@@ -0,0 +1,124 @@
+<?xml version="1.0"?>
+<launch>
+  <arg name="input/pointcloud" default="/sensing/lidar/concatenated/pointcloud"/>
+  <arg name="output/objects" default="objects"/>
+  <!-- <arg name="data_path" default="$(env HOME)/autoware_data" description="packages data and artifacts directory path"/> -->
+  <arg name="data_path" default="$(find-pkg-share autoware_lidar_bevfusion)/autoware_data" description="packages data and artifacts directory path"/>
+  <arg name="model_name" default="bevfusion" description="options: `bevfusion`"/>
+  <!-- <arg name="model_path" default="$(var data_path)/lidar_bevfusion"/> -->
+  <arg name="model_path" default="$(find-pkg-share autoware_lidar_bevfusion)/config"/>
+  <arg name="model_param_path" default="$(find-pkg-share autoware_lidar_bevfusion)/config/$(var model_name).param.yaml"/>
+  <arg name="ml_package_param_path" default="$(var model_path)/$(var model_name)_ml_package.param.yaml"/>
+  <arg name="class_remapper_param_path" default="$(find-pkg-share autoware_lidar_bevfusion)/config/detection_class_remapper.param.yaml"/>
+  <arg name="build_only" default="false" description="shutdown node after TensorRT engine file is built"/>
+  <arg name="log_level" default="info"/>
+
+  <arg name="use_pointcloud_container" default="false" description="use pointcloud_container"/>
+  <arg name="pointcloud_container_name" default="pointcloud_container" description="pointcloud_container name"/>
+
+  <arg name="use_decompress" default="true" description="use image_transport_decompressor"/>
+
+  <arg name="camera_info0" default="/sensing/camera/camera0/camera_info"/>
+  <arg name="camera_info1" default="/sensing/camera/camera1/camera_info"/>
+  <arg name="camera_info2" default="/sensing/camera/camera2/camera_info"/>
+  <arg name="camera_info3" default="/sensing/camera/camera3/camera_info"/>
+  <arg name="camera_info4" default="/sensing/camera/camera4/camera_info"/>
+  <arg name="camera_info5" default="/sensing/camera/camera5/camera_info"/>
+
+  <arg name="image0" default="/sensing/camera/camera0/image_rect_color"/>
+  <arg name="image1" default="/sensing/camera/camera1/image_rect_color"/>
+  <arg name="image2" default="/sensing/camera/camera2/image_rect_color"/>
+  <arg name="image3" default="/sensing/camera/camera3/image_rect_color"/>
+  <arg name="image4" default="/sensing/camera/camera4/image_rect_color"/>
+  <arg name="image5" default="/sensing/camera/camera5/image_rect_color"/>
+
+  <arg name="decompressor_param_file" default="$(find-pkg-share autoware_image_transport_decompressor)/config/image_transport_decompressor.param.yaml"/>
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image0)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image0)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image1)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image1)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image2)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image2)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image3)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image3)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image4)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image4)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <node pkg="autoware_image_transport_decompressor" exec="image_transport_decompressor_node" name="image_transport_decompressor_node" if="$(var use_decompress)">
+    <remap from="~/input/compressed_image" to="$(var image5)/compressed"/>
+    <remap from="~/output/raw_image" to="$(var image5)"/>
+    <param from="$(var decompressor_param_file)"/>
+  </node>
+
+  <group if="$(var use_pointcloud_container)">
+    <load_composable_node target="$(var pointcloud_container_name)">
+      <composable_node pkg="autoware_lidar_bevfusion" plugin="autoware::lidar_bevfusion::LidarTransfusionNode" name="lidar_bevfusion">
+        <remap from="~/input/pointcloud" to="$(var input/pointcloud)"/>
+        <remap from="~/output/objects" to="$(var output/objects)"/>
+        <param from="$(var model_param_path)" allow_substs="true"/>
+        <param from="$(var ml_package_param_path)" allow_substs="true"/>
+        <param from="$(var class_remapper_param_path)"/>
+
+        <remap from="~/input/camera_info0" to="$(var camera_info0)"/>
+        <remap from="~/input/camera_info1" to="$(var camera_info1)"/>
+        <remap from="~/input/camera_info2" to="$(var camera_info2)"/>
+        <remap from="~/input/camera_info3" to="$(var camera_info3)"/>
+        <remap from="~/input/camera_info4" to="$(var camera_info4)"/>
+        <remap from="~/input/camera_info5" to="$(var camera_info5)"/>
+        <remap from="~/input/image0" to="$(var image0)"/>
+        <remap from="~/input/image1" to="$(var image1)"/>
+        <remap from="~/input/image2" to="$(var image2)"/>
+        <remap from="~/input/image3" to="$(var image3)"/>
+        <remap from="~/input/image4" to="$(var image4)"/>
+        <remap from="~/input/image5" to="$(var image5)"/>
+
+        <!-- This parameter shall NOT be included in param file. See also: https://github.com/autowarefoundation/autoware.universe/pull/6167 -->
+        <param name="build_only" value="$(var build_only)"/>
+      </composable_node>
+    </load_composable_node>
+  </group>
+  <group unless="$(var use_pointcloud_container)">
+    <node pkg="autoware_lidar_bevfusion" exec="autoware_lidar_bevfusion_node" name="lidar_bevfusion" output="screen" args="--ros-args --log-level $(var log_level)">
+      <remap from="~/input/pointcloud" to="$(var input/pointcloud)"/>
+      <remap from="~/output/objects" to="$(var output/objects)"/>
+      <param from="$(var model_param_path)" allow_substs="true"/>
+      <param from="$(var ml_package_param_path)" allow_substs="true"/>
+      <param from="$(var class_remapper_param_path)"/>
+
+      <remap from="~/input/camera_info0" to="$(var camera_info0)"/>
+      <remap from="~/input/camera_info1" to="$(var camera_info1)"/>
+      <remap from="~/input/camera_info2" to="$(var camera_info2)"/>
+      <remap from="~/input/camera_info3" to="$(var camera_info3)"/>
+      <remap from="~/input/camera_info4" to="$(var camera_info4)"/>
+      <remap from="~/input/camera_info5" to="$(var camera_info5)"/>
+      <remap from="~/input/image0" to="$(var image0)"/>
+      <remap from="~/input/image1" to="$(var image1)"/>
+      <remap from="~/input/image2" to="$(var image2)"/>
+      <remap from="~/input/image3" to="$(var image3)"/>
+      <remap from="~/input/image4" to="$(var image4)"/>
+      <remap from="~/input/image5" to="$(var image5)"/>
+
+      <!-- This parameter shall NOT be included in param file. See also: https://github.com/autowarefoundation/autoware.universe/pull/6167 -->
+      <param name="build_only" value="$(var build_only)"/>
+    </node>
+  </group>
+</launch>
diff --git a/perception/autoware_lidar_bevfusion/lib/bevfusion_trt.cpp b/perception/autoware_lidar_bevfusion/lib/bevfusion_trt.cpp
new file mode 100644
index 0000000000000..a7a00771d6b71
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/bevfusion_trt.cpp
@@ -0,0 +1,532 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/bevfusion_trt.hpp"
+
+#include "autoware/lidar_bevfusion/bevfusion_config.hpp"
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+#include "autoware/lidar_bevfusion/preprocess/precomputed_features.hpp"
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+
+#include <autoware/cuda_utils/cuda_utils.hpp>
+#include <autoware/universe_utils/math/constants.hpp>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+BEVFusionTRT::BEVFusionTRT(
+  const tensorrt_common::TrtCommonConfig & trt_config,
+  const DensificationParam & densification_param, const BEVFusionConfig & config)
+: config_(config)
+{
+  vg_ptr_ = std::make_unique<VoxelGenerator>(densification_param, config_, stream_);
+
+  stop_watch_ptr_ =
+    std::make_unique<autoware::universe_utils::StopWatch<std::chrono::milliseconds>>();
+  stop_watch_ptr_->tic("processing/inner");
+
+  initPtr();
+  initTrt(trt_config);
+
+  CHECK_CUDA_ERROR(cudaStreamCreate(&stream_));
+
+  camera_streams_.resize(config_.num_cameras_);
+  for (std::int64_t i = 0; i < config_.num_cameras_; i++) {
+    CHECK_CUDA_ERROR(cudaStreamCreate(&camera_streams_[i]));
+  }
+}
+
+BEVFusionTRT::~BEVFusionTRT()
+{
+  if (stream_) {
+    cudaStreamSynchronize(stream_);
+    cudaStreamDestroy(stream_);
+  }
+
+  for (std::int64_t i = 0; i < config_.num_cameras_; i++) {
+    if (camera_streams_[i]) {
+      cudaStreamSynchronize(camera_streams_[i]);
+      cudaStreamDestroy(camera_streams_[i]);
+    }
+  }
+}
+
+void BEVFusionTRT::initPtr()
+{
+  // point cloud to voxels
+  voxel_features_size_ =
+    config_.max_num_voxels_ * config_.max_points_per_voxel_ * config_.num_point_feature_size_;
+  voxel_coords_size_ = 3 * config_.max_num_voxels_;
+
+  // output of TRT -- input of post-process
+  bbox_pred_size_ = config_.num_proposals_ * config_.num_box_values_;
+  label_pred_output_d_ = autoware::cuda_utils::make_unique<std::int64_t[]>(config_.num_proposals_);
+  bbox_pred_output_d_ = autoware::cuda_utils::make_unique<float[]>(bbox_pred_size_);
+  score_output_d_ = autoware::cuda_utils::make_unique<float[]>(config_.num_proposals_);
+
+  // lidar branch
+  voxel_features_d_ = autoware::cuda_utils::make_unique<float[]>(voxel_features_size_);
+  voxel_coords_d_ = autoware::cuda_utils::make_unique<std::int32_t[]>(voxel_coords_size_);
+  num_points_per_voxel_d_ =
+    autoware::cuda_utils::make_unique<std::int32_t[]>(config_.max_num_voxels_);
+  points_d_ = autoware::cuda_utils::make_unique<float[]>(
+    config_.cloud_capacity_ * config_.num_point_feature_size_);
+
+  // pre computed tensors
+  if (config_.sensor_fusion_) {
+    lidar2image_d_ = autoware::cuda_utils::make_unique<float[]>(config_.num_cameras_ * 4 * 4);
+    std::int64_t num_geom_feats = config_.num_cameras_ * config_.features_height_ *
+                                  config_.features_width_ * config_.num_depth_features_;
+    geom_feats_d_ = autoware::cuda_utils::make_unique<std::int32_t[]>(4 * num_geom_feats);
+    kept_d_ = autoware::cuda_utils::make_unique<std::uint8_t[]>(num_geom_feats);
+    ranks_d_ = autoware::cuda_utils::make_unique<std::int64_t[]>(num_geom_feats);
+    indices_d_ = autoware::cuda_utils::make_unique<std::int64_t[]>(num_geom_feats);
+
+    // image branch
+    roi_tensor_d_ = autoware::cuda_utils::make_unique<std::uint8_t[]>(
+      config_.num_cameras_ * config_.roi_height_ * config_.roi_width_ * 3);
+    for (std::int64_t camera_id = 0; camera_id < config_.num_cameras_; camera_id++) {
+      image_buffers_d_.emplace_back(autoware::cuda_utils::make_unique<std::uint8_t[]>(
+        config_.raw_image_height_ * config_.raw_image_width_ * 3));
+    }
+    camera_masks_d_ = autoware::cuda_utils::make_unique<float[]>(config_.num_cameras_);
+  }
+
+  pre_ptr_ = std::make_unique<PreprocessCuda>(config_, stream_, true);
+  post_ptr_ = std::make_unique<PostprocessCuda>(config_, stream_);
+}
+
+void BEVFusionTRT::initTrt(const tensorrt_common::TrtCommonConfig & trt_config)
+{
+  std::vector<autoware::tensorrt_common::NetworkIO> network_io;
+
+  // Lidar branch
+  network_io.emplace_back(
+    "voxels",
+    nvinfer1::Dims{3, {-1, config_.max_points_per_voxel_, config_.num_point_feature_size_}});
+  network_io.emplace_back("num_points_per_voxel", nvinfer1::Dims{1, {-1}});
+  network_io.emplace_back("coors", nvinfer1::Dims{2, {-1, 3}});
+
+  // Camera branch
+  if (config_.sensor_fusion_) {
+    network_io.emplace_back("points", nvinfer1::Dims{2, {-1, 5}});
+    network_io.emplace_back("camera_mask", nvinfer1::Dims{1, {-1}});
+    network_io.emplace_back(
+      "imgs", nvinfer1::Dims{4, {-1, 3, config_.roi_height_, config_.roi_width_}});
+    network_io.emplace_back("lidar2image", nvinfer1::Dims{3, {-1, 4, 4}});  // 4x4 matrix
+
+    network_io.emplace_back("geom_feats", nvinfer1::Dims{2, {-1, 4}});
+    network_io.emplace_back("kept", nvinfer1::Dims{1, {-1}});
+    network_io.emplace_back("ranks", nvinfer1::Dims{1, {-1}});
+    network_io.emplace_back("indices", nvinfer1::Dims{1, {-1}});
+  }
+
+  // Outputs
+  network_io.emplace_back(
+    "bbox_pred", nvinfer1::Dims{2, {config_.num_box_values_, config_.num_proposals_}});
+  network_io.emplace_back("score", nvinfer1::Dims{1, {config_.num_proposals_}});
+  network_io.emplace_back("label_pred", nvinfer1::Dims{1, {config_.num_proposals_}});
+
+  std::vector<autoware::tensorrt_common::ProfileDims> profile_dims;
+
+  // Lidar branch
+  profile_dims.emplace_back(
+    "voxels",
+    nvinfer1::Dims{
+      3, {config_.voxels_num_[0], config_.max_points_per_voxel_, config_.num_point_feature_size_}},
+    nvinfer1::Dims{
+      3, {config_.voxels_num_[1], config_.max_points_per_voxel_, config_.num_point_feature_size_}},
+    nvinfer1::Dims{
+      3, {config_.voxels_num_[2], config_.max_points_per_voxel_, config_.num_point_feature_size_}});
+
+  profile_dims.emplace_back(
+    "num_points_per_voxel", nvinfer1::Dims{1, {config_.voxels_num_[0]}},
+    nvinfer1::Dims{1, {config_.voxels_num_[1]}}, nvinfer1::Dims{1, {config_.voxels_num_[2]}});
+
+  profile_dims.emplace_back(
+    "coors", nvinfer1::Dims{2, {config_.voxels_num_[0], 3}},
+    nvinfer1::Dims{2, {config_.voxels_num_[1], 3}}, nvinfer1::Dims{2, {config_.voxels_num_[2], 3}});
+
+  // Camera branch
+  if (config_.sensor_fusion_) {
+    profile_dims.emplace_back(
+      "points", nvinfer1::Dims{2, {config_.voxels_num_[0], 5}},
+      nvinfer1::Dims{2, {config_.voxels_num_[1], 5}},
+      nvinfer1::Dims{2, {config_.voxels_num_[2], 5}});
+
+    profile_dims.emplace_back(
+      "camera_mask", nvinfer1::Dims{1, {1}}, nvinfer1::Dims{1, {config_.num_cameras_}},
+      nvinfer1::Dims{1, {config_.num_cameras_}});
+
+    profile_dims.emplace_back(
+      "imgs", nvinfer1::Dims{4, {1, 3, config_.roi_height_, config_.roi_width_}},
+      nvinfer1::Dims{4, {config_.num_cameras_, 3, config_.roi_height_, config_.roi_width_}},
+      nvinfer1::Dims{4, {config_.num_cameras_, 3, config_.roi_height_, config_.roi_width_}});
+    profile_dims.emplace_back(
+      "lidar2image", nvinfer1::Dims{3, {1, 4, 4}}, nvinfer1::Dims{3, {config_.num_cameras_, 4, 4}},
+      nvinfer1::Dims{3, {config_.num_cameras_, 4, 4}});
+
+    const std::int64_t num_geom_feats = config_.num_cameras_ * config_.features_height_ *
+                                        config_.features_width_ * config_.num_depth_features_;
+
+    profile_dims.emplace_back(
+      "geom_feats", nvinfer1::Dims{2, {0, 4}}, nvinfer1::Dims{2, {num_geom_feats, 4}},
+      nvinfer1::Dims{2, {num_geom_feats, 4}});
+
+    profile_dims.emplace_back(
+      "kept", nvinfer1::Dims{1, {0}}, nvinfer1::Dims{1, {num_geom_feats}},
+      nvinfer1::Dims{1, {num_geom_feats}});
+
+    profile_dims.emplace_back(
+      "ranks", nvinfer1::Dims{1, {0}}, nvinfer1::Dims{1, {num_geom_feats}},
+      nvinfer1::Dims{1, {num_geom_feats}});
+
+    profile_dims.emplace_back(
+      "indices", nvinfer1::Dims{1, {0}}, nvinfer1::Dims{1, {num_geom_feats}},
+      nvinfer1::Dims{1, {num_geom_feats}});
+  }
+
+  auto network_io_ptr =
+    std::make_unique<std::vector<autoware::tensorrt_common::NetworkIO>>(network_io);
+  auto profile_dims_ptr =
+    std::make_unique<std::vector<autoware::tensorrt_common::ProfileDims>>(profile_dims);
+
+  network_trt_ptr_ = std::make_unique<autoware::tensorrt_common::TrtCommon>(
+    trt_config, std::make_shared<autoware::tensorrt_common::Profiler>(),
+    std::vector<std::string>{config_.plugins_path_});
+
+  if (!network_trt_ptr_->setup(std::move(profile_dims_ptr), std::move(network_io_ptr))) {
+    throw std::runtime_error("Failed to setup TRT engine." + config_.plugins_path_);
+  }
+
+  if (config_.sensor_fusion_) {
+    /* nvinfer1::Dims input_imgs_shape;
+    input_imgs_shape.nbDims = 4;
+    input_imgs_shape.d[0] = config_.num_cameras_;
+    input_imgs_shape.d[1] = 3;
+    input_imgs_shape.d[2] = config_.roi_height_;
+    input_imgs_shape.d[3] = config_.roi_width_;
+
+    nvinfer1::Dims input_lidar2image_shape;
+    input_lidar2image_shape.nbDims = 3;
+    input_lidar2image_shape.d[0] = config_.num_cameras_;
+    input_lidar2image_shape.d[1] = 4;
+    input_lidar2image_shape.d[2] = 4;
+
+    nvinfer1::Dims input_geom_feats_shape;
+    input_geom_feats_shape.nbDims = 2;
+    input_geom_feats_shape.d[0] = num_ranks_;
+    input_geom_feats_shape.d[1] = 4;
+
+    nvinfer1::Dims input_kept_shape;
+    input_kept_shape.nbDims = 1;
+    input_kept_shape.d[0] = num_kept_;
+
+    nvinfer1::Dims input_ranks_shape;
+    input_ranks_shape.nbDims = 1;
+    input_ranks_shape.d[0] = num_ranks_;
+
+    nvinfer1::Dims input_indices_shape;
+    input_indices_shape.nbDims = 1;
+    input_indices_shape.d[0] = num_indices_; */
+
+    std::vector<float> lidar2image_host(config_.num_cameras_ * 4 * 4);
+    cudaMemcpy(
+      lidar2image_host.data(), lidar2image_d_.get(), config_.num_cameras_ * 4 * 4 * sizeof(float),
+      cudaMemcpyDeviceToHost);
+
+    network_trt_ptr_->setInputShape("camera_mask", nvinfer1::Dims{1, {config_.num_cameras_}});
+    network_trt_ptr_->setInputShape(
+      "imgs",
+      nvinfer1::Dims{4, {config_.num_cameras_, 3, config_.roi_height_, config_.roi_width_}});
+    network_trt_ptr_->setInputShape("lidar2image", nvinfer1::Dims{3, {config_.num_cameras_, 4, 4}});
+    network_trt_ptr_->setInputShape("geom_feats", nvinfer1::Dims{2, {num_ranks_, 4}});
+    network_trt_ptr_->setInputShape("kept", nvinfer1::Dims{1, {num_kept_}});
+    network_trt_ptr_->setInputShape("ranks", nvinfer1::Dims{1, {num_ranks_}});
+    network_trt_ptr_->setInputShape("indices", nvinfer1::Dims{1, {num_indices_}});
+  }
+
+  network_trt_ptr_->setTensorAddress("voxels", voxel_features_d_.get());
+  network_trt_ptr_->setTensorAddress("num_points_per_voxel", num_points_per_voxel_d_.get());
+  network_trt_ptr_->setTensorAddress("coors", voxel_coords_d_.get());
+
+  if (config_.sensor_fusion_) {
+    network_trt_ptr_->setTensorAddress("points", points_d_.get());
+    network_trt_ptr_->setTensorAddress("camera_mask", camera_masks_d_.get());
+    network_trt_ptr_->setTensorAddress("imgs", roi_tensor_d_.get());
+    network_trt_ptr_->setTensorAddress("lidar2image", lidar2image_d_.get());
+    network_trt_ptr_->setTensorAddress("geom_feats", geom_feats_d_.get());
+    network_trt_ptr_->setTensorAddress("kept", kept_d_.get());
+    network_trt_ptr_->setTensorAddress("ranks", ranks_d_.get());
+    network_trt_ptr_->setTensorAddress("indices", indices_d_.get());
+  }
+
+  network_trt_ptr_->setTensorAddress("label_pred", label_pred_output_d_.get());
+  network_trt_ptr_->setTensorAddress("bbox_pred", bbox_pred_output_d_.get());
+  network_trt_ptr_->setTensorAddress("score", score_output_d_.get());
+}
+
+bool BEVFusionTRT::detect(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr & pc_msg,
+  const std::vector<sensor_msgs::msg::Image::ConstSharedPtr> & image_msgs,
+  const std::vector<float> & camera_masks, const tf2_ros::Buffer & tf_buffer,
+  std::vector<Box3D> & det_boxes3d, std::unordered_map<std::string, double> & proc_timing)
+{
+  stop_watch_ptr_->toc("processing/inner", true);
+  if (!preProcess(pc_msg, image_msgs, camera_masks, tf_buffer)) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Pre-process failed. Skipping detection.");
+    return false;
+  }
+  proc_timing.emplace(
+    "debug/processing_time/preprocess_ms", stop_watch_ptr_->toc("processing/inner", true));
+
+  if (!inference()) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Inference failed. Skipping detection.");
+    return false;
+  }
+  proc_timing.emplace(
+    "debug/processing_time/inference_ms", stop_watch_ptr_->toc("processing/inner", true));
+
+  if (!postProcess(det_boxes3d)) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Post-process failed. Skipping detection");
+    return false;
+  }
+  proc_timing.emplace(
+    "debug/processing_time/postprocess_ms", stop_watch_ptr_->toc("processing/inner", true));
+
+  return true;
+}
+
+void BEVFusionTRT::setIntrinsicsExtrinsics(
+  std::vector<sensor_msgs::msg::CameraInfo> & camera_info_vector,
+  std::vector<Matrix4fRowM> & lidar2camera_vector)
+{
+  roi_start_y_vector_.clear();
+  std::vector<Matrix4fRowM> img_aug_matrices;
+
+  for (std::int64_t i = 0; i < config_.num_cameras_; i++) {
+    float fx = camera_info_vector[i].p[0];
+    float fy = camera_info_vector[i].p[5];
+    float cx = camera_info_vector[i].p[2];
+    float cy = camera_info_vector[i].p[6];
+
+    Matrix4fRowM camera2lidar_matrix = lidar2camera_vector[i].inverse();
+    float r31 = camera2lidar_matrix(2, 0);
+    float r32 = camera2lidar_matrix(2, 1);
+    float r33 = camera2lidar_matrix(2, 2);
+
+    float yl = cy + cx * (fy / fx) * (r31 / r32) - fy * (r33 / r32);
+    float yr =
+      cy + (cx - config_.raw_image_width_ + 1) * (fy / fx) * (r31 / r32) - fy * (r33 / r32);
+    float yh = std::max(0.0f, std::min(yr, yl));
+    float yh_resized = yh * config_.img_aug_scale_y_;
+    int crop_h = static_cast<int>(
+      std::min(yh_resized, static_cast<float>(config_.resized_height_ - config_.roi_height_)));
+
+    Matrix4fRowM img_aug_matrix = Matrix4fRowM::Identity();
+    img_aug_matrix(0, 0) = config_.img_aug_scale_x_;
+    img_aug_matrix(1, 1) = config_.img_aug_scale_y_;
+    img_aug_matrix(1, 3) = -static_cast<float>(crop_h);
+
+    img_aug_matrices.push_back(img_aug_matrix);
+    roi_start_y_vector_.push_back(crop_h);
+  }
+
+  auto [lidar2images_flattened, geom_feats, kept, ranks, indices] =
+    precompute_features(lidar2camera_vector, img_aug_matrices, camera_info_vector, config_);
+
+  assert(static_cast<std::int64_t>(lidar2images_flattened.size()) == config_.num_cameras_ * 4 * 4);
+
+  assert(
+    static_cast<std::int64_t>(geom_feats.size()) <=
+    config_.num_cameras_ * 4 * config_.features_height_ * config_.features_width_ *
+      config_.num_depth_features_);
+  assert(
+    static_cast<std::int64_t>(kept.size()) == config_.num_cameras_ * config_.features_height_ *
+                                                config_.features_width_ *
+                                                config_.num_depth_features_);
+  assert(
+    static_cast<std::int64_t>(ranks.size()) <= config_.num_cameras_ * config_.features_height_ *
+                                                 config_.features_width_ *
+                                                 config_.num_depth_features_);
+  assert(
+    static_cast<std::int64_t>(indices.size()) <= config_.num_cameras_ * config_.features_height_ *
+                                                   config_.features_width_ *
+                                                   config_.num_depth_features_);
+
+  num_geom_feats_ = static_cast<std::int64_t>(geom_feats.size());
+  num_kept_ = static_cast<std::int64_t>(kept.size());
+  num_ranks_ = static_cast<std::int64_t>(ranks.size());
+  num_indices_ = static_cast<std::int64_t>(indices.size());
+
+  assert(num_geom_feats_ == 4 * num_ranks_);
+  assert(num_ranks_ == num_indices_);
+
+  cudaMemcpy(
+    lidar2image_d_.get(), lidar2images_flattened.data(),
+    config_.num_cameras_ * 4 * 4 * sizeof(float), cudaMemcpyHostToDevice);
+  cudaMemcpy(
+    geom_feats_d_.get(), geom_feats.data(), num_geom_feats_ * sizeof(std::int32_t),
+    cudaMemcpyHostToDevice);
+  cudaMemcpy(kept_d_.get(), kept.data(), num_kept_ * sizeof(std::uint8_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(
+    ranks_d_.get(), ranks.data(), num_ranks_ * sizeof(std::int64_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(
+    indices_d_.get(), indices.data(), num_indices_ * sizeof(std::int64_t), cudaMemcpyHostToDevice);
+}
+
+bool BEVFusionTRT::preProcess(
+  const sensor_msgs::msg::PointCloud2::ConstSharedPtr & pc_msg,
+  const std::vector<sensor_msgs::msg::Image::ConstSharedPtr> & image_msgs,
+  const std::vector<float> & camera_masks, const tf2_ros::Buffer & tf_buffer)
+{
+  using autoware::cuda_utils::clear_async;
+
+  if (!is_data_layout_compatible_with_point_xyzirc(*pc_msg)) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Invalid point type. Skipping detection.");
+    return false;
+  }
+
+  if (!vg_ptr_->enqueuePointCloud(*pc_msg, tf_buffer)) {
+    return false;
+  }
+
+  // TODO(knzo25): these should be able to be removed as they are filled by TensorRT
+  clear_async(label_pred_output_d_.get(), config_.num_proposals_, stream_);
+  clear_async(bbox_pred_output_d_.get(), bbox_pred_size_, stream_);
+  clear_async(score_output_d_.get(), config_.num_proposals_, stream_);
+
+  clear_async(voxel_features_d_.get(), voxel_features_size_, stream_);
+  clear_async(voxel_coords_d_.get(), voxel_coords_size_, stream_);
+  clear_async(num_points_per_voxel_d_.get(), config_.max_num_voxels_, stream_);
+  clear_async(points_d_.get(), config_.cloud_capacity_ * config_.num_point_feature_size_, stream_);
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  // TODO(knzo25): move this to each image callback
+  if (config_.sensor_fusion_) {
+    int start_x =
+      std::max(0, static_cast<int>(config_.resized_width_) - static_cast<int>(config_.roi_width_)) /
+      2;
+
+    for (std::int64_t camera_id = 0; camera_id < config_.num_cameras_; camera_id++) {
+      int start_y = roi_start_y_vector_[camera_id];
+      cudaMemcpyAsync(
+        image_buffers_d_[camera_id].get(), image_msgs[camera_id]->data.data(),
+        config_.raw_image_height_ * config_.raw_image_width_ * 3, cudaMemcpyHostToDevice, stream_);
+
+      pre_ptr_->resize_and_extract_roi_launch(
+        image_buffers_d_[camera_id].get(),
+        &roi_tensor_d_[camera_id * config_.roi_height_ * config_.roi_width_ * 3],
+        config_.raw_image_height_, config_.raw_image_width_, config_.resized_height_,
+        config_.resized_width_, config_.roi_height_, config_.roi_width_, start_y, start_x,
+        camera_streams_[camera_id]);
+    }
+
+    cudaMemcpyAsync(
+      camera_masks_d_.get(), camera_masks.data(), config_.num_cameras_ * sizeof(float),
+      cudaMemcpyHostToDevice, stream_);
+  }
+
+  const auto num_points = vg_ptr_->generateSweepPoints(points_d_);
+
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  auto num_voxels = static_cast<std::int64_t>(pre_ptr_->generateVoxels(
+    points_d_.get(), num_points, voxel_features_d_.get(), voxel_coords_d_.get(),
+    num_points_per_voxel_d_.get()));
+
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  if (num_voxels < config_.min_num_voxels_) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"),
+      "Too few voxels (" << num_voxels << ") for the actual optimization profile ("
+                         << config_.min_num_voxels_ << ")");
+    return false;
+  }
+  if (num_voxels > config_.max_num_voxels_) {
+    RCLCPP_ERROR_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"),
+      "Actual number of voxels (" << num_voxels
+                                  << ") is over the limit for the actual optimization profile ("
+                                  << config_.max_num_voxels_ << "). Clipping to the limit.");
+    num_voxels = config_.max_num_voxels_;
+  }
+
+  network_trt_ptr_->setInputShape(
+    "voxels", nvinfer1::Dims{
+                3, {num_voxels, config_.max_points_per_voxel_, config_.num_point_feature_size_}});
+  network_trt_ptr_->setInputShape("num_points_per_voxel", nvinfer1::Dims{1, {num_voxels}});
+  network_trt_ptr_->setInputShape("coors", nvinfer1::Dims{2, {num_voxels, 3}});
+
+  if (config_.sensor_fusion_) {
+    network_trt_ptr_->setInputShape(
+      "points", nvinfer1::Dims{2, {static_cast<std::int64_t>(num_points), 5}});
+    network_trt_ptr_->setInputShape("geom_feats", nvinfer1::Dims{2, {num_ranks_, 4}});
+    network_trt_ptr_->setInputShape("kept", nvinfer1::Dims{1, {num_kept_}});
+    network_trt_ptr_->setInputShape("ranks", nvinfer1::Dims{1, {num_ranks_}});
+    network_trt_ptr_->setInputShape("indices", nvinfer1::Dims{1, {num_indices_}});
+  }
+
+  for (std::int64_t i = 0; i < config_.num_cameras_; i++) {
+    cudaStreamSynchronize(camera_streams_[i]);
+  }
+
+  return true;
+}
+
+bool BEVFusionTRT::inference()
+{
+  auto status = network_trt_ptr_->enqueueV3(stream_);
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  if (!status) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"), "Fail to enqueue and skip to detect.");
+    return false;
+  }
+
+  return true;
+}
+
+bool BEVFusionTRT::postProcess(std::vector<Box3D> & det_boxes3d)
+{
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  CHECK_CUDA_ERROR(post_ptr_->generateDetectedBoxes3D_launch(
+    label_pred_output_d_.get(), bbox_pred_output_d_.get(), score_output_d_.get(), det_boxes3d,
+    stream_));
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+  return true;
+}
+
+}  //  namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/detection_class_remapper.cpp b/perception/autoware_lidar_bevfusion/lib/detection_class_remapper.cpp
new file mode 100644
index 0000000000000..e16aa0f0177b0
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/detection_class_remapper.cpp
@@ -0,0 +1,74 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <autoware/lidar_bevfusion/detection_class_remapper.hpp>
+
+#include <cstdint>
+#include <limits>
+#include <vector>
+namespace autoware::lidar_bevfusion
+{
+
+void DetectionClassRemapper::setParameters(
+  const std::vector<std::int64_t> & allow_remapping_by_area_matrix,
+  const std::vector<double> & min_area_matrix, const std::vector<double> & max_area_matrix)
+{
+  assert(allow_remapping_by_area_matrix.size() == min_area_matrix.size());
+  assert(allow_remapping_by_area_matrix.size() == max_area_matrix.size());
+  assert(
+    std::pow(static_cast<int>(std::sqrt(min_area_matrix.size())), 2) == min_area_matrix.size());
+
+  num_labels_ = static_cast<int>(std::sqrt(min_area_matrix.size()));
+
+  Eigen::Map<const Eigen::Matrix<std::int64_t, Eigen::Dynamic, Eigen::Dynamic>>
+    allow_remapping_by_area_matrix_tmp(
+      allow_remapping_by_area_matrix.data(), num_labels_, num_labels_);
+  allow_remapping_by_area_matrix_ = allow_remapping_by_area_matrix_tmp.transpose()
+                                      .cast<bool>();  // Eigen is column major by default
+
+  Eigen::Map<const Eigen::MatrixXd> min_area_matrix_tmp(
+    min_area_matrix.data(), num_labels_, num_labels_);
+  min_area_matrix_ = min_area_matrix_tmp.transpose();  // Eigen is column major by default
+
+  Eigen::Map<const Eigen::MatrixXd> max_area_matrix_tmp(
+    max_area_matrix.data(), num_labels_, num_labels_);
+  max_area_matrix_ = max_area_matrix_tmp.transpose();  // Eigen is column major by default
+
+  min_area_matrix_ = min_area_matrix_.unaryExpr(
+    [](double v) { return std::isfinite(v) ? v : std::numeric_limits<double>::max(); });
+  max_area_matrix_ = max_area_matrix_.unaryExpr(
+    [](double v) { return std::isfinite(v) ? v : std::numeric_limits<double>::max(); });
+}
+
+void DetectionClassRemapper::mapClasses(autoware_perception_msgs::msg::DetectedObjects & msg)
+{
+  for (auto & object : msg.objects) {
+    const float bev_area = object.shape.dimensions.x * object.shape.dimensions.y;
+
+    for (auto & classification : object.classification) {
+      auto & label = classification.label;
+
+      for (int i = 0; i < num_labels_; ++i) {
+        if (
+          allow_remapping_by_area_matrix_(label, i) && bev_area >= min_area_matrix_(label, i) &&
+          bev_area <= max_area_matrix_(label, i)) {
+          label = i;
+          break;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/postprocess/circle_nms_kernel.cu b/perception/autoware_lidar_bevfusion/lib/postprocess/circle_nms_kernel.cu
new file mode 100644
index 0000000000000..d30bc06dfbe3d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/postprocess/circle_nms_kernel.cu
@@ -0,0 +1,147 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp"
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <cstddef>
+
+namespace
+{
+const std::size_t THREADS_PER_BLOCK_NMS = 16;
+}  // namespace
+
+namespace autoware::lidar_bevfusion
+{
+
+__device__ inline float dist2dPow(const Box3D * a, const Box3D * b)
+{
+  return powf(a->x - b->x, 2) + powf(a->y - b->y, 2);
+}
+
+// cspell: ignore divup
+__global__ void circleNMS_Kernel(
+  const Box3D * __restrict__ boxes, const std::size_t num_boxes3d, const std::size_t col_blocks,
+  const float dist2d_pow_threshold, std::uint64_t * __restrict__ mask)
+{
+  // params: boxes (N,)
+  // params: mask (N, divup(N/THREADS_PER_BLOCK_NMS))
+
+  const auto row_start = blockIdx.y;
+  const auto col_start = blockIdx.x;
+
+  if (row_start > col_start) return;
+
+  const std::size_t row_size =
+    fminf(num_boxes3d - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+  const std::size_t col_size =
+    fminf(num_boxes3d - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
+
+  __shared__ Box3D block_boxes[THREADS_PER_BLOCK_NMS];
+
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x] = boxes[THREADS_PER_BLOCK_NMS * col_start + threadIdx.x];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const std::size_t cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+    const Box3D * cur_box = boxes + cur_box_idx;
+
+    std::uint64_t t = 0;
+    std::size_t start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (std::size_t i = start; i < col_size; i++) {
+      if (dist2dPow(cur_box, block_boxes + i) < dist2d_pow_threshold) {
+        t |= 1ULL << i;
+      }
+    }
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+cudaError_t circleNMS_launch(
+  const thrust::device_vector<Box3D> & boxes3d, const std::size_t num_boxes3d,
+  std::size_t col_blocks, const float distance_threshold,
+  thrust::device_vector<std::uint64_t> & mask, cudaStream_t stream)
+{
+  const float dist2d_pow_thres = powf(distance_threshold, 2);
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+  circleNMS_Kernel<<<blocks, threads, 0, stream>>>(
+    thrust::raw_pointer_cast(boxes3d.data()), num_boxes3d, col_blocks, dist2d_pow_thres,
+    thrust::raw_pointer_cast(mask.data()));
+
+  return cudaGetLastError();
+}
+
+std::size_t circleNMS(
+  thrust::device_vector<Box3D> & boxes3d, const float distance_threshold,
+  thrust::device_vector<bool> & keep_mask, cudaStream_t stream)
+{
+  const auto num_boxes3d = boxes3d.size();
+  const auto col_blocks = divup(num_boxes3d, THREADS_PER_BLOCK_NMS);
+  thrust::device_vector<std::uint64_t> mask_d(num_boxes3d * col_blocks);
+
+  CHECK_CUDA_ERROR(
+    circleNMS_launch(boxes3d, num_boxes3d, col_blocks, distance_threshold, mask_d, stream));
+
+  // memcpy device to host
+  thrust::host_vector<std::uint64_t> mask_h(mask_d.size());
+  thrust::copy(mask_d.begin(), mask_d.end(), mask_h.begin());
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
+
+  // generate keep_mask
+  std::vector<std::uint64_t> remv_h(col_blocks);
+  thrust::host_vector<bool> keep_mask_h(keep_mask.size());
+  std::size_t num_to_keep = 0;
+  for (std::size_t i = 0; i < num_boxes3d; i++) {
+    auto nblock = i / THREADS_PER_BLOCK_NMS;
+    auto inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_h[nblock] & (1ULL << inblock))) {
+      keep_mask_h[i] = true;
+      num_to_keep++;
+      std::uint64_t * p = &mask_h[0] + i * col_blocks;
+      for (std::size_t j = nblock; j < col_blocks; j++) {
+        remv_h[j] |= p[j];
+      }
+    } else {
+      keep_mask_h[i] = false;
+    }
+  }
+
+  // memcpy host to device
+  keep_mask = keep_mask_h;
+
+  return num_to_keep;
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/postprocess/non_maximum_suppression.cpp b/perception/autoware_lidar_bevfusion/lib/postprocess/non_maximum_suppression.cpp
new file mode 100644
index 0000000000000..dd64a73890247
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/postprocess/non_maximum_suppression.cpp
@@ -0,0 +1,114 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/postprocess/non_maximum_suppression.hpp"
+
+#include <autoware/object_recognition_utils/geometry.hpp>
+#include <autoware/object_recognition_utils/object_recognition_utils.hpp>
+#include <autoware/universe_utils/geometry/geometry.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+void NonMaximumSuppression::setParameters(const NMSParams & params)
+{
+  assert(params.search_distance_2d_ >= 0.0);
+  assert(params.iou_threshold_ >= 0.0 && params.iou_threshold_ <= 1.0);
+
+  params_ = params;
+  target_class_mask_ = classNamesToBooleanMask(params.target_class_names_);
+}
+
+bool NonMaximumSuppression::isTargetLabel(const std::uint8_t label)
+{
+  if (label >= target_class_mask_.size()) {
+    return false;
+  }
+
+  assert(label < target_class_mask_.size());
+
+  return target_class_mask_.at(label);
+}
+
+bool NonMaximumSuppression::isTargetPairObject(
+  const DetectedObject & object1, const DetectedObject & object2)
+{
+  const auto label1 =
+    autoware::object_recognition_utils::getHighestProbLabel(object1.classification);
+  const auto label2 =
+    autoware::object_recognition_utils::getHighestProbLabel(object2.classification);
+
+  if (isTargetLabel(label1) && isTargetLabel(label2)) {
+    return true;
+  }
+
+  const auto search_sqr_dist_2d = params_.search_distance_2d_ * params_.search_distance_2d_;
+  const auto sqr_dist_2d = autoware::universe_utils::calcSquaredDistance2d(
+    autoware::object_recognition_utils::getPose(object1),
+    autoware::object_recognition_utils::getPose(object2));
+  return sqr_dist_2d <= search_sqr_dist_2d;
+}
+
+Eigen::MatrixXd NonMaximumSuppression::generateIoUMatrix(
+  const std::vector<DetectedObject> & input_objects)
+{
+  // NOTE: row = target objects to be suppressed, col = source objects to be compared
+  Eigen::MatrixXd triangular_matrix =
+    Eigen::MatrixXd::Zero(input_objects.size(), input_objects.size());
+  for (std::size_t target_i = 0; target_i < input_objects.size(); ++target_i) {
+    for (std::size_t source_i = 0; source_i < target_i; ++source_i) {
+      const auto & target_obj = input_objects.at(target_i);
+      const auto & source_obj = input_objects.at(source_i);
+      if (!isTargetPairObject(target_obj, source_obj)) {
+        continue;
+      }
+
+      if (params_.nms_type_ == NMS_TYPE::IoU_BEV) {
+        const double iou = autoware::object_recognition_utils::get2dIoU(target_obj, source_obj);
+        triangular_matrix(target_i, source_i) = iou;
+        // NOTE: If the target object has any objects with iou > iou_threshold, it
+        // will be suppressed regardless of later results.
+        if (iou > params_.iou_threshold_) {
+          break;
+        }
+      }
+    }
+  }
+
+  return triangular_matrix;
+}
+
+std::vector<DetectedObject> NonMaximumSuppression::apply(
+  const std::vector<DetectedObject> & input_objects)
+{
+  Eigen::MatrixXd iou_matrix = generateIoUMatrix(input_objects);
+
+  std::vector<DetectedObject> output_objects;
+  output_objects.reserve(input_objects.size());
+  for (std::size_t i = 0; i < input_objects.size(); ++i) {
+    const auto value = iou_matrix.row(i).maxCoeff();
+    if (params_.nms_type_ == NMS_TYPE::IoU_BEV) {
+      if (value <= params_.iou_threshold_) {
+        output_objects.emplace_back(input_objects.at(i));
+      }
+    }
+  }
+
+  return output_objects;
+}
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/postprocess/postprocess_kernel.cu b/perception/autoware_lidar_bevfusion/lib/postprocess/postprocess_kernel.cu
new file mode 100644
index 0000000000000..9ca15ba19e46d
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/postprocess/postprocess_kernel.cu
@@ -0,0 +1,141 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/postprocess/circle_nms_kernel.hpp"
+#include "autoware/lidar_bevfusion/postprocess/postprocess_kernel.hpp"
+
+#include <thrust/count.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sort.h>
+
+#include <cstddef>
+
+namespace autoware::lidar_bevfusion
+{
+struct is_score_greater
+{
+  is_score_greater(float t) : t_(t) {}
+
+  __device__ bool operator()(const Box3D & b) { return b.score > t_; }
+
+private:
+  float t_{0.0};
+};
+
+struct is_kept
+{
+  __device__ bool operator()(const bool keep) { return keep; }
+};
+
+struct score_greater
+{
+  __device__ bool operator()(const Box3D & lb, const Box3D & rb) { return lb.score > rb.score; }
+};
+
+__device__ inline float sigmoid(float x)
+{
+  return 1.0f / (1.0f + expf(-x));
+}
+
+__global__ void generateBoxes3D_kernel(
+  const std::int64_t * __restrict__ label_pred_output, const float * __restrict__ bbox_pred_output,
+  const float * __restrict__ score_output, const float voxel_size_x, const float voxel_size_y,
+  const float min_x_range, const float min_y_range, const int num_proposals,
+  const float out_size_factor, const float * __restrict__ yaw_norm_thresholds,
+  Box3D * __restrict__ det_boxes3d)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= num_proposals) {
+    return;
+  }
+
+  const float yaw_sin = bbox_pred_output[6 * num_proposals + point_idx];
+  const float yaw_cos = bbox_pred_output[7 * num_proposals + point_idx];
+  const float yaw_norm = sqrtf(yaw_sin * yaw_sin + yaw_cos * yaw_cos);
+  const int label = static_cast<int>(label_pred_output[point_idx]);
+
+  det_boxes3d[point_idx].label = label;
+  det_boxes3d[point_idx].score =
+    yaw_norm >= yaw_norm_thresholds[label] ? score_output[point_idx] : 0.f;
+
+  det_boxes3d[point_idx].x =
+    bbox_pred_output[0 * num_proposals + point_idx] * out_size_factor * voxel_size_x + min_x_range;
+  det_boxes3d[point_idx].y =
+    bbox_pred_output[1 * num_proposals + point_idx] * out_size_factor * voxel_size_y + min_y_range;
+  det_boxes3d[point_idx].z = bbox_pred_output[2 * num_proposals + point_idx];
+  det_boxes3d[point_idx].length = expf(bbox_pred_output[3 * num_proposals + point_idx]);
+  det_boxes3d[point_idx].width = expf(bbox_pred_output[4 * num_proposals + point_idx]);
+  det_boxes3d[point_idx].height = expf(bbox_pred_output[5 * num_proposals + point_idx]);
+  det_boxes3d[point_idx].yaw = atan2f(yaw_sin, yaw_cos);
+  det_boxes3d[point_idx].vx = bbox_pred_output[8 * num_proposals + point_idx];
+  det_boxes3d[point_idx].vy = bbox_pred_output[9 * num_proposals + point_idx];
+}
+
+PostprocessCuda::PostprocessCuda(const BEVFusionConfig & config, cudaStream_t & stream)
+: config_(config), stream_(stream)
+{
+}
+
+// cspell: ignore divup
+cudaError_t PostprocessCuda::generateDetectedBoxes3D_launch(
+  const std::int64_t * label_pred_output, const float * bbox_pred_output,
+  const float * score_output, std::vector<Box3D> & det_boxes3d, cudaStream_t stream)
+{
+  dim3 threads = {config_.threads_per_block_};
+  dim3 blocks = {divup(config_.num_proposals_, threads.x)};
+
+  auto boxes3d_d = thrust::device_vector<Box3D>(config_.num_proposals_);
+  auto yaw_norm_thresholds_d = thrust::device_vector<float>(
+    config_.yaw_norm_thresholds_.begin(), config_.yaw_norm_thresholds_.end());
+
+  generateBoxes3D_kernel<<<blocks, threads, 0, stream>>>(
+    label_pred_output, bbox_pred_output, score_output, config_.voxel_x_size_, config_.voxel_y_size_,
+    config_.min_x_range_, config_.min_y_range_, config_.num_proposals_, config_.out_size_factor_,
+    thrust::raw_pointer_cast(yaw_norm_thresholds_d.data()),
+    thrust::raw_pointer_cast(boxes3d_d.data()));
+
+  // suppress by score
+  const auto num_det_boxes3d = thrust::count_if(
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), is_score_greater(config_.score_threshold_));
+
+  if (num_det_boxes3d == 0) {
+    return cudaGetLastError();
+  }
+
+  thrust::device_vector<Box3D> det_boxes3d_d(num_det_boxes3d);
+  thrust::copy_if(
+    thrust::device, boxes3d_d.begin(), boxes3d_d.end(), det_boxes3d_d.begin(),
+    is_score_greater(config_.score_threshold_));
+
+  // sort by score
+  thrust::sort(det_boxes3d_d.begin(), det_boxes3d_d.end(), score_greater());
+
+  // supress by NMS
+  thrust::device_vector<bool> final_keep_mask_d(num_det_boxes3d);
+  const auto num_final_det_boxes3d =
+    circleNMS(det_boxes3d_d, config_.circle_nms_dist_threshold_, final_keep_mask_d, stream);
+  thrust::device_vector<Box3D> final_det_boxes3d_d(num_final_det_boxes3d);
+  thrust::copy_if(
+    thrust::device, det_boxes3d_d.begin(), det_boxes3d_d.end(), final_keep_mask_d.begin(),
+    final_det_boxes3d_d.begin(), is_kept());
+
+  // memcpy device to host
+  det_boxes3d.resize(num_final_det_boxes3d);
+  thrust::copy(final_det_boxes3d_d.begin(), final_det_boxes3d_d.end(), det_boxes3d.begin());
+
+  return cudaGetLastError();
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/preprocess/pointcloud_densification.cpp b/perception/autoware_lidar_bevfusion/lib/preprocess/pointcloud_densification.cpp
new file mode 100644
index 0000000000000..1b25ac1ce3577
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/preprocess/pointcloud_densification.cpp
@@ -0,0 +1,111 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/preprocess/pointcloud_densification.hpp"
+
+#include <pcl_ros/transforms.hpp>
+#include <tf2_eigen/tf2_eigen.hpp>
+
+#include <boost/optional.hpp>
+
+#include <pcl_conversions/pcl_conversions.h>
+
+#include <string>
+#include <utility>
+
+namespace
+{
+
+boost::optional<geometry_msgs::msg::Transform> getTransform(
+  const tf2_ros::Buffer & tf_buffer, const std::string & target_frame_id,
+  const std::string & source_frame_id, const rclcpp::Time & time)
+{
+  try {
+    geometry_msgs::msg::TransformStamped transform_stamped;
+    transform_stamped = tf_buffer.lookupTransform(
+      target_frame_id, source_frame_id, time, rclcpp::Duration::from_seconds(0.5));
+    return transform_stamped.transform;
+  } catch (tf2::TransformException & ex) {
+    RCLCPP_WARN_STREAM(rclcpp::get_logger("lidar_bevfusion"), ex.what());
+    return boost::none;
+  }
+}
+
+Eigen::Affine3f transformToEigen(const geometry_msgs::msg::Transform & t)
+{
+  Eigen::Affine3f a;
+  a.matrix() = tf2::transformToEigen(t).matrix().cast<float>();
+  return a;
+}
+
+}  // namespace
+
+namespace autoware::lidar_bevfusion
+{
+
+PointCloudDensification::PointCloudDensification(
+  const DensificationParam & param, cudaStream_t & stream)
+: param_(param), stream_(stream)
+{
+}
+
+bool PointCloudDensification::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & pointcloud_msg, const tf2_ros::Buffer & tf_buffer)
+{
+  const auto header = pointcloud_msg.header;
+
+  if (param_.pointcloud_cache_size() > 1) {
+    auto transform_world2current =
+      getTransform(tf_buffer, header.frame_id, param_.world_frame_id(), header.stamp);
+    if (!transform_world2current) {
+      return false;
+    }
+    auto affine_world2current = transformToEigen(transform_world2current.get());
+
+    enqueue(pointcloud_msg, affine_world2current);
+  } else {
+    enqueue(pointcloud_msg, Eigen::Affine3f::Identity());
+  }
+
+  dequeue();
+
+  return true;
+}
+
+void PointCloudDensification::enqueue(
+  const sensor_msgs::msg::PointCloud2 & msg, const Eigen::Affine3f & affine_world2current)
+{
+  affine_world2current_ = affine_world2current;
+  current_timestamp_ = rclcpp::Time(msg.header.stamp).seconds();
+
+  auto data_d = autoware::cuda_utils::make_unique<InputPointType[]>(msg.width * msg.height);
+
+  CHECK_CUDA_ERROR(cudaMemcpyAsync(
+    data_d.get(), msg.data.data(), sizeof(std::uint8_t) * msg.width * msg.height * msg.point_step,
+    cudaMemcpyHostToDevice, stream_));
+
+  PointCloudWithTransform pointcloud = {
+    std::move(data_d), msg.header, msg.width * msg.height, affine_world2current.inverse()};
+
+  pointcloud_cache_.push_front(std::move(pointcloud));
+}
+
+void PointCloudDensification::dequeue()
+{
+  if (pointcloud_cache_.size() > param_.pointcloud_cache_size()) {
+    pointcloud_cache_.pop_back();
+  }
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/preprocess/precomputed_features.cpp b/perception/autoware_lidar_bevfusion/lib/preprocess/precomputed_features.cpp
new file mode 100644
index 0000000000000..044f032cfab23
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/preprocess/precomputed_features.cpp
@@ -0,0 +1,376 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/preprocess/precomputed_features.hpp"
+
+#include <Eigen/Dense>
+#include <unsupported/Eigen/CXX11/Tensor>
+
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+Tensor4D create_frustum(const BEVFusionConfig & config)
+{
+  const float dbound_start = config.dbound_[0];
+  const float dbound_end = config.dbound_[1];
+  const float dbound_step = config.dbound_[2];
+  const int roi_height = static_cast<int>(config.roi_height_);            // Image height
+  const int roi_width = static_cast<int>(config.roi_width_);              // Image width
+  const int features_height = static_cast<int>(config.features_height_);  // Feature height
+  const int features_width = static_cast<int>(config.features_width_);    // Feature width
+
+  // Compute D (number of depth layers)
+  int D = static_cast<Eigen::Index>(std::floor((dbound_end - dbound_start) / dbound_step));
+  assert(static_cast<std::int64_t>(D) == config.num_depth_features_);
+
+  Tensor1D ds_vec(D);
+  for (Eigen::Index i = 0; i < D; ++i) {
+    ds_vec(i) = dbound_start + i * dbound_step;
+  }
+
+  // Create xs_lin and ys_lin tensors
+  Tensor1D xs_lin(features_width);
+  for (Eigen::Index i = 0; i < features_width; ++i) {
+    xs_lin(i) = static_cast<double>(i) * (roi_width - 1) / (features_width - 1);
+  }
+
+  Tensor1D ys_lin(features_height);
+  for (Eigen::Index i = 0; i < features_height; ++i) {
+    ys_lin(i) = static_cast<double>(i) * (roi_height - 1) / (features_height - 1);
+  }
+
+  // Reshape and broadcast ds_vec to (D, features_height, features_width)
+  Tensor3D ds_tensor = ds_vec.reshape(Eigen::array<Eigen::Index, 3>{D, 1, 1});
+  Tensor3D ds_broadcast =
+    ds_tensor.broadcast(Eigen::array<Eigen::Index, 3>{1, features_height, features_width});
+
+  // Reshape and broadcast xs_lin to (1, 1, features_width)
+  Tensor3D xs_tensor = xs_lin.reshape(Eigen::array<Eigen::Index, 3>{1, 1, features_width});
+  Tensor3D xs_broadcast = xs_tensor.broadcast(Eigen::array<Eigen::Index, 3>{D, features_height, 1});
+
+  // Reshape and broadcast ys_lin to (1, features_height, 1)
+  Tensor3D ys_tensor = ys_lin.reshape(Eigen::array<Eigen::Index, 3>{1, features_height, 1});
+  Tensor3D ys_broadcast = ys_tensor.broadcast(Eigen::array<Eigen::Index, 3>{D, 1, features_width});
+
+  // Stack xs, ys, ds along the last dimension to form frustum
+  Tensor4D frustum(D, features_height, features_width, 3);
+  frustum.chip(0, 3) = xs_broadcast;
+  frustum.chip(1, 3) = ys_broadcast;
+  frustum.chip(2, 3) = ds_broadcast;
+
+  return frustum;
+}
+
+Tensor5D get_geometry(
+  const Tensor4D & frustum,             // [D, H, W, 3]
+  const Tensor3D & camera2lidar_rots,   // [N, 3, 3]
+  const Tensor2D & camera2lidar_trans,  // [N, 3]
+  const Tensor3D & intrins_inverse,     // [N, 3, 3]
+  const Tensor3D & post_rots_inverse,   // [N, 3, 3]
+  const Tensor2D & post_trans           // [N, 3]
+)
+{
+  // Get dimensions
+  int N = camera2lidar_trans.dimension(0);
+  int D = frustum.dimension(0);
+  int H = frustum.dimension(1);
+  int W = frustum.dimension(2);
+
+  // Reshape and broadcast frustum to [N, D, H, W, 3]
+  Tensor5D frustum_broadcast = frustum.reshape(Eigen::array<Eigen::Index, 5>{1, D, H, W, 3})
+                                 .broadcast(Eigen::array<Eigen::Index, 5>{N, 1, 1, 1, 1});
+
+  // Reshape post_trans to [N, 1, 1, 1, 3]
+  Tensor5D post_trans_broadcast = post_trans.reshape(Eigen::array<Eigen::Index, 5>{N, 1, 1, 1, 3})
+                                    .broadcast(Eigen::array<Eigen::Index, 5>{1, D, H, W, 1});
+
+  // Subtract post_trans from frustum
+  Tensor5D points = frustum_broadcast - post_trans_broadcast;
+
+  // Unsqueeze points to [N, D, H, W, 3, 1]
+  Tensor6D points_unsqueezed = points.reshape(Eigen::array<Eigen::Index, 6>{N, D, H, W, 3, 1});
+
+  Tensor5D points_rotated(N, D, H, W, 3);
+
+  for (Eigen::Index camera_index = 0; camera_index < N; camera_index++) {
+    Tensor2D post_rot_inverse = post_rots_inverse.chip(camera_index, 0);
+    Tensor4D points_sliced = points.chip(camera_index, 0);
+
+    Eigen::array<Eigen::IndexPair<int>, 1> contract_dims = {Eigen::IndexPair<int>(3, 0)};
+    Eigen::array<int, 2> shuffling({1, 0});
+    Tensor2D post_rot_inverse_transposed = post_rot_inverse.shuffle(shuffling);
+
+    Tensor4D points_sliced_rotated =
+      points_sliced.contract(post_rot_inverse_transposed, contract_dims);
+    points_rotated.chip(camera_index, 0) = points_sliced_rotated;
+  }
+
+  // Remove the last dimension (size 1)
+  Tensor5D points_squeezed = points_rotated;
+
+  Tensor5D points_xy = points_squeezed.slice(
+    Eigen::array<Eigen::Index, 5>{0, 0, 0, 0, 0}, Eigen::array<Eigen::Index, 5>{N, D, H, W, 2});
+  Tensor5D points_z = points_squeezed.slice(
+    Eigen::array<Eigen::Index, 5>{0, 0, 0, 0, 2}, Eigen::array<Eigen::Index, 5>{N, D, H, W, 1});
+  Tensor5D points_xy_scaled =
+    points_xy * points_z.broadcast(Eigen::array<Eigen::Index, 5>{1, 1, 1, 1, 2});
+
+  // Concatenate points_xy_scaled and points_z along the last dimension
+  Tensor5D points_cat(points_squeezed.dimensions());
+  points_cat.slice(
+    Eigen::array<Eigen::Index, 5>{0, 0, 0, 0, 0}, Eigen::array<Eigen::Index, 5>{N, D, H, W, 2}) =
+    points_xy_scaled;
+  points_cat.slice(
+    Eigen::array<Eigen::Index, 5>{0, 0, 0, 0, 2}, Eigen::array<Eigen::Index, 5>{N, D, H, W, 1}) =
+    points_z;
+
+  Tensor3D combine(intrins_inverse.dimensions());
+
+  for (Eigen::Index camera_index = 0; camera_index < N; camera_index++) {
+    Tensor2D camera2lidar_rot = camera2lidar_rots.chip(camera_index, 0);
+    Tensor2D intrins_inv = intrins_inverse.chip(camera_index, 0);
+
+    Eigen::array<Eigen::IndexPair<int>, 1> contract_dims = {Eigen::IndexPair<int>(1, 0)};
+
+    Tensor2D combine_sliced = camera2lidar_rot.contract(intrins_inv, contract_dims);
+    combine.chip(camera_index, 0) = combine_sliced;
+  }
+
+  // Reshape combine to [N, 1, 1, 1, 3, 3]
+  Tensor6D combine_reshaped = combine.reshape(Eigen::array<Eigen::Index, 6>{N, 1, 1, 1, 3, 3});
+
+  Tensor5D points_transformed(points_cat.dimensions());
+
+  for (Eigen::Index camera_index = 0; camera_index < N; camera_index++) {
+    Tensor2D combine_sliced = combine.chip(camera_index, 0);
+    Tensor4D points_sliced = points_cat.chip(camera_index, 0);
+
+    Eigen::array<Eigen::IndexPair<int>, 1> contract_dims = {Eigen::IndexPair<int>(3, 0)};
+    Eigen::array<int, 2> shuffling({1, 0});
+    Tensor2D combined_transposed = combine_sliced.shuffle(shuffling);
+
+    Tensor4D points_transformed_sliced = points_sliced.contract(combined_transposed, contract_dims);
+    points_transformed.chip(camera_index, 0) = points_transformed_sliced;
+  }
+
+  // Squeeze the last dimension
+  Tensor5D points_squeezed_final =
+    points_transformed.reshape(Eigen::array<Eigen::Index, 5>{N, D, H, W, 3});
+
+  // Add camera2lidar_trans
+  Tensor5D camera2lidar_trans_broacast =
+    camera2lidar_trans.reshape(Eigen::array<Eigen::Index, 5>{N, 1, 1, 1, 3})
+      .broadcast(Eigen::array<Eigen::Index, 5>{1, D, H, W, 1});
+  ;
+  Tensor5D points_final = points_squeezed_final + camera2lidar_trans_broacast;
+
+  return points_final;
+}
+
+// Define the function
+std::tuple<
+    Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor>,  // geom_feats
+    Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor>,  // kept
+    Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>,  // ranks
+    Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>   // indices
+> bev_pool_aux(
+    const Tensor5D& geom_feats_input,
+    const BEVFusionConfig & config)
+{
+  Eigen::Vector3f dx(config.xbound_[2], config.ybound_[2], config.zbound_[2]);
+  Eigen::Vector3f bx(
+    config.xbound_[0] + config.xbound_[2] / 2.0, config.ybound_[0] + config.ybound_[2] / 2.0,
+    config.zbound_[0] + config.zbound_[2] / 2.0);
+  Eigen::Vector3i nx(
+    static_cast<int>((config.xbound_[1] - config.xbound_[0]) / config.xbound_[2]),
+    static_cast<int>((config.ybound_[1] - config.ybound_[0]) / config.ybound_[2]),
+    static_cast<int>((config.zbound_[1] - config.zbound_[0]) / config.zbound_[2]));
+
+  // Get dimensions
+  Eigen::Index N = geom_feats_input.dimension(0);
+  Eigen::Index D = geom_feats_input.dimension(1);
+  Eigen::Index H = geom_feats_input.dimension(2);
+  Eigen::Index W = geom_feats_input.dimension(3);
+  Eigen::Index C = geom_feats_input.dimension(4);
+  assert(C == 3);
+
+  Eigen::Index Nprime = N * D * H * W;
+
+  TensorMap1D dx_map(dx.data(), 3);
+  TensorMap1D bx_map(bx.data(), 3);
+
+  Tensor5D dx_broadcast = dx_map.reshape(Eigen::array<Eigen::Index, 5>{1, 1, 1, 1, C})
+                            .broadcast(Eigen::array<Eigen::Index, 5>{N, D, H, W, 1});
+
+  Tensor5D bx_broadcast = bx_map.reshape(Eigen::array<Eigen::Index, 5>{1, 1, 1, 1, 3})
+                            .broadcast(Eigen::array<Eigen::Index, 5>{N, D, H, W, 1});
+
+  Tensor5D geom_feats_aux = (geom_feats_input - (bx_broadcast - 0.5 * dx_broadcast)) / dx_broadcast;
+
+  Eigen::Map<const Eigen::Matrix<float, Eigen::Dynamic, 3, Eigen::RowMajor>> geom_feats_map(
+    geom_feats_aux.data(), Nprime, C);
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 3, Eigen::RowMajor> geom_feats_int =
+    geom_feats_map.cast<std::int32_t>();
+
+  // Concatenate geom_feats_int and batch_ix along column
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor> geom_feats_cat(Nprime, 4);
+  geom_feats_cat.block(0, 0, Nprime, C) = geom_feats_int;
+  geom_feats_cat.col(3).setZero();
+
+  // Filter out points outside box
+  Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor> kept(Nprime);
+  for (Eigen::Index i = 0; i < Nprime; ++i) {
+    int32_t x = geom_feats_cat(i, 0);
+    int32_t y = geom_feats_cat(i, 1);
+    int32_t z = geom_feats_cat(i, 2);
+
+    bool keep = (x >= 0) && (x < nx(0)) && (y >= 0) && (y < nx(1)) && (z >= 0) && (z < nx(2));
+    kept(i) = keep;
+  }
+
+  // Collect indices where kept is true
+  std::vector<std::int64_t> kept_indices;
+  for (Eigen::Index i = 0; i < Nprime; ++i) {
+    if (kept(i)) {
+      kept_indices.push_back(i);
+    }
+  }
+
+  // Create geom_feats_kept
+  Eigen::Index N_kept = kept_indices.size();
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor> geom_feats_kept(N_kept, 4);
+  for (Eigen::Index i = 0; i < N_kept; ++i) {
+    std::int64_t idx = kept_indices[i];
+    geom_feats_kept.row(i) = geom_feats_cat.row(idx);
+  }
+
+  // Compute ranks
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> ranks(N_kept);
+  std::int64_t factor0 = static_cast<std::int64_t>(W) * D;
+  std::int64_t factor1 = static_cast<std::int64_t>(D);
+  std::int64_t factor2 = 1;
+
+  for (Eigen::Index i = 0; i < N_kept; ++i) {
+    std::int32_t x = geom_feats_kept(i, 0);
+    std::int32_t y = geom_feats_kept(i, 1);
+    std::int32_t z = geom_feats_kept(i, 2);
+    std::int64_t batch = geom_feats_kept(i, 3);
+
+    std::int64_t rank = x * factor0 + y * factor1 + z * factor2 + batch;
+    ranks(i) = rank;
+  }
+
+  //  Sort ranks and get indices
+  std::vector<std::pair<std::int64_t, std::int64_t>> rank_idx_pairs(N_kept);
+  for (Eigen::Index i = 0; i < N_kept; ++i) {
+    rank_idx_pairs[i] = std::make_pair(ranks(i), i);
+  }
+  std::sort(rank_idx_pairs.begin(), rank_idx_pairs.end());
+
+  //  Create sorted ranks and indices
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> ranks_sorted(N_kept);
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor> indices(N_kept);
+  Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor> geom_feats_sorted(N_kept, 4);
+  for (Eigen::Index i = 0; i < N_kept; ++i) {
+    std::int64_t rank = rank_idx_pairs[i].first;
+    std::int64_t idx = rank_idx_pairs[i].second;
+
+    ranks_sorted(i) = rank;
+    indices(i) = idx;
+    geom_feats_sorted.row(i) = geom_feats_kept.row(idx);
+  }
+
+  //  Return geom_feats_sorted, kept, ranks_sorted, indices
+  return std::make_tuple(geom_feats_sorted, kept, ranks_sorted, indices);
+}
+
+std::tuple<
+  Eigen::VectorXf, Eigen::Matrix<std::int32_t, Eigen::Dynamic, 4, Eigen::RowMajor>,
+  Eigen::Matrix<std::uint8_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>,
+  Eigen::Matrix<std::int64_t, 1, Eigen::Dynamic, Eigen::RowMajor>>
+precompute_features(
+  const std::vector<Matrix4fRowM> & lidar2camera_transforms,
+  const std::vector<Matrix4fRowM> & camera_aug_matrices,
+  const std::vector<sensor_msgs::msg::CameraInfo> & camera_info_vector,
+  const BEVFusionConfig & config)
+{
+  Eigen::VectorXf lidar2images_flat(config.num_cameras_ * 4 * 4);
+
+  Tensor4D frustum = autoware::lidar_bevfusion::create_frustum(config);
+
+  Tensor3D camera2lidar_rotations(config.num_cameras_, 3, 3);
+  Tensor2D camera2lidar_translations(config.num_cameras_, 3);
+  Tensor3D intrins_inverse(config.num_cameras_, 3, 3);
+  Tensor3D post_rots_inverse(config.num_cameras_, 3, 3);
+  Tensor2D post_trans(config.num_cameras_, 3);
+
+  for (std::int64_t camera_id = 0; camera_id < config.num_cameras_; camera_id++) {
+    Matrix4fRowM cam2image = Matrix4fRowM::Identity();
+    cam2image(0, 0) = camera_info_vector[camera_id].p[0];
+    cam2image(1, 1) = camera_info_vector[camera_id].p[5];
+    cam2image(0, 2) = camera_info_vector[camera_id].p[2];
+    cam2image(1, 2) = camera_info_vector[camera_id].p[6];
+
+    Matrix4fRowM img_aug_matrix = camera_aug_matrices[camera_id];
+    Matrix4fRowM img_aug_matrix_inverse = img_aug_matrix.inverse().eval();
+
+    Matrix4fRowM lidar2camera = lidar2camera_transforms[camera_id];
+    Matrix4fRowM camera2lidar = lidar2camera.inverse().eval();
+
+    Matrix4fRowM lidar2image = cam2image * lidar2camera;
+    Eigen::VectorXf lidar2image_flat(Eigen::Map<Eigen::VectorXf>(lidar2image.data(), 16));
+    lidar2images_flat.segment(16 * camera_id, 16) = lidar2image_flat;
+
+    Matrix3fRowM camera2lidar_rot = camera2lidar.block<3, 3>(0, 0);
+    Vector3fRowM camera2lidar_trans = camera2lidar.block<3, 1>(0, 3);
+    Matrix3fRowM intrins_inv = cam2image.inverse().eval().block<3, 3>(0, 0);
+    Matrix3fRowM post_rots_inv = img_aug_matrix_inverse.block<3, 3>(0, 0);
+    Vector3fRowM post_trans_vec = img_aug_matrix.block<3, 1>(0, 3);
+
+    TensorMap2D camera2lidar_rot_tensor(camera2lidar_rot.data(), 3, 3);
+    TensorMap1D camera2lidar_trans_tensor(camera2lidar_trans.data(), 3);
+    TensorMap2D intrins_inv_tensor(intrins_inv.data(), 3, 3);
+    TensorMap2D post_rots_inv_tensor(post_rots_inv.data(), 3, 3);
+    TensorMap1D post_trans_tensor(post_trans_vec.data(), 3);
+
+    camera2lidar_rotations.chip(camera_id, 0) = camera2lidar_rot_tensor;
+    camera2lidar_translations.chip(camera_id, 0) = camera2lidar_trans_tensor;
+    intrins_inverse.chip(camera_id, 0) = intrins_inv_tensor;
+    post_rots_inverse.chip(camera_id, 0) = post_rots_inv_tensor;
+    post_trans.chip(camera_id, 0) = post_trans_tensor;
+  }
+
+  Tensor5D geometry = get_geometry(
+    frustum,                    // [D, H, W, 3]
+    camera2lidar_rotations,     // [N, 3, 3]
+    camera2lidar_translations,  // [N, 3]
+    intrins_inverse,            // [N, 3, 3]
+    post_rots_inverse,          // [N, 3, 3]
+    post_trans                  // [N, 3]
+  );
+
+  auto [geom_feats, kept, ranks, indices] = bev_pool_aux(geometry, config);
+
+  return std::make_tuple(lidar2images_flat, geom_feats, kept, ranks, indices);
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/preprocess/preprocess_kernel.cu b/perception/autoware_lidar_bevfusion/lib/preprocess/preprocess_kernel.cu
new file mode 100644
index 0000000000000..27baf7cd45b8f
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/preprocess/preprocess_kernel.cu
@@ -0,0 +1,228 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2021 NVIDIA CORPORATION & AFFILIATES.
+ * All rights reserved. SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "autoware/lidar_bevfusion/preprocess/point_type.hpp"
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+
+#include <autoware/cuda_utils/cuda_check_error.hpp>
+
+#include <spconvlib/spconv/csrc/sparse/all/ops3d/Point2Voxel.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+
+namespace autoware::lidar_bevfusion
+{
+
+PreprocessCuda::PreprocessCuda(
+  const BEVFusionConfig & config, cudaStream_t & stream, bool allocate_buffers)
+: stream_(stream), config_(config)
+{
+  if (allocate_buffers) {
+    hash_key_value_ = tv::empty({config.cloud_capacity_ * 2}, tv::custom128, 0);
+    point_indice_data_ = tv::empty({config.cloud_capacity_}, tv::int64, 0);
+    points_voxel_id_ = tv::empty({config.cloud_capacity_}, tv::int64, 0);
+  }
+}
+
+__global__ void generateSweepPoints_kernel(
+  const InputPointType * __restrict__ input_points, std::size_t points_size, float time_lag,
+  const float * transform_array, int num_features, float * __restrict__ output_points)
+{
+  int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (point_idx >= points_size) return;
+
+  const InputPointType * input_point = &input_points[point_idx];
+  float input_x = input_point->x;
+  float input_y = input_point->y;
+  float input_z = input_point->z;
+  float input_intensity = static_cast<float>(input_point->intensity);
+
+  output_points[point_idx * num_features] = transform_array[0] * input_x +
+                                            transform_array[4] * input_y +
+                                            transform_array[8] * input_z + transform_array[12];
+  output_points[point_idx * num_features + 1] = transform_array[1] * input_x +
+                                                transform_array[5] * input_y +
+                                                transform_array[9] * input_z + transform_array[13];
+  output_points[point_idx * num_features + 2] = transform_array[2] * input_x +
+                                                transform_array[6] * input_y +
+                                                transform_array[10] * input_z + transform_array[14];
+  output_points[point_idx * num_features + 3] = input_intensity;
+  output_points[point_idx * num_features + 4] = time_lag;
+}
+
+cudaError_t PreprocessCuda::generateSweepPoints_launch(
+  const InputPointType * input_data, std::size_t points_size, float time_lag,
+  const float * transform_array, float * output_points)
+{
+  dim3 blocks(divup(points_size, config_.threads_per_block_));
+  dim3 threads(config_.threads_per_block_);
+
+  generateSweepPoints_kernel<<<blocks, threads, 0, stream_>>>(
+    input_data, points_size, time_lag, transform_array, config_.num_point_feature_size_,
+    output_points);
+
+  cudaError_t err = cudaGetLastError();
+  return err;
+}
+
+std::size_t PreprocessCuda::generateVoxels(
+  const float * points, unsigned int points_size, float * voxel_features,
+  std::int32_t * voxel_coords, std::int32_t * num_points_per_voxel)
+{
+  using Point2VoxelGPU3D = spconvlib::spconv::csrc::sparse::all::ops3d::Point2Voxel;
+
+  std::array<float, 3> vsize_xyz{
+    config_.voxel_z_size_, config_.voxel_y_size_, config_.voxel_x_size_};
+
+  std::array<std::int32_t, 3> grid_size{
+    static_cast<std::int32_t>(config_.grid_z_size_),
+    static_cast<std::int32_t>(config_.grid_y_size_),
+    static_cast<std::int32_t>(config_.grid_x_size_)};
+
+  std::array<std::int64_t, 3> grid_stride{
+    static_cast<std::int64_t>(config_.grid_y_size_ * config_.grid_x_size_),
+    static_cast<std::int64_t>(config_.grid_x_size_), 1};
+
+  std::array<float, 6> coors_range{config_.min_z_range_, config_.min_y_range_,
+                                   config_.min_x_range_, config_.max_z_range_,
+                                   config_.max_y_range_, config_.max_x_range_};
+
+  tv::Tensor pc = tv::from_blob(
+    points, {static_cast<std::int64_t>(points_size), config_.num_point_feature_size_}, tv::float32,
+    0);
+
+  auto point_limit = pc.dim(0);
+
+  tv::Tensor voxels_padded = tv::from_blob(
+    voxel_features, {config_.max_num_voxels_, config_.max_points_per_voxel_, pc.dim(1)},
+    tv::float32, 0);
+
+  tv::Tensor indices_padded_no_batch =
+    tv::from_blob(voxel_coords, {config_.max_num_voxels_, 3}, tv::int32, 0);
+
+  tv::Tensor num_points_per_voxel_tensor =
+    tv::from_blob(num_points_per_voxel, {config_.max_num_voxels_}, tv::int32, 0);
+
+  auto p2v_res = Point2VoxelGPU3D::point_to_voxel_hash_static(
+    pc, voxels_padded, indices_padded_no_batch, num_points_per_voxel_tensor, hash_key_value_,
+    point_indice_data_, points_voxel_id_, vsize_xyz, grid_size, grid_stride, coors_range, true,
+    false, reinterpret_cast<std::uintptr_t>(stream_));
+
+  std::size_t real_num_voxels = static_cast<std::size_t>(std::get<0>(p2v_res).dim(0));
+
+  return real_num_voxels;
+}
+
+__global__ void resize_and_extract_roi(
+  const std::uint8_t * __restrict__ input_img, std::uint8_t * __restrict__ output_img, int H,
+  int W,                     // Original image dimensions (Height, Width)
+  int H2, int W2,            // Resized image dimensions (Height, Width)
+  int H3, int W3,            // ROI dimensions (Height, Width)
+  int y_start, int x_start)  // ROI top-left coordinates in resized image
+{
+  // Calculate the global thread indices
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // Width index in output (ROI) image
+  int j = blockIdx.y * blockDim.y + threadIdx.y;  // Height index in output (ROI) image
+
+  // Check if the thread corresponds to a valid pixel in the ROI
+  if (i >= W3 || j >= H3) return;
+
+  // Compute scaling factors from original to resized image
+  float scale_y = static_cast<float>(H) / H2;
+  float scale_x = static_cast<float>(W) / W2;
+
+  // Map output pixel (i, j) in ROI to position in resized image
+  int i_resized = i + x_start;
+  int j_resized = j + y_start;
+
+  // Map position in resized image back to position in original image
+  float i_original = (i_resized + 0.5f) * scale_x - 0.5f;
+  float j_original = (j_resized + 0.5f) * scale_y - 0.5f;
+
+  // Compute coordinates for bilinear interpolation
+  int i0 = static_cast<int>(floorf(i_original));
+  int j0 = static_cast<int>(floorf(j_original));
+  int i1 = i0 + 1;
+  int j1 = j0 + 1;
+
+  // Compute interpolation weights
+  float di = i_original - i0;
+  float dj = j_original - j0;
+
+  float w00 = (1.0f - di) * (1.0f - dj);
+  float w01 = (1.0f - di) * dj;
+  float w10 = di * (1.0f - dj);
+  float w11 = di * dj;
+
+  // Loop over the three color channels
+  for (int c = 0; c < 3; ++c) {
+    float v00 = 0.0f, v01 = 0.0f, v10 = 0.0f, v11 = 0.0f;
+
+    // Boundary checks for each neighboring pixel
+    if (i0 >= 0 && i0 < W && j0 >= 0 && j0 < H)
+      v00 = static_cast<float>(input_img[(j0 * W + i0) * 3 + c]);
+    if (i0 >= 0 && i0 < W && j1 >= 0 && j1 < H)
+      v01 = static_cast<float>(input_img[(j1 * W + i0) * 3 + c]);
+    if (i1 >= 0 && i1 < W && j0 >= 0 && j0 < H)
+      v10 = static_cast<float>(input_img[(j0 * W + i1) * 3 + c]);
+    if (i1 >= 0 && i1 < W && j1 >= 0 && j1 < H)
+      v11 = static_cast<float>(input_img[(j1 * W + i1) * 3 + c]);
+
+    // Compute the interpolated pixel value
+    float value = w00 * v00 + w01 * v01 + w10 * v10 + w11 * v11;
+
+    // Store the result in the output ROI image
+    output_img[(j * W3 + i) * 3 + c] = static_cast<std::uint8_t>(value);
+  }
+}
+
+cudaError_t PreprocessCuda::resize_and_extract_roi_launch(
+  const std::uint8_t * input_img, std::uint8_t * output_img, int H,
+  int W,                     // Original image dimensions
+  int H2, int W2,            // Resized image dimensions
+  int H3, int W3,            // ROI dimensions
+  int y_start, int x_start,  // ROI top-left coordinates in resized image
+  cudaStream_t stream)
+{
+  // Define the block and grid dimensions
+  dim3 threads(16, 16);
+  dim3 blocks(divup(W3, threads.x), divup(H3, threads.y));
+
+  // Launch the kernel
+  resize_and_extract_roi<<<blocks, threads, 0, stream>>>(
+    input_img, output_img, H, W, H2, W2, H3, W3, y_start, x_start);
+
+  // Check for errors
+  return cudaGetLastError();
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/preprocess/voxel_generator.cpp b/perception/autoware_lidar_bevfusion/lib/preprocess/voxel_generator.cpp
new file mode 100644
index 0000000000000..6ad6558490dc5
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/preprocess/voxel_generator.cpp
@@ -0,0 +1,89 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/preprocess/voxel_generator.hpp"
+
+#include "autoware/lidar_bevfusion/preprocess/preprocess_kernel.hpp"
+
+#include <sensor_msgs/point_cloud2_iterator.hpp>
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <type_traits>
+
+namespace autoware::lidar_bevfusion
+{
+
+VoxelGenerator::VoxelGenerator(
+  const DensificationParam & densification_param, const BEVFusionConfig & config,
+  cudaStream_t & stream)
+: config_(config), stream_(stream)
+{
+  pd_ptr_ = std::make_unique<PointCloudDensification>(densification_param, stream_);
+
+  pre_ptr_ = std::make_unique<PreprocessCuda>(config_, stream_, false);
+
+  affine_past2current_d_ =
+    autoware::cuda_utils::make_unique<float[]>(Eigen::Affine3f::MatrixType::SizeAtCompileTime);
+}
+
+bool VoxelGenerator::enqueuePointCloud(
+  const sensor_msgs::msg::PointCloud2 & msg, const tf2_ros::Buffer & tf_buffer)
+{
+  return pd_ptr_->enqueuePointCloud(msg, tf_buffer);
+}
+
+std::size_t VoxelGenerator::generateSweepPoints(CudaUniquePtr<float[]> & points_d)
+{
+  std::size_t point_counter{0};
+  CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+  for (auto pc_cache_iter = pd_ptr_->getPointCloudCacheIter(); !pd_ptr_->isCacheEnd(pc_cache_iter);
+       pc_cache_iter++) {
+    auto sweep_num_points = pc_cache_iter->num_points;
+    auto output_offset = point_counter * config_.num_point_feature_size_;
+
+    if (point_counter + sweep_num_points >= static_cast<std::size_t>(config_.cloud_capacity_)) {
+      RCLCPP_WARN_STREAM(
+        rclcpp::get_logger("lidar_bevfusion"), "Exceeding cloud capacity. Used "
+                                                 << pd_ptr_->getIdx(pc_cache_iter) << " out of "
+                                                 << pd_ptr_->getCacheSize() << " sweep(s)");
+      break;
+    }
+
+    auto affine_past2current =
+      pd_ptr_->getAffineWorldToCurrent() * pc_cache_iter->affine_past2world;
+    static_assert(std::is_same<decltype(affine_past2current.matrix()), Eigen::Matrix4f &>::value);
+    static_assert(!Eigen::Matrix4f::IsRowMajor, "matrices should be col-major.");
+
+    float time_lag = static_cast<float>(
+      pd_ptr_->getCurrentTimestamp() - rclcpp::Time(pc_cache_iter->header.stamp).seconds());
+
+    CHECK_CUDA_ERROR(cudaMemcpyAsync(
+      affine_past2current_d_.get(), affine_past2current.data(),
+      Eigen::Affine3f::MatrixType::SizeAtCompileTime * sizeof(float), cudaMemcpyHostToDevice,
+      stream_));
+    CHECK_CUDA_ERROR(cudaStreamSynchronize(stream_));
+
+    pre_ptr_->generateSweepPoints_launch(
+      pc_cache_iter->data_d.get(), sweep_num_points, time_lag, affine_past2current_d_.get(),
+      points_d.get() + output_offset);
+    point_counter += sweep_num_points;
+  }
+
+  return point_counter;
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/lib/ros_utils.cpp b/perception/autoware_lidar_bevfusion/lib/ros_utils.cpp
new file mode 100644
index 0000000000000..22a396a067d0a
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/lib/ros_utils.cpp
@@ -0,0 +1,126 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/ros_utils.hpp"
+
+#include <autoware/object_recognition_utils/object_recognition_utils.hpp>
+#include <autoware/universe_utils/geometry/geometry.hpp>
+#include <autoware/universe_utils/math/constants.hpp>
+
+#include <cstddef>
+#include <string>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+using Label = autoware_perception_msgs::msg::ObjectClassification;
+
+bool is_data_layout_compatible_with_point_xyzirc(const sensor_msgs::msg::PointCloud2 & input)
+{
+  using PointIndex = autoware::point_types::PointXYZIRCIndex;
+  using autoware::point_types::PointXYZIRC;
+  if (input.fields.size() < 6) {
+    return false;
+  }
+  bool same_layout = true;
+  const auto & field_x = input.fields.at(static_cast<size_t>(PointIndex::X));
+  same_layout &= field_x.name == "x";
+  same_layout &= field_x.offset == offsetof(PointXYZIRC, x);
+  same_layout &= field_x.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_x.count == 1;
+  const auto & field_y = input.fields.at(static_cast<size_t>(PointIndex::Y));
+  same_layout &= field_y.name == "y";
+  same_layout &= field_y.offset == offsetof(PointXYZIRC, y);
+  same_layout &= field_y.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_y.count == 1;
+  const auto & field_z = input.fields.at(static_cast<size_t>(PointIndex::Z));
+  same_layout &= field_z.name == "z";
+  same_layout &= field_z.offset == offsetof(PointXYZIRC, z);
+  same_layout &= field_z.datatype == sensor_msgs::msg::PointField::FLOAT32;
+  same_layout &= field_z.count == 1;
+  const auto & field_intensity = input.fields.at(static_cast<size_t>(PointIndex::Intensity));
+  same_layout &= field_intensity.name == "intensity";
+  same_layout &= field_intensity.offset == offsetof(PointXYZIRC, intensity);
+  same_layout &= field_intensity.datatype == sensor_msgs::msg::PointField::UINT8;
+  same_layout &= field_intensity.count == 1;
+  const auto & field_return_type = input.fields.at(static_cast<size_t>(PointIndex::ReturnType));
+  same_layout &= field_return_type.name == "return_type";
+  same_layout &= field_return_type.offset == offsetof(PointXYZIRC, return_type);
+  same_layout &= field_return_type.datatype == sensor_msgs::msg::PointField::UINT8;
+  same_layout &= field_return_type.count == 1;
+  const auto & field_ring = input.fields.at(static_cast<size_t>(PointIndex::Channel));
+  same_layout &= field_ring.name == "channel";
+  same_layout &= field_ring.offset == offsetof(PointXYZIRC, channel);
+  same_layout &= field_ring.datatype == sensor_msgs::msg::PointField::UINT16;
+  same_layout &= field_ring.count == 1;
+
+  return same_layout;
+}
+
+void box3DToDetectedObject(
+  const Box3D & box3d, const std::vector<std::string> & class_names,
+  autoware_perception_msgs::msg::DetectedObject & obj)
+{
+  obj.existence_probability = box3d.score;
+
+  // classification
+  autoware_perception_msgs::msg::ObjectClassification classification;
+  classification.probability = 1.0f;
+  if (box3d.label >= 0 && static_cast<std::size_t>(box3d.label) < class_names.size()) {
+    classification.label = getSemanticType(class_names[box3d.label]);
+  } else {
+    classification.label = Label::UNKNOWN;
+    RCLCPP_WARN_STREAM(rclcpp::get_logger("lidar_bevfusion"), "Unexpected label: UNKNOWN is set.");
+  }
+
+  if (autoware::object_recognition_utils::isCarLikeVehicle(classification.label)) {
+    obj.kinematics.orientation_availability =
+      autoware_perception_msgs::msg::DetectedObjectKinematics::SIGN_UNKNOWN;
+  }
+
+  obj.classification.emplace_back(classification);
+
+  // pose and shape
+  obj.kinematics.pose_with_covariance.pose.position =
+    autoware::universe_utils::createPoint(box3d.x, box3d.y, box3d.z);
+  obj.kinematics.pose_with_covariance.pose.orientation =
+    autoware::universe_utils::createQuaternionFromYaw(box3d.yaw);
+  obj.shape.type = autoware_perception_msgs::msg::Shape::BOUNDING_BOX;
+  obj.shape.dimensions =
+    autoware::universe_utils::createTranslation(box3d.length, box3d.width, box3d.height);
+}
+
+std::uint8_t getSemanticType(const std::string & class_name)
+{
+  if (class_name == "CAR") {
+    return Label::CAR;
+  } else if (class_name == "TRUCK") {
+    return Label::TRUCK;
+  } else if (class_name == "BUS") {
+    return Label::BUS;
+  } else if (class_name == "TRAILER") {
+    return Label::TRAILER;
+  } else if (class_name == "MOTORCYCLE") {
+    return Label::MOTORCYCLE;
+  } else if (class_name == "BICYCLE") {
+    return Label::BICYCLE;
+  } else if (class_name == "PEDESTRIAN") {
+    return Label::PEDESTRIAN;
+  } else {  // CONSTRUCTION_VEHICLE, BARRIER, TRAFFIC_CONE
+    return Label::UNKNOWN;
+  }
+}
+
+}  // namespace autoware::lidar_bevfusion
diff --git a/perception/autoware_lidar_bevfusion/package.xml b/perception/autoware_lidar_bevfusion/package.xml
new file mode 100644
index 0000000000000..eb38aa9a07f99
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/package.xml
@@ -0,0 +1,36 @@
+<?xml version="1.0"?>
+<?xml-model href="http://download.ros.org/schema/package_format3.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
+<package format="3">
+  <name>autoware_lidar_bevfusion</name>
+  <version>1.0.0</version>
+  <description>The autoware_lidar_bevfusion package</description>
+  <maintainer email="kenzo.lobos@tier4.jp">Kenzo Lobos-Tsunekawa</maintainer>
+  <license>Apache License 2.0</license>
+
+  <buildtool_depend>ament_cmake_auto</buildtool_depend>
+  <buildtool_depend>autoware_cmake</buildtool_depend>
+
+  <depend>autoware_cuda_utils</depend>
+  <depend>autoware_object_recognition_utils</depend>
+  <depend>autoware_perception_msgs</depend>
+  <depend>autoware_point_types</depend>
+  <depend>autoware_tensorrt_common</depend>
+  <depend>autoware_universe_utils</depend>
+  <depend>launch_ros</depend>
+  <depend>pcl_ros</depend>
+  <depend>rclcpp</depend>
+  <depend>rclcpp_components</depend>
+  <depend>tf2_eigen</depend>
+  <depend>tf2_geometry_msgs</depend>
+  <depend>tf2_sensor_msgs</depend>
+
+  <exec_depend>autoware_image_transport_decompressor</exec_depend>
+
+  <test_depend>ament_cmake_ros</test_depend>
+  <test_depend>ament_lint_auto</test_depend>
+  <test_depend>autoware_lint_common</test_depend>
+
+  <export>
+    <build_type>ament_cmake</build_type>
+  </export>
+</package>
diff --git a/perception/autoware_lidar_bevfusion/schema/bevfusion.schema.dummy.json b/perception/autoware_lidar_bevfusion/schema/bevfusion.schema.dummy.json
new file mode 100644
index 0000000000000..a3af7a6da7e82
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/schema/bevfusion.schema.dummy.json
@@ -0,0 +1,121 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for LiDAR TransFusion Node",
+  "$comment": "This schema is not considered in CI workflow. See https://github.com/autowarefoundation/autoware.universe/pull/8205#issuecomment-2255074224.",
+  "type": "object",
+  "definitions": {
+    "bevfusion": {
+      "type": "object",
+      "properties": {
+        "trt_precision": {
+          "type": "string",
+          "description": "A precision of TensorRT engine.",
+          "default": "fp16",
+          "enum": ["fp16", "fp32"]
+        },
+        "cloud_capacity": {
+          "type": "integer",
+          "description": "Capacity of the point cloud buffer (should be set to at least the maximum theoretical number of points).",
+          "default": 2000000,
+          "minimum": 1
+        },
+        "onnx_path": {
+          "type": "string",
+          "description": "A path to ONNX model file.",
+          "default": "$(var model_path)/bevfusion.onnx",
+          "pattern": "\\.onnx$"
+        },
+        "engine_path": {
+          "type": "string",
+          "description": "A path to TensorRT engine file.",
+          "default": "$(var model_path)/bevfusion.engine",
+          "pattern": "\\.engine$"
+        },
+        "densification_num_past_frames": {
+          "type": "integer",
+          "description": "A number of past frames to be considered as same input frame.",
+          "default": 1,
+          "minimum": 0
+        },
+        "densification_world_frame_id": {
+          "type": "string",
+          "description": "A name of frame id where world coordinates system is defined with respect to.",
+          "default": "map"
+        },
+        "circle_nms_dist_threshold": {
+          "type": "number",
+          "description": "A distance threshold between detections in NMS.",
+          "default": 0.5,
+          "minimum": 0.0
+        },
+        "iou_nms_target_class_names": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "An array of class names to be target in NMS.",
+          "default": ["CAR"],
+          "uniqueItems": true
+        },
+        "iou_nms_search_distance_2d": {
+          "type": "number",
+          "description": "A maximum distance value to search the nearest objects.",
+          "default": 10.0,
+          "minimum": 0.0
+        },
+        "iou_nms_threshold": {
+          "type": "number",
+          "description": "A threshold value of NMS using IoU score.",
+          "default": 0.1,
+          "minimum": 0.0,
+          "maximum": 1.0
+        },
+        "yaw_norm_thresholds": {
+          "type": "array",
+          "items": {
+            "type": "number",
+            "minimum": 0.0,
+            "maximum": 1.0
+          },
+          "description": "A thresholds array of direction vectors norm, all of objects with vector norm less than this threshold are ignored.",
+          "default": [0.3, 0.3, 0.3, 0.3, 0.0]
+        },
+        "score_threshold": {
+          "type": "number",
+          "description": "A threshold value of confidence score, all of objects with score less than this threshold are ignored.",
+          "default": 0.2,
+          "minimum": 0.0
+        }
+      },
+      "required": [
+        "trt_precision",
+        "cloud_capacity",
+        "onnx_path",
+        "engine_path",
+        "densification_num_past_frames",
+        "densification_world_frame_id",
+        "circle_nms_dist_threshold",
+        "iou_nms_target_class_names",
+        "iou_nms_search_distance_2d",
+        "iou_nms_threshold",
+        "yaw_norm_thresholds",
+        "score_threshold"
+      ],
+      "additionalProperties": false
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/bevfusion"
+        }
+      },
+      "required": ["ros__parameters"],
+      "additionalProperties": false
+    }
+  },
+  "required": ["/**"],
+  "additionalProperties": false
+}
diff --git a/perception/autoware_lidar_bevfusion/schema/bevfusion_ml_package.schema.json b/perception/autoware_lidar_bevfusion/schema/bevfusion_ml_package.schema.json
new file mode 100644
index 0000000000000..b726deab95fca
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/schema/bevfusion_ml_package.schema.json
@@ -0,0 +1,71 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for LiDAR TransFusion Node",
+  "type": "object",
+  "definitions": {
+    "bevfusion": {
+      "type": "object",
+      "properties": {
+        "class_names": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          },
+          "description": "An array of class names will be predicted.",
+          "default": ["CAR", "TRUCK", "BUS", "BICYCLE", "PEDESTRIAN"],
+          "uniqueItems": true
+        },
+        "voxels_num": {
+          "type": "array",
+          "items": {
+            "type": "integer"
+          },
+          "description": "A maximum number of voxels [min, opt, max].",
+          "default": [5000, 30000, 60000]
+        },
+        "point_cloud_range": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "An array of distance ranges of each class.",
+          "default": [-76.8, -76.8, -3.0, 76.8, 76.8, 5.0],
+          "minItems": 6,
+          "maxItems": 6
+        },
+        "voxel_size": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "Voxels size [x, y, z].",
+          "default": [0.3, 0.3, 8.0],
+          "minItems": 3,
+          "maxItems": 3
+        },
+        "num_proposals": {
+          "type": "integer",
+          "description": "Number of proposals at TransHead.",
+          "default": 500,
+          "minimum": 1
+        }
+      },
+      "required": ["class_names", "voxels_num", "point_cloud_range", "voxel_size", "num_proposals"],
+      "additionalProperties": false
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/bevfusion"
+        }
+      },
+      "required": ["ros__parameters"],
+      "additionalProperties": false
+    }
+  },
+  "required": ["/**"],
+  "additionalProperties": false
+}
diff --git a/perception/autoware_lidar_bevfusion/schema/detection_class_remapper.schema.json b/perception/autoware_lidar_bevfusion/schema/detection_class_remapper.schema.json
new file mode 100644
index 0000000000000..8aaefa295fcc0
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/schema/detection_class_remapper.schema.json
@@ -0,0 +1,72 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "title": "Parameters for Detection Class Remapper",
+  "type": "object",
+  "definitions": {
+    "detection_class_remapper": {
+      "type": "object",
+      "properties": {
+        "allow_remapping_by_area_matrix": {
+          "type": "array",
+          "items": {
+            "type": "integer"
+          },
+          "description": "Whether to allow remapping of classes. The order of 8x8 matrix classes comes from ObjectClassification msg.",
+          "default": [
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
+            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            0, 0, 0, 0, 0, 0
+          ],
+          "minItems": 64,
+          "maxItems": 64
+        },
+        "min_area_matrix": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "Minimum area for specific class to consider class remapping.",
+          "default": [
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.1, 0.0, 36.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 36.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 36.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+          ],
+          "minItems": 64,
+          "maxItems": 64
+        },
+        "max_area_matrix": {
+          "type": "array",
+          "items": {
+            "type": "number"
+          },
+          "description": "Maximum area for specific class to consider class remapping.",
+          "default": [
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 36.0, 0.0, 999.999, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 999.999, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+          ],
+          "minItems": 64,
+          "maxItems": 64
+        }
+      },
+      "required": ["allow_remapping_by_area_matrix", "min_area_matrix", "max_area_matrix"],
+      "additionalProperties": false
+    }
+  },
+  "properties": {
+    "/**": {
+      "type": "object",
+      "properties": {
+        "ros__parameters": {
+          "$ref": "#/definitions/detection_class_remapper"
+        }
+      },
+      "required": ["ros__parameters"],
+      "additionalProperties": false
+    }
+  },
+  "required": ["/**"],
+  "additionalProperties": false
+}
diff --git a/perception/autoware_lidar_bevfusion/src/bev_ops/bev_pool_cuda.cu b/perception/autoware_lidar_bevfusion/src/bev_ops/bev_pool_cuda.cu
new file mode 100644
index 0000000000000..2f65d1aee3e9b
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/bev_ops/bev_pool_cuda.cu
@@ -0,0 +1,55 @@
+// Taken from
+// https://github.com/mit-han-lab/bevfusion/blob/main/mmdet3d/ops/bev_pool/src/bev_pool_cuda.cu
+// Available under Apache-2.0 license
+
+#include <stdio.h>
+#include <stdlib.h>
+
+/*
+  Function: pillar pooling
+  Args:
+    b                : batch size
+    d                : depth of the feature map
+    h                : height of pooled feature map
+    w                : width of pooled feature map
+    n                : number of input points
+    c                : number of channels
+    n_intervals      : number of unique points
+    x                : input features, FloatTensor[n, c]
+    geom_feats       : input coordinates, IntTensor[n, 4]
+    interval_lengths : starting position for pooled point, IntTensor[n_intervals]
+    interval_starts  : how many points in each pooled point, IntTensor[n_intervals]
+    out              : output features, FloatTensor[b, d, h, w, c]
+*/
+__global__ void bev_pool_kernel(
+  int b, int d, int h, int w, int n, int c, int n_intervals, const float * __restrict__ x,
+  const int * __restrict__ geom_feats, const int * __restrict__ interval_starts,
+  const int * __restrict__ interval_lengths, float * __restrict__ out)
+{
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = idx / c;
+  int cur_c = idx % c;
+  if (index >= n_intervals) return;
+  int interval_start = interval_starts[index];
+  int interval_length = interval_lengths[index];
+
+  const int * cur_geom_feats = geom_feats + interval_start * 4;
+  const float * cur_x = x + interval_start * c + cur_c;
+  float * cur_out = out + cur_geom_feats[3] * d * h * w * c + cur_geom_feats[2] * h * w * c +
+                    cur_geom_feats[0] * w * c + cur_geom_feats[1] * c + cur_c;
+  float psum = 0;
+  for (int i = 0; i < interval_length; i++) {
+    psum += cur_x[i * c];
+  }
+
+  *cur_out = psum;
+}
+
+void bev_pool(
+  int b, int d, int h, int w, int n, int c, int n_intervals, const float * x,
+  const int * geom_feats, const int * interval_starts, const int * interval_lengths, float * out,
+  cudaStream_t & stream)
+{
+  bev_pool_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256, 0, stream>>>(
+    b, d, h, w, n, c, n_intervals, x, geom_feats, interval_starts, interval_lengths, out);
+}
diff --git a/perception/autoware_lidar_bevfusion/src/lidar_bevfusion_node.cpp b/perception/autoware_lidar_bevfusion/src/lidar_bevfusion_node.cpp
new file mode 100644
index 0000000000000..413912e0c3292
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/lidar_bevfusion_node.cpp
@@ -0,0 +1,342 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/lidar_bevfusion/lidar_bevfusion_node.hpp"
+
+#include "autoware/lidar_bevfusion/utils.hpp"
+
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace autoware::lidar_bevfusion
+{
+
+LidarBEVFusionNode::LidarBEVFusionNode(const rclcpp::NodeOptions & options)
+: Node("lidar_bevfusion", options), tf_buffer_(this->get_clock())
+{
+  auto descriptor = rcl_interfaces::msg::ParameterDescriptor{}.set__read_only(true);
+
+  // Modality
+  sensor_fusion_ = this->declare_parameter<bool>("sensor_fusion", descriptor);
+
+  // Non network parameters
+  max_camera_lidar_delay_ = this->declare_parameter<float>("max_camera_lidar_delay", descriptor);
+
+  // TensorRT parameters
+  const std::string plugins_path = this->declare_parameter<std::string>("plugins_path", descriptor);
+
+  // Network parameters
+  const std::string onnx_path = this->declare_parameter<std::string>("onnx_path", descriptor);
+  const std::string engine_path = this->declare_parameter<std::string>("engine_path", descriptor);
+  const std::string trt_precision =
+    this->declare_parameter<std::string>("trt_precision", descriptor);
+
+  // Common parameters
+  const auto out_size_factor = this->declare_parameter<std::int64_t>("out_size_factor", descriptor);
+
+  auto to_float_vector = [](const auto & v) -> std::vector<float> {
+    return std::vector<float>(v.begin(), v.end());
+  };
+
+  // Lidar branch parameters
+  const auto cloud_capacity = this->declare_parameter<std::int64_t>("cloud_capacity", descriptor);
+  const auto max_points_per_voxel =
+    this->declare_parameter<std::int64_t>("max_points_per_voxel", descriptor);
+  const auto voxels_num =
+    this->declare_parameter<std::vector<std::int64_t>>("voxels_num", descriptor);
+  const auto point_cloud_range =
+    to_float_vector(this->declare_parameter<std::vector<double>>("point_cloud_range", descriptor));
+  const auto voxel_size =
+    to_float_vector(this->declare_parameter<std::vector<double>>("voxel_size", descriptor));
+
+  // Camera branch parameters
+  // cSpell:ignore dbound xbound ybound zbound
+  const auto dbound =
+    to_float_vector(this->declare_parameter<std::vector<double>>("dbound", descriptor));
+  const auto xbound =
+    to_float_vector(this->declare_parameter<std::vector<double>>("xbound", descriptor));
+  const auto ybound =
+    to_float_vector(this->declare_parameter<std::vector<double>>("ybound", descriptor));
+  const auto zbound =
+    to_float_vector(this->declare_parameter<std::vector<double>>("zbound", descriptor));
+  const auto num_cameras = this->declare_parameter<std::int64_t>("num_cameras", descriptor);
+  const auto raw_image_height =
+    this->declare_parameter<std::int64_t>("raw_image_height", descriptor);
+  const auto raw_image_width = this->declare_parameter<std::int64_t>("raw_image_width", descriptor);
+  const auto img_aug_scale_x = this->declare_parameter<float>("img_aug_scale_x", descriptor);
+  const auto img_aug_scale_y = this->declare_parameter<float>("img_aug_scale_y", descriptor);
+  const auto roi_height = this->declare_parameter<std::int64_t>("roi_height", descriptor);
+  const auto roi_width = this->declare_parameter<std::int64_t>("roi_width", descriptor);
+  const auto features_height = this->declare_parameter<std::int64_t>("features_height", descriptor);
+  const auto features_width = this->declare_parameter<int>("features_width", descriptor);
+  const auto num_depth_features = this->declare_parameter<int>("num_depth_features", descriptor);
+
+  // Head parameters
+  const auto num_proposals = this->declare_parameter<std::int64_t>("num_proposals", descriptor);
+  class_names_ = this->declare_parameter<std::vector<std::string>>("class_names", descriptor);
+
+  if (point_cloud_range.size() != 6) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"),
+      "The size of point_cloud_range != 6: use the default parameters.");
+  }
+  if (voxel_size.size() != 3) {
+    RCLCPP_WARN_STREAM(
+      rclcpp::get_logger("lidar_bevfusion"),
+      "The size of voxel_size != 3: use the default parameters.");
+  }
+
+  // pre-process
+  const std::string densification_world_frame_id =
+    this->declare_parameter<std::string>("densification_world_frame_id", descriptor);
+  const int densification_num_past_frames =
+    this->declare_parameter<std::int64_t>("densification_num_past_frames", descriptor);
+
+  // post-process
+  const float circle_nms_dist_threshold =
+    static_cast<float>(this->declare_parameter<double>("circle_nms_dist_threshold", descriptor));
+  {  // IoU NMS
+    NMSParams p;
+    p.nms_type_ = NMS_TYPE::IoU_BEV;
+    p.target_class_names_ =
+      this->declare_parameter<std::vector<std::string>>("iou_nms_target_class_names", descriptor);
+    p.search_distance_2d_ =
+      this->declare_parameter<double>("iou_nms_search_distance_2d", descriptor);
+    p.iou_threshold_ = this->declare_parameter<double>("iou_nms_threshold", descriptor);
+    iou_bev_nms_.setParameters(p);
+  }
+  const auto yaw_norm_thresholds =
+    this->declare_parameter<std::vector<double>>("yaw_norm_thresholds", descriptor);
+  const float score_threshold =
+    static_cast<float>(this->declare_parameter<double>("score_threshold", descriptor));
+
+  DensificationParam densification_param(
+    densification_world_frame_id, densification_num_past_frames);
+
+  BEVFusionConfig config(
+    sensor_fusion_, plugins_path, out_size_factor, cloud_capacity, max_points_per_voxel, voxels_num,
+    point_cloud_range, voxel_size, dbound, xbound, ybound, zbound, num_cameras, raw_image_height,
+    raw_image_width, img_aug_scale_x, img_aug_scale_y, roi_height, roi_width, features_height,
+    features_width, num_depth_features, num_proposals, circle_nms_dist_threshold,
+    yaw_norm_thresholds, score_threshold);
+
+  const auto allow_remapping_by_area_matrix = this->declare_parameter<std::vector<std::int64_t>>(
+    "allow_remapping_by_area_matrix", descriptor);
+  const auto min_area_matrix =
+    this->declare_parameter<std::vector<double>>("min_area_matrix", descriptor);
+  const auto max_area_matrix =
+    this->declare_parameter<std::vector<double>>("max_area_matrix", descriptor);
+  detection_class_remapper_.setParameters(
+    allow_remapping_by_area_matrix, min_area_matrix, max_area_matrix);
+
+  auto trt_config =
+    tensorrt_common::TrtCommonConfig(onnx_path, trt_precision, engine_path, 1ULL << 32U);
+  detector_ptr_ = std::make_unique<BEVFusionTRT>(trt_config, densification_param, config);
+
+  cloud_sub_ = this->create_subscription<sensor_msgs::msg::PointCloud2>(
+    "~/input/pointcloud", rclcpp::SensorDataQoS{}.keep_last(1),
+    std::bind(&LidarBEVFusionNode::cloudCallback, this, std::placeholders::_1));
+
+  objects_pub_ = this->create_publisher<autoware_perception_msgs::msg::DetectedObjects>(
+    "~/output/objects", rclcpp::QoS(1));
+
+  if (sensor_fusion_) {
+    image_subs_.resize(num_cameras);
+    camera_info_subs_.resize(num_cameras);
+    image_msgs_.resize(num_cameras);
+    camera_info_msgs_.resize(num_cameras);
+    lidar2camera_extrinsics_.resize(num_cameras);
+
+    for (std::int64_t camera_id = 0; camera_id < num_cameras; ++camera_id) {
+      image_subs_[camera_id] = this->create_subscription<sensor_msgs::msg::Image>(
+        "~/input/image" + std::to_string(camera_id), rclcpp::SensorDataQoS{}.keep_last(1),
+        [this, camera_id](const sensor_msgs::msg::Image::ConstSharedPtr msg) {
+          this->imageCallback(msg, camera_id);
+        });
+
+      camera_info_subs_[camera_id] = this->create_subscription<sensor_msgs::msg::CameraInfo>(
+        "~/input/camera_info" + std::to_string(camera_id), rclcpp::SensorDataQoS{}.keep_last(1),
+        [this, camera_id](const sensor_msgs::msg::CameraInfo & msg) {
+          this->cameraInfoCallback(msg, camera_id);
+        });
+    }
+  }
+
+  published_time_pub_ = std::make_unique<autoware::universe_utils::PublishedTimePublisher>(this);
+
+  // initialize debug tool
+  {
+    using autoware::universe_utils::DebugPublisher;
+    using autoware::universe_utils::StopWatch;
+    stop_watch_ptr_ = std::make_unique<StopWatch<std::chrono::milliseconds>>();
+    debug_publisher_ptr_ = std::make_unique<DebugPublisher>(this, this->get_name());
+    stop_watch_ptr_->tic("cyclic");
+    stop_watch_ptr_->tic("processing/total");
+  }
+
+  if (this->declare_parameter<bool>("build_only", false, descriptor)) {
+    RCLCPP_INFO(this->get_logger(), "TensorRT engine was built Shutting down the node.");
+    rclcpp::shutdown();
+  }
+}
+
+void LidarBEVFusionNode::cloudCallback(const sensor_msgs::msg::PointCloud2::ConstSharedPtr pc_msg)
+{
+  lidar_frame_ = pc_msg->header.frame_id;
+
+  if (sensor_fusion_ && (!extrinsics_available_ || !images_available_ || !intrinsics_available_)) {
+    return;
+  }
+
+  if (sensor_fusion_ && !intrinsics_extrinsics_precomputed_) {
+    std::vector<sensor_msgs::msg::CameraInfo> camera_info_msgs;
+    std::vector<Matrix4f> lidar2camera_extrinsics;
+
+    std::transform(
+      camera_info_msgs_.begin(), camera_info_msgs_.end(), std::back_inserter(camera_info_msgs),
+      [](const auto & opt) { return *opt; });
+
+    std::transform(
+      lidar2camera_extrinsics_.begin(), lidar2camera_extrinsics_.end(),
+      std::back_inserter(lidar2camera_extrinsics), [](const auto & opt) { return *opt; });
+
+    detector_ptr_->setIntrinsicsExtrinsics(camera_info_msgs, lidar2camera_extrinsics);
+    intrinsics_extrinsics_precomputed_ = true;
+  }
+
+  const auto objects_sub_count =
+    objects_pub_->get_subscription_count() + objects_pub_->get_intra_process_subscription_count();
+  if (objects_sub_count < 1) {
+    return;
+  }
+
+  if (stop_watch_ptr_) {
+    stop_watch_ptr_->toc("processing/total", true);
+  }
+
+  double lidar_stamp = rclcpp::Time(pc_msg->header.stamp).seconds();
+  camera_masks_.resize(camera_info_msgs_.size());
+  for (std::size_t i = 0; i < camera_masks_.size(); ++i) {
+    camera_masks_[i] = 1.f ? (lidar_stamp - rclcpp::Time(image_msgs_[i]->header.stamp).seconds()) <
+                               max_camera_lidar_delay_
+                           : 0.f;
+  }
+
+  std::vector<Box3D> det_boxes3d;
+  std::unordered_map<std::string, double> proc_timing;
+  bool is_success =
+    detector_ptr_->detect(pc_msg, image_msgs_, camera_masks_, tf_buffer_, det_boxes3d, proc_timing);
+  if (!is_success) {
+    return;
+  }
+
+  std::vector<autoware_perception_msgs::msg::DetectedObject> raw_objects;
+  raw_objects.reserve(det_boxes3d.size());
+  for (const auto & box3d : det_boxes3d) {
+    autoware_perception_msgs::msg::DetectedObject obj;
+    box3DToDetectedObject(box3d, class_names_, obj);
+    raw_objects.emplace_back(obj);
+  }
+
+  autoware_perception_msgs::msg::DetectedObjects output_msg;
+  output_msg.header = pc_msg->header;
+  output_msg.objects = iou_bev_nms_.apply(raw_objects);
+
+  detection_class_remapper_.mapClasses(output_msg);
+
+  if (objects_sub_count > 0) {
+    objects_pub_->publish(output_msg);
+    published_time_pub_->publish_if_subscribed(objects_pub_, output_msg.header.stamp);
+  }
+
+  // add processing time for debug
+  if (debug_publisher_ptr_ && stop_watch_ptr_) {
+    const double cyclic_time_ms = stop_watch_ptr_->toc("cyclic", true);
+    const double processing_time_ms = stop_watch_ptr_->toc("processing/total", true);
+    const double pipeline_latency_ms =
+      std::chrono::duration<double, std::milli>(
+        std::chrono::nanoseconds(
+          (this->get_clock()->now() - output_msg.header.stamp).nanoseconds()))
+        .count();
+    debug_publisher_ptr_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/cyclic_time_ms", cyclic_time_ms);
+    debug_publisher_ptr_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/pipeline_latency_ms", pipeline_latency_ms);
+    debug_publisher_ptr_->publish<tier4_debug_msgs::msg::Float64Stamped>(
+      "debug/processing_time/total_ms", processing_time_ms);
+    for (const auto & [topic, time_ms] : proc_timing) {
+      debug_publisher_ptr_->publish<tier4_debug_msgs::msg::Float64Stamped>(topic, time_ms);
+    }
+  }
+}
+
+void LidarBEVFusionNode::imageCallback(
+  const sensor_msgs::msg::Image::ConstSharedPtr msg, std::size_t camera_id)
+{
+  image_msgs_[camera_id] = msg;
+
+  std::size_t num_valid_images = std::count_if(
+    image_msgs_.begin(), image_msgs_.end(),
+    [](const auto & image_msg) { return image_msg != nullptr; });
+
+  images_available_ = num_valid_images == image_msgs_.size();
+}
+
+void LidarBEVFusionNode::cameraInfoCallback(
+  const sensor_msgs::msg::CameraInfo & msg, std::size_t camera_id)
+{
+  camera_info_msgs_[camera_id] = msg;
+
+  std::size_t num_valid_intrinsics = std::count_if(
+    camera_info_msgs_.begin(), camera_info_msgs_.end(),
+    [](const auto & opt) { return opt.has_value(); });
+
+  intrinsics_available_ = num_valid_intrinsics == camera_info_msgs_.size();
+
+  if (
+    lidar2camera_extrinsics_[camera_id].has_value() || !lidar_frame_.has_value() ||
+    extrinsics_available_) {
+    return;
+  }
+
+  try {
+    geometry_msgs::msg::TransformStamped transform_stamped;
+    transform_stamped =
+      tf_buffer_.lookupTransform(msg.header.frame_id, *lidar_frame_, msg.header.stamp);
+
+    Eigen::Matrix4f lidar2camera_transform =
+      tf2::transformToEigen(transform_stamped.transform).matrix().cast<float>();
+
+    Matrix4f lidar2camera_rowmajor_transform = lidar2camera_transform.eval();
+    lidar2camera_extrinsics_[camera_id] = lidar2camera_rowmajor_transform;
+  } catch (tf2::TransformException & ex) {
+    RCLCPP_WARN(this->get_logger(), "%s", ex.what());
+    return;
+  }
+
+  std::size_t num_valid_extrinsics = std::count_if(
+    lidar2camera_extrinsics_.begin(), lidar2camera_extrinsics_.end(),
+    [](const auto & opt) { return opt.has_value(); });
+
+  extrinsics_available_ = num_valid_extrinsics == lidar2camera_extrinsics_.size();
+}
+
+}  // namespace autoware::lidar_bevfusion
+
+#include "rclcpp_components/register_node_macro.hpp"
+
+RCLCPP_COMPONENTS_REGISTER_NODE(autoware::lidar_bevfusion::LidarBEVFusionNode)
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.cpp
new file mode 100644
index 0000000000000..b69cb4e9710a0
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.cpp
@@ -0,0 +1,421 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin.hpp"
+
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <spconvlib/spconv/csrc/sparse/all/SpconvOps.h>
+#include <spconvlib/spconv/csrc/sparse/alloc/StaticAllocator.h>
+#include <spconvlib/spconv/csrc/sparse/convops/SimpleExternalSpconvMatmul.h>
+#include <spconvlib/spconv/csrc/sparse/convops/gemmops/GemmTunerSimple.h>
+#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
+#include <spconvlib/spconv/csrc/sparse/inference/InferenceOps.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <functional>
+#include <iostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+// GetIndicePairsImplicitGemm
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+GetIndicePairsImplicitGemmPlugin::GetIndicePairsImplicitGemmPlugin(
+  const std::string & name, GetIndicePairsImplicitGemmParameters const & params)
+: layer_name_{name}, params_{params}
+{
+  initFieldsToSerialize();
+}
+
+void GetIndicePairsImplicitGemmPlugin::initFieldsToSerialize()
+{
+  data_to_serialize_.clear();
+  data_to_serialize_.emplace_back("batch_size", &params_.batch_size, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("algo", &params_.algo, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("is_train", &params_.is_train, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back(
+    "dilation_dims", &params_.dilation_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back("ksize_dims", &params_.ksize_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back(
+    "out_padding_dims", &params_.out_padding_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back("padding_dims", &params_.padding_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back(
+    "spatial_shape_dims", &params_.spatial_shape_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back("stride_dims", &params_.stride_dims, PluginFieldType::kDIMS, 1);
+  data_to_serialize_.emplace_back("subm", &params_.subm, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("transpose", &params_.transpose, PluginFieldType::kINT32, 1);
+
+  fc_to_serialize_.nbFields = data_to_serialize_.size();
+  fc_to_serialize_.fields = data_to_serialize_.data();
+}
+
+IPluginCapability * GetIndicePairsImplicitGemmPlugin::getCapabilityInterface(
+  PluginCapabilityType type) noexcept
+{
+  try {
+    if (type == PluginCapabilityType::kBUILD) {
+      return static_cast<IPluginV3OneBuild *>(this);
+    }
+    if (type == PluginCapabilityType::kRUNTIME) {
+      return static_cast<IPluginV3OneRuntime *>(this);
+    }
+    PLUGIN_ASSERT(type == PluginCapabilityType::kCORE);
+    return static_cast<IPluginV3OneCore *>(this);
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+IPluginV3 * GetIndicePairsImplicitGemmPlugin::clone() noexcept
+{
+  try {
+    IPluginV3 * const plugin{new GetIndicePairsImplicitGemmPlugin{layer_name_, params_}};
+    return plugin;
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+char const * GetIndicePairsImplicitGemmPlugin::getPluginName() const noexcept
+{
+  return kGET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_NAME;
+}
+
+char const * GetIndicePairsImplicitGemmPlugin::getPluginVersion() const noexcept
+{
+  return kGET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_VERSION;
+}
+
+char const * GetIndicePairsImplicitGemmPlugin::getPluginNamespace() const noexcept
+{
+  return kGET_INDICE_PAIRS_IMPLICIT_GEMM_PLUGIN_NAMESPACE;
+}
+
+std::int32_t GetIndicePairsImplicitGemmPlugin::getNbOutputs() const noexcept
+{
+  return 5;
+}
+
+std::int32_t GetIndicePairsImplicitGemmPlugin::configurePlugin(
+  DynamicPluginTensorDesc const * in, std::int32_t num_inputs, DynamicPluginTensorDesc const * out,
+  std::int32_t num_outputs) noexcept
+{
+  // Validate input arguments.
+  PLUGIN_ASSERT(num_inputs == 1);
+  PLUGIN_ASSERT(num_outputs == 5);
+  PLUGIN_ASSERT(in[0].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(out[0].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(out[1].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(out[2].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(out[3].desc.dims.nbDims == 1);
+  PLUGIN_ASSERT(out[4].desc.dims.nbDims == 0);
+
+  std::int64_t kernel_volume = 1;
+  for (const std::int64_t ksize : params_.ksize) {
+    kernel_volume *= ksize;
+  }
+
+  PLUGIN_ASSERT(in[0].desc.dims.d[1] == 4);  // coords + 1
+
+  PLUGIN_ASSERT(out[0].desc.dims.d[1] == 4);  // coords + 1
+  PLUGIN_ASSERT(out[0].desc.type == in[0].desc.type);
+
+  PLUGIN_ASSERT(out[1].desc.dims.d[0] == kernel_volume);
+  PLUGIN_ASSERT(out[1].desc.type == in[0].desc.type);
+
+  PLUGIN_ASSERT(out[2].desc.dims.d[1] == 1);
+  PLUGIN_ASSERT(out[2].desc.type == in[0].desc.type);
+
+  PLUGIN_ASSERT(out[3].desc.type == in[0].desc.type);
+
+  PLUGIN_ASSERT(out[4].desc.type == in[0].desc.type);
+
+  return 0;
+}
+
+bool GetIndicePairsImplicitGemmPlugin::supportsFormatCombination(
+  std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+  std::int32_t num_outputs) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 1);
+  PLUGIN_ASSERT(num_outputs == 5);
+
+  return (
+    in_out[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+    in_out[pos].desc.type == nvinfer1::DataType::kINT32);
+}
+
+std::int32_t GetIndicePairsImplicitGemmPlugin::getOutputDataTypes(
+  DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+  std::int32_t num_inputs) const noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 1);
+  PLUGIN_ASSERT(num_outputs == 5);
+
+  output_types[0] = input_types[0];
+  output_types[1] = input_types[0];
+  output_types[2] = input_types[0];
+  output_types[3] = input_types[0];
+  output_types[4] = input_types[0];
+
+  return 0;
+}
+
+std::int32_t GetIndicePairsImplicitGemmPlugin::getOutputShapes(
+  DimsExprs const * inputs, std::int32_t num_inputs,
+  [[maybe_unused]] DimsExprs const * shape_inputs, [[maybe_unused]] std::int32_t num_shape_inputs,
+  DimsExprs * outputs, std::int32_t num_outputs, IExprBuilder & expr_builder) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 1);
+  PLUGIN_ASSERT(num_outputs == 5);
+  PLUGIN_ASSERT(inputs[0].nbDims == 2);
+
+  std::int64_t kernel_volume = 1;
+
+  for (std::size_t i = 0; i < params_.ksize.size(); ++i) {
+    kernel_volume *= params_.ksize[i];
+  }
+
+  if (params_.subm) {
+    outputs[0] = inputs[0];
+    outputs[0].d[1] = inputs[0].d[1];
+
+    outputs[1].nbDims = 2;
+    outputs[1].d[0] = expr_builder.constant(kernel_volume);
+    outputs[1].d[1] = inputs[0].d[0];
+
+    outputs[2].nbDims = 2;
+    outputs[2].d[0] = inputs[0].d[0];
+    outputs[2].d[1] = expr_builder.constant(1);
+
+    outputs[3].nbDims = 1;
+    outputs[3].d[0] = inputs[0].d[0];
+
+  } else {
+    auto opt_value = expr_builder.operation(
+      DimensionOperation::kCEIL_DIV, *inputs[0].d[0], *expr_builder.constant(2));
+
+    outputs[0].nbDims = 2;
+    outputs[0].d[0] =
+      expr_builder.declareSizeTensor(4, *opt_value, *expr_builder.constant(out_inds_num_limit_));
+    outputs[0].d[1] = inputs[0].d[1];
+
+    outputs[1].nbDims = 2;
+    outputs[1].d[0] = expr_builder.constant(kernel_volume);
+    outputs[1].d[1] =
+      expr_builder.declareSizeTensor(4, *opt_value, *expr_builder.constant(out_inds_num_limit_));
+
+    outputs[2].nbDims = 2;
+    outputs[2].d[0] =
+      expr_builder.declareSizeTensor(4, *opt_value, *expr_builder.constant(out_inds_num_limit_));
+    outputs[2].d[1] = expr_builder.constant(1);
+
+    outputs[3].nbDims = 1;
+    outputs[3].d[0] =
+      expr_builder.declareSizeTensor(4, *opt_value, *expr_builder.constant(out_inds_num_limit_));
+  }
+
+  // num_activate_out
+  outputs[4].nbDims = 0;
+
+  return 0;
+}
+
+std::int32_t GetIndicePairsImplicitGemmPlugin::enqueue(
+  PluginTensorDesc const * input_desc, [[maybe_unused]] PluginTensorDesc const * output_desc,
+  void const * const * inputs, void * const * outputs, [[maybe_unused]] void * workspace,
+  cudaStream_t stream) noexcept
+{
+  using SpconvOps = spconvlib::spconv::csrc::sparse::all::SpconvOps;
+  using StaticAllocator = spconvlib::spconv::csrc::sparse::alloc::StaticAllocator;
+
+  const bool is_subm = params_.subm;
+  const bool direct_table = true;
+  const bool use_direct_table = direct_table && !is_subm;
+  const int static_num_act_in = out_inds_num_limit_;
+
+  std::vector<int> ksize(params_.ksize.begin(), params_.ksize.end());
+  std::vector<int> stride(params_.stride.begin(), params_.stride.end());
+  std::vector<int> padding(params_.padding.begin(), params_.padding.end());
+  std::vector<int> dilation(params_.dilation.begin(), params_.dilation.end());
+  std::vector<int> input_dims(params_.spatial_shape.begin(), params_.spatial_shape.end());
+
+  auto out_dims = SpconvOps::get_conv_output_size(input_dims, ksize, stride, padding, dilation);
+  std::vector<std::int64_t> output_dims_i64(out_dims.begin(), out_dims.end());
+  std::int64_t out_spatial_volume = std::accumulate(
+    output_dims_i64.begin(), output_dims_i64.end(), static_cast<std::int64_t>(1),
+    std::multiplies<std::int64_t>());
+
+  bool use_int64_hash_k =
+    out_spatial_volume >= static_cast<std::int64_t>(std::numeric_limits<int>::max());
+
+  int kernel_volume = 1;
+  for (const auto & ksize : params_.ksize) {
+    kernel_volume *= ksize;
+  }
+
+  auto max_act_out_theory = SpconvOps::get_handcrafted_max_act_out(
+    input_desc[0].dims.d[0], ksize, stride, padding, dilation);
+
+  auto ws_tensors = SpconvOps::get_indice_gen_tensors_from_workspace(
+    reinterpret_cast<std::uint8_t *>(workspace), kernel_volume, out_inds_num_limit_,
+    is_subm ? out_inds_num_limit_ : out_inds_num_limit_, max_act_out_theory, is_subm,
+    use_int64_hash_k, use_direct_table);
+
+  int pair_fwd_size_padded = is_subm ? input_desc[0].dims.d[0] : out_inds_num_limit_;
+  tv::Tensor pair_fwd_padded =
+    tv::from_blob(outputs[1], {kernel_volume, pair_fwd_size_padded}, tv::int32, 0);
+
+  bool is_split_mask =
+    params_.algo == static_cast<std::int64_t>(tv::gemm::SparseConvAlgo::kMaskSplitImplicitGemm);
+  int mask_count = is_split_mask ? 2 : 1;
+
+  tv::Tensor pair_mask_fwd_padded =
+    tv::from_blob(outputs[2], {mask_count, pair_fwd_size_padded}, tv::int32, 0);
+  tv::Tensor mask_argsort_fwd_padded =
+    tv::from_blob(outputs[3], {mask_count, pair_fwd_size_padded}, tv::int32, 0);
+  tv::Tensor out_inds = tv::from_blob(
+    outputs[0], {is_subm ? input_desc[0].dims.d[0] : out_inds_num_limit_, 4}, tv::int32, 0);
+  tv::Tensor indices_kernel_num = tv::zeros({kernel_volume}, tv::int32, 0);
+
+  tv::Tensor input_indices = tv::from_blob(inputs[0], {input_desc[0].dims.d[0], 4}, tv::int32, 0);
+
+  std::tuple<tv::Tensor, int> pair_res;
+
+  tv::Context ctx;
+  ctx.set_cuda_stream_int(reinterpret_cast<std::uintptr_t>(stream));
+
+  if (is_subm) {
+    out_inds.copy_(input_indices, ctx);
+
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_FWD, pair_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK, pair_mask_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_MASK_ARG_SORT, mask_argsort_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_OUT_INDICES, out_inds});
+    ws_tensors.insert({SPCONV_ALLOC_INDICE_NUM_PER_LOC, indices_kernel_num});
+    StaticAllocator alloc(ws_tensors);
+
+    pair_res = SpconvOps::get_indice_pairs_implicit_gemm(
+      alloc, input_indices, params_.batch_size, input_dims, static_cast<int>(params_.algo), ksize,
+      stride, padding, dilation, {0, 0, 0}, params_.subm, params_.transpose, false /*is_train*/,
+      reinterpret_cast<std::uintptr_t>(stream), out_inds_num_limit_, tv::CUDAKernelTimer(false),
+      use_direct_table);
+
+  } else {
+    tv::Tensor pair_bwd_padded = tv::empty({kernel_volume, static_num_act_in}, tv::int32, 0);
+    tv::Tensor pair_mask_bwd_padded = tv::empty({mask_count, static_num_act_in}, tv::int32, 0);
+    tv::Tensor mask_argsort_bwd_padded = tv::empty({mask_count, static_num_act_in}, tv::int32, 0);
+
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_FWD, pair_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_BWD, pair_bwd_padded});
+
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK, pair_mask_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_PAIR_MASK_BWD, pair_mask_bwd_padded});
+
+    ws_tensors.insert({SPCONV_ALLOC_MASK_ARG_SORT, mask_argsort_fwd_padded});
+    ws_tensors.insert({SPCONV_ALLOC_MASK_ARG_SORT_BWD, mask_argsort_bwd_padded});
+
+    ws_tensors.insert({SPCONV_ALLOC_OUT_INDICES, out_inds});
+    ws_tensors.insert({SPCONV_ALLOC_INDICE_NUM_PER_LOC, indices_kernel_num});
+
+    StaticAllocator alloc(ws_tensors);
+
+    pair_res = SpconvOps::get_indice_pairs_implicit_gemm(
+      alloc, input_indices, params_.batch_size, input_dims, static_cast<int>(params_.algo), ksize,
+      stride, padding, dilation, {0, 0, 0}, params_.subm, params_.transpose, false /*is_train*/,
+      reinterpret_cast<std::uintptr_t>(stream), out_inds_num_limit_, tv::CUDAKernelTimer(false),
+      use_direct_table);
+  }
+
+  std::int32_t num_act_out_real = std::get<1>(pair_res);
+  std::int32_t * num_act_out_data = static_cast<std::int32_t *>(outputs[4]);
+
+  cudaError_t const status = cudaMemcpyAsync(
+    num_act_out_data, &num_act_out_real, sizeof(std::int32_t), cudaMemcpyHostToDevice, stream);
+
+  return status;
+}
+
+std::int32_t GetIndicePairsImplicitGemmPlugin::onShapeChange(
+  [[maybe_unused]] PluginTensorDesc const * in, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] PluginTensorDesc const * out, [[maybe_unused]] std::int32_t num_outputs) noexcept
+{
+  return 0;
+}
+
+IPluginV3 * GetIndicePairsImplicitGemmPlugin::attachToContext(
+  [[maybe_unused]] IPluginResourceContext * context) noexcept
+{
+  return clone();
+}
+
+PluginFieldCollection const * GetIndicePairsImplicitGemmPlugin::getFieldsToSerialize() noexcept
+{
+  return &fc_to_serialize_;
+}
+
+std::size_t GetIndicePairsImplicitGemmPlugin::getWorkspaceSize(
+  [[maybe_unused]] DynamicPluginTensorDesc const * inputs, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] DynamicPluginTensorDesc const * outputs,
+  [[maybe_unused]] std::int32_t num_outputs) const noexcept
+{
+  using SpconvOps = spconvlib::spconv::csrc::sparse::all::SpconvOps;
+
+  bool is_subm = params_.subm;
+  const bool direct_table = true;
+  const bool use_direct_table = direct_table && !is_subm;
+
+  std::vector<int> ksize(params_.ksize.begin(), params_.ksize.end());
+  std::vector<int> stride(params_.stride.begin(), params_.stride.end());
+  std::vector<int> padding(params_.padding.begin(), params_.padding.end());
+  std::vector<int> dilation(params_.dilation.begin(), params_.dilation.end());
+  std::vector<int> input_dims(params_.spatial_shape.begin(), params_.spatial_shape.end());
+
+  auto out_dims = SpconvOps::get_conv_output_size(input_dims, ksize, stride, padding, dilation);
+  std::vector<std::int64_t> output_dims_i64(out_dims.begin(), out_dims.end());
+  std::int64_t out_spatial_volume = std::accumulate(
+    output_dims_i64.begin(), output_dims_i64.end(), static_cast<std::int64_t>(1),
+    std::multiplies<int64_t>());
+  bool use_int64_hash_k =
+    out_spatial_volume >= static_cast<std::int64_t>(std::numeric_limits<int>::max());
+
+  int kernel_volume = 1;
+  for (const auto & ksize : params_.ksize) {
+    kernel_volume *= ksize;
+  }
+
+  // query workspace size.
+  int workspace_size = SpconvOps::get_indice_gen_workspace_size(
+    kernel_volume, out_inds_num_limit_, out_inds_num_limit_, out_inds_num_limit_, is_subm,
+    use_int64_hash_k, use_direct_table);
+
+  return static_cast<std::size_t>(workspace_size);
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.cpp
new file mode 100644
index 0000000000000..2ddb213cbd265
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.cpp
@@ -0,0 +1,324 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins//get_indice_pairs_implicit_gemm_plugin_creator.hpp"
+
+#include "autoware/tensorrt_plugins//get_indice_pairs_implicit_gemm_plugin.hpp"
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntimePlugin.h>
+
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <string>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+REGISTER_TENSORRT_PLUGIN(GetIndicePairsImplicitGemmPluginCreator);
+
+GetIndicePairsImplicitGemmPluginCreator::GetIndicePairsImplicitGemmPluginCreator()
+{
+  std::cout << "GetIndicePairsImplicitGemmPluginCreator::GetIndicePairsImplicitGemmPluginCreator"
+            << std::endl
+            << std::flush;
+
+  plugin_attributes_.clear();
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("algo", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("batch_size", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("dilation", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("is_train", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("ksize", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("out_padding", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("padding", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("spatial_shape", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("stride", nullptr, PluginFieldType::kINT32, 3));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("subm", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("transpose", nullptr, PluginFieldType::kINT32, 1));
+
+  fc_.nbFields = plugin_attributes_.size();
+  fc_.fields = plugin_attributes_.data();
+}
+
+nvinfer1::PluginFieldCollection const *
+GetIndicePairsImplicitGemmPluginCreator::getFieldNames() noexcept
+{
+  // This is only used in the build phase.
+  return &fc_;
+}
+
+IPluginV3 * GetIndicePairsImplicitGemmPluginCreator::createPlugin(
+  char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept
+{
+  // The build phase and the deserialization phase are handled differently.
+  if (phase == TensorRTPhase::kBUILD || phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the ONNX node will be parsed and passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+
+      PLUGIN_VALIDATE(num_fields == 11);
+
+      GetIndicePairsImplicitGemmParameters parameters;
+
+      for (std::int32_t i{0}; i < num_fields; ++i) {
+        const std::string attr_name = fields[i].name;
+        const nvinfer1::PluginFieldType type = fields[i].type;
+
+        if (attr_name == "batch_size") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.batch_size = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+        if (attr_name == "algo") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.algo = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+        if (attr_name == "is_train") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.is_train = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+        if (attr_name == "dilation") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const dilation_data{
+            static_cast<std::int32_t const *>(fields[i].data)};
+
+          parameters.dilation_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.dilation.push_back(dilation_data[j]);
+            parameters.dilation_dims.d[j] = static_cast<std::int64_t>(dilation_data[j]);
+          }
+        }
+        if (attr_name == "dilation_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.dilation_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.dilation_dims.nbDims; ++j) {
+            parameters.dilation.push_back(static_cast<std::int32_t>(parameters.dilation_dims.d[j]));
+          }
+        }
+        if (attr_name == "ksize") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const ksize_data{static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.ksize_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.ksize.push_back(ksize_data[j]);
+            parameters.ksize_dims.d[j] = static_cast<std::int64_t>(ksize_data[j]);
+          }
+        }
+        if (attr_name == "ksize_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.ksize_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.ksize_dims.nbDims; ++j) {
+            parameters.ksize.push_back(static_cast<std::int32_t>(parameters.ksize_dims.d[j]));
+          }
+        }
+        if (attr_name == "out_padding") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const out_padding_data{
+            static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.out_padding_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.out_padding.push_back(out_padding_data[j]);
+            parameters.out_padding_dims.d[j] = static_cast<std::int64_t>(out_padding_data[j]);
+          }
+        }
+        if (attr_name == "out_padding_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.out_padding_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.out_padding_dims.nbDims; ++j) {
+            parameters.out_padding.push_back(
+              static_cast<std::int32_t>(parameters.out_padding_dims.d[j]));
+          }
+        }
+        if (attr_name == "padding") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const padding_data{
+            static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.padding_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.padding.push_back(padding_data[j]);
+            parameters.padding_dims.d[j] = static_cast<std::int64_t>(padding_data[j]);
+          }
+        }
+        if (attr_name == "padding_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.padding_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.padding_dims.nbDims; ++j) {
+            parameters.padding.push_back(static_cast<std::int32_t>(parameters.padding_dims.d[j]));
+          }
+        }
+        if (attr_name == "spatial_shape") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const spatial_shape_data{
+            static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.spatial_shape_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.spatial_shape.push_back(spatial_shape_data[j]);
+            parameters.spatial_shape_dims.d[j] = static_cast<std::int64_t>(spatial_shape_data[j]);
+          }
+        }
+        if (attr_name == "spatial_shape_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.spatial_shape_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.spatial_shape_dims.nbDims; ++j) {
+            parameters.spatial_shape.push_back(
+              static_cast<std::int32_t>(parameters.spatial_shape_dims.d[j]));
+          }
+        }
+        if (attr_name == "stride") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          std::int32_t const * const stride_data{static_cast<std::int32_t const *>(fields[i].data)};
+          parameters.stride_dims.nbDims = fields[i].length;
+          for (std::int32_t j{0}; j < fields[i].length; ++j) {
+            parameters.stride.push_back(stride_data[j]);
+            parameters.stride_dims.d[j] = static_cast<std::int64_t>(stride_data[j]);
+          }
+        }
+        if (attr_name == "stride_dims") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kDIMS);
+          parameters.stride_dims = static_cast<nvinfer1::Dims const *>(fields[i].data)[0];
+          for (std::int32_t j{0}; j < parameters.stride_dims.nbDims; ++j) {
+            parameters.stride.push_back(static_cast<std::int32_t>(parameters.stride_dims.d[j]));
+          }
+        }
+        if (attr_name == "subm") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.subm = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+        if (attr_name == "transpose") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.transpose = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+      }
+
+      // Log the attributes parsed from ONNX node.
+      std::stringstream ss;
+      ss << name << " plugin Attributes:";
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "batch_size: " << parameters.batch_size;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "algo: " << parameters.algo;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "is_train: " << parameters.is_train;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "dilation: ";
+      for (auto const & val : parameters.dilation) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "ksize: ";
+      for (auto const & val : parameters.ksize) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "out_padding: ";
+      for (auto const & val : parameters.out_padding) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "padding: ";
+      for (auto const & val : parameters.padding) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "spatial_shape: ";
+      for (auto const & val : parameters.spatial_shape) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "stride: ";
+      for (auto const & val : parameters.stride) {
+        ss << val << " ";
+      }
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "subm: " << parameters.subm;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "transpose: " << parameters.transpose;
+      logDebug(ss.str().c_str());
+
+      GetIndicePairsImplicitGemmPlugin * const plugin{
+        new GetIndicePairsImplicitGemmPlugin{std::string(name), parameters}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else if (phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the serialized plugin will be passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+      PLUGIN_VALIDATE(num_fields == 1);
+
+      char const * attr_name = fields[0].name;
+      PLUGIN_VALIDATE(!strcmp(attr_name, "parameters"));
+      PLUGIN_VALIDATE(fields[0].type == nvinfer1::PluginFieldType::kUNKNOWN);
+      PLUGIN_VALIDATE(fields[0].length == sizeof(GetIndicePairsImplicitGemmParameters));
+      GetIndicePairsImplicitGemmParameters params{
+        *(static_cast<GetIndicePairsImplicitGemmParameters const *>(fields[0].data))};
+
+      GetIndicePairsImplicitGemmPlugin * const plugin{
+        new GetIndicePairsImplicitGemmPlugin{std::string(name), params}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin.cpp
new file mode 100644
index 0000000000000..fc535fe3534b8
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin.cpp
@@ -0,0 +1,314 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin.hpp"
+
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+#include <spconvlib/spconv/csrc/sparse/all/SpconvOps.h>
+#include <spconvlib/spconv/csrc/sparse/alloc/StaticAllocator.h>
+#include <spconvlib/spconv/csrc/sparse/convops/SimpleExternalSpconvMatmul.h>
+#include <spconvlib/spconv/csrc/sparse/convops/spops/ConvGemmOps.h>
+#include <spconvlib/spconv/csrc/sparse/inference/InferenceOps.h>
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+// ImplicitGemm
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+ImplicitGemmPlugin::ImplicitGemmPlugin(
+  const std::string & name, ImplicitGemmParameters const & params)
+: layer_name_{name}, params_{params}
+{
+  using ConvGemmOps = spconvlib::spconv::csrc::sparse::convops::spops::ConvGemmOps;
+  using ConvMain = spconvlib::cumm::conv::main::ConvMainUnitTest;
+
+  initFieldsToSerialize();
+
+  arch_ = ConvGemmOps::get_compute_capability();
+  tunner_ptr_ = std::make_unique<ConvTunerSimple>(ConvMain::get_all_conv_algo_desp());
+}
+
+void ImplicitGemmPlugin::initFieldsToSerialize()
+{
+  data_to_serialize_.clear();
+  data_to_serialize_.emplace_back("act_alpha", &params_.act_alpha, PluginFieldType::kFLOAT32, 1);
+  data_to_serialize_.emplace_back("act_alpha", &params_.act_beta, PluginFieldType::kFLOAT32, 1);
+
+  data_to_serialize_.emplace_back("is_subm", &params_.is_subm, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("is_train", &params_.is_train, PluginFieldType::kINT32, 1);
+
+  data_to_serialize_.emplace_back(
+    "output_add_scale", &params_.output_add_scale, PluginFieldType::kFLOAT32, 1);
+  data_to_serialize_.emplace_back(
+    "output_scale", &params_.output_scale, PluginFieldType::kFLOAT32, 1);
+
+  fc_to_serialize_.nbFields = data_to_serialize_.size();
+  fc_to_serialize_.fields = data_to_serialize_.data();
+}
+
+IPluginCapability * ImplicitGemmPlugin::getCapabilityInterface(PluginCapabilityType type) noexcept
+{
+  try {
+    if (type == PluginCapabilityType::kBUILD) {
+      return static_cast<IPluginV3OneBuild *>(this);
+    }
+    if (type == PluginCapabilityType::kRUNTIME) {
+      return static_cast<IPluginV3OneRuntime *>(this);
+    }
+    PLUGIN_ASSERT(type == PluginCapabilityType::kCORE);
+    return static_cast<IPluginV3OneCore *>(this);
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+IPluginV3 * ImplicitGemmPlugin::clone() noexcept
+{
+  try {
+    IPluginV3 * const plugin{new ImplicitGemmPlugin{layer_name_, params_}};
+    return plugin;
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+char const * ImplicitGemmPlugin::getPluginName() const noexcept
+{
+  return kIMPLICIT_GEMM_PLUGIN_NAME;
+}
+
+char const * ImplicitGemmPlugin::getPluginVersion() const noexcept
+{
+  return kIMPLICIT_GEMM_PLUGIN_VERSION;
+}
+
+char const * ImplicitGemmPlugin::getPluginNamespace() const noexcept
+{
+  return kIMPLICIT_GEMM_PLUGIN_NAMESPACE;
+}
+
+std::int32_t ImplicitGemmPlugin::getNbOutputs() const noexcept
+{
+  return 1;
+}
+
+std::int32_t ImplicitGemmPlugin::configurePlugin(
+  DynamicPluginTensorDesc const * in, std::int32_t num_inputs, DynamicPluginTensorDesc const * out,
+  std::int32_t num_outputs) noexcept
+{
+  // Validate input arguments.
+  PLUGIN_ASSERT(num_inputs == 5);
+  PLUGIN_ASSERT(num_outputs == 1);
+  PLUGIN_ASSERT(in[INOUT_IN_FEATURES_INDEX].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[INOUT_FILTERS_INDEX].desc.dims.nbDims == 5);
+  PLUGIN_ASSERT(in[INOUT_PAIR_FWD_INDEX].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].desc.dims.nbDims == 1);
+  PLUGIN_ASSERT(out[0].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(
+    in[INOUT_FILTERS_INDEX].desc.dims.d[4] == in[INOUT_IN_FEATURES_INDEX].desc.dims.d[1]);
+  PLUGIN_ASSERT(
+    in[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].desc.dims.d[0] == in[INOUT_PAIR_FWD_INDEX].desc.dims.d[1]);
+  PLUGIN_ASSERT(in[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].desc.dims.d[1] == 1);
+  PLUGIN_ASSERT(
+    in[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].desc.dims.d[0] ==
+    in[INOUT_PAIR_FWD_INDEX].desc.dims.d[1]);
+  PLUGIN_ASSERT(in[INOUT_IN_FEATURES_INDEX].desc.type == in[INOUT_FILTERS_INDEX].desc.type);
+  PLUGIN_ASSERT(in[INOUT_IN_FEATURES_INDEX].desc.type == out[0].desc.type);
+  PLUGIN_ASSERT(
+    in[INOUT_PAIR_FWD_INDEX].desc.type == in[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].desc.type);
+  PLUGIN_ASSERT(
+    in[INOUT_PAIR_FWD_INDEX].desc.type == in[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].desc.type);
+
+  return 0;
+}
+
+bool ImplicitGemmPlugin::supportsFormatCombination(
+  std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+  std::int32_t num_outputs) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 5);
+  PLUGIN_ASSERT(num_outputs == 1);
+
+  bool supported = in_out[pos].desc.format == nvinfer1::TensorFormat::kLINEAR;
+
+  switch (pos) {
+    case INOUT_IN_FEATURES_INDEX:
+    case INOUT_FILTERS_INDEX:
+    case INOUT_OUT_FEATURES_INDEX:
+      supported &= in_out[pos].desc.type == nvinfer1::DataType::kFLOAT;
+      break;
+    case INOUT_PAIR_FWD_INDEX:
+    case INOUT_PAIR_MASK_FWD_SPLITS_INDEX:
+    case INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX:
+      supported &= in_out[pos].desc.type == nvinfer1::DataType::kINT32;
+      break;
+    default:
+      supported = false;
+      break;
+  }
+
+  return supported;
+}
+
+std::int32_t ImplicitGemmPlugin::getOutputDataTypes(
+  DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+  std::int32_t num_inputs) const noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 5);
+  PLUGIN_ASSERT(num_outputs == 1);
+
+  output_types[0] = input_types[INOUT_IN_FEATURES_INDEX];
+
+  return 0;
+}
+
+std::int32_t ImplicitGemmPlugin::getOutputShapes(
+  DimsExprs const * inputs, std::int32_t num_inputs,
+  [[maybe_unused]] DimsExprs const * shape_inputs, [[maybe_unused]] std::int32_t num_shape_inputs,
+  DimsExprs * outputs, std::int32_t num_outputs,
+  [[maybe_unused]] IExprBuilder & expr_builder) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 5);
+  PLUGIN_ASSERT(num_outputs == 1);
+  PLUGIN_ASSERT(inputs[0].nbDims == 2);
+
+  outputs[0].nbDims = 2;
+  outputs[0].d[0] = inputs[3].d[0];
+  outputs[0].d[1] = inputs[1].d[0];
+
+  return 0;
+}
+
+std::int32_t ImplicitGemmPlugin::enqueue(
+  PluginTensorDesc const * input_desc, [[maybe_unused]] PluginTensorDesc const * output_desc,
+  void const * const * inputs, void * const * outputs, [[maybe_unused]] void * workspace,
+  cudaStream_t stream) noexcept
+{
+  using StaticAllocator = spconvlib::spconv::csrc::sparse::alloc::StaticAllocator;
+  using ConvGemmOps = spconvlib::spconv::csrc::sparse::convops::spops::ConvGemmOps;
+
+  std::int64_t num_act_in = input_desc[INOUT_IN_FEATURES_INDEX].dims.d[0];
+  std::int64_t num_in_features = input_desc[INOUT_IN_FEATURES_INDEX].dims.d[1];
+  // std::int64_t kernel_volume = input_desc[INOUT_PAIR_FWD_INDEX].dims.d[0];
+  std::int64_t num_act_out = input_desc[INOUT_PAIR_FWD_INDEX].dims.d[1];
+  std::int64_t num_out_features = input_desc[INOUT_FILTERS_INDEX].dims.d[0];
+
+  tv::Tensor input_features =
+    tv::from_blob(inputs[INOUT_IN_FEATURES_INDEX], {num_act_in, num_in_features}, tv::float32, 0);
+
+  tv::Tensor weights = tv::from_blob(
+    inputs[INOUT_FILTERS_INDEX],
+    {input_desc[INOUT_FILTERS_INDEX].dims.d[0], input_desc[INOUT_FILTERS_INDEX].dims.d[1],
+     input_desc[INOUT_FILTERS_INDEX].dims.d[2], input_desc[INOUT_FILTERS_INDEX].dims.d[3],
+     input_desc[INOUT_FILTERS_INDEX].dims.d[4]},
+    tv::float32, 0);
+
+  tv::Tensor pair_fwd = tv::from_blob(
+    inputs[INOUT_PAIR_FWD_INDEX],
+    {input_desc[INOUT_PAIR_FWD_INDEX].dims.d[0], input_desc[INOUT_PAIR_FWD_INDEX].dims.d[1]},
+    tv::int32, 0);
+
+  tv::Tensor pair_mask_fwd_splits = tv::from_blob(
+    inputs[INOUT_PAIR_MASK_FWD_SPLITS_INDEX],
+    {1, input_desc[INOUT_PAIR_MASK_FWD_SPLITS_INDEX].dims.d[0]}, tv::int32, 0);
+
+  tv::Tensor mask_argsort_fwd_splits = tv::from_blob(
+    inputs[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX],
+    {
+      1,
+      input_desc[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].dims.d[0],
+    },
+    tv::int32, 0);
+
+  PLUGIN_ASSERT(
+    input_desc[INOUT_PAIR_FWD_INDEX].dims.d[1] ==
+    input_desc[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].dims.d[0]);
+  PLUGIN_ASSERT(input_desc[INOUT_MASK_ARGSORT_FWD_SPLITS_INDEX].dims.nbDims == 1);
+
+  tv::Tensor out_features =
+    tv::from_blob(outputs[0], {num_act_out, num_out_features}, tv::float32, 0);
+
+  tv::Tensor mask_tensor = tv::zeros({1}, tv::uint32, -1);
+
+  auto mask_tensor_ptr = mask_tensor.data_ptr<uint32_t>();
+  mask_tensor_ptr[0] = 0xffffffff;
+
+  std::vector<tv::Tensor> pair_mask_splits;
+  std::vector<tv::Tensor> mask_argsort_splits;
+
+  pair_mask_splits.push_back(pair_mask_fwd_splits);
+  mask_argsort_splits.push_back(mask_argsort_fwd_splits);
+
+  std::unordered_map<std::string, tv::Tensor> tensor_dict{
+    {SPCONV_ALLOC_FEATURES, input_features},
+    {SPCONV_ALLOC_FILTERS, weights},
+    {SPCONV_ALLOC_OUT_FEATURES, out_features}};
+  StaticAllocator alloc2(tensor_dict);
+
+  auto conv_run_status = ConvGemmOps::implicit_gemm(
+    alloc2, *tunner_ptr_, input_features, weights, pair_fwd, pair_mask_splits, mask_argsort_splits,
+    num_act_out, mask_tensor, arch_, false, params_.is_subm,
+    reinterpret_cast<std::uintptr_t>(stream), tv::CUDAKernelTimer(false), true, false, tv::Tensor(),
+    0.0, 0.0, tv::gemm::Activation::kNone, false, 1.0, tv::Tensor(), tv::Tensor(), 0.0, 0);
+
+  return 0;
+}
+
+std::int32_t ImplicitGemmPlugin::onShapeChange(
+  [[maybe_unused]] PluginTensorDesc const * in, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] PluginTensorDesc const * out, [[maybe_unused]] std::int32_t num_outputs) noexcept
+{
+  return 0;
+}
+
+IPluginV3 * ImplicitGemmPlugin::attachToContext(
+  [[maybe_unused]] IPluginResourceContext * context) noexcept
+{
+  return clone();
+}
+
+PluginFieldCollection const * ImplicitGemmPlugin::getFieldsToSerialize() noexcept
+{
+  return &fc_to_serialize_;
+}
+
+std::size_t ImplicitGemmPlugin::getWorkspaceSize(
+  [[maybe_unused]] DynamicPluginTensorDesc const * inputs, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] DynamicPluginTensorDesc const * outputs,
+  [[maybe_unused]] std::int32_t num_outputs) const noexcept
+{
+  return 0;
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin_creator.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin_creator.cpp
new file mode 100644
index 0000000000000..9c77b04d883d2
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/implicit_gemm_plugin_creator.cpp
@@ -0,0 +1,174 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp"
+
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin.hpp"
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntimePlugin.h>
+
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+#include <string>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+REGISTER_TENSORRT_PLUGIN(ImplicitGemmPluginCreator);
+
+ImplicitGemmPluginCreator::ImplicitGemmPluginCreator()
+{
+  std::cout << "ImplicitGemmPluginCreator::ImplicitGemmPluginCreator" << std::endl << std::flush;
+
+  plugin_attributes_.clear();
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("act_alpha", nullptr, PluginFieldType::kFLOAT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("act_beta", nullptr, PluginFieldType::kFLOAT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("is_subm", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("is_train", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("output_add_scale", nullptr, PluginFieldType::kFLOAT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("output_scale", nullptr, PluginFieldType::kFLOAT32, 1));
+
+  fc_.nbFields = plugin_attributes_.size();
+  fc_.fields = plugin_attributes_.data();
+}
+
+nvinfer1::PluginFieldCollection const * ImplicitGemmPluginCreator::getFieldNames() noexcept
+{
+  // This is only used in the build phase.
+  return &fc_;
+}
+
+IPluginV3 * ImplicitGemmPluginCreator::createPlugin(
+  char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept
+{
+  // The build phase and the deserialization phase are handled differently.
+  if (phase == TensorRTPhase::kBUILD || phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the ONNX node will be parsed and passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+
+      PLUGIN_VALIDATE(num_fields == 6);
+
+      ImplicitGemmParameters parameters;
+
+      for (std::int32_t i{0}; i < num_fields; ++i) {
+        const std::string attr_name = fields[i].name;
+        const nvinfer1::PluginFieldType type = fields[i].type;
+
+        if (attr_name == "act_alpha") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kFLOAT32);
+          parameters.act_alpha = static_cast<float const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "act_beta") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kFLOAT32);
+          parameters.act_beta = static_cast<float const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "is_subm") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.is_subm = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "is_train") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.is_train = static_cast<std::int32_t const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "output_add_scale") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kFLOAT32);
+          parameters.output_add_scale = static_cast<float const *>(fields[i].data)[0];
+        }
+
+        if (attr_name == "output_scale") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kFLOAT32);
+          parameters.output_scale = static_cast<float const *>(fields[i].data)[0];
+        }
+      }
+
+      // Log the attributes parsed from ONNX node.
+      std::stringstream ss;
+      ss << name << " plugin Attributes:";
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "act_alpha: " << parameters.act_alpha;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "act_beta: " << parameters.act_beta;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "is_subm: " << parameters.is_subm;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "is_train: " << parameters.is_train;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "output_add_scale: " << parameters.output_add_scale;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "output_scale: " << parameters.output_scale;
+      logDebug(ss.str().c_str());
+
+      ImplicitGemmPlugin * const plugin{new ImplicitGemmPlugin{std::string(name), parameters}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else if (phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the serialized plugin will be passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+      PLUGIN_VALIDATE(num_fields == 1);
+
+      char const * attr_name = fields[0].name;
+      PLUGIN_VALIDATE(!strcmp(attr_name, "parameters"));
+      PLUGIN_VALIDATE(fields[0].type == nvinfer1::PluginFieldType::kUNKNOWN);
+      PLUGIN_VALIDATE(fields[0].length == sizeof(ImplicitGemmParameters));
+      ImplicitGemmParameters params{*(static_cast<ImplicitGemmParameters const *>(fields[0].data))};
+
+      ImplicitGemmPlugin * const plugin{new ImplicitGemmPlugin{std::string(name), params}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_registration.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_registration.cpp
new file mode 100644
index 0000000000000..1462cba3d656f
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_registration.cpp
@@ -0,0 +1,73 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/get_indice_pairs_implicit_gemm_plugin_creator.hpp"
+#include "autoware/tensorrt_plugins/implicit_gemm_plugin_creator.hpp"
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <cstdint>
+#include <iostream>
+#include <mutex>
+
+class ThreadSafeLoggerFinder
+{
+public:
+  ThreadSafeLoggerFinder() = default;
+
+  // Set the logger finder.
+  void setLoggerFinder(nvinfer1::ILoggerFinder * finder)
+  {
+    std::lock_guard<std::mutex> lk(mutex_);
+    if (logger_finder_ == nullptr && finder != nullptr) {
+      logger_finder_ = finder;
+    }
+  }
+
+  // Get the logger.
+  nvinfer1::ILogger * getLogger() noexcept
+  {
+    std::lock_guard<std::mutex> lk(mutex_);
+    if (logger_finder_ != nullptr) {
+      return logger_finder_->findLogger();
+    }
+    return nullptr;
+  }
+
+private:
+  nvinfer1::ILoggerFinder * logger_finder_{nullptr};
+  std::mutex mutex_;
+};
+
+ThreadSafeLoggerFinder glogger_finder;
+
+extern "C" void setLoggerFinder(nvinfer1::ILoggerFinder * finder)
+{
+  glogger_finder.setLoggerFinder(finder);
+}
+
+extern "C" nvinfer1::IPluginCreatorInterface * const * getCreators(std::int32_t & num_creators)
+{
+  num_creators = 3;
+  static nvinfer1::plugin::QuickCumsumCudaPluginCreator quick_cumsum_cuda_plugin_creator{};
+  static nvinfer1::plugin::GetIndicePairsImplicitGemmPluginCreator
+    get_indice_pairs_implicit_gemm_plugin_creator{};
+  static nvinfer1::plugin::ImplicitGemmPluginCreator implicit_gemm_plugin_creator{};
+
+  static nvinfer1::IPluginCreatorInterface * const plugin_creator_list[] = {
+    &quick_cumsum_cuda_plugin_creator, &get_indice_pairs_implicit_gemm_plugin_creator,
+    &implicit_gemm_plugin_creator};
+  return plugin_creator_list;
+}
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_utils.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_utils.cpp
new file mode 100644
index 0000000000000..cabc121e30910
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/plugin_utils.cpp
@@ -0,0 +1,57 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntime.h>
+
+#include <cstdint>
+#include <cstring>
+#include <sstream>
+
+void caughtError(std::exception const & e)
+{
+  getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, e.what());
+}
+
+void logDebug(char const * msg)
+{
+  getLogger()->log(nvinfer1::ILogger::Severity::kVERBOSE, msg);
+}
+
+void logInfo(char const * msg)
+{
+  getLogger()->log(nvinfer1::ILogger::Severity::kINFO, msg);
+}
+
+void reportAssertion(bool success, char const * msg, char const * file, std::int32_t line)
+{
+  if (!success) {
+    std::ostringstream stream;
+    stream << "Assertion failed: " << msg << std::endl
+           << file << ':' << line << std::endl
+           << "Aborting..." << std::endl;
+    getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
+    std::abort();
+  }
+}
+
+void reportValidation(bool success, char const * msg, char const * file, std::int32_t line)
+{
+  if (!success) {
+    std::ostringstream stream;
+    stream << "Validation failed: " << msg << std::endl << file << ':' << line << std::endl;
+    getLogger()->log(nvinfer1::ILogger::Severity::kINTERNAL_ERROR, stream.str().c_str());
+  }
+}
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin.cpp
new file mode 100644
index 0000000000000..781b8e3fe5539
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin.cpp
@@ -0,0 +1,245 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp"
+
+#include "autoware/bev_ops/bev_pool_cuda.h"
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+
+#include <NvInferRuntime.h>
+#include <NvInferRuntimePlugin.h>
+
+#include <algorithm>  // NOTE(knzo25): temporary
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <vector>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+QuickCumsumCudaPlugin::QuickCumsumCudaPlugin(
+  const std::string & name, QuickCumsumCudaParameters const & params)
+: layer_name_{name}, params_{params}
+{
+  initFieldsToSerialize();
+}
+
+void QuickCumsumCudaPlugin::initFieldsToSerialize()
+{
+  data_to_serialize_.clear();
+  data_to_serialize_.emplace_back("batch_size", &params_.batch_size, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("dimension", &params_.dimension, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("height", &params_.height, PluginFieldType::kINT32, 1);
+  data_to_serialize_.emplace_back("width", &params_.width, PluginFieldType::kINT32, 1);
+
+  fc_to_serialize_.nbFields = data_to_serialize_.size();
+  fc_to_serialize_.fields = data_to_serialize_.data();
+}
+
+IPluginCapability * QuickCumsumCudaPlugin::getCapabilityInterface(
+  PluginCapabilityType type) noexcept
+{
+  try {
+    if (type == PluginCapabilityType::kBUILD) {
+      return static_cast<IPluginV3OneBuild *>(this);
+    }
+    if (type == PluginCapabilityType::kRUNTIME) {
+      return static_cast<IPluginV3OneRuntime *>(this);
+    }
+    PLUGIN_ASSERT(type == PluginCapabilityType::kCORE);
+    return static_cast<IPluginV3OneCore *>(this);
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+IPluginV3 * QuickCumsumCudaPlugin::clone() noexcept
+{
+  try {
+    IPluginV3 * const plugin{new QuickCumsumCudaPlugin{layer_name_, params_}};
+    return plugin;
+  } catch (std::exception const & e) {
+    caughtError(e);
+  }
+  return nullptr;
+}
+
+char const * QuickCumsumCudaPlugin::getPluginName() const noexcept
+{
+  return kQUICK_CUMSUM_CUDA_PLUGIN_NAME;
+}
+
+char const * QuickCumsumCudaPlugin::getPluginVersion() const noexcept
+{
+  return kQUICK_CUMSUM_CUDA_PLUGIN_VERSION;
+}
+
+char const * QuickCumsumCudaPlugin::getPluginNamespace() const noexcept
+{
+  return kQUICK_CUMSUM_CUDA_PLUGIN_NAMESPACE;
+}
+
+std::int32_t QuickCumsumCudaPlugin::getNbOutputs() const noexcept
+{
+  return 1;
+}
+
+std::int32_t QuickCumsumCudaPlugin::configurePlugin(
+  DynamicPluginTensorDesc const * in, std::int32_t num_inputs, DynamicPluginTensorDesc const * out,
+  std::int32_t num_outputs) noexcept
+{
+  // Validate input arguments.
+  PLUGIN_ASSERT(num_inputs == 4);
+  PLUGIN_ASSERT(num_outputs == 1);
+  PLUGIN_ASSERT(in[0].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[1].desc.dims.nbDims == 2);
+  PLUGIN_ASSERT(in[2].desc.dims.nbDims == 1);
+  PLUGIN_ASSERT(in[3].desc.dims.nbDims == 1);
+
+  PLUGIN_ASSERT(out[0].desc.dims.nbDims == 5);
+
+  PLUGIN_ASSERT(in[0].desc.dims.d[0] == -1);
+  PLUGIN_ASSERT(in[1].desc.dims.d[0] == -1);
+  PLUGIN_ASSERT(in[1].desc.dims.d[1] == 4);
+
+  PLUGIN_ASSERT(in[2].desc.dims.d[0] == -1);
+  PLUGIN_ASSERT(in[3].desc.dims.d[0] == -1);
+
+  PLUGIN_ASSERT(out[0].desc.type == in[0].desc.type);
+
+  return 0;
+}
+
+bool QuickCumsumCudaPlugin::supportsFormatCombination(
+  std::int32_t pos, DynamicPluginTensorDesc const * in_out, std::int32_t num_inputs,
+  std::int32_t num_outputs) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 4 && num_outputs == 1 && pos < num_inputs + num_outputs);
+  bool valid;
+
+  const std::int32_t INPUT_FEATURES_INDEX = 0;
+  const std::int32_t INPUT_GEOM_FEATURES_INDEX = 1;
+  const std::int32_t INPUT_INTERVAL_LENGTH_INDEX = 2;
+  const std::int32_t INPUT_INTERVAL_START_INDEX = 3;
+  const std::int32_t OUTPUT_INDEX = 4;
+
+  switch (pos) {
+    case INPUT_FEATURES_INDEX:
+    case OUTPUT_INDEX:
+      valid = in_out[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+              in_out[pos].desc.type == nvinfer1::DataType::kFLOAT;
+      break;
+    case INPUT_GEOM_FEATURES_INDEX:
+    case INPUT_INTERVAL_LENGTH_INDEX:
+    case INPUT_INTERVAL_START_INDEX:
+      valid = in_out[pos].desc.format == nvinfer1::TensorFormat::kLINEAR &&
+              in_out[pos].desc.type == nvinfer1::DataType::kINT32;
+      break;
+    default:
+      valid = false;
+      break;
+  }
+
+  return valid;
+}
+
+std::int32_t QuickCumsumCudaPlugin::getOutputDataTypes(
+  DataType * output_types, std::int32_t num_outputs, DataType const * input_types,
+  std::int32_t num_inputs) const noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 4);
+  PLUGIN_ASSERT(num_outputs == 1);
+  output_types[0] = input_types[0];
+  return 0;
+}
+
+std::int32_t QuickCumsumCudaPlugin::getOutputShapes(
+  DimsExprs const * inputs, std::int32_t num_inputs,
+  [[maybe_unused]] DimsExprs const * shape_inputs, [[maybe_unused]] std::int32_t num_shape_inputs,
+  DimsExprs * outputs, std::int32_t num_outputs, IExprBuilder & expr_builder) noexcept
+{
+  PLUGIN_ASSERT(num_inputs == 4);
+  PLUGIN_ASSERT(num_outputs == 1);
+  PLUGIN_ASSERT(inputs != nullptr);
+  PLUGIN_ASSERT(inputs[0].nbDims == 2);
+
+  outputs[0].nbDims = 5;
+  outputs[0].d[0] = expr_builder.constant(static_cast<std::int32_t>(params_.batch_size));
+  outputs[0].d[1] = expr_builder.constant(static_cast<std::int32_t>(params_.dimension));
+  outputs[0].d[2] = expr_builder.constant(static_cast<std::int32_t>(params_.height));
+  outputs[0].d[3] = expr_builder.constant(static_cast<std::int32_t>(params_.width));
+  outputs[0].d[4] = inputs[0].d[1];
+
+  return 0;
+}
+
+std::int32_t QuickCumsumCudaPlugin::enqueue(
+  PluginTensorDesc const * input_desc, [[maybe_unused]] PluginTensorDesc const * output_desc,
+  void const * const * inputs, void * const * outputs, [[maybe_unused]] void * workspace,
+  cudaStream_t stream) noexcept
+{
+  const auto output_features_num = input_desc[0].dims.d[0];
+  const auto output_features_dim = input_desc[0].dims.d[1];
+  const auto intervals_num = input_desc[2].dims.d[0];
+
+  cudaMemsetAsync(
+    outputs[0], 0,
+    params_.batch_size * params_.dimension * params_.height * params_.width * output_features_dim *
+      sizeof(float),
+    stream);
+
+  bev_pool(
+    static_cast<std::int32_t>(params_.batch_size), static_cast<std::int32_t>(params_.dimension),
+    static_cast<std::int32_t>(params_.height), static_cast<std::int32_t>(params_.width),
+    output_features_num, output_features_dim, intervals_num, static_cast<const float *>(inputs[0]),
+    static_cast<const std::int32_t *>(inputs[1]), static_cast<const std::int32_t *>(inputs[3]),
+    static_cast<const std::int32_t *>(inputs[2]), static_cast<float *>(outputs[0]), stream);
+
+  return cudaSuccess;
+}
+
+std::int32_t QuickCumsumCudaPlugin::onShapeChange(
+  [[maybe_unused]] PluginTensorDesc const * in, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] PluginTensorDesc const * out, [[maybe_unused]] std::int32_t num_outputs) noexcept
+{
+  return 0;
+}
+
+IPluginV3 * QuickCumsumCudaPlugin::attachToContext(
+  [[maybe_unused]] IPluginResourceContext * context) noexcept
+{
+  return clone();
+}
+
+PluginFieldCollection const * QuickCumsumCudaPlugin::getFieldsToSerialize() noexcept
+{
+  return &fc_to_serialize_;
+}
+
+std::size_t QuickCumsumCudaPlugin::getWorkspaceSize(
+  [[maybe_unused]] DynamicPluginTensorDesc const * inputs, [[maybe_unused]] std::int32_t num_inputs,
+  [[maybe_unused]] DynamicPluginTensorDesc const * outputs,
+  [[maybe_unused]] std::int32_t num_outputs) const noexcept
+{
+  return 0;
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.cpp b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.cpp
new file mode 100644
index 0000000000000..c74edc03b4a47
--- /dev/null
+++ b/perception/autoware_lidar_bevfusion/src/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.cpp
@@ -0,0 +1,143 @@
+// Copyright 2025 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin_creator.hpp"
+
+#include "autoware/tensorrt_plugins/plugin_utils.hpp"
+#include "autoware/tensorrt_plugins/quick_cumsum_cuda_plugin.hpp"
+
+#include <NvInferRuntimePlugin.h>
+
+#include <cstdint>
+#include <cstring>
+#include <exception>
+#include <iostream>
+#include <mutex>
+#include <sstream>
+
+namespace nvinfer1
+{
+namespace plugin
+{
+
+REGISTER_TENSORRT_PLUGIN(QuickCumsumCudaPluginCreator);
+
+QuickCumsumCudaPluginCreator::QuickCumsumCudaPluginCreator()
+{
+  plugin_attributes_.clear();
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("batch_size", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("dimension", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("height", nullptr, PluginFieldType::kINT32, 1));
+  plugin_attributes_.emplace_back(
+    nvinfer1::PluginField("width", nullptr, PluginFieldType::kINT32, 1));
+
+  fc_.nbFields = plugin_attributes_.size();
+  fc_.fields = plugin_attributes_.data();
+}
+
+nvinfer1::PluginFieldCollection const * QuickCumsumCudaPluginCreator::getFieldNames() noexcept
+{
+  // This is only used in the build phase.
+  return &fc_;
+}
+
+IPluginV3 * QuickCumsumCudaPluginCreator::createPlugin(
+  char const * name, PluginFieldCollection const * fc, TensorRTPhase phase) noexcept
+{
+  if (phase == TensorRTPhase::kBUILD || phase == TensorRTPhase::kRUNTIME) {
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+
+      PLUGIN_VALIDATE(num_fields == 4);
+
+      QuickCumsumCudaParameters parameters;
+
+      for (std::int32_t i{0}; i < num_fields; ++i) {
+        const std::string attr_name = fields[i].name;
+        const nvinfer1::PluginFieldType type = fields[i].type;
+
+        if (attr_name == "batch_size") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.batch_size = *static_cast<std::int32_t const *>(fields[i].data);
+        } else if (attr_name == "dimension") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.dimension = *static_cast<std::int32_t const *>(fields[i].data);
+        } else if (attr_name == "height") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.height = *static_cast<std::int32_t const *>(fields[i].data);
+        } else if (attr_name == "width") {
+          PLUGIN_VALIDATE(type == nvinfer1::PluginFieldType::kINT32);
+          parameters.width = *static_cast<std::int32_t const *>(fields[i].data);
+        }
+      }
+
+      // Log the attributes parsed from ONNX node.
+      std::stringstream ss;
+      ss << name << " plugin Attributes:";
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "batch_size: " << parameters.batch_size;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "dimension: " << parameters.dimension;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "height: " << parameters.height;
+      logDebug(ss.str().c_str());
+
+      ss.str("");
+      ss << "width: " << parameters.width;
+      logDebug(ss.str().c_str());
+
+      QuickCumsumCudaPlugin * const plugin{
+        new QuickCumsumCudaPlugin{std::string(name), parameters}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else if (phase == TensorRTPhase::kRUNTIME) {
+    // The attributes from the serialized plugin will be passed via fc.
+    try {
+      nvinfer1::PluginField const * fields{fc->fields};
+      std::int32_t num_fields{fc->nbFields};
+      PLUGIN_VALIDATE(num_fields == 1);
+
+      char const * attr_name = fields[0].name;
+      PLUGIN_VALIDATE(!strcmp(attr_name, "parameters"));
+      PLUGIN_VALIDATE(fields[0].type == nvinfer1::PluginFieldType::kUNKNOWN);
+      PLUGIN_VALIDATE(fields[0].length == sizeof(QuickCumsumCudaParameters));
+      QuickCumsumCudaParameters params{
+        *(static_cast<QuickCumsumCudaParameters const *>(fields[0].data))};
+
+      QuickCumsumCudaPlugin * const plugin{new QuickCumsumCudaPlugin{std::string(name), params}};
+      return plugin;
+    } catch (std::exception const & e) {
+      caughtError(e);
+    }
+    return nullptr;
+  } else {
+    return nullptr;
+  }
+}
+
+}  // namespace plugin
+}  // namespace nvinfer1
diff --git a/sensing/autoware_cuda_utils/include/autoware/cuda_utils/cuda_utils.hpp b/sensing/autoware_cuda_utils/include/autoware/cuda_utils/cuda_utils.hpp
new file mode 100644
index 0000000000000..9d0a0f3e3e015
--- /dev/null
+++ b/sensing/autoware_cuda_utils/include/autoware/cuda_utils/cuda_utils.hpp
@@ -0,0 +1,36 @@
+// Copyright 2025 Tier IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This code is licensed under CC0 1.0 Universal (Public Domain).
+// You can use this without any limitation.
+// https://creativecommons.org/publicdomain/zero/1.0/deed.en
+// borrowed from https://proc-cpuinfo.fixstars.com/2019/02/cuda_smart_pointer/
+
+#ifndef AUTOWARE__CUDA_UTILS__CUDA_UTILS_HPP_
+#define AUTOWARE__CUDA_UTILS__CUDA_UTILS_HPP_
+
+#include "autoware/cuda_utils/cuda_check_error.hpp"
+
+#include <cuda_runtime_api.h>
+
+namespace autoware::cuda_utils
+{
+template <typename T>
+void clear_async(T * ptr, std::size_t num_elem, cudaStream_t stream)
+{
+  CHECK_CUDA_ERROR(::cudaMemsetAsync(ptr, 0, sizeof(T) * num_elem, stream));
+}
+}  // namespace autoware::cuda_utils
+
+#endif  // AUTOWARE__CUDA_UTILS__CUDA_UTILS_HPP_