diff --git a/perception/tensorrt_mtr/config/tensorrt_mtr.param.yaml b/perception/tensorrt_mtr/config/tensorrt_mtr.param.yaml
index 6e969d21c638a..3d88066f6582e 100644
--- a/perception/tensorrt_mtr/config/tensorrt_mtr.param.yaml
+++ b/perception/tensorrt_mtr/config/tensorrt_mtr.param.yaml
@@ -3,13 +3,12 @@
     model_path: "$(var data_path)/mtr_static.onnx"
     precision: "fp32"
     target_labels: ["VEHICLE", "PEDESTRIAN", "CYCLIST"]
-    num_past: 10
+    num_past: 11
     num_mode: 6
     num_future: 80
-    max_num_polyline: 798
+    max_num_polyline: 768
     max_num_point: 20
     point_break_distance: 1.0
-    offset_xy: [30.0, 0.0]
     intention_point_filepath: "$(var data_path)/intention_point.csv"
     num_intention_point_cluster: 64
     polyline_label_path: "$(var data_path)/polyline_label.txt"
diff --git a/perception/tensorrt_mtr/include/tensorrt_mtr/agent.hpp b/perception/tensorrt_mtr/include/tensorrt_mtr/agent.hpp
index cc0d7af2488ec..d789d44e36b44 100644
--- a/perception/tensorrt_mtr/include/tensorrt_mtr/agent.hpp
+++ b/perception/tensorrt_mtr/include/tensorrt_mtr/agent.hpp
@@ -34,9 +34,7 @@ enum AgentLabel { VEHICLE = 0, PEDESTRIAN = 1, CYCLIST = 2 };
  */
 struct AgentState
 {
-  /**
-   * @brief Construct a new instance filling all elements by `0.0f`.
-   */
+  // Construct a new instance filling all elements by `0.0f`.
   AgentState() : data_({0.0f}) {}
 
   /**
@@ -84,33 +82,49 @@ struct AgentState
     std::copy(itr, itr + dim(), data_.begin());
   }
 
-  static size_t dim() { return AgentStateDim; }
-
-  /**
-   * @brief Construct a new instance filling all elements by `0.0f`.
-   *
-   * @return AgentState
-   */
+  // Construct a new instance filling all elements by `0.0f`.
   static AgentState empty() noexcept { return AgentState(); }
 
-  /**
-   * @brief Return the address pointer of data array.
-   *
-   * @return float*
-   */
+  // Return the agent state dimensions `D`.
+  static size_t dim() { return AgentStateDim; }
+
+  // Return the address pointer of data array.
   float * data_ptr() noexcept { return data_.data(); }
 
+  // Return the x position.
   float x() const { return x_; }
+
+  // Return the y position.
   float y() const { return y_; }
+
+  // Return the z position.
   float z() const { return z_; }
+
+  // Return the length of object size.
   float length() const { return length_; }
+
+  // Return the width of object size.
   float width() const { return width_; }
+
+  // Return the height of object size.
   float height() const { return height_; }
+
+  // Return the yaw angle `[rad]`.
   float yaw() const { return yaw_; }
+
+  // Return the x velocity.
   float vx() const { return vx_; }
+
+  // Return the y velocity.
   float vy() const { return vy_; }
+
+  // Return the x acceleration.
   float ax() const { return ax_; }
+
+  // Return the y acceleration.
   float ay() const { return ay_; }
+
+  // Return `true` if the value is `1.0`.
   bool is_valid() const { return is_valid_ == 1.0f; }
 
 private:
@@ -163,34 +177,19 @@ struct AgentHistory
   {
   }
 
-  static size_t state_dim() { return AgentStateDim; }
-
-  /**
-   * @brief Return the history length.
-   *
-   * @return size_t History length.
-   */
+  // Return the history time length `T`.
   size_t length() const { return max_time_length_; }
 
-  /**
-   * @brief Return the data size of history `T * D`.
-   *
-   * @return size_t
-   */
+  // Return the number of agent state dimensions `D`.
+  static size_t state_dim() { return AgentStateDim; }
+
+  // Return the data size of history `T * D`.
   size_t size() const { return max_time_length_ * state_dim(); }
 
-  /**
-   * @brief Return the shape of history matrix ordering in `(T, D)`.
-   *
-   * @return std::tuple<size_t, size_t>
-   */
+  // Return the shape of history matrix ordering in `(T, D)`.
   std::tuple<size_t, size_t> shape() const { return {max_time_length_, state_dim()}; }
 
-  /**
-   * @brief Return the object id.
-   *
-   * @return const std::string&
-   */
+  // Return the object id.
   const std::string & object_id() const { return object_id_; }
 
   size_t label_index() const { return label_index_; }
@@ -220,10 +219,7 @@ struct AgentHistory
     latest_time_ = current_time;
   }
 
-  /**
-   * @brief Update history with all-zeros state, but latest time is not updated.
-   *
-   */
+  // Update history with all-zeros state, but latest time is not updated.
   void update_empty() noexcept
   {
     // remove the state at the oldest timestamp
@@ -235,11 +231,7 @@ struct AgentHistory
     }
   }
 
-  /**
-   * @brief Return the address pointer of data array.
-   *
-   * @return float* The pointer of data array.
-   */
+  // Return the address pointer of data array.
   float * data_ptr() noexcept { return data_.data(); }
 
   /**
@@ -264,11 +256,7 @@ struct AgentHistory
    */
   bool is_valid_latest() const { return get_latest_state().is_valid(); }
 
-  /**
-   * @brief Get the latest agent state.
-   *
-   * @return AgentState
-   */
+  // Get the latest agent state at `T`.
   AgentState get_latest_state() const
   {
     const auto & latest_itr = (data_.begin() + state_dim() * (max_time_length_ - 1));
@@ -338,49 +326,55 @@ struct AgentData
     }
   }
 
-  static size_t state_dim() { return AgentStateDim; }
+  // Return the number of classes `C`.
   static size_t num_class() { return 3; }  // TODO(ktro2828): Do not use magic number.
+
+  // Return the number of target agents `B`.
   size_t num_target() const { return num_target_; }
+
+  // Return the number of agents `N`.
   size_t num_agent() const { return num_agent_; }
+
+  // Return the timestamp length `T`.
   size_t time_length() const { return time_length_; }
+
+  // Return the number of agent state dimensions `D`.
+  static size_t state_dim() { return AgentStateDim; }
+
+  // Return the index of ego.
   int sdc_index() const { return sdc_index_; }
+
+  // Return the vector of indices of target agents, in shape `[B]`.
   const std::vector<size_t> & target_index() const { return target_index_; }
+
+  // Return the vector of label indices of all agents, in shape `[N]`.
   const std::vector<size_t> & label_index() const { return label_index_; }
+
+  // Return the vector of label indices of target agents, in shape `[B]`.
   const std::vector<size_t> & target_label_index() const { return target_label_index_; }
+
+  // Return the vector of timestamps in shape `[T]`.
   const std::vector<float> & timestamps() const { return timestamps_; }
 
+  // Return the number of all elements `N*T*D`.
   size_t size() const { return num_agent_ * time_length_ * state_dim(); }
-  size_t input_dim() const { return num_agent_ + time_length_ + num_class() + 3; }
 
-  /**
-   * @brief Return the data shape ordering in (N, T, D).
-   *
-   * @return std::tuple<size_t, size_t, size_t>
-   */
+  // Return the number of state dimensions of MTR input `T+C+D+3`.
+  size_t input_dim() const { return time_length_ + state_dim() + num_class() + 3; }
+
+  // Return the data shape ordering in (N, T, D).
   std::tuple<size_t, size_t, size_t> shape() const
   {
     return {num_agent_, time_length_, state_dim()};
   }
 
-  /**
-   * @brief Return the address pointer of data array.
-   *
-   * @return float* The pointer of data array.
-   */
+  // Return the address pointer of data array.
   float * data_ptr() noexcept { return data_.data(); }
 
-  /**
-   * @brief Return the address pointer of data array for target agents.
-   *
-   * @return float* The pointer of data array for target agents.
-   */
+  // Return the address pointer of data array for target agents.
   float * target_data_ptr() noexcept { return target_data_.data(); }
 
-  /**
-   * @brief Return the address pointer of data array for ego vehicle.
-   *
-   * @return float* The pointer of data array for ego vehicle.
-   */
+  // Return the address pointer of data array for ego vehicle.
   float * ego_data_ptr() noexcept { return ego_data_.data(); }
 
 private:
@@ -397,6 +391,7 @@ struct AgentData
   std::vector<float> ego_data_;
 };
 
+// Get label names from label indices.
 std::vector<std::string> getLabelNames(const std::vector<size_t> & label_index)
 {
   std::vector<std::string> label_names;
diff --git a/perception/tensorrt_mtr/include/tensorrt_mtr/intention_point.hpp b/perception/tensorrt_mtr/include/tensorrt_mtr/intention_point.hpp
index 0e957bb0d4e4b..4e40a7acab22c 100644
--- a/perception/tensorrt_mtr/include/tensorrt_mtr/intention_point.hpp
+++ b/perception/tensorrt_mtr/include/tensorrt_mtr/intention_point.hpp
@@ -107,18 +107,10 @@ struct IntentionPoint
     return points;
   }
 
-  /**
-   * @brief Return the size of intension point K * D.
-   *
-   * @return size_t
-   */
+  // Return the size of intension point `K*D`.
   size_t size() const { return num_cluster_ * state_dim(); }
 
-  /**
-   * @brief Return the number of clusters contained in intention points.
-   *
-   * @return size_t
-   */
+  // Return the number of clusters contained in intention points `K`.
   size_t num_cluster() const { return num_cluster_; }
 
 private:
diff --git a/perception/tensorrt_mtr/include/tensorrt_mtr/node.hpp b/perception/tensorrt_mtr/include/tensorrt_mtr/node.hpp
index a23652ca57606..b095002e5d42a 100644
--- a/perception/tensorrt_mtr/include/tensorrt_mtr/node.hpp
+++ b/perception/tensorrt_mtr/include/tensorrt_mtr/node.hpp
@@ -88,13 +88,8 @@ class PolylineTypeMap
     }
   }
 
-  /**
-   * @brief Return the ID of the corresponding label type.
-   * If specified type is not contained in map, return -1.
-   *
-   * @param type
-   * @return int
-   */
+  // Return the ID of the corresponding label type. If specified type is not contained in map,
+  // return `-1`.
   int getTypeID(const std::string & type) const
   {
     return label_map_.count(type) == 0 ? -1 : label_map_.at(type);
@@ -109,83 +104,42 @@ class MTRNode : public rclcpp::Node
 public:
   explicit MTRNode(const rclcpp::NodeOptions & node_options);
 
-  // Object ID for ego vehicle
+  // Object ID of the ego vehicle
   const std::string EGO_ID{"EGO"};
 
 private:
-  /**
-   * @brief Main callback being invoked when the tracked objects topic is subscribed.
-   *
-   * @param object_msg
-   */
+  // Main callback being invoked when the tracked objects topic is subscribed.
   void callback(const TrackedObjects::ConstSharedPtr object_msg);
 
-  /**
-   * @brief Callback being invoked when the HD map topic is subscribed.
-   *
-   * @param map_msg
-   */
+  // Callback being invoked when the HD map topic is subscribed.
   void onMap(const HADMapBin::ConstSharedPtr map_msg);
 
-  /**
-   * @brief Callback being invoked when the Ego's odometry topic is subscribed.
-   *
-   * @param ego_msg
-   */
+  // Callback being invoked when the Ego's odometry topic is subscribed.
   void onEgo(const Odometry::ConstSharedPtr ego_msg);
 
-  /**
-   * @brief Converts lanelet2 to polylines.
-   *
-   * @return true
-   */
+  // Convert Lanelet to `PolylineData`.
   bool convertLaneletToPolyline();
 
-  /**
-   * @brief Remove ancient agent histories.
-   *
-   * @param current_time
-   * @param objects_msg
-   */
+  // Remove ancient agent histories.
   void removeAncientAgentHistory(
     const float current_time, const TrackedObjects::ConstSharedPtr objects_msg);
 
-  /**
-   * @brief Appends new states to history.
-   *
-   * @param current_time
-   * @param objects_msg
-   */
+  // Appends new states to history.
   void updateAgentHistory(
     const float current_time, const TrackedObjects::ConstSharedPtr objects_msg);
 
+  // Extract ego state stored in the buffer which has the nearest timestamp from current timestamp.
   AgentState extractNearestEgo(const float current_time) const;
 
-  /**
-   * @brief Extract target agents and return corresponding indices.
-   *
-   * NOTE: Extract targets in order of proximity, closest first.
-   *
-   * @param histories
-   * @return std::vector<size_t>
-   */
+  // Extract target agents and return corresponding indices.
+  // NOTE: Extract targets in order of proximity, closest first.
   std::vector<size_t> extractTargetAgent(const std::vector<AgentHistory> & histories);
 
-  /**
-   * @brief Return the timestamps relative from the first element.Return the timestamps relative
-   * from the first element.
-   *
-   * @return std::vector<float>
-   */
+  // Return the timestamps relative from the first element.Return the timestamps relative from the
+  // first element.
   std::vector<float> getRelativeTimestamps() const;
 
-  /**
-   * @brief Generate `PredictedObject` from `PredictedTrajectory`.
-   *
-   * @param object
-   * @param trajectory
-   * @return PredictedObject
-   */
+  // Generate `PredictedObject` from `PredictedTrajectory`.
   PredictedObject generatePredictedObject(
     const TrackedObject & object, const PredictedTrajectory & trajectory);
 
diff --git a/perception/tensorrt_mtr/include/tensorrt_mtr/polyline.hpp b/perception/tensorrt_mtr/include/tensorrt_mtr/polyline.hpp
index c7695deb0bb3d..660db35fecfa3 100644
--- a/perception/tensorrt_mtr/include/tensorrt_mtr/polyline.hpp
+++ b/perception/tensorrt_mtr/include/tensorrt_mtr/polyline.hpp
@@ -32,9 +32,7 @@ enum PolylineLabel { LANE = 0, ROAD_LINE = 1, ROAD_EDGE = 2, CROSSWALK = 3 };
 
 struct LanePoint
 {
-  /**
-   * @brief Construct a new instance filling all elements by `0.0f`.
-   */
+  // Construct a new instance filling all elements by `0.0f`.
   LanePoint() : data_({0.0f}) {}
 
   /**
@@ -55,34 +53,22 @@ struct LanePoint
   {
   }
 
+  // Construct a new instance filling all elements by `0.0f`.
+  static LanePoint empty() noexcept { return LanePoint(); }
+
+  // Return the point state dimensions `D`.
   static size_t dim() { return PointStateDim; }
 
-  /**
-   * @brief Return the x position of the point.
-   *
-   * @return float The x position.
-   */
+  // Return the x position of the point.
   float x() const { return x_; }
 
-  /**
-   * @brief Return the y position of the point.
-   *
-   * @return float The y position.
-   */
+  // Return the y position of the point.
   float y() const { return y_; }
 
-  /**
-   * @brief Return the z position of the point.
-   *
-   * @return float The z position.
-   */
+  // Return the z position of the point.
   float z() const { return z_; }
 
-  /**
-   * @brief Return the label of the point.
-   *
-   * @return float The label.
-   */
+  // Return the label of the point.
   float label() const { return label_; }
 
   /**
@@ -96,18 +82,7 @@ struct LanePoint
     return std::hypot(x_ - other.x(), y_ - other.y(), z_ - other.z());
   }
 
-  /**
-   * @brief Construct a new instance filling all elements by `0.0f`.
-   *
-   * @return LanePoint
-   */
-  static LanePoint empty() noexcept { return LanePoint(); }
-
-  /**
-   * @brief Return the address pointer of data array.
-   *
-   * @return float*
-   */
+  // Return the address pointer of data array.
   float * data_ptr() noexcept { return data_.data(); }
 
 private:
@@ -165,27 +140,28 @@ struct PolylineData
     }
   }
 
-  static size_t state_dim() { return PointStateDim; }
+  // Return the number of polylines `K`.
   size_t num_polyline() const { return num_polyline_; }
+
+  // Return the number of points contained in each polyline `P`.
   size_t num_point() const { return num_point_; }
 
+  // Return the number of point dimensions `D`.
+  static size_t state_dim() { return PointStateDim; }
+
+  // Return the number of all elements `K*P*D`.
   size_t size() const { return num_polyline_ * num_point_ * state_dim(); }
 
-  /**
-   * @brief Return the data shape ordering in (L, P, D).
-   *
-   * @return std::tuple<size_t, size_t, size_t>
-   */
+  // Return the number of state dimensions of MTR input `D+2`.
+  size_t input_dim() const { return state_dim() + 2; }
+
+  // Return the data shape ordering in `(K, P, D)`.
   std::tuple<size_t, size_t, size_t> shape() const
   {
     return {num_polyline_, num_point_, state_dim()};
   }
 
-  /**
-   * @brief Return the address pointer of data array.
-   *
-   * @return float* The pointer of data array.
-   */
+  // Return the address pointer of data array.
   float * data_ptr() noexcept { return data_.data(); }
 
 private:
diff --git a/perception/tensorrt_mtr/include/tensorrt_mtr/trajectory.hpp b/perception/tensorrt_mtr/include/tensorrt_mtr/trajectory.hpp
index 08ee1a789293f..6bade709cbebc 100644
--- a/perception/tensorrt_mtr/include/tensorrt_mtr/trajectory.hpp
+++ b/perception/tensorrt_mtr/include/tensorrt_mtr/trajectory.hpp
@@ -35,11 +35,19 @@ struct PredictedState
   {
   }
 
+  // Return the predicted state dimensions `D`.
   static size_t dim() { return PredictedStateDim; }
 
+  // Return the predicted x position.
   float x() const { return x_; }
+
+  // Return the predicted y position.
   float y() const { return y_; }
+
+  // Return the predicted x velocity.
   float vx() const { return vx_; }
+
+  // Return the predicted y velocity.
   float vy() const { return vy_; }
 
 private:
@@ -47,7 +55,7 @@ struct PredictedState
 };  // struct PredictedState
 
 /**
- * @brief A class to represent waypoints for a single mode.
+ * @brief A class to represent waypoints of a single mode.
  */
 struct PredictedMode
 {
@@ -61,15 +69,16 @@ struct PredictedMode
     }
   }
 
+  // Return the number of predicted future timestamps `T`.
+  size_t num_future() const { return num_future_; }
+
+  // Return the number of predicted state dimensions `D`.
   static size_t state_dim() { return PredictedStateDim; }
+
+  // Return the predicted score.
   float score() const { return score_; }
-  size_t num_future() const { return num_future_; }
 
-  /**
-   * @brief Get the waypoints.
-   *
-   * @return const std::vector<PredictedState>&
-   */
+  // Return the vector of waypoints.
   const std::vector<PredictedState> & get_waypoints() const { return waypoints_; }
 
 private:
@@ -83,6 +92,16 @@ struct PredictedMode
  */
 struct PredictedTrajectory
 {
+  /**
+   * @brief Construct a new instance.
+   *
+   * @note Predicted trajectories are sorted with the smallest scores.
+   *
+   * @param scores Predicted cores for each target, in shape `[B*M]`.
+   * @param trajectories Predicted trajectories for each target. `[B*M*T*D]`.
+   * @param num_mode The number of predicted modes.
+   * @param num_future The number of predicted timestamps.
+   */
   PredictedTrajectory(
     const float * scores, const float * trajectories, const size_t num_mode,
     const size_t num_future)
@@ -94,26 +113,24 @@ struct PredictedTrajectory
       std::vector<float> waypoints(start_ptr, start_ptr + num_future_ * state_dim());
       modes_.emplace_back(score, waypoints.data(), num_future_);
     }
-
     // sort by score
     sort_by_score();
   }
 
-  static size_t state_dim() { return PredictedStateDim; }
+  // Return the number of predicted modes `M`.
   size_t num_mode() const { return num_mode_; }
+
+  // Return the number of predicted future timestamps `T`.
   size_t num_future() const { return num_future_; }
 
-  /**
-   * @brief Return predicted modes. Modes are sorted in descending order based on their scores.
-   *
-   * @return const std::vector<PredictedMode>&
-   */
+  // Return the number of predicted state dimensions `D`.
+  static size_t state_dim() { return PredictedStateDim; }
+
+  // Return predicted modes. Modes are sorted in descending order based on their scores.
   const std::vector<PredictedMode> & get_modes() const { return modes_; }
 
 private:
-  /**
-   * @brief Sort modes in descending order based on their scores.
-   */
+  // Sort modes in descending order based on their scores.
   void sort_by_score()
   {
     std::sort(modes_.begin(), modes_.end(), [](const auto & mode1, const auto & mode2) {
diff --git a/perception/tensorrt_mtr/include/tensorrt_mtr/trt_mtr.hpp b/perception/tensorrt_mtr/include/tensorrt_mtr/trt_mtr.hpp
index 3a27cf6e48c56..5438adf0fb21c 100644
--- a/perception/tensorrt_mtr/include/tensorrt_mtr/trt_mtr.hpp
+++ b/perception/tensorrt_mtr/include/tensorrt_mtr/trt_mtr.hpp
@@ -42,18 +42,19 @@ struct MTRConfig
    * @brief Construct a new instance.
    *
    * @param target_labels An array of target label names.
+   * @param num_past The number of past timestamps.
    * @param num_mode The number of modes.
    * @param num_future The number of future time step length predicted by MTR.
    * @param max_num_polyline The max number of polylines which can be contained in a single input.
-   * @param offset_xy The offset value used in input pre-process.
+   * @param max_num_point The max number of points contained in each polyline.
    * @param intention_point_filepath The path to intention points file.
    * @param num_intention_point_cluster The number of clusters for intension points.
    */
   MTRConfig(
     const std::vector<std::string> & target_labels = {"VEHICLE", "PEDESTRIAN", "CYCLIST"},
-    const size_t num_past = 10, const size_t num_mode = 6, const size_t num_future = 80,
+    const size_t num_past = 11, const size_t num_mode = 6, const size_t num_future = 80,
     const size_t max_num_polyline = 768, const size_t max_num_point = 20,
-    const float point_break_distance = 1.0f, const std::array<float, 2> & offset_xy = {30.0f, 0.0f},
+    const float point_break_distance = 1.0f,
     const std::string & intention_point_filepath = "./data/waymo64.csv",
     const size_t num_intention_point_cluster = 64)
   : target_labels(target_labels),
@@ -64,7 +65,6 @@ struct MTRConfig
     max_num_polyline(max_num_polyline),
     max_num_point(max_num_point),
     point_break_distance(point_break_distance),
-    offset_xy(offset_xy),
     intention_point_filepath(intention_point_filepath),
     num_intention_point_cluster(num_intention_point_cluster)
   {
@@ -78,7 +78,6 @@ struct MTRConfig
   size_t max_num_polyline;
   size_t max_num_point;
   float point_break_distance;
-  std::array<float, 2> offset_xy;
   std::string intention_point_filepath;
   size_t num_intention_point_cluster;
 };  // struct MTRConfig
@@ -159,6 +158,15 @@ class TrtMTR
 
   IntentionPoint intention_point_;
 
+  // data size
+  // load from input data
+  size_t num_target_, num_agent_, time_length_;
+  size_t agent_input_dim_;
+  size_t num_polyline_, num_point_;
+  size_t polyline_input_dim_;
+  // load from config
+  size_t max_num_polyline_, num_mode_, num_future_;
+
   // source data
   cuda::unique_ptr<int[]> d_target_index_{nullptr};
   cuda::unique_ptr<int[]> d_label_index_{nullptr};
@@ -167,7 +175,6 @@ class TrtMTR
   cuda::unique_ptr<float[]> d_target_state_{nullptr};
   cuda::unique_ptr<float[]> d_intention_points_{nullptr};
   cuda::unique_ptr<float[]> d_polyline_{nullptr};
-  cuda::unique_ptr<int[]> d_topk_index_{nullptr};
 
   // preprocessed inputs
   cuda::unique_ptr<float[]> d_in_trajectory_{nullptr};
@@ -176,6 +183,10 @@ class TrtMTR
   cuda::unique_ptr<float[]> d_in_polyline_{nullptr};
   cuda::unique_ptr<bool[]> d_in_polyline_mask_{nullptr};
   cuda::unique_ptr<float[]> d_in_polyline_center_{nullptr};
+  // only used for topk extraction
+  cuda::unique_ptr<float[]> d_tmp_polyline_{nullptr};
+  cuda::unique_ptr<bool[]> d_tmp_polyline_mask_{nullptr};
+  cuda::unique_ptr<float[]> d_tmp_distance_{nullptr};
 
   // outputs
   cuda::unique_ptr<float[]> d_out_score_{nullptr};
diff --git a/perception/tensorrt_mtr/lib/include/postprocess/postprocess_kernel.cuh b/perception/tensorrt_mtr/lib/include/postprocess/postprocess_kernel.cuh
index 2b3014d1fb39c..949eacc6abeca 100644
--- a/perception/tensorrt_mtr/lib/include/postprocess/postprocess_kernel.cuh
+++ b/perception/tensorrt_mtr/lib/include/postprocess/postprocess_kernel.cuh
@@ -18,21 +18,20 @@
 #include <cuda_runtime.h>
 
 /**
- * @brief A kernel to transform predicted trajectory from each target coords system to world coords
- * system.
+ * @brief Transform predicted trajectory from target agent coords to world coords.
  *
  * @param B The number of target agents.
  * @param M The number of modes.
  * @param T The number of future timestamps.
- * @param inDim The number of input agent state dimensions.
- * @param targetState Source target agent states at latest timestamp, in shape [B*inDim].
- * @param outDim The number of output state dimensions.
- * @param trajectory Output predicted trajectory, in shape [B*M*T*outDim].
- * @return __global__
+ * @param AgentDim The number of target agent state dimensions, expecting (x, y, z, length, width,
+ * height, yaw, vx, vy, ax, ay).
+ * @param targetState Source target agent state, in shape [B*AgentDim].
+ * @param PredDim The number of predicted state dimension, expecting (x, y, ?, ?, ?, vx, vy).
+ * @param predTrajectory Predicted trajectory, in shape [B*M*T*PredDim].
  */
 __global__ void transformTrajectoryKernel(
-  const int B, const int M, const int T, const int inDim, const float * targetState,
-  const int outDim, float * trajectory);
+  const int B, const int M, const int T, const int AgentDim, const float * targetState,
+  const int PredDim, float * predTrajectory);
 
 /**
  * @brief Execute postprocess to predicted score and trajectory.
@@ -40,16 +39,16 @@ __global__ void transformTrajectoryKernel(
  * @param B The number of target agents.
  * @param M The number of modes.
  * @param T The number of future timestamps.
- * @param inDim The number of input agent state dimensions.
- * @param target_state Target agent states at the latest timestamp, in shape [B * inDim].
- * @param outDim The number predicted agent state dimensions
- * @param pred_scores Predicted scores, in shape [B*M].
- * @param pred_trajectory Predicted trajectories, in shape [B*M*T*D].
+ * @param AgentDim The number of target agent state dimensions, expecting (x, y, z, length, width,
+ * height, yaw, vx, vy, ax, ay).
+ * @param targetState Target agent states at the latest timestamp, in shape [B*inDim].
+ * @param PredDim The number predicted state dimensions, expecting (x, y, ?, ?, ?, vx, vy).
+ * @param predTrajectory Predicted trajectory, in shape [B*M*T*PredDim].
  * @param stream CUDA stream.
- * @return cudaError_t
+ * @return cudaError_t CUDA error type.
  */
 cudaError_t postprocessLauncher(
-  const int B, const int M, const int T, const int inDim, const float * target_state,
-  const int outDim, float * pred_score, float * pred_trajectory, cudaStream_t stream);
+  const int B, const int M, const int T, const int AgentDim, const float * targetState,
+  const int PredDim, float * predTrajectory, cudaStream_t stream);
 
 #endif  // POSTPROCESS__POSTPROCESS_KERNEL_CUH__
diff --git a/perception/tensorrt_mtr/lib/include/preprocess/polyline_preprocess_kernel.cuh b/perception/tensorrt_mtr/lib/include/preprocess/polyline_preprocess_kernel.cuh
index d655f50820282..0ffec0e01daf6 100644
--- a/perception/tensorrt_mtr/lib/include/preprocess/polyline_preprocess_kernel.cuh
+++ b/perception/tensorrt_mtr/lib/include/preprocess/polyline_preprocess_kernel.cuh
@@ -16,96 +16,125 @@
 #define PREPROCESS__POLYLINE_PREPROCESS_KERNEL_CUH_
 
 /**
- * @brief Transform to target agent coordinates system.
+ * @brief Transform polylines to target agent coordinate system and extend its feature with previous
+ * x, y`(D->D+2)`.
+ *
+ * Points which all elements are 0.0 except of typeID are filled by 0.0 and the corresponding
+ * mask value is 0.0 too.
  *
  * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
- * @param PointDim The number of point state dimensions.
- * @param in_polyline Source polylines, in shape [K*P*PointDim].
+ * @param PointDim The number of point dimensions, expecting (x, y, z, dx, dy, dz, typeID).
+ * @param inPolyline Source polyline, in shape [K*P*D].
  * @param B The number of target agents.
- * @param AgentDim The number of agent state dimensions.
- * @param target_state Source target state at the latest timestamp, in shape [B*AgentDim].
- * @param out_polyline Output polylines, in shape [B*K*P*(PointDim+2)].
- * @param out_polyline_mask Output polyline mask, in shape [B*K*P].
+ * @param AgentDim The number of agent state dimensions, expecting (x, y, z, length, width, height,
+ * yaw, vx, vy, ax, ay).
+ * @param targetState Source target agent states, in shape [B*AgentDim].
+ * @param outPolyline Output polyline, in shape [B*K*P*(PointDim+2)].
+ * @param outPolylineMask Output polyline mask, in shape [B*K*P].
  */
 __global__ void transformPolylineKernel(
-  const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask);
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask);
 
 /**
- * @brief Set the previous xy position at the end of element.
+ * @brief Set the Previous Position Kernel object
  *
  * @param B The number of target agents.
  * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
- * @param D The number of point dimensions.
- * @param mask The polyline mask, in shape [B*K*P].
- * @param polyline The container of polylines, in shape [B*K*P*D]
+ * @param D The number of point dimensions, expecting (x, y, ..., preX, preY).
+ * @param polyline Source polyline, in shape [B*K*P*D].
  */
 __global__ void setPreviousPositionKernel(
-  const int B, const int K, const int P, const int D, const bool * mask, float * polyline);
+  const int B, const int K, const int P, const int D, float * polyline);
 
 /**
- * @brief Extract TopK elements.
+ * @brief Calculate center distance from target agent to each polyline.
+ *
+ * @note Polyline must have been transformed to target agent coordinates system.
  *
- * @param K The number of K.
- * @param L The number of source polylines.
+ * @param B The number of target agents.
+ * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
+ * @param D The number of point dimensions, expecting [x, y, ...].
+ * @param polyline Source polyline, in shape [B*K*P*D].
+ * @param polylineMask Source polyline mask, in shape [B*K*P].
+ * @param distance Output calculated distances, in shape [B*K].
+ */
+__global__ void calculateCenterDistanceKernel(
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * distance);
+
+/**
+ * @brief Extract K polylines with the smallest distances.
+ *
+ * @note Because this kernel supposes to allocate shared memory dynamically it is necessary to
+ * specify `sizeof(float) * L` in the kernel execution configuration.
+ *
+ * @param K The number of polylines to be extracted.
  * @param B The number of target agents.
- * @param offsetX X offset position.
- * @param offsetY Y offset position.
- * @param AgentDim The number of agent state dimensions.
- * @param targetState Source state of target agents, in shape [B*AgentDim].
- * @param PointDim The number of point state dimensions.
- * @param inPolyline Source polylines, in shape [L*P*PointDim].
- * @param outPolyline Output polylines, in shape [K*P*PointDim].
+ * @param L The number of original polylines.
+ * @param P The number of points contained in each polyline.
+ * @param D The number of point dimensions.
+ * @param inPolyline Source polyline, in shape [B*L*P*D].
+ * @param inPolylineMask Source polyline mask, in shape [B*L*P].
+ * @param inDistance Source distances from target agents to the centers of each polyline, in shape
+ * [B*L].
+ * @param outPolyline Output polyline, in shape [B*K*P*D].
+ * @param outPolylineMask Output polyline mask, in shape [B*K*P].
  */
-__global__ void extractTopkKernel(
-  const int K, const int L, const int P, const int B, const float offsetX, const float offsetY,
-  const int AgentDim, const float * targetState, const int PointDim, const float * inPolyline,
-  float * outPolyline);
+__global__ void extractTopKPolylineKernel(
+  const int K, const int B, const int L, const int P, const int D, const float * inPolyline,
+  const bool * inPolylineMask, const float * inDistance, float * outPolyline,
+  bool * outPolylineMask);
 
 /**
- * @brief Calculate the magnitudes of polylines.
+ * @brief Calculate center positions of each polyline with respect to target agent coordinates
+ * system.
+ *
+ * @note Polyline must have been transformed to target agent coordinates system.
  *
  * @param B The number of target agents.
  * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
- * @param D The number of point dimensions.
- * @param polyline Source polylines, in shape [B*K*P*(D + 2)].
- * @param mask Source polyline masks, in shape [B*K*P].
- * @param center Output magnitudes of polylines, in shape [B*K*3].
+ * @param D The number of point dimensions, expecting (x, y, z, ...).
+ * @param polyline Source polyline, in shape [B*K*P*D].
+ * @param polylineMask Source polyline mask, in shape [B*K*P].
+ * @param center Output centers, in shape [B*K*3].
  */
 __global__ void calculatePolylineCenterKernel(
-  const int B, const int K, const int P, const int PointDim, const float * polyline,
-  const bool * mask, float * center);
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * center);
 
 /**
  * @brief In cases of the number of batch polylines (L) is greater than K,
  *  extracts the topK elements.
  *
- * @param L The number of source polylines.
- * @param K The number of polylines expected as the model input.
+ * @param K The number of polylines to be extracted.
+ * @param L The number of original polylines.
  * @param P The number of points contained in each polyline.
  * @param PointDim The number of point state dimensions.
- * @param AgentDim The number of agent state dimensions.
- * @param in_polyline Source polylines, in shape [L*P*PointDim].
+ * @param inPolyline Source polylines, in shape [L*P*PointDim].
  * @param B The number of target agents.
- * @param target_state Target agent state at the latest timestamp, in shape [B, AgentDim].
- * @param offset_x The x offset.
- * @param offset_y The y offset.
- * @param topk_index A container to store topK indices, in shape [K].
- * @param out_polyline Output polylines, in shape [B*K*P*(PointDim+2)].
- * @param out_polyline_mask Output polyline masks, in shape [B*K*P].
- * @param out_polyline_center Output magnitudes of each polyline with respect to target coords,
+ * @param AgentDim The number of agent state dimensions.
+ * @param targetState Target agent state at the latest timestamp, in shape [B*AgentDim].
+ * @param tmpPolyline A container to store transformed polyline temporary, in shape
+ * [B*L*P*(PointDim+2)].
+ * @param tmpPolylineMask A container to store transformed polyline mask temporary, in shape
+ * [B*L*P].
+ * @param tmpDistance A container to store distances temporary, in shape [B*L].
+ * @param outPolyline Output polylines, in shape [B*K*P*(PointDim+2)].
+ * @param outPolylineMask Output polyline masks, in shape [B*K*P].
+ * @param outPolylineCenter Output magnitudes of each polyline with respect to target coords,
  *  in shape [B*K*3].
  * @param stream CUDA stream.
  * @return cudaError_t
  */
 cudaError_t polylinePreprocessWithTopkLauncher(
-  const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, const float offsetX, const float offsetY,
-  int * topk_index, float * out_polyline, bool * out_polyline_mask, float * out_polyline_center,
+  const int K, const int L, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * tmpPolyline, bool * tmpPolylineMask,
+  float * tmpDistance, float * outPolyline, bool * outPolylineMask, float * outPolylineCenter,
   cudaStream_t stream);
 
 /**
@@ -114,20 +143,20 @@ cudaError_t polylinePreprocessWithTopkLauncher(
  * @param K The number of polylines.
  * @param P The number of points contained in each polyline.
  * @param PointDim The number of point state dimensions.
- * @param in_polyline Source polylines, in shape [K*P*PointDim].
+ * @param inPolyline Source polylines, in shape [K*P*PointDim].
  * @param B The number of target agents.
  * @param AgentDim The number of agent state dimensions.
- * @param target_state Target agent state at the latest timestamp, in shape [B, AgentDim].
- * @param out_polyline Output polylines, in shape [B*K*P*(PointDim + 2)].
- * @param out_polyline_mask Output polyline masks, in shape [B*K*P].
- * @param out_polyline_center Output magnitudes of each polyline with respect to target coords,
+ * @param targetState Target agent state at the latest timestamp, in shape [B*AgentDim].
+ * @param outPolyline Output polylines, in shape [B*K*P*(PointDim+2)].
+ * @param outPolylineMask Output polyline masks, in shape [B*K*P].
+ * @param outPolylineCenter Output magnitudes of each polyline with respect to target coords,
  *  in shape [B*K*3].
  * @param stream CUDA stream.
  * @return cudaError_t
  */
 cudaError_t polylinePreprocessLauncher(
-  const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask,
-  float * out_polyline_center, cudaStream_t stream);
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask,
+  float * outPolylineCenter, cudaStream_t stream);
 
 #endif  // PREPROCESS__POLYLINE_PREPROCESS_KERNEL_CUH_
diff --git a/perception/tensorrt_mtr/lib/src/postprocess/postprocess_kernel.cu b/perception/tensorrt_mtr/lib/src/postprocess/postprocess_kernel.cu
index 2149fb8aac41f..381ef00343764 100644
--- a/perception/tensorrt_mtr/lib/src/postprocess/postprocess_kernel.cu
+++ b/perception/tensorrt_mtr/lib/src/postprocess/postprocess_kernel.cu
@@ -12,11 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "postprocess/postprocess_kernel.cuh"
-
 __global__ void transformTrajectoryKernel(
-  const int B, const int M, const int T, const int inDim, const float * targetState,
-  const int outDim, float * trajectory)
+  const int B, const int M, const int T, const int AgentDim, const float * targetState,
+  const int PredDim, float * predTrajectory)
 {
   int b = blockIdx.x * blockDim.x + threadIdx.x;
   int m = blockIdx.y * blockDim.y + threadIdx.y;
@@ -26,31 +24,31 @@ __global__ void transformTrajectoryKernel(
     return;
   }
 
-  const int pred_idx = (b * M * T + m * T + t) * outDim;
-  const float pred_x = trajectory[pred_idx];
-  const float pred_y = trajectory[pred_idx + 1];
+  const int predIdx = (b * M * T + m * T + t) * PredDim;
+  const float predX = predTrajectory[predIdx];
+  const float predY = predTrajectory[predIdx + 1];
 
-  const int target_idx = b * inDim;
-  const float target_x = targetState[target_idx];
-  const float target_y = targetState[target_idx + 1];
-  const float target_yaw = targetState[target_idx + 6];
-  const float target_cos = cos(target_yaw);
-  const float target_sin = sin(target_yaw);
+  const int targetIdx = b * AgentDim;
+  const float targetX = targetState[targetIdx];
+  const float targetY = targetState[targetIdx + 1];
+  const float targetYaw = targetState[targetIdx + 6];
+  const float targetCos = cosf(targetYaw);
+  const float targetSin = sinf(targetYaw);
 
-  trajectory[pred_idx] = target_cos * pred_x + target_sin * pred_y + target_x;
-  trajectory[pred_idx + 1] = -target_sin * pred_x + target_cos * pred_y + target_y;
+  predTrajectory[predIdx] = targetCos * predX + targetSin * predY + targetX;
+  predTrajectory[predIdx + 1] = -targetSin * predX + targetCos * predY + targetY;
 }
 
 cudaError_t postprocessLauncher(
-  const int B, const int M, const int T, const int inDim, const float * target_state,
-  const int outDim, float * pred_score, float * pred_trajectory, cudaStream_t stream)
+  const int B, const int M, const int T, const int AgentDim, const float * targetState,
+  const int PredDim, float * predTrajectory, cudaStream_t stream)
 {
   // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
   constexpr int threadsPerBlock = 256;
   dim3 blocks(B, M, T);
 
   transformTrajectoryKernel<<<blocks, threadsPerBlock, 0, stream>>>(
-    B, M, T, inDim, target_state, outDim, pred_trajectory);
+    B, M, T, AgentDim, targetState, PredDim, predTrajectory);
 
   return cudaGetLastError();
 }
diff --git a/perception/tensorrt_mtr/lib/src/preprocess/polyline_preprocess_kernel.cu b/perception/tensorrt_mtr/lib/src/preprocess/polyline_preprocess_kernel.cu
index 227f57e300cbe..f04985d394118 100644
--- a/perception/tensorrt_mtr/lib/src/preprocess/polyline_preprocess_kernel.cu
+++ b/perception/tensorrt_mtr/lib/src/preprocess/polyline_preprocess_kernel.cu
@@ -14,11 +14,13 @@
 
 #include "preprocess/polyline_preprocess_kernel.cuh"
 
+#include <float.h>
+
 #include <iostream>
 
 __global__ void transformPolylineKernel(
-  const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask)
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask)
 {
   int b = blockIdx.x * blockDim.x + threadIdx.x;
   int k = blockIdx.y * blockDim.y + threadIdx.y;
@@ -28,50 +30,59 @@ __global__ void transformPolylineKernel(
     return;
   }
 
-  const int src_polyline_idx = (k * P + p) * PointDim;
-  const float x = in_polyline[src_polyline_idx];
-  const float y = in_polyline[src_polyline_idx + 1];
-  const float z = in_polyline[src_polyline_idx + 2];
-  const float dx = in_polyline[src_polyline_idx + 3];
-  const float dy = in_polyline[src_polyline_idx + 4];
-  const float dz = in_polyline[src_polyline_idx + 5];
-  const float type_id = in_polyline[src_polyline_idx + 6];
-
-  const int center_idx = b * AgentDim;
-  const float center_x = target_state[center_idx];
-  const float center_y = target_state[center_idx + 1];
-  const float center_z = target_state[center_idx + 2];
-  const float center_yaw = target_state[center_idx + 6];
-  const float center_cos = cos(center_yaw);
-  const float center_sin = sin(center_yaw);
-
-  // do transform
-  const float trans_x = center_cos * (x - center_x) - center_sin * (y - center_y);
-  const float trans_y = center_sin * (x - center_x) + center_cos * (y - center_y);
-  const float trans_z = z - center_z;
-  const float trans_dx = center_cos * dx - center_sin * dy;
-  const float trans_dy = center_sin * dx + center_cos * dy;
-  const float trans_dz = dz;
-
-  const int out_idx = (b * K * P + k * P + p) * (PointDim + 2);
-  out_polyline[out_idx] = trans_x;
-  out_polyline[out_idx + 1] = trans_y;
-  out_polyline[out_idx + 2] = trans_z;
-  out_polyline[out_idx + 3] = trans_dx;
-  out_polyline[out_idx + 4] = trans_dy;
-  out_polyline[out_idx + 5] = trans_dz;
-  out_polyline[out_idx + 6] = type_id;
-
-  const int out_mask_idx = b * K * P + k * P + p;
-  bool is_valid = false;
-  for (size_t i = 0; i < 6; ++i) {
-    is_valid += out_polyline[out_idx + i] != 0.0f;
+  const int inIdx = (k * P + p) * PointDim;
+  const int outIdx = b * K * P + k * P + p;
+  bool isValid = false;
+  for (int d = 0; d < PointDim - 1; ++d) {
+    if (inPolyline[inIdx + d] != 0.0f) {
+      isValid = true;
+    }
+  }
+  outPolylineMask[outIdx] = isValid;
+
+  // initialize output polyline with 0.0
+  for (int d = 0; d < PointDim + 2; ++d) {
+    outPolyline[outIdx * (PointDim + 2) + d] = 0.0f;
+  }
+
+  // set transformed values if valid, otherwise all 0.0.
+  if (isValid) {
+    const float x = inPolyline[inIdx];
+    const float y = inPolyline[inIdx + 1];
+    const float z = inPolyline[inIdx + 2];
+    const float dx = inPolyline[inIdx + 3];
+    const float dy = inPolyline[inIdx + 4];
+    const float dz = inPolyline[inIdx + 5];
+    const float typeID = inPolyline[inIdx + 6];
+
+    const int centerIdx = b * AgentDim;
+    const float centerX = targetState[centerIdx];
+    const float centerY = targetState[centerIdx + 1];
+    const float centerZ = targetState[centerIdx + 2];
+    const float centerYaw = targetState[centerIdx + 6];
+    const float centerCos = cosf(centerYaw);
+    const float centerSin = sinf(centerYaw);
+
+    // do transform
+    const float transX = centerCos * (x - centerX) - centerSin * (y - centerY);
+    const float transY = centerSin * (x - centerX) + centerCos * (y - centerY);
+    const float transZ = z - centerZ;
+    const float transDx = centerCos * dx - centerSin * dy;
+    const float transDy = centerSin * dx + centerCos * dy;
+    const float transDz = dz;
+
+    outPolyline[outIdx * (PointDim + 2)] = transX;
+    outPolyline[outIdx * (PointDim + 2) + 1] = transY;
+    outPolyline[outIdx * (PointDim + 2) + 2] = transZ;
+    outPolyline[outIdx * (PointDim + 2) + 3] = transDx;
+    outPolyline[outIdx * (PointDim + 2) + 4] = transDy;
+    outPolyline[outIdx * (PointDim + 2) + 5] = transDz;
+    outPolyline[outIdx * (PointDim + 2) + 6] = typeID;
   }
-  out_polyline_mask[out_mask_idx] = is_valid;
 }
 
 __global__ void setPreviousPositionKernel(
-  const int B, const int K, const int P, const int D, const bool * mask, float * polyline)
+  const int B, const int K, const int P, const int D, float * polyline)
 {
   int b = blockIdx.x * blockDim.x + threadIdx.x;
   int k = blockIdx.y * blockDim.y + threadIdx.y;
@@ -81,56 +92,92 @@ __global__ void setPreviousPositionKernel(
     return;
   }
 
-  const int cur_idx = (b * K * P + k * P + p) * D;
-  const int pre_idx = k == 0 ? cur_idx : (b * K * P + (k - 1) * P + p) * D;
+  const int curIdx = (b * K * P + k * P + p) * D;
+  const int preIdx = p == 0 ? curIdx : (b * K * P + k * P + p - 1) * D;
 
-  polyline[cur_idx + D - 2] = polyline[pre_idx];
-  polyline[cur_idx + D - 1] = polyline[pre_idx + 1];
+  polyline[curIdx + D - 2] = polyline[preIdx];      // x
+  polyline[curIdx + D - 1] = polyline[preIdx + 1];  // y
+}
 
-  const int mask_idx = b * K * P + k * P + p;
-  if (!mask[mask_idx]) {
-    for (int d = 0; d < D; ++d) {
-      polyline[cur_idx + d] = 0.0f;
+__global__ void calculateCenterDistanceKernel(
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * distance)
+{
+  int b = blockIdx.x * blockDim.x + threadIdx.x;
+  int k = blockIdx.y * blockDim.y + threadIdx.y;
+  if (b >= B || k >= K) {
+    return;
+  }
+
+  // calculate polyline center
+  float sumX = 0.0f, sumY = 0.0f;
+  int numValid = 0;
+  for (int p = 0; p < P; ++p) {
+    int idx = b * K * P + k * P + p;
+    if (polylineMask[idx]) {
+      sumX += polyline[idx * D];
+      sumY += polyline[idx * D + 1];
+      ++numValid;
     }
   }
+  float centerX = sumX / fmaxf(1.0f, numValid);
+  float centerY = sumY / fmaxf(1.0f, numValid);
+
+  distance[b * K + k] = hypot(centerX, centerY);
 }
 
-__global__ void extractTopkKernel(
-  const int K, const int L, const int P, const int B, const float offsetX, const float offsetY,
-  const int AgentDim, const float * targetState, const int PointDim, const float * inPolyline,
-  float * outPolyline)
+__global__ void extractTopKPolylineKernel(
+  const int K, const int B, const int L, const int P, const int D, const float * inPolyline,
+  const bool * inPolylineMask, const float * inDistance, float * outPolyline,
+  bool * outPolylineMask)
 {
-  // --- pseudo code ---
-  // mask = All(polyline != 0.0, dim=2)
-  // polylineCenter = polyline[:, :, 0:2].sum(dim=1) / clampMin(mask.sum(dim=1), min=1.0)
-  // offset = rotateAlongZ((offset_x, offset_y), target_state[:, 6])
-  // targetOffsetPos = target_state[:, 0:2] + offset
-  // distances = (target_offset_pos - center)
-  // _, topkIdxs = distances.topk(k=K, descending=True)
-  // outPolyline = inPolyline[topkIdxs]
-  // -------------------
-
-  // int targetIdx = blockIdx.x;
-
-  // const float targetX = targetState[targetIdx];
-  // const float targetY = targetState[targetIdx + 1];
-  // const float targetYaw = targetState[targetIdx + 6];
-  // const float targetCos = cos(targetYaw);
-  // const float targetSin = sin(targetYaw);
-
-  // const float transTargetX = targetCos * offsetX + targetSin * offsetY + targetX;
-  // const float transTargetY = -targetSin * offsetX + targetCos * offsetY + targetY;
+  int b = blockIdx.x;                             // Batch index
+  int tid = threadIdx.x;                          // Polyline index
+  int p = blockIdx.y * blockDim.y + threadIdx.y;  // Point index
+  int d = blockIdx.z * blockDim.z + threadIdx.z;  // Dim index
+  if (b >= B || tid >= L || p >= P || d >= D) {
+    return;
+  }
+  extern __shared__ float distances[];
+
+  // Load distances into shared memory
+  if (tid < L) {
+    distances[tid] = inDistance[b * L + tid];
+  }
+  __syncthreads();
+
+  // Simple selection of the smallest K distances
+  // (this part should be replaced with a more efficient sorting/selecting algorithm)
+  for (int k = 0; k < K; k++) {
+    float minDistance = FLT_MAX;
+    int minIndex = -1;
+
+    for (int l = 0; l < L; l++) {
+      if (distances[l] < minDistance) {
+        minDistance = distances[l];
+        minIndex = l;
+      }
+    }
+    __syncthreads();
+
+    if (minIndex == -1) {
+      continue;
+    }
+
+    if (tid == k) {  // this thread will handle copying the k-th smallest polyline
+      int inIdx = b * L * P + minIndex * P + p;
+      int outIdx = b * K * P + k * P + p;
+      outPolyline[outIdx * D + d] = inPolyline[inIdx * D + d];
+      outPolylineMask[outIdx] = inPolylineMask[inIdx];
+    }
+    distances[minIndex] = FLT_MAX;  // exclude this index from future consideration
+  }
 }
 
 __global__ void calculatePolylineCenterKernel(
-  const int B, const int K, const int P, const int PointDim, const float * polyline,
-  const bool * mask, float * center)
+  const int B, const int K, const int P, const int D, const float * polyline,
+  const bool * polylineMask, float * center)
 {
-  // --- pseudo code ---
-  // sum = (polylines[:, :, :, 0:3] * mask[:, :, :, None]).sum(dim=2)
-  // center = sum / clampMIN(mask.sum(dim=2), min=1.0)
-  // -------------------
-
   int b = blockIdx.x * blockDim.x + threadIdx.x;
   int k = blockIdx.y * blockDim.y + threadIdx.y;
 
@@ -139,33 +186,33 @@ __global__ void calculatePolylineCenterKernel(
   }
 
   // initialize with 0.0
-  int center_idx = (b * K + k) * 3;
+  int centerIdx = (b * K + k) * 3;
   for (int d = 0; d < 3; ++d) {
-    center[center_idx + d] = 0.0f;
+    center[centerIdx + d] = 0.0f;
   }
 
-  float sum_xyz[3] = {0.0f, 0.0f, 0.0f};
-  int count = 0;
+  // calculate polyline center
+  float sumX = 0.0f, sumY = 0.0f, sumZ = 0.0f;
+  int numValid = 0;
   for (int p = 0; p < P; ++p) {
-    int src_idx = b * K * P + k * P + p;
-    if (mask[src_idx]) {
-      for (int d = 0; d < 3; ++d) {
-        sum_xyz[d] += polyline[src_idx * PointDim + d];
-      }
-      ++count;
+    int idx = b * K * P + k * P + p;
+    if (polylineMask[idx]) {
+      sumX += polyline[idx * D];
+      sumY += polyline[idx * D + 1];
+      sumZ += polyline[idx * D + 2];
+      ++numValid;
     }
   }
-  count = max(count, 1);
 
-  for (int d = 0; d < 3; ++d) {
-    center[center_idx + d] = sum_xyz[d] / static_cast<float>(count);
-  }
+  center[centerIdx] = sumX / fmaxf(1.0f, numValid);
+  center[centerIdx + 1] = sumY / fmaxf(1.0f, numValid);
+  center[centerIdx + 2] = sumZ / fmaxf(1.0f, numValid);
 }
 
 cudaError_t polylinePreprocessWithTopkLauncher(
-  const int L, const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, const float offset_x, const float offset_y,
-  int * topk_index, float * out_polyline, bool * out_polyline_mask, float * out_polyline_center,
+  const int K, const int L, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * tmpPolyline, bool * tmpPolylineMask,
+  float * tmpDistance, float * outPolyline, bool * outPolylineMask, float * outPolylineCenter,
   cudaStream_t stream)
 {
   if (L < K) {
@@ -173,27 +220,54 @@ cudaError_t polylinePreprocessWithTopkLauncher(
     return cudaError_t::cudaErrorInvalidValue;
   }
 
+  const int outPointDim = PointDim + 2;
+
+  // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
+  constexpr int threadsPerBlock = 256;
+  const dim3 blocks1(B, L, P);
+  transformPolylineKernel<<<blocks1, threadsPerBlock, 0, stream>>>(
+    L, P, PointDim, inPolyline, B, AgentDim, targetState, tmpPolyline, tmpPolylineMask);
+
+  const dim3 blocks2(B, L);
+  calculateCenterDistanceKernel<<<blocks2, threadsPerBlock, 0, stream>>>(
+    B, L, P, outPointDim, tmpPolyline, tmpPolylineMask, tmpDistance);
+
+  const dim3 blocks3(B, P, outPointDim);
+  const size_t sharedMemSize = sizeof(float) * L;
+  extractTopKPolylineKernel<<<blocks3, threadsPerBlock, sharedMemSize, stream>>>(
+    K, B, L, P, outPointDim, tmpPolyline, tmpPolylineMask, tmpDistance, outPolyline,
+    outPolylineMask);
+
+  const dim3 blocks4(B, K, P);
+  setPreviousPositionKernel<<<blocks4, threadsPerBlock, 0, stream>>>(
+    B, K, P, outPointDim, outPolyline);
+
+  const dim3 blocks5(B, K);
+  calculatePolylineCenterKernel<<<blocks5, threadsPerBlock, 0, stream>>>(
+    B, K, P, outPointDim, outPolyline, outPolylineMask, outPolylineCenter);
+
   return cudaGetLastError();
 }
 
 cudaError_t polylinePreprocessLauncher(
-  const int K, const int P, const int PointDim, const float * in_polyline, const int B,
-  const int AgentDim, const float * target_state, float * out_polyline, bool * out_polyline_mask,
-  float * out_polyline_center, cudaStream_t stream)
+  const int K, const int P, const int PointDim, const float * inPolyline, const int B,
+  const int AgentDim, const float * targetState, float * outPolyline, bool * outPolylineMask,
+  float * outPolylineCenter, cudaStream_t stream)
 {
+  const int outPointDim = PointDim + 2;
+
   // TODO: update the number of blocks and threads to guard from `cudaErrorIllegalAccess`
   constexpr int threadsPerBlock = 256;
-  const dim3 block3d(B, K / threadsPerBlock, P);
-
+  const dim3 block3d(B, K, P);
   transformPolylineKernel<<<block3d, threadsPerBlock, 0, stream>>>(
-    K, P, PointDim, in_polyline, B, AgentDim, target_state, out_polyline, out_polyline_mask);
+    K, P, PointDim, inPolyline, B, AgentDim, targetState, outPolyline, outPolylineMask);
 
   setPreviousPositionKernel<<<block3d, threadsPerBlock, 0, stream>>>(
-    B, K, P, PointDim, out_polyline_mask, out_polyline);
+    B, K, P, outPointDim, outPolyline);
 
-  const dim3 block2d(B, K / threadsPerBlock);
+  const dim3 block2d(B, K);
   calculatePolylineCenterKernel<<<block2d, threadsPerBlock, 0, stream>>>(
-    B, K, P, PointDim, out_polyline, out_polyline_mask, out_polyline_center);
+    B, K, P, outPointDim, outPolyline, outPolylineMask, outPolylineCenter);
 
   return cudaGetLastError();
 }
diff --git a/perception/tensorrt_mtr/src/node.cpp b/perception/tensorrt_mtr/src/node.cpp
index 5f59108f236d8..89e5b768e8521 100644
--- a/perception/tensorrt_mtr/src/node.cpp
+++ b/perception/tensorrt_mtr/src/node.cpp
@@ -28,12 +28,8 @@ namespace trt_mtr
 {
 namespace
 {
-/**
- * @brief Get the Lanelet subtype name.
- *
- * @param lanelet
- * @return std::string
- */
+// Return the Lanelet subtype name. If input Lanelet has no attribute named `type` return empty
+// string `""`.
 std::string getLaneletSubtype(const lanelet::Lanelet & lanelet)
 {
   if (!lanelet.hasAttribute("subtype")) {
@@ -44,12 +40,8 @@ std::string getLaneletSubtype(const lanelet::Lanelet & lanelet)
   }
 }
 
-/**
- * @brief Get the LineString type name.
- *
- * @param linestring
- * @return std::string
- */
+// Return the LineString type name. If input LineString has no attribute named `type` return empty
+// string `""`.
 std::string getLineStringType(const lanelet::ConstLineString3d & linestring)
 {
   if (!linestring.hasAttribute("type")) {
@@ -60,12 +52,8 @@ std::string getLineStringType(const lanelet::ConstLineString3d & linestring)
   }
 }
 
-/**
- * @brief Get the LineString subtype name.
- *
- * @param linestring
- * @return std::string
- */
+// Return the LineString subtype name. If input LineString has no attribute named `subtype` return
+// empty string `""`.
 std::string getLineStringSubtype(const lanelet::ConstLineString3d & linestring)
 {
   if (!linestring.hasAttribute("subtype")) {
@@ -76,13 +64,7 @@ std::string getLineStringSubtype(const lanelet::ConstLineString3d & linestring)
   }
 }
 
-/**
- * @brief Check whether lanelet has an attribute named `turn_direction`.
- *
- * @param lanelet
- * @return true
- * @return false
- */
+// Check whether lanelet has an attribute named `turn_direction`.
 bool isTurnIntersection(const lanelet::Lanelet & lanelet)
 {
   if (lanelet.hasAttribute("turn_direction")) {
@@ -92,25 +74,14 @@ bool isTurnIntersection(const lanelet::Lanelet & lanelet)
   }
 }
 
-/**
- * @brief Insert source LanePoints into target LanePoints.
- *
- * @param src Source LanePoints.
- * @param dst Target LanePoints.
- */
+// Insert source `LanePoints` into target `LanePoints`.
 void insertLanePoints(const std::vector<LanePoint> & src, std::vector<LanePoint> & dst)
 {
   dst.reserve(dst.size() * 2);
   dst.insert(dst.end(), src.cbegin(), src.cend());
 }
 
-/**
- * @brief Convert TrackedObject to AgentState.
- *
- * @param object
- * @param is_valid
- * @return AgentState
- */
+// Convert `TrackedObject` to `AgentState`.
 AgentState trackedObjectToAgentState(const TrackedObject & object, const bool is_valid)
 {
   const auto & pose = object.kinematics.pose_with_covariance.pose;
@@ -135,13 +106,8 @@ AgentState trackedObjectToAgentState(const TrackedObject & object, const bool is
     valid};
 }
 
-/**
- * @brief Get the label index corresponding to AgentLabel. If the label of tracked object is not
- * defined in AgentLabel returns `-1`.
- *
- * @param object
- * @return int
- */
+// Get the label index corresponding to AgentLabel. If the label of tracked object is not * defined
+// in AgentLabel returns `-1`.
 int getLabelIndex(const TrackedObject & object)
 {
   const auto classification = object_recognition_utils::getHighestProbLabel(object.classification);
@@ -175,16 +141,13 @@ MTRNode::MTRNode(const rclcpp::NodeOptions & node_options)
     const auto max_num_point = static_cast<size_t>(declare_parameter<int>("max_num_point"));
     const auto point_break_distance =
       static_cast<float>(declare_parameter<double>("point_break_distance"));
-    auto tmp_offset_xy = declare_parameter<std::vector<double>>("offset_xy");
-    std::array<float, 2> offset_xy;
-    std::copy_n(tmp_offset_xy.begin(), 2, offset_xy.begin());
     const auto intention_point_filepath =
       declare_parameter<std::string>("intention_point_filepath");
     const auto num_intention_point_cluster =
       static_cast<size_t>(declare_parameter<int>("num_intention_point_cluster"));
     config_ptr_ = std::make_unique<MTRConfig>(
       target_labels, num_past, num_mode, num_future, max_num_polyline, max_num_point,
-      point_break_distance, offset_xy, intention_point_filepath, num_intention_point_cluster);
+      point_break_distance, intention_point_filepath, num_intention_point_cluster);
     model_ptr_ = std::make_unique<TrtMTR>(model_path, precision, *config_ptr_.get());
   }
 
@@ -501,7 +464,7 @@ std::vector<size_t> MTRNode::extractTargetAgent(const std::vector<AgentHistory>
     return item1.second < item2.second;
   });
 
-  constexpr size_t max_target_size = 15;  // TODO(ktro2828): use parameter
+  constexpr size_t max_target_size = 15;  // TODO(ktro2828): use a parameter
   std::vector<size_t> target_indices;
   target_indices.reserve(max_target_size);
   for (const auto & [idx, _] : distances) {
@@ -564,7 +527,6 @@ PredictedObject MTRNode::generatePredictedObject(
 
   return predicted_object;
 }
-
 }  // namespace trt_mtr
 
 #include <rclcpp_components/register_node_macro.hpp>
diff --git a/perception/tensorrt_mtr/src/trt_mtr.cpp b/perception/tensorrt_mtr/src/trt_mtr.cpp
index ba9615da35c82..5f4899af262eb 100644
--- a/perception/tensorrt_mtr/src/trt_mtr.cpp
+++ b/perception/tensorrt_mtr/src/trt_mtr.cpp
@@ -27,6 +27,11 @@ TrtMTR::TrtMTR(
 : config_(config),
   intention_point_(config_.intention_point_filepath, config_.num_intention_point_cluster)
 {
+  max_num_polyline_ = config_.max_num_polyline;
+  num_mode_ = config_.num_mode;
+  num_future_ = config_.num_future;
+
+  // build engine
   builder_ = std::make_unique<MTRBuilder>(
     model_path, precision, batch_config, max_workspace_size, build_config);
   builder_->setup();
@@ -53,7 +58,7 @@ bool TrtMTR::doInference(
                                 d_in_polyline_.get(),        d_in_polyline_mask_.get(),
                                 d_in_polyline_center_.get(), d_in_last_pos_.get(),
                                 d_target_index_.get(),       d_intention_points_.get(),
-                                d_out_score_.get(),          d_out_trajectory_.get()};
+                                d_out_trajectory_.get(),     d_out_score_.get()};
 
   if (!builder_->enqueueV2(buffer.data(), stream_, nullptr)) {
     std::cerr << "Fail to do inference" << std::endl;
@@ -70,110 +75,108 @@ bool TrtMTR::doInference(
 
 void TrtMTR::initCudaPtr(AgentData & agent_data, PolylineData & polyline_data)
 {
+  num_target_ = agent_data.num_target();
+  num_agent_ = agent_data.num_agent();
+  time_length_ = agent_data.time_length();
+  agent_input_dim_ = agent_data.input_dim();
+  num_polyline_ = polyline_data.num_polyline();
+  num_point_ = polyline_data.num_point();
+  polyline_input_dim_ = polyline_data.input_dim();
+
   // source data
-  d_target_index_ = cuda::make_unique<int[]>(agent_data.num_target());
-  d_label_index_ = cuda::make_unique<int[]>(agent_data.num_agent());
-  d_timestamps_ = cuda::make_unique<float[]>(agent_data.time_length());
+  d_target_index_ = cuda::make_unique<int[]>(num_target_);
+  d_label_index_ = cuda::make_unique<int[]>(num_agent_);
+  d_timestamps_ = cuda::make_unique<float[]>(time_length_);
   d_trajectory_ = cuda::make_unique<float[]>(agent_data.size());
-  d_target_state_ = cuda::make_unique<float[]>(agent_data.num_target() * agent_data.state_dim());
-  d_intention_points_ =
-    cuda::make_unique<float[]>(agent_data.num_target() * intention_point_.size());
+  d_target_state_ = cuda::make_unique<float[]>(num_target_ * agent_data.state_dim());
+  d_intention_points_ = cuda::make_unique<float[]>(num_target_ * intention_point_.size());
   d_polyline_ = cuda::make_unique<float[]>(polyline_data.size());
-  d_topk_index_ = cuda::make_unique<int[]>(config_.max_num_polyline);
 
   // preprocessed input
-  // TODO(ktro2828): define this in global
-  d_in_trajectory_ = cuda::make_unique<float[]>(
-    agent_data.num_target() * agent_data.num_agent() * agent_data.time_length() *
-    agent_data.input_dim());
-  d_in_trajectory_mask_ = cuda::make_unique<bool[]>(
-    agent_data.num_target() * agent_data.num_agent() * agent_data.time_length());
-  d_in_last_pos_ = cuda::make_unique<float[]>(agent_data.num_target() * agent_data.num_agent() * 3);
-  d_in_polyline_ = cuda::make_unique<float[]>(
-    agent_data.num_target() * config_.max_num_polyline * polyline_data.num_point() *
-    (polyline_data.state_dim() + 2));
-  d_in_polyline_mask_ = cuda::make_unique<bool[]>(
-    agent_data.num_target() * config_.max_num_polyline * polyline_data.num_point());
-  d_in_polyline_center_ =
-    cuda::make_unique<float[]>(agent_data.num_target() * config_.max_num_polyline * 3);
+  d_in_trajectory_ =
+    cuda::make_unique<float[]>(num_target_ * num_agent_ * time_length_ * agent_input_dim_);
+  d_in_trajectory_mask_ = cuda::make_unique<bool[]>(num_target_ * num_agent_ * time_length_);
+  d_in_last_pos_ = cuda::make_unique<float[]>(num_target_ * num_agent_ * 3);
+  d_in_polyline_ =
+    cuda::make_unique<float[]>(num_target_ * max_num_polyline_ * num_point_ * polyline_input_dim_);
+  d_in_polyline_mask_ = cuda::make_unique<bool[]>(num_target_ * max_num_polyline_ * num_point_);
+  d_in_polyline_center_ = cuda::make_unique<float[]>(num_target_ * max_num_polyline_ * 3);
+
+  if (max_num_polyline_ < num_polyline_) {
+    d_tmp_polyline_ =
+      cuda::make_unique<float[]>(num_target_ * num_polyline_ * num_point_ * polyline_input_dim_);
+    d_tmp_polyline_mask_ = cuda::make_unique<bool[]>(num_target_ * num_polyline_ * num_point_);
+    d_tmp_distance_ = cuda::make_unique<float[]>(num_target_ * num_polyline_);
+  }
 
   // outputs
-  d_out_score_ = cuda::make_unique<float[]>(agent_data.num_target() * config_.num_mode);
-  d_out_trajectory_ = cuda::make_unique<float[]>(
-    agent_data.num_target() * config_.num_mode * config_.num_future * PredictedStateDim);
+  d_out_score_ = cuda::make_unique<float[]>(num_target_ * num_mode_);
+  d_out_trajectory_ =
+    cuda::make_unique<float[]>(num_target_ * num_mode_ * num_future_ * PredictedStateDim);
 }
 
 bool TrtMTR::preProcess(AgentData & agent_data, PolylineData & polyline_data)
 {
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    d_target_index_.get(), agent_data.target_index().data(), sizeof(int) * agent_data.num_target(),
+    d_target_index_.get(), agent_data.target_index().data(), sizeof(int) * num_target_,
     cudaMemcpyHostToDevice, stream_));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    d_label_index_.get(), agent_data.label_index().data(), sizeof(int) * agent_data.num_agent(),
+    d_label_index_.get(), agent_data.label_index().data(), sizeof(int) * num_agent_,
     cudaMemcpyHostToDevice, stream_));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    d_timestamps_.get(), agent_data.timestamps().data(), sizeof(float) * agent_data.time_length(),
+    d_timestamps_.get(), agent_data.timestamps().data(), sizeof(float) * time_length_,
     cudaMemcpyHostToDevice, stream_));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    d_trajectory_.get(), agent_data.data_ptr(),
-    sizeof(float) * agent_data.num_agent() * agent_data.time_length() * agent_data.state_dim(),
+    d_trajectory_.get(), agent_data.data_ptr(), sizeof(float) * agent_data.size(),
     cudaMemcpyHostToDevice, stream_));
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_target_state_.get(), agent_data.target_data_ptr(),
-    sizeof(float) * agent_data.num_target() * agent_data.state_dim(), cudaMemcpyHostToDevice,
-    stream_));
+    sizeof(float) * num_target_ * agent_data.state_dim(), cudaMemcpyHostToDevice, stream_));
 
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
-    d_polyline_.get(), polyline_data.data_ptr(),
-    sizeof(float) * polyline_data.num_polyline() * polyline_data.num_point() *
-      polyline_data.state_dim(),
+    d_polyline_.get(), polyline_data.data_ptr(), sizeof(float) * polyline_data.size(),
     cudaMemcpyHostToDevice, stream_));
 
   const auto target_label_names = getLabelNames(agent_data.target_label_index());
   const auto intention_points = intention_point_.get_points(target_label_names);
   CHECK_CUDA_ERROR(cudaMemcpyAsync(
     d_intention_points_.get(), intention_points.data(),
-    sizeof(float) * agent_data.num_target() * intention_point_.size(), cudaMemcpyHostToDevice,
-    stream_));
+    sizeof(float) * num_target_ * intention_point_.size(), cudaMemcpyHostToDevice, stream_));
 
-  // Preprocess
   CHECK_CUDA_ERROR(agentPreprocessLauncher(
-    agent_data.num_target(), agent_data.num_agent(), agent_data.time_length(),
-    agent_data.state_dim(), agent_data.num_class(), agent_data.sdc_index(), d_target_index_.get(),
-    d_label_index_.get(), d_timestamps_.get(), d_trajectory_.get(), d_in_trajectory_.get(),
-    d_in_trajectory_mask_.get(), d_in_last_pos_.get(), stream_));
+    num_target_, num_agent_, time_length_, agent_data.state_dim(), agent_data.num_class(),
+    agent_data.sdc_index(), d_target_index_.get(), d_label_index_.get(), d_timestamps_.get(),
+    d_trajectory_.get(), d_in_trajectory_.get(), d_in_trajectory_mask_.get(), d_in_last_pos_.get(),
+    stream_));
 
-  // TODO(ktro2828)
-  if (config_.max_num_polyline < polyline_data.num_polyline()) {
+  if (max_num_polyline_ < num_polyline_) {
     CHECK_CUDA_ERROR(polylinePreprocessWithTopkLauncher(
-      polyline_data.num_polyline(), config_.max_num_polyline, polyline_data.num_point(),
-      polyline_data.state_dim(), d_polyline_.get(), agent_data.num_target(), agent_data.state_dim(),
-      d_target_state_.get(), config_.offset_xy[0], config_.offset_xy[1], d_topk_index_.get(),
-      d_in_polyline_.get(), d_in_polyline_mask_.get(), d_in_polyline_center_.get(), stream_));
+      max_num_polyline_, num_polyline_, num_point_, polyline_data.state_dim(), d_polyline_.get(),
+      num_target_, agent_data.state_dim(), d_target_state_.get(), d_tmp_polyline_.get(),
+      d_tmp_polyline_mask_.get(), d_tmp_distance_.get(), d_in_polyline_.get(),
+      d_in_polyline_mask_.get(), d_in_polyline_center_.get(), stream_));
   } else {
     assert(config_.max_num_polyline == polyline_data.num_polyline());
     CHECK_CUDA_ERROR(polylinePreprocessLauncher(
-      polyline_data.num_polyline(), polyline_data.num_point(), polyline_data.state_dim(),
-      d_polyline_.get(), agent_data.num_target(), agent_data.state_dim(), d_target_state_.get(),
-      d_in_polyline_.get(), d_in_polyline_mask_.get(), d_in_polyline_center_.get(), stream_));
+      num_polyline_, num_point_, polyline_data.state_dim(), d_polyline_.get(), num_target_,
+      agent_data.state_dim(), d_target_state_.get(), d_in_polyline_.get(),
+      d_in_polyline_mask_.get(), d_in_polyline_center_.get(), stream_));
   }
   return true;
 }
 
 bool TrtMTR::postProcess(AgentData & agent_data, std::vector<PredictedTrajectory> & trajectories)
 {
-  // Postprocess
   CHECK_CUDA_ERROR(postprocessLauncher(
-    agent_data.num_target(), config_.num_mode, config_.num_future, agent_data.state_dim(),
-    d_target_state_.get(), PredictedStateDim, d_out_score_.get(), d_out_trajectory_.get(),
-    stream_));
+    num_target_, num_mode_, num_future_, agent_data.state_dim(), d_target_state_.get(),
+    PredictedStateDim, d_out_trajectory_.get(), stream_));
 
-  trajectories.reserve(agent_data.num_target());
-  for (size_t b = 0; b < agent_data.num_target(); ++b) {
-    const auto score_ptr = d_out_score_.get() + b * config_.num_mode;
+  trajectories.reserve(num_target_);
+  for (size_t b = 0; b < num_target_; ++b) {
+    const auto score_ptr = d_out_score_.get() + b * num_mode_;
     const auto trajectory_ptr =
-      d_out_trajectory_.get() + b * config_.num_mode * config_.num_future * PredictedStateDim;
-    trajectories.emplace_back(score_ptr, trajectory_ptr, config_.num_mode, config_.num_future);
+      d_out_trajectory_.get() + b * num_mode_ * num_future_ * PredictedStateDim;
+    trajectories.emplace_back(score_ptr, trajectory_ptr, num_mode_, num_future_);
   }
   return true;
 }