diff --git a/benchmarks/tapa_flow/bandwidth23/Makefile b/benchmarks/tapa_flow/bandwidth23/Makefile
new file mode 100644
index 00000000..441f8a63
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/Makefile
@@ -0,0 +1,114 @@
+# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+
+ROOT_DIR         := $(shell git rev-parse --show-toplevel)
+KERNEL_NAME      := bandwidth23
+RS_SCRIPT        := $(CURDIR)/run.py
+SRC_DIR		     := $(CURDIR)/design
+AB_CONFIG        := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json
+IMPL_CONFIG      := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json
+LINK_CONFIG      := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini
+PLATFORM         := xilinx_vck5000_gen4x8_qdma_2_202220_1
+PART_NUM         := xcvc1902-vsvd1760-2MP-e-S
+GRP_UTIL         := $(ROOT_DIR)/common/util/get_group.py
+TEMP_DIR         := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
+RS_TARGET        := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
+BUILD_LOG        := $(TEMP_DIR)/build.json
+SUCCESS          := "Build Successful"
+TIMING_RPT       := impl_1_hw_bb_locked_timing_summary_routed.rpt
+SLACK_GETTER     := $(ROOT_DIR)/common/util/get_slack.py
+RSPATH           := $(CURDIR)
+RSXX             := rapidstream
+RSPYTHON         := rapidstream
+DEVICE_CONFIG    := $(TEMP_DIR)/device.json
+DEVICE_GEN       := $(CURDIR)/gen_device.py
+INCLUDE          := -I $(XILINX_HLS)/include
+KERNEL_XO        := $(TEMP_DIR)/$(KERNEL_NAME).xo
+KERNEL_XCLBIN    := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
+KERNEL_XSA       := $(TEMP_DIR)/$(KERNEL_NAME).xsa
+TARGET		     := hw
+
+all: $(RS_TARGET)
+	cd $(RSPATH) && $(RSPYTHON)	$(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG)  -c clk_kernel_00_unbuffered_net -p 3.333
+	@echo $(SUCCESS)
+
+$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG)
+	mkdir -p $(TEMP_DIR)
+	cd $(RSPATH) && $(RSXX)-tapaopt \
+    --work-dir $(TEMP_DIR) \
+    --tapa-xo-path $< \
+    --device-config $(DEVICE_CONFIG) \
+    --floorplan-config $(AB_CONFIG) \
+	--single-reg \
+    --run-impl \
+    --implementation-config $(IMPL_CONFIG) \
+    --connectivity-ini $(LINK_CONFIG)
+
+$(DEVICE_CONFIG):$(AB_CONFIG)
+	mkdir -p $(TEMP_DIR)
+	cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT)
+
+cosim:$(KERNEL_XO) $(TEMP_DIR)/main.exe
+	cd $(TEMP_DIR) && $(TEMP_DIR)/main.exe 1024 \
+	--bitstream $< \
+	-xosim_work_dir $(TEMP_DIR)/xosim_work_dir
+
+hw: $(KERNEL_XCLBIN)
+
+$(KERNEL_XCLBIN): $(KERNEL_XSA)
+	@echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****"
+	cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \
+	  $^ \
+	  --temp_dir $(TEMP_DIR) \
+	  --save-temps \
+	  --report_dir $(TEMP_DIR)/reports/ \
+	  --package.boot_mode=ospi \
+	  -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log
+	@echo "### ***** $(KERNEL_XCLBIN) packaging done! *****"
+
+$(KERNEL_XSA): $(KERNEL_XO)
+	cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
+	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+	--config $(SRC_DIR)/vck5000.cfg \
+	--save-temps \
+	--temp_dir $(TEMP_DIR) \
+	--clock.defaultFreqHz 250000000 \
+	--vivado.synth.jobs 16 \
+	$< -o $@
+
+xo: $(KERNEL_XO)
+
+$(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && tapa compile \
+	--top $(KERNEL_NAME) \
+	--part-num xcu55c-fsvh2892-2L-e \
+	--clock-period 3.33 \
+	-o $(KERNEL_NAME).xo \
+	-f $< \
+	2>&1 | tee tapa.log
+
+csim:$(TEMP_DIR)/main.exe
+
+$(TEMP_DIR)/main.exe: $(SRC_DIR)/*.cpp
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && tapa g++ $^ $(INCLUDE) -o $(TEMP_DIR)/main.exe -O2
+	$(TEMP_DIR)/main.exe
+
+show_groups:
+	rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \
+	-o $(TEMP_DIR)/module_types.csv
+
+
+
+clean:
+	rm -rf $(TEMP_DIR) *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
+
+cleanall:
+	rm -rf build *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
diff --git a/benchmarks/tapa_flow/bandwidth23/README.md b/benchmarks/tapa_flow/bandwidth23/README.md
new file mode 100644
index 00000000..54f2286e
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/README.md
@@ -0,0 +1,141 @@
+<!--
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+-->
+
+<img src="https://imagedelivery.net/AU8IzMTGgpVmEBfwPILIgw/1b565657-df33-41f9-f29e-0d539743e700/128" width="64px" alt="RapidStream Logo" />
+
+# TAPA Flow: ORC Decoder
+
+## Introduction
+
+
+In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include:
+
+- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA.
+- Optimize the .xo file with RapidStream to obtain an optimized .xo file.
+- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment.
+
+## Tutorial
+
+### Step 1 (Done): Generate the Xilinx Object File (`.xo`)
+
+
+We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory.
+
+```bash
+WORK_DIR=generated
+tapac \
+  --work-dir ${WORK_DIR} \
+  --top data_decoding \
+  --part-num xcu280-fsvh2892-2L-e \
+  --clock-period 3.33 \
+  -o ${WORK_DIR}/data_decoding.xo \
+  --connectivity config/link_config.ini \
+  src/data_decoder.cpp \
+  2>&1 | tee tapa.log
+```
+
+### Step 2: Use Rapidstream to Optimize `.xo` Design
+
+The RapidStream flow conducts design space exploration and generates solutions  by taking all TAPA-generated `.xo` file as the input.
+The RapidStream flow for TAPA requires the following key inputs:
+
+- **Platform**: The Vitis platform (e.g., `xilinx_u280_gen3x16_xdma_1_202211_1`).
+- **Device**: virtual device define by calling rapidstream APIs based on platform (e.g., `get_u280_vitis_device_factory`).
+- **.xo file**: The `.xo` file generated by TAPA
+- **Connectivity** (.ini): Include the configuration file for `v++` design/config/run.py/link_config.ini.
+- **top_module_name**: Top module name for the kernel.
+- **Clock**: All the clock and frequencies.
+- **Flatten Module**: Within a design, not all modules need to be optimized. The flatten module name is the target module rapidstream will optimize.
+
+The Python snippet below shows how we initiate rapidstream instance to set up the rapidstream environment.
+
+```Python
+from rapidstream import get_u280_vitis_device_factory, RapidStreamTAPA
+import os
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+INI_PATH = f"{CURR_DIR}/design/config/link_config.ini"
+VITIS_PLATFORM = "xilinx_u280_gen3x16_xdma_1_202211_1"
+XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo"
+kernel_name = "data_decoding"
+factory = get_u280_vitis_device_factory(VITIS_PLATFORM)
+rs = RapidStreamTAPA(f"{CURR_DIR}/build")
+rs.set_virtual_device(factory.generate_virtual_device())
+rs.add_xo_file(XO_PATH)
+rs.set_vitis_platform(VITIS_PLATFORM)
+rs.set_vitis_connectivity_config(INI_PATH)
+rs.set_top_module_name(kernel_name)
+rs.add_clock("ap_clk", 3.33)
+rs.add_flatten_targets([kernel_name])
+```
+
+The HBM AXI port connection is described in design/config/run.py/link_config.ini.
+
+```bash
+[connectivity]
+sp=data_decoding.input_port:HBM[0:1]
+sp=data_decoding.output_port0_32b_8b:HBM[16:17]
+sp=data_decoding.output_port1_16b_8b:HBM[18:19]
+sp=data_decoding.output_port2_16b_8b:HBM[20:21]
+sp=data_decoding.output_port3_8b:HBM[22:23]
+sp=data_decoding.output_port4_Track:HBM[24:25]
+```
+
+As a result, it is necessary to assign the kernel ports to the appropriate slots. The Python code below demonstrates this process. For comprehensive linking details, please refer to the design/config/run.py/link_config.ini file.
+
+ ```Python
+# Bind ports to HBM 16-31
+right_slot = "SLOT_X1Y0:SLOT_X1Y0"
+left_slot = "SLOT_X0Y0:SLOT_X0Y0"
+rs.assign_port_to_region(".*input_port.*", left_slot)
+rs.assign_port_to_region(".*output_port0_32b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port1_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port2_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port3_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port4_Track.*", right_slot)
+rs.assign_port_to_region("s_axi_control_.*", left_slot)
+rs.assign_port_to_region("ap_clk", left_slot)
+rs.assign_port_to_region("ap_rst_n", left_slot)
+rs.assign_port_to_region("interrupt", left_slot)
+```
+
+For the complete detail, please refore to [./run.py](./run.py) file. Call the rapidstream by launching the command below or `make all`.
+
+```bash
+rapidstream run.py
+```
+
+If everything is successful, you should at least get one optimized `.xclbin` file.
+
+
+
+
+### Step 3: Check the Group Module Report
+
+
+RapidStream mandates a clear distinction between communication and computation within user designs.
+
+- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules.
+
+- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged.
+
+For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation.
+
+To generate a report on group types, execute the commands below or `run make show_groups`:
+
+```bash
+rapidstream ../../../common/util/get_group.py \
+	-i build/passes/0-imported.json \
+	-o build/module_types.csv
+```
+
+The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `Callipepla` serves as a Group module, while the other three modules are added by RapidStream.
+
+| Module Name                      | Group Type     |
+|:--------------------------------:|:--------------:|
+| data_decoding                    | grouped_module |
+|__rs_ap_ctrl_start_ready_pipeline | grouped_module |
+|__rs_ff_pipeline                  | grouped_module |
+|__rs_hs_pipeline                  | grouped_module |
diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp b/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp
new file mode 100644
index 00000000..9471ab2d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp
@@ -0,0 +1,127 @@
+#include <iostream>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <tapa.h>
+#include "bandwidth23.h"
+
+using std::clog;
+using std::endl;
+using std::vector;
+
+DEFINE_string(bitstream, "", "path to bitstream file, run csim if empty");
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, /*remove_flags=*/true);
+
+  const uint64_t n = argc > 1 ? atoll(argv[1]) : 1024 * 1024;
+
+  vector<bit512> rmem0(n);
+  vector<bit512> rmem1(n);
+  vector<bit512> rmem2(n);
+  vector<bit512> rmem3(n);
+  vector<bit512> rmem4(n);
+  vector<bit512> rmem5(n);
+  vector<bit512> rmem6(n);
+  vector<bit512> rmem7(n);
+  vector<bit512> rmem8(n);
+  vector<bit512> rmem9(n);
+  vector<bit512> rmem10(n);
+  vector<bit512> rmem11(n);
+  vector<bit512> rmem12(n);
+  vector<bit512> rmem13(n);
+  vector<bit512> rmem14(n);
+  vector<bit512> rmem15(n);
+  vector<bit512> rmem16(n);
+  vector<bit512> rmem17(n);
+  vector<bit512> rmem18(n);
+  vector<bit512> rmem19(n);
+  vector<bit512> rmem20(n);
+  vector<bit512> rmem21(n);
+  vector<bit512> rmem22(n);
+
+
+  for (uint64_t i = 0; i < n; ++i) {
+    rmem0[i] = i;
+    rmem1[i] = i;
+    rmem2[i] = i;
+    rmem3[i] = i;
+    rmem4[i] = i;
+    rmem5[i] = i;
+    rmem6[i] = i;
+    rmem7[i] = i;
+    rmem8[i] = i;
+    rmem9[i] = i;
+    rmem10[i] = i;
+    rmem11[i] = i;
+    rmem12[i] = i;
+    rmem13[i] = i;
+    rmem14[i] = i;
+    rmem15[i] = i;
+    rmem16[i] = i;
+    rmem17[i] = i;
+    rmem18[i] = i;
+    rmem19[i] = i;
+    rmem20[i] = i;
+    rmem21[i] = i;
+    rmem22[i] = i;
+  }
+  int64_t kernel_time_ns = tapa::invoke(
+      bandwidth23,
+      FLAGS_bitstream,
+      tapa::read_write_mmap<bit512>(rmem0),
+      tapa::read_write_mmap<bit512>(rmem1),
+      tapa::read_write_mmap<bit512>(rmem2),
+      tapa::read_write_mmap<bit512>(rmem3),
+      tapa::read_write_mmap<bit512>(rmem4),
+      tapa::read_write_mmap<bit512>(rmem5),
+      tapa::read_write_mmap<bit512>(rmem6),
+      tapa::read_write_mmap<bit512>(rmem7),
+      tapa::read_write_mmap<bit512>(rmem8),
+      tapa::read_write_mmap<bit512>(rmem9),
+      tapa::read_write_mmap<bit512>(rmem10),
+      tapa::read_write_mmap<bit512>(rmem11),
+      tapa::read_write_mmap<bit512>(rmem12),
+      tapa::read_write_mmap<bit512>(rmem13),
+      tapa::read_write_mmap<bit512>(rmem14),
+      tapa::read_write_mmap<bit512>(rmem15),
+      tapa::read_write_mmap<bit512>(rmem16),
+      tapa::read_write_mmap<bit512>(rmem17),
+      tapa::read_write_mmap<bit512>(rmem18),
+      tapa::read_write_mmap<bit512>(rmem19),
+      tapa::read_write_mmap<bit512>(rmem20),
+      tapa::read_write_mmap<bit512>(rmem21),
+      tapa::read_write_mmap<bit512>(rmem22),
+      n);
+
+  clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << endl;
+
+  uint64_t num_errors = 0;
+  const uint64_t threshold = 10;  // only report up to these errors
+  for (uint64_t i = 0; i < n; ++i) {
+    bit512 out512 = (i << 1);
+    if (rmem0[i] != out512) {
+      if (num_errors < threshold) {
+        clog << "error at " << i << ": expected " << rmem0[i] << ", got "
+             << out512 << endl;
+      }
+      ++num_errors;
+    }
+    if (rmem22[i] != out512) {
+      if (num_errors < threshold) {
+        clog << "error at " << i << ": expected " << rmem22[i] << ", got "
+             << out512 << endl;
+      }
+      ++num_errors;
+    }
+  }
+  if (num_errors == 0) {
+    clog << "PASS!" << endl;
+  } else {
+    if (num_errors > threshold) {
+      clog << " (+" << (num_errors - threshold) << " more errors)" << endl;
+    }
+    clog << "FAIL!" << endl;
+  }
+  return num_errors > 0 ? 1 : 0;
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.cpp b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.cpp
new file mode 100644
index 00000000..e2d60f9d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.cpp
@@ -0,0 +1,176 @@
+#include <cstdint>
+
+#include <tapa.h>
+#include "bandwidth23.h"
+
+void yshift(tapa::istream<bit512>& a, tapa::ostream<bit512>& b, uint64_t n) {
+  for (uint64_t i = 0; i < n; ++i) {
+    bit512 tmp;
+    tmp = a.read();
+    tmp = (tmp << 1);
+    b.write(tmp);
+  }
+}
+
+void Mmap2Stream(
+    tapa::mmap<bit512> mmap,
+    uint64_t n,
+    tapa::ostream<bit512>& stream){
+
+    for (uint64_t i = 0; i < n; ++i) {
+        stream << mmap[i];
+    }
+}
+
+void Stream2Mmap(tapa::istream<bit512>& stream, tapa::mmap<bit512> mmap,
+                 uint64_t n) {
+  for (uint64_t i = 0; i < n; ++i) {
+    mmap[i] = stream.read();
+  }
+}
+
+void bandwidth23(
+  tapa::mmap<bit512> ch_0,
+  tapa::mmap<bit512> ch_1,
+  tapa::mmap<bit512> ch_2,
+  tapa::mmap<bit512> ch_3,
+  tapa::mmap<bit512> ch_4,
+  tapa::mmap<bit512> ch_5,
+  tapa::mmap<bit512> ch_6,
+  tapa::mmap<bit512> ch_7,
+  tapa::mmap<bit512> ch_8,
+  tapa::mmap<bit512> ch_9,
+  tapa::mmap<bit512> ch_10,
+  tapa::mmap<bit512> ch_11,
+  tapa::mmap<bit512> ch_12,
+  tapa::mmap<bit512> ch_13,
+  tapa::mmap<bit512> ch_14,
+  tapa::mmap<bit512> ch_15,
+  tapa::mmap<bit512> ch_16,
+  tapa::mmap<bit512> ch_17,
+  tapa::mmap<bit512> ch_18,
+  tapa::mmap<bit512> ch_19,
+  tapa::mmap<bit512> ch_20,
+  tapa::mmap<bit512> ch_21,
+  tapa::mmap<bit512> ch_22,
+  uint64_t n) {
+
+  tapa::stream<bit512> qr0("qr0");
+  tapa::stream<bit512> qr1("qr1");
+  tapa::stream<bit512> qr2("qr2");
+  tapa::stream<bit512> qr3("qr3");
+  tapa::stream<bit512> qr4("qr4");
+  tapa::stream<bit512> qr5("qr5");
+  tapa::stream<bit512> qr6("qr6");
+  tapa::stream<bit512> qr7("qr7");
+  tapa::stream<bit512> qr8("qr8");
+  tapa::stream<bit512> qr9("qr9");
+  tapa::stream<bit512> qr10("qr10");
+  tapa::stream<bit512> qr11("qr11");
+  tapa::stream<bit512> qr12("qr12");
+  tapa::stream<bit512> qr13("qr13");
+  tapa::stream<bit512> qr14("qr14");
+  tapa::stream<bit512> qr15("qr15");
+  tapa::stream<bit512> qr16("qr16");
+  tapa::stream<bit512> qr17("qr17");
+  tapa::stream<bit512> qr18("qr18");
+  tapa::stream<bit512> qr19("qr19");
+  tapa::stream<bit512> qr20("qr20");
+  tapa::stream<bit512> qr21("qr21");
+  tapa::stream<bit512> qr22("qr22");
+
+  tapa::stream<bit512> qw0("qw0");
+  tapa::stream<bit512> qw1("qw1");
+  tapa::stream<bit512> qw2("qw2");
+  tapa::stream<bit512> qw3("qw3");
+  tapa::stream<bit512> qw4("qw4");
+  tapa::stream<bit512> qw5("qw5");
+  tapa::stream<bit512> qw6("qw6");
+  tapa::stream<bit512> qw7("qw7");
+  tapa::stream<bit512> qw8("qw8");
+  tapa::stream<bit512> qw9("qw9");
+  tapa::stream<bit512> qw10("qw10");
+  tapa::stream<bit512> qw11("qw11");
+  tapa::stream<bit512> qw12("qw12");
+  tapa::stream<bit512> qw13("qw13");
+  tapa::stream<bit512> qw14("qw14");
+  tapa::stream<bit512> qw15("qw15");
+  tapa::stream<bit512> qw16("qw16");
+  tapa::stream<bit512> qw17("qw17");
+  tapa::stream<bit512> qw18("qw18");
+  tapa::stream<bit512> qw19("qw19");
+  tapa::stream<bit512> qw20("qw20");
+  tapa::stream<bit512> qw21("qw21");
+  tapa::stream<bit512> qw22("qw22");
+
+  tapa::task()
+      .invoke(Mmap2Stream, ch_0, n, qr0)
+      .invoke(Mmap2Stream, ch_1, n, qr1)
+      .invoke(Mmap2Stream, ch_2, n, qr2)
+      .invoke(Mmap2Stream, ch_3, n, qr3)
+      .invoke(Mmap2Stream, ch_4, n, qr4)
+      .invoke(Mmap2Stream, ch_5, n, qr5)
+      .invoke(Mmap2Stream, ch_6, n, qr6)
+      .invoke(Mmap2Stream, ch_7, n, qr7)
+      .invoke(Mmap2Stream, ch_8, n, qr8)
+      .invoke(Mmap2Stream, ch_9, n, qr9)
+      .invoke(Mmap2Stream, ch_10, n, qr10)
+      .invoke(Mmap2Stream, ch_11, n, qr11)
+      .invoke(Mmap2Stream, ch_12, n, qr12)
+      .invoke(Mmap2Stream, ch_13, n, qr13)
+      .invoke(Mmap2Stream, ch_14, n, qr14)
+      .invoke(Mmap2Stream, ch_15, n, qr15)
+      .invoke(Mmap2Stream, ch_16, n, qr16)
+      .invoke(Mmap2Stream, ch_17, n, qr17)
+      .invoke(Mmap2Stream, ch_18, n, qr18)
+      .invoke(Mmap2Stream, ch_19, n, qr19)
+      .invoke(Mmap2Stream, ch_20, n, qr20)
+      .invoke(Mmap2Stream, ch_21, n, qr21)
+      .invoke(Mmap2Stream, ch_22, n, qr22)
+      .invoke(yshift, qr0, qw0, n)
+      .invoke(yshift, qr1, qw1, n)
+      .invoke(yshift, qr2, qw2, n)
+      .invoke(yshift, qr3, qw3, n)
+      .invoke(yshift, qr4, qw4, n)
+      .invoke(yshift, qr5, qw5, n)
+      .invoke(yshift, qr6, qw6, n)
+      .invoke(yshift, qr7, qw7, n)
+      .invoke(yshift, qr8, qw8, n)
+      .invoke(yshift, qr9, qw9, n)
+      .invoke(yshift, qr10, qw10, n)
+      .invoke(yshift, qr11, qw11, n)
+      .invoke(yshift, qr12, qw12, n)
+      .invoke(yshift, qr13, qw13, n)
+      .invoke(yshift, qr14, qw14, n)
+      .invoke(yshift, qr15, qw15, n)
+      .invoke(yshift, qr16, qw16, n)
+      .invoke(yshift, qr17, qw17, n)
+      .invoke(yshift, qr18, qw18, n)
+      .invoke(yshift, qr19, qw19, n)
+      .invoke(yshift, qr20, qw20, n)
+      .invoke(yshift, qr21, qw21, n)
+      .invoke(yshift, qr22, qw22, n)
+      .invoke(Stream2Mmap, qw0, ch_0, n)
+      .invoke(Stream2Mmap, qw1, ch_1, n)
+      .invoke(Stream2Mmap, qw2, ch_2, n)
+      .invoke(Stream2Mmap, qw3, ch_3, n)
+      .invoke(Stream2Mmap, qw4, ch_4, n)
+      .invoke(Stream2Mmap, qw5, ch_5, n)
+      .invoke(Stream2Mmap, qw6, ch_6, n)
+      .invoke(Stream2Mmap, qw7, ch_7, n)
+      .invoke(Stream2Mmap, qw8, ch_8, n)
+      .invoke(Stream2Mmap, qw9, ch_9, n)
+      .invoke(Stream2Mmap, qw10, ch_10, n)
+      .invoke(Stream2Mmap, qw11, ch_11, n)
+      .invoke(Stream2Mmap, qw12, ch_12, n)
+      .invoke(Stream2Mmap, qw13, ch_13, n)
+      .invoke(Stream2Mmap, qw14, ch_14, n)
+      .invoke(Stream2Mmap, qw15, ch_15, n)
+      .invoke(Stream2Mmap, qw16, ch_16, n)
+      .invoke(Stream2Mmap, qw17, ch_17, n)
+      .invoke(Stream2Mmap, qw18, ch_18, n)
+      .invoke(Stream2Mmap, qw19, ch_19, n)
+      .invoke(Stream2Mmap, qw20, ch_20, n)
+      .invoke(Stream2Mmap, qw21, ch_21, n)
+      .invoke(Stream2Mmap, qw22, ch_22, n);
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.h b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.h
new file mode 100644
index 00000000..5686779d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.h
@@ -0,0 +1,37 @@
+
+#ifndef __VADD_BW_H__
+#define __VADD_BW_H__
+#include <cstdint>
+
+#include <tapa.h>
+#include <ap_int.h>
+
+typedef ap_uint<512> bit512;
+
+void bandwidth23(
+  tapa::mmap<bit512> ch_0,
+  tapa::mmap<bit512> ch_1,
+  tapa::mmap<bit512> ch_2,
+  tapa::mmap<bit512> ch_3,
+  tapa::mmap<bit512> ch_4,
+  tapa::mmap<bit512> ch_5,
+  tapa::mmap<bit512> ch_6,
+  tapa::mmap<bit512> ch_7,
+  tapa::mmap<bit512> ch_8,
+  tapa::mmap<bit512> ch_9,
+  tapa::mmap<bit512> ch_10,
+  tapa::mmap<bit512> ch_11,
+  tapa::mmap<bit512> ch_12,
+  tapa::mmap<bit512> ch_13,
+  tapa::mmap<bit512> ch_14,
+  tapa::mmap<bit512> ch_15,
+  tapa::mmap<bit512> ch_16,
+  tapa::mmap<bit512> ch_17,
+  tapa::mmap<bit512> ch_18,
+  tapa::mmap<bit512> ch_19,
+  tapa::mmap<bit512> ch_20,
+  tapa::mmap<bit512> ch_21,
+  tapa::mmap<bit512> ch_22,
+  uint64_t n);
+
+#endif
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/ab_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/ab_config.json
new file mode 100644
index 00000000..5676d8e8
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/ab_config.json
@@ -0,0 +1,34 @@
+{
+    "dse_range_max": 0.8,
+    "dse_range_min": 0.7,
+    "partition_strategy": "flat",
+    "port_pre_assignments": {
+        ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_10_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_11_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_12_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_13_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_14_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_15_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_16_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_17_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_18_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_19_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_20_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_21_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_22_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_4_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_5_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_6_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_7_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_8_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_9_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        "ap_clk": "SLOT_X0Y0:SLOT_X0Y0",
+        "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0",
+        "interrupt": "SLOT_X0Y0:SLOT_X0Y0",
+        "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0"
+    }
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/impl_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/impl_config.json
new file mode 100644
index 00000000..3c481977
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/impl_config.json
@@ -0,0 +1,7 @@
+{
+    "max_workers": 2,
+    "port_to_clock_period": {
+        "ap_clk": 3.33
+    },
+    "vitis_platform": "xilinx_u55c_gen3x16_xdma_3_202210_1"
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/link_config.ini b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/link_config.ini
new file mode 100644
index 00000000..c19a4a5a
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/link_config.ini
@@ -0,0 +1,24 @@
+[connectivity]
+sp=banwidth23.ch_0:HBM[0:1]
+sp=banwidth23.ch_1:HBM[0:1]
+sp=banwidth23.ch_2:HBM[0:1]
+sp=banwidth23.ch_3:HBM[0:1]
+sp=banwidth23.ch_4:HBM[0:1]
+sp=banwidth23.ch_5:HBM[0:1]
+sp=banwidth23.ch_6:HBM[0:1]
+sp=banwidth23.ch_7:HBM[0:1]
+sp=banwidth23.ch_8:HBM[0:1]
+sp=banwidth23.ch_9:HBM[0:1]
+sp=banwidth23.ch_10:HBM[0:1]
+sp=banwidth23.ch_11:HBM[0:1]
+sp=banwidth23.ch_12:HBM[0:1]
+sp=banwidth23.ch_13:HBM[0:1]
+sp=banwidth23.ch_14:HBM[0:1]
+sp=banwidth23.ch_15:HBM[0:1]
+sp=banwidth23.ch_16:HBM[0:1]
+sp=banwidth23.ch_17:HBM[0:1]
+sp=banwidth23.ch_18:HBM[0:1]
+sp=banwidth23.ch_19:HBM[0:1]
+sp=banwidth23.ch_20:HBM[0:1]
+sp=banwidth23.ch_21:HBM[0:1]
+sp=banwidth23.ch_22:HBM[0:1]
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json
new file mode 100644
index 00000000..b9325669
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json
@@ -0,0 +1,34 @@
+{
+    "dse_range_max": 0.8,
+    "dse_range_min": 0.7,
+    "partition_strategy": "flat",
+    "port_pre_assignments": {
+        ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_10_.*": "SLOT_X1Y0:SLOT_X1Y0",
+        ".*ch_11_.*": "SLOT_X1Y0:SLOT_X1Y0",
+        ".*ch_12_.*": "SLOT_X1Y0:SLOT_X1Y0",
+        ".*ch_13_.*": "SLOT_X1Y0:SLOT_X1Y0",
+        ".*ch_14_.*": "SLOT_X1Y0:SLOT_X1Y0",
+        ".*ch_15_.*": "SLOT_X1Y0:SLOT_X1Y0",
+        ".*ch_16_.*": "SLOT_X1Y0:SLOT_X1Y0",
+        ".*ch_17_.*": "SLOT_X1Y1:SLOT_X1Y1",
+        ".*ch_18_.*": "SLOT_X1Y1:SLOT_X1Y1",
+        ".*ch_19_.*": "SLOT_X1Y1:SLOT_X1Y1",
+        ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_20_.*": "SLOT_X1Y1:SLOT_X1Y1",
+        ".*ch_21_.*": "SLOT_X1Y1:SLOT_X1Y1",
+        ".*ch_22_.*": "SLOT_X1Y1:SLOT_X1Y1",
+        ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_4_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_5_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_6_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_7_.*": "SLOT_X0Y1:SLOT_X0Y1",
+        ".*ch_8_.*": "SLOT_X0Y1:SLOT_X0Y1",
+        ".*ch_9_.*": "SLOT_X0Y1:SLOT_X0Y1",
+        "ap_clk": "SLOT_X0Y0:SLOT_X0Y0",
+        "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0",
+        "interrupt": "SLOT_X0Y0:SLOT_X0Y0",
+        "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0"
+    }
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/impl_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/impl_config.json
new file mode 100644
index 00000000..9b47f4ca
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/impl_config.json
@@ -0,0 +1,7 @@
+{
+    "max_workers": 2,
+    "port_to_clock_period": {
+        "ap_clk": 3.33
+    },
+    "vitis_platform": "xilinx_vck5000_gen4x8_qdma_2_202220_1"
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/link_config.ini b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/link_config.ini
new file mode 100644
index 00000000..cf375c2d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/link_config.ini
@@ -0,0 +1,27 @@
+platform=xilinx_vck5000_gen4x8_qdma_2_202220_1
+
+[connectivity]
+
+sp = bandwidth23.m_axi_ch_0:MC_NOC0
+sp = bandwidth23.m_axi_ch_1:MC_NOC0
+sp = bandwidth23.m_axi_ch_2:MC_NOC0
+sp = bandwidth23.m_axi_ch_3:MC_NOC0
+sp = bandwidth23.m_axi_ch_4:MC_NOC0
+sp = bandwidth23.m_axi_ch_5:MC_NOC0
+sp = bandwidth23.m_axi_ch_6:MC_NOC0
+sp = bandwidth23.m_axi_ch_7:MC_NOC0
+sp = bandwidth23.m_axi_ch_8:MC_NOC0
+sp = bandwidth23.m_axi_ch_9:MC_NOC0
+sp = bandwidth23.m_axi_ch_10:MC_NOC0
+sp = bandwidth23.m_axi_ch_11:MC_NOC0
+sp = bandwidth23.m_axi_ch_12:MC_NOC0
+sp = bandwidth23.m_axi_ch_13:MC_NOC0
+sp = bandwidth23.m_axi_ch_14:MC_NOC0
+sp = bandwidth23.m_axi_ch_15:MC_NOC0
+sp = bandwidth23.m_axi_ch_16:MC_NOC0
+sp = bandwidth23.m_axi_ch_17:MC_NOC0
+sp = bandwidth23.m_axi_ch_18:MC_NOC0
+sp = bandwidth23.m_axi_ch_19:MC_NOC0
+sp = bandwidth23.m_axi_ch_20:MC_NOC0
+sp = bandwidth23.m_axi_ch_21:MC_NOC0
+sp = bandwidth23.m_axi_ch_22:MC_NOC0
diff --git a/benchmarks/tapa_flow/bandwidth23/design/run_tapa.sh b/benchmarks/tapa_flow/bandwidth23/design/run_tapa.sh
new file mode 100644
index 00000000..0071559b
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/run_tapa.sh
@@ -0,0 +1,9 @@
+WORK_DIR=work.out
+
+tapa compile \
+  --top data_decoding \
+  --part-num xcu55c-fsvh2892-2L-e \
+  --clock-period 3.33 \
+  -o ${WORK_DIR}/data_decoding.xo \
+  -f src/data_decoder.cpp \
+  2>&1 | tee tapa.log
diff --git a/benchmarks/tapa_flow/bandwidth23/run_au55c.py b/benchmarks/tapa_flow/bandwidth23/run_au55c.py
new file mode 100644
index 00000000..8ea706e5
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/run_au55c.py
@@ -0,0 +1,42 @@
+__copyright__ = """
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+"""
+
+from rapidstream import get_u55c_vitis_device_factory
+import os
+from pathlib import Path
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+CURR_FILE = os.path.basename(__file__)
+
+VITIS_PLATFORM = "xilinx_u55c_gen3x16_xdma_3_202210_1"
+XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo"
+
+factory = get_u55c_vitis_device_factory(VITIS_PLATFORM)
+
+# Reserve resource for the HBM Memory Sub-System.
+# The HMSS is not part of the user kernel so the partition optimization process
+# is unaware of its existence. We need to manually reserve resources for it.
+# For 512-bit HBM channels, each HBM channel uses approximately the following resources:
+# AREA_PER_HBM_CHANNEL = {
+#     "LUT": 5000,
+#     "FF": 6500,
+#     "BRAM": 0,
+#     "URAM": 0,
+#     "DSP": 0,
+# }
+factory.reduce_slot_area(0, 0, lut=150800)
+factory.reduce_slot_area(0, 1, lut=146960)
+factory.reduce_slot_area(0, 2, lut=146960)
+factory.reduce_slot_area(1, 0, lut=128000)
+factory.reduce_slot_area(1, 1, lut=107840)
+factory.reduce_slot_area(1, 2, lut=115120)
+
+
+# For this U280 platform, the right most DSP column on the boundary between
+# dynamic/static region is not usable. So we need to adjust the DSP count
+# to reflect the actual available DSPs.
+print("Reducing DSP of (1, 1) to make it less congested")
+factory.reduce_slot_area(1, 1, dsp=100)
+factory.generate_virtual_device(Path(f"{CURR_DIR}/build/{CURR_FILE}/device.json"))
diff --git a/benchmarks/tapa_flow/bandwidth23/run_vck5000.py b/benchmarks/tapa_flow/bandwidth23/run_vck5000.py
new file mode 100644
index 00000000..ae36f962
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/run_vck5000.py
@@ -0,0 +1,84 @@
+__copyright__ = """
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+"""
+
+import os
+from pathlib import Path
+
+from rapidstream import DeviceFactory
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+CURR_FILE = os.path.basename(__file__)
+
+VITIS_PLATFORM = "xilinx_vck5000_gen4x8_qdma_2_202220_1"
+VCK5000_PART_NAME = "xcvc1902-vsvd1760-2MP-e-S"
+
+
+factory = DeviceFactory(row=2, col=2, part_num=VCK5000_PART_NAME, board_name=None)
+
+for x in range(2):
+    for y in range(2):
+        pblock = f"-add CLOCKREGION_X{x*4}Y{y*4}:CLOCKREGION_X{x*4+3}Y{y*4+3}"
+        factory.set_slot_pblock(x, y, [pblock])
+
+
+# set SLR crossing capacity
+for x in range(2):
+    factory.set_slot_capacity(x, 0, north=11520)
+    factory.set_slot_capacity(x, 1, north=11520)
+
+    factory.set_slot_capacity(x, 1, south=11520)
+    # factory.set_slot_capacity(x, 2, south=11520)
+
+# Set W/E capacity
+for y in range(2):
+    factory.set_slot_capacity(0, y, east=40320)
+    factory.set_slot_capacity(1, y, west=40320)
+# factory.set_slot_capacity(0, 2, east=41178)
+# factory.set_slot_capacity(1, 2, west=41178)
+
+
+factory.set_platform_name(VITIS_PLATFORM)
+factory.set_user_pblock_name("pblock_dynamic_region")
+
+factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y2"])
+factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y2"])
+factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y3:CLOCKREGION_X4Y4"])
+factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y3:CLOCKREGION_X9Y4"])
+
+
+# Vitis uses 4395 nets from SLR0 to SLR1
+# factory.set_slot_capacity(1, 0, north=11520 - 4395)
+# factory.set_slot_capacity(1, 1, north=11520 - 4395)
+
+# Vitis uses 4185 nets from SLR1 to SLR2
+# factory.set_slot_capacity(1, 1, south=11520 - 4185)
+
+
+factory.extract_slot_resources()
+
+
+# Reserve resource for the HBM Memory Sub-System.
+# The HMSS is not part of the user kernel so the partition optimization process
+# is unaware of its existence. We need to manually reserve resources for it.
+# For 512-bit HBM channels, each HBM channel uses approximately the following resources:
+# AREA_PER_HBM_CHANNEL = {
+#     "LUT": 5000,
+#     "FF": 6500,
+#     "BRAM": 0,
+#     "URAM": 0,
+#     "DSP": 0,
+# }
+# factory.reduce_slot_area(0, 0, lut=150800)
+# factory.reduce_slot_area(0, 1, lut=146960)
+# factory.reduce_slot_area(1, 0, lut=128000)
+# factory.reduce_slot_area(1, 1, lut=107840)
+
+
+# For this U280 platform, the right most DSP column on the boundary between
+# dynamic/static region is not usable. So we need to adjust the DSP count
+# to reflect the actual available DSPs.
+print("Reducing DSP of (1, 1) to make it less congested")
+factory.reduce_slot_area(1, 1, dsp=100)
+factory.generate_virtual_device(Path(f"{CURR_DIR}/build/{CURR_FILE}/device.json"))
diff --git a/benchmarks/tapa_flow/bandwidth4/Makefile b/benchmarks/tapa_flow/bandwidth4/Makefile
new file mode 100644
index 00000000..3f2761f9
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/Makefile
@@ -0,0 +1,114 @@
+# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+
+ROOT_DIR         := $(shell git rev-parse --show-toplevel)
+KERNEL_NAME      := bandwidth4
+RS_SCRIPT        := $(CURDIR)/run.py
+SRC_DIR		     := $(CURDIR)/design
+AB_CONFIG        := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json
+IMPL_CONFIG      := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json
+LINK_CONFIG      := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini
+PLATFORM         := xilinx_vck5000_gen4x8_qdma_2_202220_1
+PART_NUM         := xcvc1902-vsvd1760-2MP-e-S
+GRP_UTIL         := $(ROOT_DIR)/common/util/get_group.py
+TEMP_DIR         := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
+RS_TARGET        := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
+BUILD_LOG        := $(TEMP_DIR)/build.json
+SUCCESS          := "Build Successful"
+TIMING_RPT       := impl_1_hw_bb_locked_timing_summary_routed.rpt
+SLACK_GETTER     := $(ROOT_DIR)/common/util/get_slack.py
+RSPATH           := $(CURDIR)
+RSXX             := rapidstream
+RSPYTHON         := rapidstream
+DEVICE_CONFIG    := $(TEMP_DIR)/device.json
+DEVICE_GEN       := $(CURDIR)/gen_device.py
+INCLUDE          := -I $(XILINX_HLS)/include
+KERNEL_XO        := $(TEMP_DIR)/$(KERNEL_NAME).xo
+KERNEL_XCLBIN    := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
+KERNEL_XSA       := $(TEMP_DIR)/$(KERNEL_NAME).xsa
+TARGET		     := hw
+
+all: $(RS_TARGET)
+	cd $(RSPATH) && $(RSPYTHON)	$(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG)  -c clk_kernel_00_unbuffered_net -p 3.333
+	@echo $(SUCCESS)
+
+#   --run-impl
+$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG)
+	mkdir -p $(TEMP_DIR)
+	cd $(RSPATH) && $(RSXX)-tapaopt \
+    --work-dir $(TEMP_DIR) \
+    --tapa-xo-path $< \
+    --device-config $(DEVICE_CONFIG) \
+    --floorplan-config $(AB_CONFIG) \
+	--single-reg \
+    --implementation-config $(IMPL_CONFIG) \
+    --connectivity-ini $(LINK_CONFIG)
+
+$(DEVICE_CONFIG):$(AB_CONFIG)
+	mkdir -p $(TEMP_DIR)
+	cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT)
+
+cosim:$(KERNEL_XO) $(TEMP_DIR)/main.exe
+	cd $(TEMP_DIR) && $(TEMP_DIR)/main.exe 1024 \
+	--bitstream $< \
+	-xosim_work_dir $(TEMP_DIR)/xosim_work_dir
+
+hw: $(KERNEL_XCLBIN)
+
+$(KERNEL_XCLBIN): $(KERNEL_XSA)
+	@echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****"
+	cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \
+	  $^ \
+	  --temp_dir $(TEMP_DIR) \
+	  --save-temps \
+	  --report_dir $(TEMP_DIR)/reports/ \
+	  --package.boot_mode=ospi \
+	  -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log
+	@echo "### ***** $(KERNEL_XCLBIN) packaging done! *****"
+
+$(KERNEL_XSA): $(KERNEL_XO)
+	cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
+	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+	--config $(SRC_DIR)/vck5000.cfg \
+	--save-temps \
+	--temp_dir $(TEMP_DIR) \
+	--clock.defaultFreqHz 250000000 \
+	--vivado.synth.jobs 16 \
+	$< -o $@
+
+xo: $(KERNEL_XO)
+
+$(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && tapa compile \
+	--top $(KERNEL_NAME) \
+	--part-num xcu55c-fsvh2892-2L-e \
+	--clock-period 3.33 \
+	-o $(KERNEL_NAME).xo \
+	-f $< \
+	2>&1 | tee tapa.log
+
+csim:$(TEMP_DIR)/main.exe
+
+$(TEMP_DIR)/main.exe: $(SRC_DIR)/*.cpp
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && tapa g++ $^ $(INCLUDE) -o $(TEMP_DIR)/main.exe -O2
+	$(TEMP_DIR)/main.exe
+
+show_groups:
+	rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \
+	-o $(TEMP_DIR)/module_types.csv
+
+
+
+clean:
+	rm -rf $(TEMP_DIR) *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
+
+cleanall:
+	rm -rf build *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
diff --git a/benchmarks/tapa_flow/bandwidth4/README.md b/benchmarks/tapa_flow/bandwidth4/README.md
new file mode 100644
index 00000000..237c8651
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/README.md
@@ -0,0 +1,141 @@
+<!--
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+-->
+
+<img src="https://imagedelivery.net/AU8IzMTGgpVmEBfwPILIgw/1b565657-df33-41f9-f29e-0d539743e700/128" width="64px" alt="RapidStream Logo" />
+
+# TAPA Flow: ORC Decoder
+
+## Introduction
+
+
+In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include:
+
+- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA.
+- Optimize the .xo file with RapidStream to obtain an optimized .xo file.
+- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment.
+
+## Tutorial
+
+### Step 1 (Done): Generate the Xilinx Object File (`.xo`)
+
+
+We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory.
+
+```bash
+WORK_DIR=generated
+tapac \
+  --work-dir ${WORK_DIR} \
+  --top data_decoding \
+  --part-num xcu280-fsvh2892-2L-e \
+  --clock-period 3.33 \
+  -o ${WORK_DIR}/data_decoding.xo \
+  --connectivity config/link_config.ini \
+  src/data_decoder.cpp \
+  2>&1 | tee tapa.log
+```
+
+### Step 2: Use Rapidstream to Optimize `.xo` Design
+
+The RapidStream flow conducts design space exploration and generates solutions  by taking all TAPA-generated `.xo` file as the input.
+The RapidStream flow for TAPA requires the following key inputs:
+
+- **Platform**: The Vitis platform (e.g., `xilinx_u280_gen3x16_xdma_1_202211_1`).
+- **Device**: virtual device define by calling rapidstream APIs based on platform (e.g., `get_u280_vitis_device_factory`).
+- **.xo file**: The `.xo` file generated by TAPA
+- **Connectivity** (.ini): Include the configuration file for `v++` design/config/run.py/link_config.ini.
+- **top_module_name**: Top module name for the kernel.
+- **Clock**: All the clock and frequencies.
+- **Flatten Module**: Within a design, not all modules need to be optimized. The flatten module name is the target module rapidstream will optimize.
+
+The Python snippet below shows how we initiate rapidstream instance to set up the rapidstream environment.
+
+```Python
+from rapidstream import get_u280_vitis_device_factory, RapidStreamTAPA
+import os
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+INI_PATH = f"{CURR_DIR}/design/config/link_config.ini"
+VITIS_PLATFORM = "xilinx_u280_gen3x16_xdma_1_202211_1"
+XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo"
+kernel_name = "data_decoding"
+factory = get_u280_vitis_device_factory(VITIS_PLATFORM)
+rs = RapidStreamTAPA(f"{CURR_DIR}/build")
+rs.set_virtual_device(factory.generate_virtual_device())
+rs.add_xo_file(XO_PATH)
+rs.set_vitis_platform(VITIS_PLATFORM)
+rs.set_vitis_connectivity_config(INI_PATH)
+rs.set_top_module_name(kernel_name)
+rs.add_clock("ap_clk", 3.33)
+rs.add_flatten_targets([kernel_name])
+```
+
+The HBM AXI port connection is described in design/config/run.py/link_config.ini.
+
+```bash
+[connectivity]
+sp=data_decoding.input_port:HBM[0:1]
+sp=data_decoding.output_port0_32b_8b:HBM[16:17]
+sp=data_decoding.output_port1_16b_8b:HBM[18:19]
+sp=data_decoding.output_port2_16b_8b:HBM[20:21]
+sp=data_decoding.output_port3_8b:HBM[22:23]
+sp=data_decoding.output_port4_Track:HBM[24:25]
+```
+
+As a result, it is necessary to assign the kernel ports to the appropriate slots. The Python code below demonstrates this process. For comprehensive linking details, please refer to the design/config/run.py/link_config.ini file.
+
+ ```Python
+# Bind ports to HBM 16-31
+right_slot = "SLOT_X1Y0:SLOT_X1Y0"
+left_slot = "SLOT_X0Y0:SLOT_X0Y0"
+rs.assign_port_to_region(".*input_port.*", left_slot)
+rs.assign_port_to_region(".*output_port0_32b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port1_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port2_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port3_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port4_Track.*", right_slot)
+rs.assign_port_to_region("s_axi_control_.*", left_slot)
+rs.assign_port_to_region("ap_clk", left_slot)
+rs.assign_port_to_region("ap_rst_n", left_slot)
+rs.assign_port_to_region("interrupt", left_slot)
+```
+
+For the complete detail, please refore to [./run_vck5000.py](./run_vck5000.py) file. Call the rapidstream by launching the command below or `make all`.
+
+```bash
+rapidstream run.py
+```
+
+If everything is successful, you should at least get one optimized `.xclbin` file.
+
+
+
+
+### Step 3: Check the Group Module Report
+
+
+RapidStream mandates a clear distinction between communication and computation within user designs.
+
+- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules.
+
+- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged.
+
+For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation.
+
+To generate a report on group types, execute the commands below or `run make show_groups`:
+
+```bash
+rapidstream ../../../common/util/get_group.py \
+	-i build/passes/0-imported.json \
+	-o build/module_types.csv
+```
+
+The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `Callipepla` serves as a Group module, while the other three modules are added by RapidStream.
+
+| Module Name                      | Group Type     |
+|:--------------------------------:|:--------------:|
+| data_decoding                    | grouped_module |
+|__rs_ap_ctrl_start_ready_pipeline | grouped_module |
+|__rs_ff_pipeline                  | grouped_module |
+|__rs_hs_pipeline                  | grouped_module |
diff --git a/benchmarks/tapa_flow/bandwidth4/design/bandwidth-host.cpp b/benchmarks/tapa_flow/bandwidth4/design/bandwidth-host.cpp
new file mode 100644
index 00000000..340e299d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/bandwidth-host.cpp
@@ -0,0 +1,70 @@
+#include <iostream>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <tapa.h>
+#include "bandwidth4.h"
+
+using std::clog;
+using std::endl;
+using std::vector;
+
+DEFINE_string(bitstream, "", "path to bitstream file, run csim if empty");
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, /*remove_flags=*/true);
+
+  const uint64_t n = argc > 1 ? atoll(argv[1]) : 1024 * 1024;
+
+  vector<bit512> rmem0(n);
+  vector<bit512> rmem1(n);
+  vector<bit512> rmem2(n);
+  vector<bit512> rmem3(n);
+
+
+  for (uint64_t i = 0; i < n; ++i) {
+    rmem0[i] = i;
+    rmem1[i] = i;
+    rmem2[i] = i;
+    rmem3[i] = i;
+  }
+  int64_t kernel_time_ns = tapa::invoke(
+      bandwidth4,
+      FLAGS_bitstream,
+      tapa::read_write_mmap<bit512>(rmem0),
+      tapa::read_write_mmap<bit512>(rmem1),
+      tapa::read_write_mmap<bit512>(rmem2),
+      tapa::read_write_mmap<bit512>(rmem3),
+      n);
+
+  clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << endl;
+
+  uint64_t num_errors = 0;
+  const uint64_t threshold = 10;  // only report up to these errors
+  for (uint64_t i = 0; i < n; ++i) {
+    bit512 out512 = (i << 1);
+    if (rmem0[i] != out512) {
+      if (num_errors < threshold) {
+        clog << "error at " << i << ": expected " << rmem0[i] << ", got "
+             << out512 << endl;
+      }
+      ++num_errors;
+    }
+    if (rmem3[i] != out512) {
+      if (num_errors < threshold) {
+        clog << "error at " << i << ": expected " << rmem3[i] << ", got "
+             << out512 << endl;
+      }
+      ++num_errors;
+    }
+  }
+  if (num_errors == 0) {
+    clog << "PASS!" << endl;
+  } else {
+    if (num_errors > threshold) {
+      clog << " (+" << (num_errors - threshold) << " more errors)" << endl;
+    }
+    clog << "FAIL!" << endl;
+  }
+  return num_errors > 0 ? 1 : 0;
+}
diff --git a/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.cpp b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.cpp
new file mode 100644
index 00000000..25d1ba55
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.cpp
@@ -0,0 +1,62 @@
+#include <cstdint>
+
+#include <tapa.h>
+#include "bandwidth4.h"
+
+void yshift(tapa::istream<bit512>& a, tapa::ostream<bit512>& b, uint64_t n) {
+  for (uint64_t i = 0; i < n; ++i) {
+    bit512 tmp;
+    tmp = a.read();
+    tmp = (tmp << 1);
+    b.write(tmp);
+  }
+}
+
+void Mmap2Stream(
+    tapa::mmap<bit512> mmap,
+    uint64_t n,
+    tapa::ostream<bit512>& stream){
+
+    for (uint64_t i = 0; i < n; ++i) {
+        stream << mmap[i];
+    }
+}
+
+void Stream2Mmap(tapa::istream<bit512>& stream, tapa::mmap<bit512> mmap,
+                 uint64_t n) {
+  for (uint64_t i = 0; i < n; ++i) {
+    mmap[i] = stream.read();
+  }
+}
+
+void bandwidth4(
+  tapa::mmap<bit512> ch_0,
+  tapa::mmap<bit512> ch_1,
+  tapa::mmap<bit512> ch_2,
+  tapa::mmap<bit512> ch_3,
+  uint64_t n) {
+
+  tapa::stream<bit512> qr0("qr0");
+  tapa::stream<bit512> qr1("qr1");
+  tapa::stream<bit512> qr2("qr2");
+  tapa::stream<bit512> qr3("qr3");
+
+  tapa::stream<bit512> qw0("qw0");
+  tapa::stream<bit512> qw1("qw1");
+  tapa::stream<bit512> qw2("qw2");
+  tapa::stream<bit512> qw3("qw3");
+
+  tapa::task()
+      .invoke(Mmap2Stream, ch_0, n, qr0)
+      .invoke(Mmap2Stream, ch_1, n, qr1)
+      .invoke(Mmap2Stream, ch_2, n, qr2)
+      .invoke(Mmap2Stream, ch_3, n, qr3)
+      .invoke(yshift, qr0, qw0, n)
+      .invoke(yshift, qr1, qw1, n)
+      .invoke(yshift, qr2, qw2, n)
+      .invoke(yshift, qr3, qw3, n)
+      .invoke(Stream2Mmap, qw0, ch_0, n)
+      .invoke(Stream2Mmap, qw1, ch_1, n)
+      .invoke(Stream2Mmap, qw2, ch_2, n)
+      .invoke(Stream2Mmap, qw3, ch_3, n);
+}
diff --git a/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.h b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.h
new file mode 100644
index 00000000..6974458f
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.h
@@ -0,0 +1,18 @@
+
+#ifndef __VADD_BW_H__
+#define __VADD_BW_H__
+#include <cstdint>
+
+#include <tapa.h>
+#include <ap_int.h>
+
+typedef ap_uint<512> bit512;
+
+void bandwidth4(
+  tapa::mmap<bit512> ch_0,
+  tapa::mmap<bit512> ch_1,
+  tapa::mmap<bit512> ch_2,
+  tapa::mmap<bit512> ch_3,
+  uint64_t n);
+
+#endif
diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json
new file mode 100644
index 00000000..264df902
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json
@@ -0,0 +1,15 @@
+{
+    "dse_range_max": 0.8,
+    "dse_range_min": 0.7,
+    "partition_strategy": "flat",
+    "port_pre_assignments": {
+        ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0",
+        "ap_clk": "SLOT_X0Y0:SLOT_X0Y0",
+        "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0",
+        "interrupt": "SLOT_X0Y0:SLOT_X0Y0",
+        "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0"
+    }
+}
diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/impl_config.json b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/impl_config.json
new file mode 100644
index 00000000..9b47f4ca
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/impl_config.json
@@ -0,0 +1,7 @@
+{
+    "max_workers": 2,
+    "port_to_clock_period": {
+        "ap_clk": 3.33
+    },
+    "vitis_platform": "xilinx_vck5000_gen4x8_qdma_2_202220_1"
+}
diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/link_config.ini b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/link_config.ini
new file mode 100644
index 00000000..17e6686e
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/link_config.ini
@@ -0,0 +1,8 @@
+platform=xilinx_vck5000_gen4x8_qdma_2_202220_1
+
+[connectivity]
+
+sp = bandwidth4.m_axi_ch_0:MC_NOC0
+sp = bandwidth4.m_axi_ch_1:MC_NOC0
+sp = bandwidth4.m_axi_ch_2:MC_NOC0
+sp = bandwidth4.m_axi_ch_3:MC_NOC0
diff --git a/benchmarks/tapa_flow/bandwidth4/design/run_tapa.sh b/benchmarks/tapa_flow/bandwidth4/design/run_tapa.sh
new file mode 100644
index 00000000..0071559b
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/run_tapa.sh
@@ -0,0 +1,9 @@
+WORK_DIR=work.out
+
+tapa compile \
+  --top data_decoding \
+  --part-num xcu55c-fsvh2892-2L-e \
+  --clock-period 3.33 \
+  -o ${WORK_DIR}/data_decoding.xo \
+  -f src/data_decoder.cpp \
+  2>&1 | tee tapa.log
diff --git a/benchmarks/tapa_flow/bandwidth4/run_vck5000.py b/benchmarks/tapa_flow/bandwidth4/run_vck5000.py
new file mode 100644
index 00000000..ae36f962
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/run_vck5000.py
@@ -0,0 +1,84 @@
+__copyright__ = """
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+"""
+
+import os
+from pathlib import Path
+
+from rapidstream import DeviceFactory
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+CURR_FILE = os.path.basename(__file__)
+
+VITIS_PLATFORM = "xilinx_vck5000_gen4x8_qdma_2_202220_1"
+VCK5000_PART_NAME = "xcvc1902-vsvd1760-2MP-e-S"
+
+
+factory = DeviceFactory(row=2, col=2, part_num=VCK5000_PART_NAME, board_name=None)
+
+for x in range(2):
+    for y in range(2):
+        pblock = f"-add CLOCKREGION_X{x*4}Y{y*4}:CLOCKREGION_X{x*4+3}Y{y*4+3}"
+        factory.set_slot_pblock(x, y, [pblock])
+
+
+# set SLR crossing capacity
+for x in range(2):
+    factory.set_slot_capacity(x, 0, north=11520)
+    factory.set_slot_capacity(x, 1, north=11520)
+
+    factory.set_slot_capacity(x, 1, south=11520)
+    # factory.set_slot_capacity(x, 2, south=11520)
+
+# Set W/E capacity
+for y in range(2):
+    factory.set_slot_capacity(0, y, east=40320)
+    factory.set_slot_capacity(1, y, west=40320)
+# factory.set_slot_capacity(0, 2, east=41178)
+# factory.set_slot_capacity(1, 2, west=41178)
+
+
+factory.set_platform_name(VITIS_PLATFORM)
+factory.set_user_pblock_name("pblock_dynamic_region")
+
+factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y2"])
+factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y2"])
+factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y3:CLOCKREGION_X4Y4"])
+factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y3:CLOCKREGION_X9Y4"])
+
+
+# Vitis uses 4395 nets from SLR0 to SLR1
+# factory.set_slot_capacity(1, 0, north=11520 - 4395)
+# factory.set_slot_capacity(1, 1, north=11520 - 4395)
+
+# Vitis uses 4185 nets from SLR1 to SLR2
+# factory.set_slot_capacity(1, 1, south=11520 - 4185)
+
+
+factory.extract_slot_resources()
+
+
+# Reserve resource for the HBM Memory Sub-System.
+# The HMSS is not part of the user kernel so the partition optimization process
+# is unaware of its existence. We need to manually reserve resources for it.
+# For 512-bit HBM channels, each HBM channel uses approximately the following resources:
+# AREA_PER_HBM_CHANNEL = {
+#     "LUT": 5000,
+#     "FF": 6500,
+#     "BRAM": 0,
+#     "URAM": 0,
+#     "DSP": 0,
+# }
+# factory.reduce_slot_area(0, 0, lut=150800)
+# factory.reduce_slot_area(0, 1, lut=146960)
+# factory.reduce_slot_area(1, 0, lut=128000)
+# factory.reduce_slot_area(1, 1, lut=107840)
+
+
+# For this U280 platform, the right most DSP column on the boundary between
+# dynamic/static region is not usable. So we need to adjust the DSP count
+# to reflect the actual available DSPs.
+print("Reducing DSP of (1, 1) to make it less congested")
+factory.reduce_slot_area(1, 1, dsp=100)
+factory.generate_virtual_device(Path(f"{CURR_DIR}/build/{CURR_FILE}/device.json"))
diff --git a/benchmarks/vitis_flow/bandwidth23/Makefile b/benchmarks/vitis_flow/bandwidth23/Makefile
new file mode 100644
index 00000000..e5963275
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/Makefile
@@ -0,0 +1,119 @@
+# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+
+ROOT_DIR         := $(shell git rev-parse --show-toplevel)
+GRP_UTIL         := $(ROOT_DIR)/common/util/get_group.py
+PLATFORM         := xilinx_vck5000_gen4x8_qdma_2_202220_1
+PART             := xcvc1902-vsvd1760-2MP-e-S
+LINK_FILE        := link_config_hbm.ini
+KERNEL_NAME      := bandwidth23
+HLSXX            := vitis_hls
+SRC_DIR          := $(CURDIR)/design
+RS_SCRIPT        := $(CURDIR)/run.py
+TEMP_DIR         := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
+HOST			 := $(TEMP_DIR)/app.exe
+KERNEL_XO        := $(TEMP_DIR)/$(KERNEL_NAME).xo
+KERNEL_XSA       := $(TEMP_DIR)/$(KERNEL_NAME).xsa
+KERNEL_XCLBIN    := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
+RS_XCLBIN        := $(TEMP_DIR)/dse/candidate_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
+CLK_PERIOD_NS    := 3
+TARGET           := hw
+HLS2RTL_TCL	     := $(ROOT_DIR)/common/tcl/hls2rtl.tcl
+GEN_XO           := 1
+
+BUILD_LOG        := $(TEMP_DIR)/build.json
+SUCCESS          := "Build Successful"
+TIMING_RPT       := impl_1_hw_bb_locked_timing_summary_routed.rpt
+SLACK_GETTER     := $(ROOT_DIR)/common/util/get_slack.py
+RSXX             := rapidstream
+
+
+
+
+all: $(RS_XCLBIN)
+	$(RSXX) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG)   -c clk_kernel_00_unbuffered_net -p 3.333
+	echo $(SUCCESS)
+
+$(RS_XCLBIN):$(KERNEL_XO)
+	$(RSXX) $(RS_SCRIPT)
+
+hw: $(KERNEL_XCLBIN)
+
+$(KERNEL_XCLBIN): $(KERNEL_XSA)
+	@echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****"
+	cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \
+	  $^ \
+	  --temp_dir $(TEMP_DIR) \
+	  --save-temps \
+	  --report_dir $(TEMP_DIR)/reports/ \
+	  --package.boot_mode=ospi \
+	  -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log
+	@echo "### ***** $(KERNEL_XCLBIN) packaging done! *****"
+
+$(KERNEL_XSA): $(KERNEL_XO)
+	cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
+	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+	--config $(SRC_DIR)/vck5000.cfg \
+	--save-temps \
+	--temp_dir $(TEMP_DIR) \
+	--clock.defaultFreqHz 250000000 \
+	--vivado.synth.jobs 16 \
+	$< -o $@
+
+
+xo:$(KERNEL_XO)
+
+$(KERNEL_XO): $(SRC_DIR)/$(KERNEL_NAME).cpp  $(SRC_DIR)/$(KERNEL_NAME).h
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && v++ -c -t ${TARGET} \
+	--platform $(PLATFORM) \
+	-k $(KERNEL_NAME) \
+	--temp_dir $(TEMP_DIR) \
+	--save-temps \
+	-o $@ \
+	$^
+
+sw_emu: $(HOST) $(SRC_DIR)/$(KERNEL_NAME).cpp  $(SRC_DIR)/$(KERNEL_NAME).h
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && v++ -c -t sw_emu \
+	--platform xilinx_u50_gen3x16_xdma_5_202210_1 \
+	-k $(KERNEL_NAME) \
+	--temp_dir $(TEMP_DIR) \
+	--save-temps \
+	-o $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xo \
+	$^
+	cd $(TEMP_DIR) && v++ -l -t sw_emu \
+	$(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xo \
+	--platform xilinx_u50_gen3x16_xdma_5_202210_1 \
+	--kernel $(KERNEL_NAME) \
+	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+	-o $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xclbin
+	cd $(TEMP_DIR) && XCL_EMULATION_MODE=sw_emu $< $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xclbin
+
+host:$(HOST)
+
+$(HOST): $(SRC_DIR)/host.cpp
+	mkdir -p $(TEMP_DIR)
+	g++ -Wall -g -std=c++11 $(SRC_DIR)/host.cpp -o $@ \
+		-I${XILINX_XRT}/include/ \
+		-I${XILINX_HLS}/include/ \
+		-L${XILINX_XRT}/lib/ -lOpenCL -pthread -lrt -lstdc++
+
+show_groups:
+	rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \
+	-o $(TEMP_DIR)/module_types.csv
+
+
+
+clean:
+	rm -rf $(TEMP_DIR) *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
+
+
+cleanall:
+	rm -rf build *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
diff --git a/benchmarks/vitis_flow/bandwidth23/README.md b/benchmarks/vitis_flow/bandwidth23/README.md
new file mode 100644
index 00000000..f6d4bcdd
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/README.md
@@ -0,0 +1,118 @@
+<!--
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+-->
+
+<img src="https://imagedelivery.net/AU8IzMTGgpVmEBfwPILIgw/1b565657-df33-41f9-f29e-0d539743e700/128" width="64px" alt="RapidStream Logo" />
+
+# Large Language Model Benchmark
+
+## Introduction
+
+In this recipe, we illustrate how to create a Vitis objective file (`.xo`) for a Large Language Model kernel from [Chen *et al.* (TRETS)](https://dl.acm.org/doi/10.1145/3656177) using Vitis, then optimize the `.xo` file with Rapidstream, and finally utilize the optimized output in the ongoing Vitis development process.
+
+
+## Tutorial
+
+### Step 1: Generate the Xilinx Object File (`.xo`)
+
+We use Vitis 2023.2 to generate the `.xo` file. Since we want to disable [free running pipeline (FRP)](https://www.xilinx.com/htmldocs/xilinx2021_2/hls-guidance/200-1553.html) feature for HLS step, we use [hls2rtl.tcl](../../../common/tcl/hls2rtl.tcl) to compile the C++ code to `.xo` file.
+
+Run the following command or run `make clean && make xo`:
+
+```bash
+source <Vitis_install_path>/Vitis/2023.2/settings64.sh
+make clean
+mkdir -p build
+vitis_hls ../../../common/tcl/hls2rtl.tcl \
+  -l build/vitis_hls_llm.log \
+  -tclargs \
+  xcu50-fsvh2104-2-e \
+  4 \
+  bert_all \
+  1 \
+  design/bert_all.cpp design/kernel.h \
+  design/bert_region_1.cpp design/bert_region_1.h \
+  design/bert_region_2.cpp design/bert_region_2.h \
+  design/bert_region_3.cpp design/bert_region_3.h
+```
+
+### Step 2 (Optional): Use Vitis --link to Generate the `.xclbin` File
+
+:warning: **Note**: This step can take hours to complete. We recommend using the RapidStream flow to optimize the `.xo` file instead of generating the `.xclbin` file if you are familiar with AMD Vitis flow.
+
+With the `.xo` file generated, you can use `v++ -link` to generate the `.xclbin` file. Run the following command or execute `make hw`:
+
+```bash
+v++ -l -t hw \
+  --platform xilinx_u50_gen3x16_xdma_5_202210_1 \
+  --kernel bert_all \
+  --connectivity.nk bert_all:1:bert_all \
+  --config design/link_config_hbm.ini \
+  --temp_dir build \
+  -o build/bert_all.xclbin \
+  build/bert_all.xo
+```
+
+### Step 3: Call RapidStream to Optimize the Design
+
+The RapidStream flow conducts design space exploration and generates optimized `.xo` files by taking the Vitis generated `.xo` as the input. The RapidStream flow for Vitis requires four key inputs:
+
+1. **Device**: Specify the Vitis platform name for `v++`.
+2. **Xilinx Object file** (.xo): Provide the file generated by `v++` or Vivado.
+3. **Connectivity** (.ini): Include the configuration file for `v++` ./design/link_config_hbm.ini.
+4. **Clock targets**: Define the desired clock frequencies.
+5. RapidStream automatically handles all other aspects of the flow.
+
+Please refer to [run_u50.py](./run_u50.py) for the complete RapidStream flow.
+To execute the flow and generate optimized `.xo` files,
+Run the following command or execute `make rs_opt`:
+
+```bash
+rapidstream ./run_u50.py
+```
+
+Unlike in the example provided in [getting_started/vitis_source](../../../getting_started/vitis_source/run.py) where the `skip_impl` variable is set to `True`, in this case, the DSE engine will automatically launch Vitis to link the optimized `.xo` file to the target device and generate the `.xclbin` file.
+
+```bash
+# Skip Vitis implementation.
+rs.run_dse(skip_impl=True)
+```
+
+When finished, you can locate these files using the following command:
+
+
+```bash
+find ./build/dse/ -name *.xclbin
+```
+
+If everything is successful, you should at least get one optimized `.xclbin` file.
+
+
+### Step 4: Check the Group Module Report
+
+
+RapidStream mandates a clear distinction between communication and computation within user designs.
+
+- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules.
+
+- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged.
+
+For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation.
+
+To generate a report on group types, execute the commands below or `run make show_groups`:
+
+```bash
+rapidstream ../../../common/util/get_group.py \
+	-i build/passes/0-imported.json \
+	-o build/module_types.csv
+```
+
+The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `VecAdd` serves as a Group module, while the other three modules are added by RapidStream.
+
+| Module Name                      | Group Type     |
+|:--------------------------------:|:--------------:|
+| bert_all                         | grouped_module |
+|__rs_ap_ctrl_start_ready_pipeline | grouped_module |
+|__rs_ff_pipeline                  | grouped_module |
+|__rs_hs_pipeline                  | grouped_module |
diff --git a/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.cpp b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.cpp
new file mode 100644
index 00000000..e1197a2d
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.cpp
@@ -0,0 +1,181 @@
+// Copyright 2024 RapidStream Design Automation, Inc.
+// All Rights Reserved.
+
+#include "bandwidth23.h"
+#include <stdio.h>
+
+
+void print_512(bit512 din){
+    // Print out the data 64-bit hex per line
+    for (int i = 0; i < 8; i++) {
+        printf("%08x%08x\n", (unsigned int) din(63+i*64, 32+i*64), (unsigned int) din(31+i*64, 0+i*64));
+    }
+}
+
+void read_mem(bit512* mem, hls::stream<bit512>& ch, long offset) {
+    for (int j = 0; j < 1024; j++) {
+        ch.write(mem[(offset<<10) + j]<<1);
+    }
+}
+
+
+void write_mem(hls::stream<bit512>& ch, bit512* mem, long offset) {
+    for (int j = 0; j < 1024; j++) {
+        mem[(offset<<10) + j] =  ch.read();
+    }
+}
+
+
+
+extern "C" {
+
+void bandwidth23(
+    bit512* ch_0,
+    bit512* ch_1,
+    bit512* ch_2,
+    bit512* ch_3,
+    bit512* ch_4,
+    bit512* ch_5,
+    bit512* ch_6,
+    bit512* ch_7,
+    bit512* ch_8,
+    bit512* ch_9,
+    bit512* ch_10,
+    bit512* ch_11,
+    bit512* ch_12,
+    bit512* ch_13,
+    bit512* ch_14,
+    bit512* ch_15,
+    bit512* ch_16,
+    bit512* ch_17,
+    bit512* ch_18,
+    bit512* ch_19,
+    bit512* ch_20,
+    bit512* ch_21,
+    bit512* ch_22,
+    long n)
+{
+#pragma HLS INTERFACE m_axi port=ch_0 bundle=ch_0
+#pragma HLS INTERFACE m_axi port=ch_1 bundle=ch_1
+#pragma HLS INTERFACE m_axi port=ch_2 bundle=ch_2
+#pragma HLS INTERFACE m_axi port=ch_3 bundle=ch_3
+#pragma HLS INTERFACE m_axi port=ch_4 bundle=ch_4
+#pragma HLS INTERFACE m_axi port=ch_5 bundle=ch_5
+#pragma HLS INTERFACE m_axi port=ch_6 bundle=ch_6
+#pragma HLS INTERFACE m_axi port=ch_7 bundle=ch_7
+#pragma HLS INTERFACE m_axi port=ch_8 bundle=ch_8
+#pragma HLS INTERFACE m_axi port=ch_9 bundle=ch_9
+#pragma HLS INTERFACE m_axi port=ch_10 bundle=ch_10
+#pragma HLS INTERFACE m_axi port=ch_11 bundle=ch_11
+#pragma HLS INTERFACE m_axi port=ch_12 bundle=ch_12
+#pragma HLS INTERFACE m_axi port=ch_13 bundle=ch_13
+#pragma HLS INTERFACE m_axi port=ch_14 bundle=ch_14
+#pragma HLS INTERFACE m_axi port=ch_15 bundle=ch_15
+#pragma HLS INTERFACE m_axi port=ch_16 bundle=ch_16
+#pragma HLS INTERFACE m_axi port=ch_17 bundle=ch_17
+#pragma HLS INTERFACE m_axi port=ch_18 bundle=ch_18
+#pragma HLS INTERFACE m_axi port=ch_19 bundle=ch_19
+#pragma HLS INTERFACE m_axi port=ch_20 bundle=ch_20
+#pragma HLS INTERFACE m_axi port=ch_21 bundle=ch_21
+#pragma HLS INTERFACE m_axi port=ch_22 bundle=ch_22
+#pragma HLS INTERFACE s_axilite port=n bundle=control
+#pragma HLS INTERFACE s_axilite port=return bundle=control
+    hls::stream<bit512> stream_0;
+#pragma HLS STREAM variable=stream_0 depth=2048
+    hls::stream<bit512> stream_1;
+#pragma HLS STREAM variable=stream_1 depth=2048
+    hls::stream<bit512> stream_2;
+#pragma HLS STREAM variable=stream_2 depth=2048
+    hls::stream<bit512> stream_3;
+#pragma HLS STREAM variable=stream_3 depth=2048
+    hls::stream<bit512> stream_4;
+#pragma HLS STREAM variable=stream_4 depth=2048
+    hls::stream<bit512> stream_5;
+#pragma HLS STREAM variable=stream_5 depth=2048
+    hls::stream<bit512> stream_6;
+#pragma HLS STREAM variable=stream_6 depth=2048
+    hls::stream<bit512> stream_7;
+#pragma HLS STREAM variable=stream_7 depth=2048
+    hls::stream<bit512> stream_8;
+#pragma HLS STREAM variable=stream_8 depth=2048
+    hls::stream<bit512> stream_9;
+#pragma HLS STREAM variable=stream_9 depth=2048
+    hls::stream<bit512> stream_10;
+#pragma HLS STREAM variable=stream_10 depth=2048
+    hls::stream<bit512> stream_11;
+#pragma HLS STREAM variable=stream_11 depth=2048
+    hls::stream<bit512> stream_12;
+#pragma HLS STREAM variable=stream_12 depth=2048
+    hls::stream<bit512> stream_13;
+#pragma HLS STREAM variable=stream_13 depth=2048
+    hls::stream<bit512> stream_14;
+#pragma HLS STREAM variable=stream_14 depth=2048
+    hls::stream<bit512> stream_15;
+#pragma HLS STREAM variable=stream_15 depth=2048
+    hls::stream<bit512> stream_16;
+#pragma HLS STREAM variable=stream_16 depth=2048
+    hls::stream<bit512> stream_17;
+#pragma HLS STREAM variable=stream_17 depth=2048
+    hls::stream<bit512> stream_18;
+#pragma HLS STREAM variable=stream_18 depth=2048
+    hls::stream<bit512> stream_19;
+#pragma HLS STREAM variable=stream_19 depth=2048
+    hls::stream<bit512> stream_20;
+#pragma HLS STREAM variable=stream_20 depth=2048
+    hls::stream<bit512> stream_21;
+#pragma HLS STREAM variable=stream_21 depth=2048
+    hls::stream<bit512> stream_22;
+#pragma HLS STREAM variable=stream_22 depth=2048
+
+
+    for(int i=0; i<(n>>10); i++){
+        read_mem(ch_0, stream_0, i);
+        read_mem(ch_1, stream_1, i);
+        read_mem(ch_2, stream_2, i);
+        read_mem(ch_3, stream_3, i);
+        read_mem(ch_4, stream_4, i);
+        read_mem(ch_5, stream_5, i);
+        read_mem(ch_6, stream_6, i);
+        read_mem(ch_7, stream_7, i);
+        read_mem(ch_8, stream_8, i);
+        read_mem(ch_9, stream_9, i);
+        read_mem(ch_10, stream_10, i);
+        read_mem(ch_11, stream_11, i);
+        read_mem(ch_12, stream_12, i);
+        read_mem(ch_13, stream_13, i);
+        read_mem(ch_14, stream_14, i);
+        read_mem(ch_15, stream_15, i);
+        read_mem(ch_16, stream_16, i);
+        read_mem(ch_17, stream_17, i);
+        read_mem(ch_18, stream_18, i);
+        read_mem(ch_19, stream_19, i);
+        read_mem(ch_20, stream_20, i);
+        read_mem(ch_21, stream_21, i);
+        read_mem(ch_22, stream_22, i);
+
+        write_mem(stream_0, ch_0, i);
+        write_mem(stream_1, ch_1, i);
+        write_mem(stream_2, ch_2, i);
+        write_mem(stream_3, ch_3, i);
+        write_mem(stream_4, ch_4, i);
+        write_mem(stream_5, ch_5, i);
+        write_mem(stream_6, ch_6, i);
+        write_mem(stream_7, ch_7, i);
+        write_mem(stream_8, ch_8, i);
+        write_mem(stream_9, ch_9, i);
+        write_mem(stream_10, ch_10, i);
+        write_mem(stream_11, ch_11, i);
+        write_mem(stream_12, ch_12, i);
+        write_mem(stream_13, ch_13, i);
+        write_mem(stream_14, ch_14, i);
+        write_mem(stream_15, ch_15, i);
+        write_mem(stream_16, ch_16, i);
+        write_mem(stream_17, ch_17, i);
+        write_mem(stream_18, ch_18, i);
+        write_mem(stream_19, ch_19, i);
+        write_mem(stream_20, ch_20, i);
+        write_mem(stream_21, ch_21, i);
+        write_mem(stream_22, ch_22, i);
+    }
+}
+}
diff --git a/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.h b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.h
new file mode 100644
index 00000000..e8fffd02
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.h
@@ -0,0 +1,43 @@
+// Copyright 2024 RapidStream Design Automation, Inc.
+// All Rights Reserved.
+
+#include "stdio.h"
+#include "stdlib.h"
+#include "math.h"
+#include <ap_int.h>
+#include <hls_stream.h>
+
+
+/* Data Type */
+typedef ap_uint<512> bit512;
+typedef ap_uint<64> bit64;
+typedef bit512 data_t ;
+/* Data Type */
+
+
+
+extern "C" { void bandwidth23(
+    bit512* ch_0,
+    bit512* ch_1,
+    bit512* ch_2,
+    bit512* ch_3,
+    bit512* ch_4,
+    bit512* ch_5,
+    bit512* ch_6,
+    bit512* ch_7,
+    bit512* ch_8,
+    bit512* ch_9,
+    bit512* ch_10,
+    bit512* ch_11,
+    bit512* ch_12,
+    bit512* ch_13,
+    bit512* ch_14,
+    bit512* ch_15,
+    bit512* ch_16,
+    bit512* ch_17,
+    bit512* ch_18,
+    bit512* ch_19,
+    bit512* ch_20,
+    bit512* ch_21,
+    bit512* ch_22,
+    long n); }
diff --git a/benchmarks/vitis_flow/bandwidth23/design/host.cpp b/benchmarks/vitis_flow/bandwidth23/design/host.cpp
new file mode 100644
index 00000000..1e5f972d
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/design/host.cpp
@@ -0,0 +1,339 @@
+// Copyright 2024 RapidStream Design Automation, Inc.
+// All Rights Reserved.
+
+
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
+
+#include <fstream>
+#include <iostream>
+#include <stdlib.h>
+#include <CL/cl2.hpp>
+#include "bandwidth23.h"
+
+void print_512(bit512 din){
+    // Print out the data 64-bit hex per line
+    for (int i = 0; i < 8; i++) {
+        printf("%08x%08x\n", (unsigned int) din(63+i*64, 32+i*64), (unsigned int) din(31+i*64, 0+i*64));
+    }
+}
+
+#define CHECK_MSG(msg, call)                                                                   \
+    call;                                                                                        \
+    if (msg != CL_SUCCESS) {                                                                   \
+        printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, msg); \
+        exit(EXIT_FAILURE);                                                                      \
+    }
+
+static const std::string error_message =
+    "Error: Result mismatch:\n"
+    "i = %d CPU result = %d Device result = %d\n";
+
+int main(int argc, char* argv[]) {
+    // Must specify the xclbin file as the second argument
+    if (argc != 2) {
+        std::cout << "Please run the application by: " << argv[0] << " <xclbin>" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::string xclbin_file = argv[1];
+
+    // Calculate the byte size the input data
+    long DATA_SIZE = 4096;
+
+    std::vector<cl::Device> devices;
+    cl_int err;
+    cl::Context context;
+    cl::CommandQueue q;
+    cl::Kernel bandwidth23;
+    cl::Program program;
+    std::vector<cl::Platform> platforms;
+    bool device_found = false;
+
+    // The get_xil_devices will return vector of Xilinx Devices
+    // Iterate through devices and find Xilinx Alveo Device
+    cl::Platform::get(&platforms);
+    for (size_t i = 0; (i < platforms.size()) & (device_found == false); i++) {
+        cl::Platform platform = platforms[i];
+        std::string platformName = platform.getInfo<CL_PLATFORM_NAME>();
+        if (platformName == "Xilinx") {
+            devices.clear();
+            platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
+            if (devices.size()) {
+                device_found = true;
+                break;
+            }
+        }
+    }
+    if (device_found == false) {
+        std::cout << "Error: could not find the target Xilinx Alveo device" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "INFO: reading " << xclbin_file << " xclbinfile" << std::endl;
+    FILE* fp;
+    if ((fp = fopen(xclbin_file.c_str(), "r")) == nullptr) {
+        std::cout << "ERROR: cannot open" << xclbin_file.c_str() << " xclbin!" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    // Load xclbin
+    std::cout << "INFO: loading: '" << xclbin_file << "'\n";
+    std::ifstream bin_file(xclbin_file, std::ifstream::binary);
+    bin_file.seekg(0, bin_file.end);
+    unsigned nb = bin_file.tellg();
+    bin_file.seekg(0, bin_file.beg);
+    char* buf = new char[nb];
+    bin_file.read(buf, nb);
+
+    // Creating Program from Binary File
+    cl::Program::Binaries bins;
+    bins.push_back({buf, nb});
+    bool valid_device = false;
+    for (unsigned int i = 0; i < devices.size(); i++) {
+        auto device = devices[i];
+        // For the device, we create a context and command queue
+        CHECK_MSG(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err));
+        CHECK_MSG(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
+        std::cout << "Trying to program device[" << i << "]: " << device.getInfo<CL_DEVICE_NAME>() << std::endl;
+        cl::Program program(context, {device}, bins, nullptr, &err);
+        if (err != CL_SUCCESS) {
+            std::cout << "Device[" << i << "]: failed to load xclbin file!\n";
+        } else {
+            std::cout << "Device[" << i << "]: xclbin is loaded successfully!\n";
+            CHECK_MSG(err, bandwidth23 = cl::Kernel(program, "bandwidth23", &err));
+            valid_device = true;
+            break; // we break because we found a valid device
+        }
+    }
+    if (!valid_device) {
+        std::cout << "Failed to program any device found, exit!\n";
+        exit(EXIT_FAILURE);
+    }
+
+    // These commands will allocate memory on the Device. The cl::Buffer objects can
+    // be used to reference the memory locations on the device.
+    CHECK_MSG(err, cl::Buffer buffer_ch_0(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_1(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_2(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_3(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_4(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_5(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_6(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_7(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_8(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_9(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_10(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_11(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_12(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_13(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_14(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_15(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_16(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_17(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_18(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_19(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_20(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_21(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_22(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+
+
+    // set the kernel Arguments
+    CHECK_MSG(err, err = bandwidth23.setArg(0, buffer_ch_0));
+    CHECK_MSG(err, err = bandwidth23.setArg(1, buffer_ch_1));
+    CHECK_MSG(err, err = bandwidth23.setArg(2, buffer_ch_2));
+    CHECK_MSG(err, err = bandwidth23.setArg(3, buffer_ch_3));
+    CHECK_MSG(err, err = bandwidth23.setArg(4, buffer_ch_4));
+    CHECK_MSG(err, err = bandwidth23.setArg(5, buffer_ch_5));
+    CHECK_MSG(err, err = bandwidth23.setArg(6, buffer_ch_6));
+    CHECK_MSG(err, err = bandwidth23.setArg(7, buffer_ch_7));
+    CHECK_MSG(err, err = bandwidth23.setArg(8, buffer_ch_8));
+    CHECK_MSG(err, err = bandwidth23.setArg(9, buffer_ch_9));
+    CHECK_MSG(err, err = bandwidth23.setArg(10, buffer_ch_10));
+    CHECK_MSG(err, err = bandwidth23.setArg(11, buffer_ch_11));
+    CHECK_MSG(err, err = bandwidth23.setArg(12, buffer_ch_12));
+    CHECK_MSG(err, err = bandwidth23.setArg(13, buffer_ch_13));
+    CHECK_MSG(err, err = bandwidth23.setArg(14, buffer_ch_14));
+    CHECK_MSG(err, err = bandwidth23.setArg(15, buffer_ch_15));
+    CHECK_MSG(err, err = bandwidth23.setArg(16, buffer_ch_16));
+    CHECK_MSG(err, err = bandwidth23.setArg(17, buffer_ch_17));
+    CHECK_MSG(err, err = bandwidth23.setArg(18, buffer_ch_18));
+    CHECK_MSG(err, err = bandwidth23.setArg(19, buffer_ch_19));
+    CHECK_MSG(err, err = bandwidth23.setArg(20, buffer_ch_20));
+    CHECK_MSG(err, err = bandwidth23.setArg(21, buffer_ch_21));
+    CHECK_MSG(err, err = bandwidth23.setArg(22, buffer_ch_22));
+
+
+    // We then need to map our OpenCL buffers to get the pointers
+    data_t* ch_0;
+    data_t* ch_1;
+    data_t* ch_2;
+    data_t* ch_3;
+    data_t* ch_4;
+    data_t* ch_5;
+    data_t* ch_6;
+    data_t* ch_7;
+    data_t* ch_8;
+    data_t* ch_9;
+    data_t* ch_10;
+    data_t* ch_11;
+    data_t* ch_12;
+    data_t* ch_13;
+    data_t* ch_14;
+    data_t* ch_15;
+    data_t* ch_16;
+    data_t* ch_17;
+    data_t* ch_18;
+    data_t* ch_19;
+    data_t* ch_20;
+    data_t* ch_21;
+    data_t* ch_22;
+
+    CHECK_MSG(err, ch_0 = (data_t*)q.enqueueMapBuffer(buffer_ch_0, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_1 = (data_t*)q.enqueueMapBuffer(buffer_ch_1, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_2 = (data_t*)q.enqueueMapBuffer(buffer_ch_2, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_3 = (data_t*)q.enqueueMapBuffer(buffer_ch_3, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_4 = (data_t*)q.enqueueMapBuffer(buffer_ch_4, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_5 = (data_t*)q.enqueueMapBuffer(buffer_ch_5, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_6 = (data_t*)q.enqueueMapBuffer(buffer_ch_6, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_7 = (data_t*)q.enqueueMapBuffer(buffer_ch_7, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_8 = (data_t*)q.enqueueMapBuffer(buffer_ch_8, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_9 = (data_t*)q.enqueueMapBuffer(buffer_ch_9, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_10 = (data_t*)q.enqueueMapBuffer(buffer_ch_10, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_11 = (data_t*)q.enqueueMapBuffer(buffer_ch_11, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_12 = (data_t*)q.enqueueMapBuffer(buffer_ch_12, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_13 = (data_t*)q.enqueueMapBuffer(buffer_ch_13, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_14 = (data_t*)q.enqueueMapBuffer(buffer_ch_14, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_15 = (data_t*)q.enqueueMapBuffer(buffer_ch_15, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_16 = (data_t*)q.enqueueMapBuffer(buffer_ch_16, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_17 = (data_t*)q.enqueueMapBuffer(buffer_ch_17, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_18 = (data_t*)q.enqueueMapBuffer(buffer_ch_18, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_19 = (data_t*)q.enqueueMapBuffer(buffer_ch_19, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_20 = (data_t*)q.enqueueMapBuffer(buffer_ch_20, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_21 = (data_t*)q.enqueueMapBuffer(buffer_ch_21, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_22 = (data_t*)q.enqueueMapBuffer(buffer_ch_22, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+
+
+    // Initialize input data
+    for (int i = 0; i < DATA_SIZE; i++) { ch_0[i] = 0 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_1[i] = 1 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_2[i] = 2 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_3[i] = 3 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_4[i] = 4 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_5[i] = 5 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_6[i] = 6 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_7[i] = 7 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_8[i] = 8 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_9[i] = 9 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_10[i] = 10 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_11[i] = 11 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_12[i] = 12 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_13[i] = 13 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_14[i] = 14 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_15[i] = 15 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_16[i] = 16 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_17[i] = 17 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_18[i] = 18 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_19[i] = 19 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_20[i] = 20 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_21[i] = 21 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_22[i] = 22 ^ i; }
+
+    CHECK_MSG(err, err = bandwidth23.setArg(0, buffer_ch_0));
+    CHECK_MSG(err, err = bandwidth23.setArg(1, buffer_ch_1));
+    CHECK_MSG(err, err = bandwidth23.setArg(2, buffer_ch_2));
+    CHECK_MSG(err, err = bandwidth23.setArg(3, buffer_ch_3));
+    CHECK_MSG(err, err = bandwidth23.setArg(4, buffer_ch_4));
+    CHECK_MSG(err, err = bandwidth23.setArg(5, buffer_ch_5));
+    CHECK_MSG(err, err = bandwidth23.setArg(6, buffer_ch_6));
+    CHECK_MSG(err, err = bandwidth23.setArg(7, buffer_ch_7));
+    CHECK_MSG(err, err = bandwidth23.setArg(8, buffer_ch_8));
+    CHECK_MSG(err, err = bandwidth23.setArg(9, buffer_ch_9));
+    CHECK_MSG(err, err = bandwidth23.setArg(10, buffer_ch_10));
+    CHECK_MSG(err, err = bandwidth23.setArg(11, buffer_ch_11));
+    CHECK_MSG(err, err = bandwidth23.setArg(12, buffer_ch_12));
+    CHECK_MSG(err, err = bandwidth23.setArg(13, buffer_ch_13));
+    CHECK_MSG(err, err = bandwidth23.setArg(14, buffer_ch_14));
+    CHECK_MSG(err, err = bandwidth23.setArg(15, buffer_ch_15));
+    CHECK_MSG(err, err = bandwidth23.setArg(16, buffer_ch_16));
+    CHECK_MSG(err, err = bandwidth23.setArg(17, buffer_ch_17));
+    CHECK_MSG(err, err = bandwidth23.setArg(18, buffer_ch_18));
+    CHECK_MSG(err, err = bandwidth23.setArg(19, buffer_ch_19));
+    CHECK_MSG(err, err = bandwidth23.setArg(20, buffer_ch_20));
+    CHECK_MSG(err, err = bandwidth23.setArg(21, buffer_ch_21));
+    CHECK_MSG(err, err = bandwidth23.setArg(22, buffer_ch_22));
+    CHECK_MSG(err, err = bandwidth23.setArg(23, DATA_SIZE));
+
+
+    // Data will be migrated to device global memory
+    CHECK_MSG(err, err = q.enqueueMigrateMemObjects({buffer_ch_0, buffer_ch_1, buffer_ch_2, buffer_ch_3, buffer_ch_4, buffer_ch_5, buffer_ch_6, buffer_ch_7, buffer_ch_8, buffer_ch_9, buffer_ch_10, buffer_ch_11, buffer_ch_12, buffer_ch_13, buffer_ch_14, buffer_ch_15, buffer_ch_16, buffer_ch_17, buffer_ch_18, buffer_ch_19, buffer_ch_20, buffer_ch_21, buffer_ch_22}, 0 /* 0 means from host*/));
+
+    // Launnch the VecAdd kernel
+    CHECK_MSG(err, err = q.enqueueTask(bandwidth23));
+
+    // Migrate the result data back to host memory
+    CHECK_MSG(err, q.enqueueMigrateMemObjects({buffer_ch_0, buffer_ch_1, buffer_ch_2, buffer_ch_3, buffer_ch_4, buffer_ch_5, buffer_ch_6, buffer_ch_7, buffer_ch_8, buffer_ch_9, buffer_ch_10, buffer_ch_11, buffer_ch_12, buffer_ch_13, buffer_ch_14, buffer_ch_15, buffer_ch_16, buffer_ch_17, buffer_ch_18, buffer_ch_19, buffer_ch_20, buffer_ch_21, buffer_ch_22}, CL_MIGRATE_MEM_OBJECT_HOST));
+
+    // Wait for all the commands to complete
+    CHECK_MSG(err, q.finish());
+
+    // Verify the result
+    int match = 0;
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_0[i] != ((0 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_1[i] != ((1 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_2[i] != ((2 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_3[i] != ((3 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_4[i] != ((4 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_5[i] != ((5 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_6[i] != ((6 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_7[i] != ((7 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_8[i] != ((8 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_9[i] != ((9 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_10[i] != ((10 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_11[i] != ((11 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_12[i] != ((12 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_13[i] != ((13 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_14[i] != ((14 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_15[i] != ((15 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_16[i] != ((16 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_17[i] != ((17 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_18[i] != ((18 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_19[i] != ((19 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_20[i] != ((20 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_21[i] != ((21 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_22[i] != ((22 ^ i))<<1) match++; }
+
+
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_0, ch_0));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_1, ch_1));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_2, ch_2));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_3, ch_3));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_5, ch_5));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_6, ch_6));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_7, ch_7));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_8, ch_8));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_9, ch_9));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_10, ch_10));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_11, ch_11));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_12, ch_12));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_13, ch_13));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_14, ch_14));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_15, ch_15));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_16, ch_16));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_17, ch_17));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_18, ch_18));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_19, ch_19));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_20, ch_20));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_21, ch_21));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_22, ch_22));
+
+    CHECK_MSG(err, err = q.finish());
+
+    if (match == 0) {
+        std::cout << "TEST PASSED!" << std::endl;
+    } else {
+        std::cout << match << " TEST FAILED!" << std::endl;
+    }
+    return (match ? EXIT_FAILURE : EXIT_SUCCESS);
+}
diff --git a/benchmarks/vitis_flow/bandwidth23/design/vck5000.cfg b/benchmarks/vitis_flow/bandwidth23/design/vck5000.cfg
new file mode 100644
index 00000000..cf375c2d
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/design/vck5000.cfg
@@ -0,0 +1,27 @@
+platform=xilinx_vck5000_gen4x8_qdma_2_202220_1
+
+[connectivity]
+
+sp = bandwidth23.m_axi_ch_0:MC_NOC0
+sp = bandwidth23.m_axi_ch_1:MC_NOC0
+sp = bandwidth23.m_axi_ch_2:MC_NOC0
+sp = bandwidth23.m_axi_ch_3:MC_NOC0
+sp = bandwidth23.m_axi_ch_4:MC_NOC0
+sp = bandwidth23.m_axi_ch_5:MC_NOC0
+sp = bandwidth23.m_axi_ch_6:MC_NOC0
+sp = bandwidth23.m_axi_ch_7:MC_NOC0
+sp = bandwidth23.m_axi_ch_8:MC_NOC0
+sp = bandwidth23.m_axi_ch_9:MC_NOC0
+sp = bandwidth23.m_axi_ch_10:MC_NOC0
+sp = bandwidth23.m_axi_ch_11:MC_NOC0
+sp = bandwidth23.m_axi_ch_12:MC_NOC0
+sp = bandwidth23.m_axi_ch_13:MC_NOC0
+sp = bandwidth23.m_axi_ch_14:MC_NOC0
+sp = bandwidth23.m_axi_ch_15:MC_NOC0
+sp = bandwidth23.m_axi_ch_16:MC_NOC0
+sp = bandwidth23.m_axi_ch_17:MC_NOC0
+sp = bandwidth23.m_axi_ch_18:MC_NOC0
+sp = bandwidth23.m_axi_ch_19:MC_NOC0
+sp = bandwidth23.m_axi_ch_20:MC_NOC0
+sp = bandwidth23.m_axi_ch_21:MC_NOC0
+sp = bandwidth23.m_axi_ch_22:MC_NOC0
diff --git a/benchmarks/vitis_flow/bandwidth23/run_u50.py b/benchmarks/vitis_flow/bandwidth23/run_u50.py
new file mode 100644
index 00000000..34aece07
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/run_u50.py
@@ -0,0 +1,40 @@
+"""Getting Started: CNN13x2 in the Vitis flow
+
+This script demonstrates how to optimize a CNN13x2 design in
+a Vitis object file. In this example, the object file is generated from the
+Vitis_HLS.
+"""
+
+__copyright__ = """
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+"""
+
+from rapidstream import get_u50_vitis_device_factory, RapidStreamVitis
+import os
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Replace with RapidStreamVitis for the  ".xo" files generated by `v++`.
+# Create a RapidStream project in the "run" directory:
+rs = RapidStreamVitis(f"{CURR_DIR}/build")
+
+# Use the "xilinx_u50_gen3x16_xdma_5_202210_1" platform as the device:
+u50_factory = get_u50_vitis_device_factory("xilinx_u50_gen3x16_xdma_5_202210_1")
+rs.set_virtual_device(u50_factory.generate_virtual_device())
+
+# Add the design object file (".xo") to the project:
+rs.add_xo_file(f"{CURR_DIR}/build/bert_all.xo")
+
+# Specify the Vitis platform and connectivity configuration:
+rs.set_vitis_platform("xilinx_u50_gen3x16_xdma_5_202210_1")
+rs.set_vitis_connectivity_config(f"{CURR_DIR}/design/link_config_hbm.ini")
+
+# Set the clock target for the design:
+rs.add_clock("ap_clk", period_ns=3)
+
+# Bind all ports to HBM 16-31:
+rs.assign_port_to_region(".*", "SLOT_X1Y0:SLOT_X1Y0")
+
+# Start the RapidStream optimization process:
+rs.run_dse()