diff --git a/benchmarks/tapa_flow/bandwidth23/Makefile b/benchmarks/tapa_flow/bandwidth23/Makefile
new file mode 100644
index 00000000..441f8a63
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/Makefile
@@ -0,0 +1,114 @@
+# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved.
+# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+
+ROOT_DIR := $(shell git rev-parse --show-toplevel)
+KERNEL_NAME := bandwidth23
+RS_SCRIPT := $(CURDIR)/run.py
+SRC_DIR := $(CURDIR)/design
+AB_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json
+IMPL_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json
+LINK_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini
+PLATFORM := xilinx_vck5000_gen4x8_qdma_2_202220_1
+PART_NUM := xcvc1902-vsvd1760-2MP-e-S
+GRP_UTIL := $(ROOT_DIR)/common/util/get_group.py
+TEMP_DIR := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
+RS_TARGET := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
+BUILD_LOG := $(TEMP_DIR)/build.json
+SUCCESS := "Build Successful"
+TIMING_RPT := impl_1_hw_bb_locked_timing_summary_routed.rpt
+SLACK_GETTER := $(ROOT_DIR)/common/util/get_slack.py
+RSPATH := $(CURDIR)
+RSXX := rapidstream
+RSPYTHON := rapidstream
+DEVICE_CONFIG := $(TEMP_DIR)/device.json
+DEVICE_GEN := $(CURDIR)/gen_device.py
+INCLUDE := -I $(XILINX_HLS)/include
+KERNEL_XO := $(TEMP_DIR)/$(KERNEL_NAME).xo
+KERNEL_XCLBIN := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
+KERNEL_XSA := $(TEMP_DIR)/$(KERNEL_NAME).xsa
+TARGET := hw
+
+all: $(RS_TARGET)
+ cd $(RSPATH) && $(RSPYTHON) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333
+ @echo $(SUCCESS)
+
+$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG)
+ mkdir -p $(TEMP_DIR)
+ cd $(RSPATH) && $(RSXX)-tapaopt \
+ --work-dir $(TEMP_DIR) \
+ --tapa-xo-path $< \
+ --device-config $(DEVICE_CONFIG) \
+ --floorplan-config $(AB_CONFIG) \
+ --single-reg \
+ --run-impl \
+ --implementation-config $(IMPL_CONFIG) \
+ --connectivity-ini $(LINK_CONFIG)
+
+$(DEVICE_CONFIG):$(AB_CONFIG)
+ mkdir -p $(TEMP_DIR)
+ cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT)
+
+cosim:$(KERNEL_XO) $(TEMP_DIR)/main.exe
+ cd $(TEMP_DIR) && $(TEMP_DIR)/main.exe 1024 \
+ --bitstream $< \
+ -xosim_work_dir $(TEMP_DIR)/xosim_work_dir
+
+hw: $(KERNEL_XCLBIN)
+
+$(KERNEL_XCLBIN): $(KERNEL_XSA)
+ @echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****"
+ cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \
+ $^ \
+ --temp_dir $(TEMP_DIR) \
+ --save-temps \
+ --report_dir $(TEMP_DIR)/reports/ \
+ --package.boot_mode=ospi \
+ -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log
+ @echo "### ***** $(KERNEL_XCLBIN) packaging done! *****"
+
+$(KERNEL_XSA): $(KERNEL_XO)
+ cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
+ --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+ --config $(SRC_DIR)/vck5000.cfg \
+ --save-temps \
+ --temp_dir $(TEMP_DIR) \
+ --clock.defaultFreqHz 250000000 \
+ --vivado.synth.jobs 16 \
+ $< -o $@
+
+xo: $(KERNEL_XO)
+
+$(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp
+ mkdir -p $(TEMP_DIR)
+ cd $(TEMP_DIR) && tapa compile \
+ --top $(KERNEL_NAME) \
+ --part-num xcu55c-fsvh2892-2L-e \
+ --clock-period 3.33 \
+ -o $(KERNEL_NAME).xo \
+ -f $< \
+ 2>&1 | tee tapa.log
+
+csim:$(TEMP_DIR)/main.exe
+
+$(TEMP_DIR)/main.exe: $(SRC_DIR)/*.cpp
+ mkdir -p $(TEMP_DIR)
+ cd $(TEMP_DIR) && tapa g++ $^ $(INCLUDE) -o $(TEMP_DIR)/main.exe -O2
+ $(TEMP_DIR)/main.exe
+
+show_groups:
+ rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \
+ -o $(TEMP_DIR)/module_types.csv
+
+
+
+clean:
+ rm -rf $(TEMP_DIR) *.log
+ rm -rf .Xil .run
+ rm -rf *.exe
+ rm -rf .ipcache
+
+cleanall:
+ rm -rf build *.log
+ rm -rf .Xil .run
+ rm -rf *.exe
+ rm -rf .ipcache
diff --git a/benchmarks/tapa_flow/bandwidth23/README.md b/benchmarks/tapa_flow/bandwidth23/README.md
new file mode 100644
index 00000000..54f2286e
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/README.md
@@ -0,0 +1,141 @@
+
+
+
+
+# TAPA Flow: ORC Decoder
+
+## Introduction
+
+
+In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include:
+
+- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA.
+- Optimize the .xo file with RapidStream to obtain an optimized .xo file.
+- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment.
+
+## Tutorial
+
+### Step 1 (Done): Generate the Xilinx Object File (`.xo`)
+
+
+We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory.
+
+```bash
+WORK_DIR=generated
+tapac \
+ --work-dir ${WORK_DIR} \
+ --top data_decoding \
+ --part-num xcu280-fsvh2892-2L-e \
+ --clock-period 3.33 \
+ -o ${WORK_DIR}/data_decoding.xo \
+ --connectivity config/link_config.ini \
+ src/data_decoder.cpp \
+ 2>&1 | tee tapa.log
+```
+
+### Step 2: Use Rapidstream to Optimize `.xo` Design
+
+The RapidStream flow conducts design space exploration and generates solutions by taking all TAPA-generated `.xo` file as the input.
+The RapidStream flow for TAPA requires the following key inputs:
+
+- **Platform**: The Vitis platform (e.g., `xilinx_u280_gen3x16_xdma_1_202211_1`).
+- **Device**: virtual device define by calling rapidstream APIs based on platform (e.g., `get_u280_vitis_device_factory`).
+- **.xo file**: The `.xo` file generated by TAPA
+- **Connectivity** (.ini): Include the configuration file for `v++` design/config/run.py/link_config.ini.
+- **top_module_name**: Top module name for the kernel.
+- **Clock**: All the clock and frequencies.
+- **Flatten Module**: Within a design, not all modules need to be optimized. The flatten module name is the target module rapidstream will optimize.
+
+The Python snippet below shows how we initiate rapidstream instance to set up the rapidstream environment.
+
+```Python
+from rapidstream import get_u280_vitis_device_factory, RapidStreamTAPA
+import os
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+INI_PATH = f"{CURR_DIR}/design/config/link_config.ini"
+VITIS_PLATFORM = "xilinx_u280_gen3x16_xdma_1_202211_1"
+XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo"
+kernel_name = "data_decoding"
+factory = get_u280_vitis_device_factory(VITIS_PLATFORM)
+rs = RapidStreamTAPA(f"{CURR_DIR}/build")
+rs.set_virtual_device(factory.generate_virtual_device())
+rs.add_xo_file(XO_PATH)
+rs.set_vitis_platform(VITIS_PLATFORM)
+rs.set_vitis_connectivity_config(INI_PATH)
+rs.set_top_module_name(kernel_name)
+rs.add_clock("ap_clk", 3.33)
+rs.add_flatten_targets([kernel_name])
+```
+
+The HBM AXI port connection is described in design/config/run.py/link_config.ini.
+
+```bash
+[connectivity]
+sp=data_decoding.input_port:HBM[0:1]
+sp=data_decoding.output_port0_32b_8b:HBM[16:17]
+sp=data_decoding.output_port1_16b_8b:HBM[18:19]
+sp=data_decoding.output_port2_16b_8b:HBM[20:21]
+sp=data_decoding.output_port3_8b:HBM[22:23]
+sp=data_decoding.output_port4_Track:HBM[24:25]
+```
+
+As a result, it is necessary to assign the kernel ports to the appropriate slots. The Python code below demonstrates this process. For comprehensive linking details, please refer to the design/config/run.py/link_config.ini file.
+
+ ```Python
+# Bind ports to HBM 16-31
+right_slot = "SLOT_X1Y0:SLOT_X1Y0"
+left_slot = "SLOT_X0Y0:SLOT_X0Y0"
+rs.assign_port_to_region(".*input_port.*", left_slot)
+rs.assign_port_to_region(".*output_port0_32b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port1_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port2_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port3_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port4_Track.*", right_slot)
+rs.assign_port_to_region("s_axi_control_.*", left_slot)
+rs.assign_port_to_region("ap_clk", left_slot)
+rs.assign_port_to_region("ap_rst_n", left_slot)
+rs.assign_port_to_region("interrupt", left_slot)
+```
+
+For the complete detail, please refore to [./run.py](./run.py) file. Call the rapidstream by launching the command below or `make all`.
+
+```bash
+rapidstream run.py
+```
+
+If everything is successful, you should at least get one optimized `.xclbin` file.
+
+
+
+
+### Step 3: Check the Group Module Report
+
+
+RapidStream mandates a clear distinction between communication and computation within user designs.
+
+- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules.
+
+- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged.
+
+For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation.
+
+To generate a report on group types, execute the commands below or `run make show_groups`:
+
+```bash
+rapidstream ../../../common/util/get_group.py \
+ -i build/passes/0-imported.json \
+ -o build/module_types.csv
+```
+
+The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `Callipepla` serves as a Group module, while the other three modules are added by RapidStream.
+
+| Module Name | Group Type |
+|:--------------------------------:|:--------------:|
+| data_decoding | grouped_module |
+|__rs_ap_ctrl_start_ready_pipeline | grouped_module |
+|__rs_ff_pipeline | grouped_module |
+|__rs_hs_pipeline | grouped_module |
diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp b/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp
new file mode 100644
index 00000000..9471ab2d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp
@@ -0,0 +1,127 @@
+#include
+#include
+
+#include
+#include
+#include "bandwidth23.h"
+
+using std::clog;
+using std::endl;
+using std::vector;
+
+DEFINE_string(bitstream, "", "path to bitstream file, run csim if empty");
+
+int main(int argc, char* argv[]) {
+ gflags::ParseCommandLineFlags(&argc, &argv, /*remove_flags=*/true);
+
+ const uint64_t n = argc > 1 ? atoll(argv[1]) : 1024 * 1024;
+
+ vector rmem0(n);
+ vector rmem1(n);
+ vector rmem2(n);
+ vector rmem3(n);
+ vector rmem4(n);
+ vector rmem5(n);
+ vector rmem6(n);
+ vector rmem7(n);
+ vector rmem8(n);
+ vector rmem9(n);
+ vector rmem10(n);
+ vector rmem11(n);
+ vector rmem12(n);
+ vector rmem13(n);
+ vector rmem14(n);
+ vector rmem15(n);
+ vector rmem16(n);
+ vector rmem17(n);
+ vector rmem18(n);
+ vector rmem19(n);
+ vector rmem20(n);
+ vector rmem21(n);
+ vector rmem22(n);
+
+
+ for (uint64_t i = 0; i < n; ++i) {
+ rmem0[i] = i;
+ rmem1[i] = i;
+ rmem2[i] = i;
+ rmem3[i] = i;
+ rmem4[i] = i;
+ rmem5[i] = i;
+ rmem6[i] = i;
+ rmem7[i] = i;
+ rmem8[i] = i;
+ rmem9[i] = i;
+ rmem10[i] = i;
+ rmem11[i] = i;
+ rmem12[i] = i;
+ rmem13[i] = i;
+ rmem14[i] = i;
+ rmem15[i] = i;
+ rmem16[i] = i;
+ rmem17[i] = i;
+ rmem18[i] = i;
+ rmem19[i] = i;
+ rmem20[i] = i;
+ rmem21[i] = i;
+ rmem22[i] = i;
+ }
+ int64_t kernel_time_ns = tapa::invoke(
+ bandwidth23,
+ FLAGS_bitstream,
+ tapa::read_write_mmap(rmem0),
+ tapa::read_write_mmap(rmem1),
+ tapa::read_write_mmap(rmem2),
+ tapa::read_write_mmap(rmem3),
+ tapa::read_write_mmap(rmem4),
+ tapa::read_write_mmap(rmem5),
+ tapa::read_write_mmap(rmem6),
+ tapa::read_write_mmap(rmem7),
+ tapa::read_write_mmap(rmem8),
+ tapa::read_write_mmap(rmem9),
+ tapa::read_write_mmap(rmem10),
+ tapa::read_write_mmap(rmem11),
+ tapa::read_write_mmap(rmem12),
+ tapa::read_write_mmap(rmem13),
+ tapa::read_write_mmap(rmem14),
+ tapa::read_write_mmap(rmem15),
+ tapa::read_write_mmap(rmem16),
+ tapa::read_write_mmap(rmem17),
+ tapa::read_write_mmap(rmem18),
+ tapa::read_write_mmap(rmem19),
+ tapa::read_write_mmap(rmem20),
+ tapa::read_write_mmap(rmem21),
+ tapa::read_write_mmap(rmem22),
+ n);
+
+ clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << endl;
+
+ uint64_t num_errors = 0;
+ const uint64_t threshold = 10; // only report up to these errors
+ for (uint64_t i = 0; i < n; ++i) {
+ bit512 out512 = (i << 1);
+ if (rmem0[i] != out512) {
+ if (num_errors < threshold) {
+ clog << "error at " << i << ": expected " << rmem0[i] << ", got "
+ << out512 << endl;
+ }
+ ++num_errors;
+ }
+ if (rmem22[i] != out512) {
+ if (num_errors < threshold) {
+ clog << "error at " << i << ": expected " << rmem22[i] << ", got "
+ << out512 << endl;
+ }
+ ++num_errors;
+ }
+ }
+ if (num_errors == 0) {
+ clog << "PASS!" << endl;
+ } else {
+ if (num_errors > threshold) {
+ clog << " (+" << (num_errors - threshold) << " more errors)" << endl;
+ }
+ clog << "FAIL!" << endl;
+ }
+ return num_errors > 0 ? 1 : 0;
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.cpp b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.cpp
new file mode 100644
index 00000000..e2d60f9d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.cpp
@@ -0,0 +1,176 @@
+#include
+
+#include
+#include "bandwidth23.h"
+
+void yshift(tapa::istream& a, tapa::ostream& b, uint64_t n) {
+ for (uint64_t i = 0; i < n; ++i) {
+ bit512 tmp;
+ tmp = a.read();
+ tmp = (tmp << 1);
+ b.write(tmp);
+ }
+}
+
+void Mmap2Stream(
+ tapa::mmap mmap,
+ uint64_t n,
+ tapa::ostream& stream){
+
+ for (uint64_t i = 0; i < n; ++i) {
+ stream << mmap[i];
+ }
+}
+
+void Stream2Mmap(tapa::istream& stream, tapa::mmap mmap,
+ uint64_t n) {
+ for (uint64_t i = 0; i < n; ++i) {
+ mmap[i] = stream.read();
+ }
+}
+
+void bandwidth23(
+ tapa::mmap ch_0,
+ tapa::mmap ch_1,
+ tapa::mmap ch_2,
+ tapa::mmap ch_3,
+ tapa::mmap ch_4,
+ tapa::mmap ch_5,
+ tapa::mmap ch_6,
+ tapa::mmap ch_7,
+ tapa::mmap ch_8,
+ tapa::mmap ch_9,
+ tapa::mmap ch_10,
+ tapa::mmap ch_11,
+ tapa::mmap ch_12,
+ tapa::mmap ch_13,
+ tapa::mmap ch_14,
+ tapa::mmap ch_15,
+ tapa::mmap ch_16,
+ tapa::mmap ch_17,
+ tapa::mmap ch_18,
+ tapa::mmap ch_19,
+ tapa::mmap ch_20,
+ tapa::mmap ch_21,
+ tapa::mmap ch_22,
+ uint64_t n) {
+
+ tapa::stream qr0("qr0");
+ tapa::stream qr1("qr1");
+ tapa::stream qr2("qr2");
+ tapa::stream qr3("qr3");
+ tapa::stream qr4("qr4");
+ tapa::stream qr5("qr5");
+ tapa::stream qr6("qr6");
+ tapa::stream qr7("qr7");
+ tapa::stream qr8("qr8");
+ tapa::stream qr9("qr9");
+ tapa::stream qr10("qr10");
+ tapa::stream qr11("qr11");
+ tapa::stream qr12("qr12");
+ tapa::stream qr13("qr13");
+ tapa::stream qr14("qr14");
+ tapa::stream qr15("qr15");
+ tapa::stream qr16("qr16");
+ tapa::stream qr17("qr17");
+ tapa::stream qr18("qr18");
+ tapa::stream qr19("qr19");
+ tapa::stream qr20("qr20");
+ tapa::stream qr21("qr21");
+ tapa::stream qr22("qr22");
+
+ tapa::stream qw0("qw0");
+ tapa::stream qw1("qw1");
+ tapa::stream qw2("qw2");
+ tapa::stream qw3("qw3");
+ tapa::stream qw4("qw4");
+ tapa::stream qw5("qw5");
+ tapa::stream qw6("qw6");
+ tapa::stream qw7("qw7");
+ tapa::stream qw8("qw8");
+ tapa::stream qw9("qw9");
+ tapa::stream qw10("qw10");
+ tapa::stream qw11("qw11");
+ tapa::stream qw12("qw12");
+ tapa::stream qw13("qw13");
+ tapa::stream qw14("qw14");
+ tapa::stream qw15("qw15");
+ tapa::stream qw16("qw16");
+ tapa::stream qw17("qw17");
+ tapa::stream qw18("qw18");
+ tapa::stream qw19("qw19");
+ tapa::stream qw20("qw20");
+ tapa::stream qw21("qw21");
+ tapa::stream qw22("qw22");
+
+ tapa::task()
+ .invoke(Mmap2Stream, ch_0, n, qr0)
+ .invoke(Mmap2Stream, ch_1, n, qr1)
+ .invoke(Mmap2Stream, ch_2, n, qr2)
+ .invoke(Mmap2Stream, ch_3, n, qr3)
+ .invoke(Mmap2Stream, ch_4, n, qr4)
+ .invoke(Mmap2Stream, ch_5, n, qr5)
+ .invoke(Mmap2Stream, ch_6, n, qr6)
+ .invoke(Mmap2Stream, ch_7, n, qr7)
+ .invoke(Mmap2Stream, ch_8, n, qr8)
+ .invoke(Mmap2Stream, ch_9, n, qr9)
+ .invoke(Mmap2Stream, ch_10, n, qr10)
+ .invoke(Mmap2Stream, ch_11, n, qr11)
+ .invoke(Mmap2Stream, ch_12, n, qr12)
+ .invoke(Mmap2Stream, ch_13, n, qr13)
+ .invoke(Mmap2Stream, ch_14, n, qr14)
+ .invoke(Mmap2Stream, ch_15, n, qr15)
+ .invoke(Mmap2Stream, ch_16, n, qr16)
+ .invoke(Mmap2Stream, ch_17, n, qr17)
+ .invoke(Mmap2Stream, ch_18, n, qr18)
+ .invoke(Mmap2Stream, ch_19, n, qr19)
+ .invoke(Mmap2Stream, ch_20, n, qr20)
+ .invoke(Mmap2Stream, ch_21, n, qr21)
+ .invoke(Mmap2Stream, ch_22, n, qr22)
+ .invoke(yshift, qr0, qw0, n)
+ .invoke(yshift, qr1, qw1, n)
+ .invoke(yshift, qr2, qw2, n)
+ .invoke(yshift, qr3, qw3, n)
+ .invoke(yshift, qr4, qw4, n)
+ .invoke(yshift, qr5, qw5, n)
+ .invoke(yshift, qr6, qw6, n)
+ .invoke(yshift, qr7, qw7, n)
+ .invoke(yshift, qr8, qw8, n)
+ .invoke(yshift, qr9, qw9, n)
+ .invoke(yshift, qr10, qw10, n)
+ .invoke(yshift, qr11, qw11, n)
+ .invoke(yshift, qr12, qw12, n)
+ .invoke(yshift, qr13, qw13, n)
+ .invoke(yshift, qr14, qw14, n)
+ .invoke(yshift, qr15, qw15, n)
+ .invoke(yshift, qr16, qw16, n)
+ .invoke(yshift, qr17, qw17, n)
+ .invoke(yshift, qr18, qw18, n)
+ .invoke(yshift, qr19, qw19, n)
+ .invoke(yshift, qr20, qw20, n)
+ .invoke(yshift, qr21, qw21, n)
+ .invoke(yshift, qr22, qw22, n)
+ .invoke(Stream2Mmap, qw0, ch_0, n)
+ .invoke(Stream2Mmap, qw1, ch_1, n)
+ .invoke(Stream2Mmap, qw2, ch_2, n)
+ .invoke(Stream2Mmap, qw3, ch_3, n)
+ .invoke(Stream2Mmap, qw4, ch_4, n)
+ .invoke(Stream2Mmap, qw5, ch_5, n)
+ .invoke(Stream2Mmap, qw6, ch_6, n)
+ .invoke(Stream2Mmap, qw7, ch_7, n)
+ .invoke(Stream2Mmap, qw8, ch_8, n)
+ .invoke(Stream2Mmap, qw9, ch_9, n)
+ .invoke(Stream2Mmap, qw10, ch_10, n)
+ .invoke(Stream2Mmap, qw11, ch_11, n)
+ .invoke(Stream2Mmap, qw12, ch_12, n)
+ .invoke(Stream2Mmap, qw13, ch_13, n)
+ .invoke(Stream2Mmap, qw14, ch_14, n)
+ .invoke(Stream2Mmap, qw15, ch_15, n)
+ .invoke(Stream2Mmap, qw16, ch_16, n)
+ .invoke(Stream2Mmap, qw17, ch_17, n)
+ .invoke(Stream2Mmap, qw18, ch_18, n)
+ .invoke(Stream2Mmap, qw19, ch_19, n)
+ .invoke(Stream2Mmap, qw20, ch_20, n)
+ .invoke(Stream2Mmap, qw21, ch_21, n)
+ .invoke(Stream2Mmap, qw22, ch_22, n);
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.h b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.h
new file mode 100644
index 00000000..5686779d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.h
@@ -0,0 +1,37 @@
+
+#ifndef __VADD_BW_H__
+#define __VADD_BW_H__
+#include
+
+#include
+#include
+
+typedef ap_uint<512> bit512;
+
+void bandwidth23(
+ tapa::mmap ch_0,
+ tapa::mmap ch_1,
+ tapa::mmap ch_2,
+ tapa::mmap ch_3,
+ tapa::mmap ch_4,
+ tapa::mmap ch_5,
+ tapa::mmap ch_6,
+ tapa::mmap ch_7,
+ tapa::mmap ch_8,
+ tapa::mmap ch_9,
+ tapa::mmap ch_10,
+ tapa::mmap ch_11,
+ tapa::mmap ch_12,
+ tapa::mmap ch_13,
+ tapa::mmap ch_14,
+ tapa::mmap ch_15,
+ tapa::mmap ch_16,
+ tapa::mmap ch_17,
+ tapa::mmap ch_18,
+ tapa::mmap ch_19,
+ tapa::mmap ch_20,
+ tapa::mmap ch_21,
+ tapa::mmap ch_22,
+ uint64_t n);
+
+#endif
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/ab_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/ab_config.json
new file mode 100644
index 00000000..5676d8e8
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/ab_config.json
@@ -0,0 +1,34 @@
+{
+ "dse_range_max": 0.8,
+ "dse_range_min": 0.7,
+ "partition_strategy": "flat",
+ "port_pre_assignments": {
+ ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_10_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_11_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_12_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_13_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_14_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_15_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_16_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_17_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_18_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_19_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_20_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_21_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_22_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_4_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_5_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_6_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_7_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_8_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_9_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ "ap_clk": "SLOT_X0Y0:SLOT_X0Y0",
+ "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0",
+ "interrupt": "SLOT_X0Y0:SLOT_X0Y0",
+ "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0"
+ }
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/impl_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/impl_config.json
new file mode 100644
index 00000000..3c481977
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/impl_config.json
@@ -0,0 +1,7 @@
+{
+ "max_workers": 2,
+ "port_to_clock_period": {
+ "ap_clk": 3.33
+ },
+ "vitis_platform": "xilinx_u55c_gen3x16_xdma_3_202210_1"
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/link_config.ini b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/link_config.ini
new file mode 100644
index 00000000..c19a4a5a
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/link_config.ini
@@ -0,0 +1,24 @@
+[connectivity]
+sp=banwidth23.ch_0:HBM[0:1]
+sp=banwidth23.ch_1:HBM[0:1]
+sp=banwidth23.ch_2:HBM[0:1]
+sp=banwidth23.ch_3:HBM[0:1]
+sp=banwidth23.ch_4:HBM[0:1]
+sp=banwidth23.ch_5:HBM[0:1]
+sp=banwidth23.ch_6:HBM[0:1]
+sp=banwidth23.ch_7:HBM[0:1]
+sp=banwidth23.ch_8:HBM[0:1]
+sp=banwidth23.ch_9:HBM[0:1]
+sp=banwidth23.ch_10:HBM[0:1]
+sp=banwidth23.ch_11:HBM[0:1]
+sp=banwidth23.ch_12:HBM[0:1]
+sp=banwidth23.ch_13:HBM[0:1]
+sp=banwidth23.ch_14:HBM[0:1]
+sp=banwidth23.ch_15:HBM[0:1]
+sp=banwidth23.ch_16:HBM[0:1]
+sp=banwidth23.ch_17:HBM[0:1]
+sp=banwidth23.ch_18:HBM[0:1]
+sp=banwidth23.ch_19:HBM[0:1]
+sp=banwidth23.ch_20:HBM[0:1]
+sp=banwidth23.ch_21:HBM[0:1]
+sp=banwidth23.ch_22:HBM[0:1]
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json
new file mode 100644
index 00000000..b9325669
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json
@@ -0,0 +1,34 @@
+{
+ "dse_range_max": 0.8,
+ "dse_range_min": 0.7,
+ "partition_strategy": "flat",
+ "port_pre_assignments": {
+ ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_10_.*": "SLOT_X1Y0:SLOT_X1Y0",
+ ".*ch_11_.*": "SLOT_X1Y0:SLOT_X1Y0",
+ ".*ch_12_.*": "SLOT_X1Y0:SLOT_X1Y0",
+ ".*ch_13_.*": "SLOT_X1Y0:SLOT_X1Y0",
+ ".*ch_14_.*": "SLOT_X1Y0:SLOT_X1Y0",
+ ".*ch_15_.*": "SLOT_X1Y0:SLOT_X1Y0",
+ ".*ch_16_.*": "SLOT_X1Y0:SLOT_X1Y0",
+ ".*ch_17_.*": "SLOT_X1Y1:SLOT_X1Y1",
+ ".*ch_18_.*": "SLOT_X1Y1:SLOT_X1Y1",
+ ".*ch_19_.*": "SLOT_X1Y1:SLOT_X1Y1",
+ ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_20_.*": "SLOT_X1Y1:SLOT_X1Y1",
+ ".*ch_21_.*": "SLOT_X1Y1:SLOT_X1Y1",
+ ".*ch_22_.*": "SLOT_X1Y1:SLOT_X1Y1",
+ ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_4_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_5_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_6_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_7_.*": "SLOT_X0Y1:SLOT_X0Y1",
+ ".*ch_8_.*": "SLOT_X0Y1:SLOT_X0Y1",
+ ".*ch_9_.*": "SLOT_X0Y1:SLOT_X0Y1",
+ "ap_clk": "SLOT_X0Y0:SLOT_X0Y0",
+ "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0",
+ "interrupt": "SLOT_X0Y0:SLOT_X0Y0",
+ "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0"
+ }
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/impl_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/impl_config.json
new file mode 100644
index 00000000..9b47f4ca
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/impl_config.json
@@ -0,0 +1,7 @@
+{
+ "max_workers": 2,
+ "port_to_clock_period": {
+ "ap_clk": 3.33
+ },
+ "vitis_platform": "xilinx_vck5000_gen4x8_qdma_2_202220_1"
+}
diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/link_config.ini b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/link_config.ini
new file mode 100644
index 00000000..cf375c2d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/link_config.ini
@@ -0,0 +1,27 @@
+platform=xilinx_vck5000_gen4x8_qdma_2_202220_1
+
+[connectivity]
+
+sp = bandwidth23.m_axi_ch_0:MC_NOC0
+sp = bandwidth23.m_axi_ch_1:MC_NOC0
+sp = bandwidth23.m_axi_ch_2:MC_NOC0
+sp = bandwidth23.m_axi_ch_3:MC_NOC0
+sp = bandwidth23.m_axi_ch_4:MC_NOC0
+sp = bandwidth23.m_axi_ch_5:MC_NOC0
+sp = bandwidth23.m_axi_ch_6:MC_NOC0
+sp = bandwidth23.m_axi_ch_7:MC_NOC0
+sp = bandwidth23.m_axi_ch_8:MC_NOC0
+sp = bandwidth23.m_axi_ch_9:MC_NOC0
+sp = bandwidth23.m_axi_ch_10:MC_NOC0
+sp = bandwidth23.m_axi_ch_11:MC_NOC0
+sp = bandwidth23.m_axi_ch_12:MC_NOC0
+sp = bandwidth23.m_axi_ch_13:MC_NOC0
+sp = bandwidth23.m_axi_ch_14:MC_NOC0
+sp = bandwidth23.m_axi_ch_15:MC_NOC0
+sp = bandwidth23.m_axi_ch_16:MC_NOC0
+sp = bandwidth23.m_axi_ch_17:MC_NOC0
+sp = bandwidth23.m_axi_ch_18:MC_NOC0
+sp = bandwidth23.m_axi_ch_19:MC_NOC0
+sp = bandwidth23.m_axi_ch_20:MC_NOC0
+sp = bandwidth23.m_axi_ch_21:MC_NOC0
+sp = bandwidth23.m_axi_ch_22:MC_NOC0
diff --git a/benchmarks/tapa_flow/bandwidth23/design/run_tapa.sh b/benchmarks/tapa_flow/bandwidth23/design/run_tapa.sh
new file mode 100644
index 00000000..0071559b
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/design/run_tapa.sh
@@ -0,0 +1,9 @@
+WORK_DIR=work.out
+
+tapa compile \
+ --top data_decoding \
+ --part-num xcu55c-fsvh2892-2L-e \
+ --clock-period 3.33 \
+ -o ${WORK_DIR}/data_decoding.xo \
+ -f src/data_decoder.cpp \
+ 2>&1 | tee tapa.log
diff --git a/benchmarks/tapa_flow/bandwidth23/run_au55c.py b/benchmarks/tapa_flow/bandwidth23/run_au55c.py
new file mode 100644
index 00000000..8ea706e5
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/run_au55c.py
@@ -0,0 +1,42 @@
+__copyright__ = """
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+"""
+
+from rapidstream import get_u55c_vitis_device_factory
+import os
+from pathlib import Path
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+CURR_FILE = os.path.basename(__file__)
+
+VITIS_PLATFORM = "xilinx_u55c_gen3x16_xdma_3_202210_1"
+XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo"
+
+factory = get_u55c_vitis_device_factory(VITIS_PLATFORM)
+
+# Reserve resource for the HBM Memory Sub-System.
+# The HMSS is not part of the user kernel so the partition optimization process
+# is unaware of its existence. We need to manually reserve resources for it.
+# For 512-bit HBM channels, each HBM channel uses approximately the following resources:
+# AREA_PER_HBM_CHANNEL = {
+# "LUT": 5000,
+# "FF": 6500,
+# "BRAM": 0,
+# "URAM": 0,
+# "DSP": 0,
+# }
+factory.reduce_slot_area(0, 0, lut=150800)
+factory.reduce_slot_area(0, 1, lut=146960)
+factory.reduce_slot_area(0, 2, lut=146960)
+factory.reduce_slot_area(1, 0, lut=128000)
+factory.reduce_slot_area(1, 1, lut=107840)
+factory.reduce_slot_area(1, 2, lut=115120)
+
+
+# For this U280 platform, the right most DSP column on the boundary between
+# dynamic/static region is not usable. So we need to adjust the DSP count
+# to reflect the actual available DSPs.
+print("Reducing DSP of (1, 1) to make it less congested")
+factory.reduce_slot_area(1, 1, dsp=100)
+factory.generate_virtual_device(Path(f"{CURR_DIR}/build/{CURR_FILE}/device.json"))
diff --git a/benchmarks/tapa_flow/bandwidth23/run_vck5000.py b/benchmarks/tapa_flow/bandwidth23/run_vck5000.py
new file mode 100644
index 00000000..ae36f962
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth23/run_vck5000.py
@@ -0,0 +1,84 @@
+__copyright__ = """
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+"""
+
+import os
+from pathlib import Path
+
+from rapidstream import DeviceFactory
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+CURR_FILE = os.path.basename(__file__)
+
+VITIS_PLATFORM = "xilinx_vck5000_gen4x8_qdma_2_202220_1"
+VCK5000_PART_NAME = "xcvc1902-vsvd1760-2MP-e-S"
+
+
+factory = DeviceFactory(row=2, col=2, part_num=VCK5000_PART_NAME, board_name=None)
+
+for x in range(2):
+ for y in range(2):
+ pblock = f"-add CLOCKREGION_X{x*4}Y{y*4}:CLOCKREGION_X{x*4+3}Y{y*4+3}"
+ factory.set_slot_pblock(x, y, [pblock])
+
+
+# set SLR crossing capacity
+for x in range(2):
+ factory.set_slot_capacity(x, 0, north=11520)
+ factory.set_slot_capacity(x, 1, north=11520)
+
+ factory.set_slot_capacity(x, 1, south=11520)
+ # factory.set_slot_capacity(x, 2, south=11520)
+
+# Set W/E capacity
+for y in range(2):
+ factory.set_slot_capacity(0, y, east=40320)
+ factory.set_slot_capacity(1, y, west=40320)
+# factory.set_slot_capacity(0, 2, east=41178)
+# factory.set_slot_capacity(1, 2, west=41178)
+
+
+factory.set_platform_name(VITIS_PLATFORM)
+factory.set_user_pblock_name("pblock_dynamic_region")
+
+factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y2"])
+factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y2"])
+factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y3:CLOCKREGION_X4Y4"])
+factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y3:CLOCKREGION_X9Y4"])
+
+
+# Vitis uses 4395 nets from SLR0 to SLR1
+# factory.set_slot_capacity(1, 0, north=11520 - 4395)
+# factory.set_slot_capacity(1, 1, north=11520 - 4395)
+
+# Vitis uses 4185 nets from SLR1 to SLR2
+# factory.set_slot_capacity(1, 1, south=11520 - 4185)
+
+
+factory.extract_slot_resources()
+
+
+# Reserve resource for the HBM Memory Sub-System.
+# The HMSS is not part of the user kernel so the partition optimization process
+# is unaware of its existence. We need to manually reserve resources for it.
+# For 512-bit HBM channels, each HBM channel uses approximately the following resources:
+# AREA_PER_HBM_CHANNEL = {
+# "LUT": 5000,
+# "FF": 6500,
+# "BRAM": 0,
+# "URAM": 0,
+# "DSP": 0,
+# }
+# factory.reduce_slot_area(0, 0, lut=150800)
+# factory.reduce_slot_area(0, 1, lut=146960)
+# factory.reduce_slot_area(1, 0, lut=128000)
+# factory.reduce_slot_area(1, 1, lut=107840)
+
+
+# For this U280 platform, the right most DSP column on the boundary between
+# dynamic/static region is not usable. So we need to adjust the DSP count
+# to reflect the actual available DSPs.
+print("Reducing DSP of (1, 1) to make it less congested")
+factory.reduce_slot_area(1, 1, dsp=100)
+factory.generate_virtual_device(Path(f"{CURR_DIR}/build/{CURR_FILE}/device.json"))
diff --git a/benchmarks/tapa_flow/bandwidth4/Makefile b/benchmarks/tapa_flow/bandwidth4/Makefile
new file mode 100644
index 00000000..3f2761f9
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/Makefile
@@ -0,0 +1,114 @@
+# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved.
+# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+
+ROOT_DIR := $(shell git rev-parse --show-toplevel)
+KERNEL_NAME := bandwidth4
+RS_SCRIPT := $(CURDIR)/run.py
+SRC_DIR := $(CURDIR)/design
+AB_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json
+IMPL_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json
+LINK_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini
+PLATFORM := xilinx_vck5000_gen4x8_qdma_2_202220_1
+PART_NUM := xcvc1902-vsvd1760-2MP-e-S
+GRP_UTIL := $(ROOT_DIR)/common/util/get_group.py
+TEMP_DIR := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
+RS_TARGET := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
+BUILD_LOG := $(TEMP_DIR)/build.json
+SUCCESS := "Build Successful"
+TIMING_RPT := impl_1_hw_bb_locked_timing_summary_routed.rpt
+SLACK_GETTER := $(ROOT_DIR)/common/util/get_slack.py
+RSPATH := $(CURDIR)
+RSXX := rapidstream
+RSPYTHON := rapidstream
+DEVICE_CONFIG := $(TEMP_DIR)/device.json
+DEVICE_GEN := $(CURDIR)/gen_device.py
+INCLUDE := -I $(XILINX_HLS)/include
+KERNEL_XO := $(TEMP_DIR)/$(KERNEL_NAME).xo
+KERNEL_XCLBIN := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
+KERNEL_XSA := $(TEMP_DIR)/$(KERNEL_NAME).xsa
+TARGET := hw
+
+all: $(RS_TARGET)
+ cd $(RSPATH) && $(RSPYTHON) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333
+ @echo $(SUCCESS)
+
+# --run-impl
+$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG)
+ mkdir -p $(TEMP_DIR)
+ cd $(RSPATH) && $(RSXX)-tapaopt \
+ --work-dir $(TEMP_DIR) \
+ --tapa-xo-path $< \
+ --device-config $(DEVICE_CONFIG) \
+ --floorplan-config $(AB_CONFIG) \
+ --single-reg \
+ --implementation-config $(IMPL_CONFIG) \
+ --connectivity-ini $(LINK_CONFIG)
+
+$(DEVICE_CONFIG):$(AB_CONFIG)
+ mkdir -p $(TEMP_DIR)
+ cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT)
+
+cosim:$(KERNEL_XO) $(TEMP_DIR)/main.exe
+ cd $(TEMP_DIR) && $(TEMP_DIR)/main.exe 1024 \
+ --bitstream $< \
+ -xosim_work_dir $(TEMP_DIR)/xosim_work_dir
+
+hw: $(KERNEL_XCLBIN)
+
+$(KERNEL_XCLBIN): $(KERNEL_XSA)
+ @echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****"
+ cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \
+ $^ \
+ --temp_dir $(TEMP_DIR) \
+ --save-temps \
+ --report_dir $(TEMP_DIR)/reports/ \
+ --package.boot_mode=ospi \
+ -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log
+ @echo "### ***** $(KERNEL_XCLBIN) packaging done! *****"
+
+$(KERNEL_XSA): $(KERNEL_XO)
+ cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
+ --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+ --config $(SRC_DIR)/vck5000.cfg \
+ --save-temps \
+ --temp_dir $(TEMP_DIR) \
+ --clock.defaultFreqHz 250000000 \
+ --vivado.synth.jobs 16 \
+ $< -o $@
+
+xo: $(KERNEL_XO)
+
+$(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp
+ mkdir -p $(TEMP_DIR)
+ cd $(TEMP_DIR) && tapa compile \
+ --top $(KERNEL_NAME) \
+ --part-num xcu55c-fsvh2892-2L-e \
+ --clock-period 3.33 \
+ -o $(KERNEL_NAME).xo \
+ -f $< \
+ 2>&1 | tee tapa.log
+
+csim:$(TEMP_DIR)/main.exe
+
+$(TEMP_DIR)/main.exe: $(SRC_DIR)/*.cpp
+ mkdir -p $(TEMP_DIR)
+ cd $(TEMP_DIR) && tapa g++ $^ $(INCLUDE) -o $(TEMP_DIR)/main.exe -O2
+ $(TEMP_DIR)/main.exe
+
+show_groups:
+ rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \
+ -o $(TEMP_DIR)/module_types.csv
+
+
+
+clean:
+ rm -rf $(TEMP_DIR) *.log
+ rm -rf .Xil .run
+ rm -rf *.exe
+ rm -rf .ipcache
+
+cleanall:
+ rm -rf build *.log
+ rm -rf .Xil .run
+ rm -rf *.exe
+ rm -rf .ipcache
diff --git a/benchmarks/tapa_flow/bandwidth4/README.md b/benchmarks/tapa_flow/bandwidth4/README.md
new file mode 100644
index 00000000..237c8651
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/README.md
@@ -0,0 +1,141 @@
+
+
+
+
+# TAPA Flow: ORC Decoder
+
+## Introduction
+
+
+In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include:
+
+- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA.
+- Optimize the .xo file with RapidStream to obtain an optimized .xo file.
+- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment.
+
+## Tutorial
+
+### Step 1 (Done): Generate the Xilinx Object File (`.xo`)
+
+
+We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory.
+
+```bash
+WORK_DIR=generated
+tapac \
+ --work-dir ${WORK_DIR} \
+ --top data_decoding \
+ --part-num xcu280-fsvh2892-2L-e \
+ --clock-period 3.33 \
+ -o ${WORK_DIR}/data_decoding.xo \
+ --connectivity config/link_config.ini \
+ src/data_decoder.cpp \
+ 2>&1 | tee tapa.log
+```
+
+### Step 2: Use Rapidstream to Optimize `.xo` Design
+
+The RapidStream flow conducts design space exploration and generates solutions by taking all TAPA-generated `.xo` file as the input.
+The RapidStream flow for TAPA requires the following key inputs:
+
+- **Platform**: The Vitis platform (e.g., `xilinx_u280_gen3x16_xdma_1_202211_1`).
+- **Device**: virtual device define by calling rapidstream APIs based on platform (e.g., `get_u280_vitis_device_factory`).
+- **.xo file**: The `.xo` file generated by TAPA
+- **Connectivity** (.ini): Include the configuration file for `v++` design/config/run.py/link_config.ini.
+- **top_module_name**: Top module name for the kernel.
+- **Clock**: All the clock and frequencies.
+- **Flatten Module**: Within a design, not all modules need to be optimized. The flatten module name is the target module rapidstream will optimize.
+
+The Python snippet below shows how we initiate rapidstream instance to set up the rapidstream environment.
+
+```Python
+from rapidstream import get_u280_vitis_device_factory, RapidStreamTAPA
+import os
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+INI_PATH = f"{CURR_DIR}/design/config/link_config.ini"
+VITIS_PLATFORM = "xilinx_u280_gen3x16_xdma_1_202211_1"
+XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo"
+kernel_name = "data_decoding"
+factory = get_u280_vitis_device_factory(VITIS_PLATFORM)
+rs = RapidStreamTAPA(f"{CURR_DIR}/build")
+rs.set_virtual_device(factory.generate_virtual_device())
+rs.add_xo_file(XO_PATH)
+rs.set_vitis_platform(VITIS_PLATFORM)
+rs.set_vitis_connectivity_config(INI_PATH)
+rs.set_top_module_name(kernel_name)
+rs.add_clock("ap_clk", 3.33)
+rs.add_flatten_targets([kernel_name])
+```
+
+The HBM AXI port connection is described in design/config/run.py/link_config.ini.
+
+```bash
+[connectivity]
+sp=data_decoding.input_port:HBM[0:1]
+sp=data_decoding.output_port0_32b_8b:HBM[16:17]
+sp=data_decoding.output_port1_16b_8b:HBM[18:19]
+sp=data_decoding.output_port2_16b_8b:HBM[20:21]
+sp=data_decoding.output_port3_8b:HBM[22:23]
+sp=data_decoding.output_port4_Track:HBM[24:25]
+```
+
+As a result, it is necessary to assign the kernel ports to the appropriate slots. The Python code below demonstrates this process. For comprehensive linking details, please refer to the design/config/run.py/link_config.ini file.
+
+ ```Python
+# Bind ports to HBM 16-31
+right_slot = "SLOT_X1Y0:SLOT_X1Y0"
+left_slot = "SLOT_X0Y0:SLOT_X0Y0"
+rs.assign_port_to_region(".*input_port.*", left_slot)
+rs.assign_port_to_region(".*output_port0_32b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port1_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port2_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port3_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port4_Track.*", right_slot)
+rs.assign_port_to_region("s_axi_control_.*", left_slot)
+rs.assign_port_to_region("ap_clk", left_slot)
+rs.assign_port_to_region("ap_rst_n", left_slot)
+rs.assign_port_to_region("interrupt", left_slot)
+```
+
+For the complete detail, please refore to [./run_vck5000.py](./run_vck5000.py) file. Call the rapidstream by launching the command below or `make all`.
+
+```bash
+rapidstream run.py
+```
+
+If everything is successful, you should at least get one optimized `.xclbin` file.
+
+
+
+
+### Step 3: Check the Group Module Report
+
+
+RapidStream mandates a clear distinction between communication and computation within user designs.
+
+- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules.
+
+- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged.
+
+For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation.
+
+To generate a report on group types, execute the commands below or `run make show_groups`:
+
+```bash
+rapidstream ../../../common/util/get_group.py \
+ -i build/passes/0-imported.json \
+ -o build/module_types.csv
+```
+
+The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `Callipepla` serves as a Group module, while the other three modules are added by RapidStream.
+
+| Module Name | Group Type |
+|:--------------------------------:|:--------------:|
+| data_decoding | grouped_module |
+|__rs_ap_ctrl_start_ready_pipeline | grouped_module |
+|__rs_ff_pipeline | grouped_module |
+|__rs_hs_pipeline | grouped_module |
diff --git a/benchmarks/tapa_flow/bandwidth4/design/bandwidth-host.cpp b/benchmarks/tapa_flow/bandwidth4/design/bandwidth-host.cpp
new file mode 100644
index 00000000..340e299d
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/bandwidth-host.cpp
@@ -0,0 +1,70 @@
+#include
+#include
+
+#include
+#include
+#include "bandwidth4.h"
+
+using std::clog;
+using std::endl;
+using std::vector;
+
+DEFINE_string(bitstream, "", "path to bitstream file, run csim if empty");
+
+int main(int argc, char* argv[]) {
+ gflags::ParseCommandLineFlags(&argc, &argv, /*remove_flags=*/true);
+
+ const uint64_t n = argc > 1 ? atoll(argv[1]) : 1024 * 1024;
+
+ vector rmem0(n);
+ vector rmem1(n);
+ vector rmem2(n);
+ vector rmem3(n);
+
+
+ for (uint64_t i = 0; i < n; ++i) {
+ rmem0[i] = i;
+ rmem1[i] = i;
+ rmem2[i] = i;
+ rmem3[i] = i;
+ }
+ int64_t kernel_time_ns = tapa::invoke(
+ bandwidth4,
+ FLAGS_bitstream,
+ tapa::read_write_mmap(rmem0),
+ tapa::read_write_mmap(rmem1),
+ tapa::read_write_mmap(rmem2),
+ tapa::read_write_mmap(rmem3),
+ n);
+
+ clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << endl;
+
+ uint64_t num_errors = 0;
+ const uint64_t threshold = 10; // only report up to these errors
+ for (uint64_t i = 0; i < n; ++i) {
+ bit512 out512 = (i << 1);
+ if (rmem0[i] != out512) {
+ if (num_errors < threshold) {
+ clog << "error at " << i << ": expected " << rmem0[i] << ", got "
+ << out512 << endl;
+ }
+ ++num_errors;
+ }
+ if (rmem3[i] != out512) {
+ if (num_errors < threshold) {
+ clog << "error at " << i << ": expected " << rmem3[i] << ", got "
+ << out512 << endl;
+ }
+ ++num_errors;
+ }
+ }
+ if (num_errors == 0) {
+ clog << "PASS!" << endl;
+ } else {
+ if (num_errors > threshold) {
+ clog << " (+" << (num_errors - threshold) << " more errors)" << endl;
+ }
+ clog << "FAIL!" << endl;
+ }
+ return num_errors > 0 ? 1 : 0;
+}
diff --git a/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.cpp b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.cpp
new file mode 100644
index 00000000..25d1ba55
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.cpp
@@ -0,0 +1,62 @@
+#include
+
+#include
+#include "bandwidth4.h"
+
+void yshift(tapa::istream& a, tapa::ostream& b, uint64_t n) {
+ for (uint64_t i = 0; i < n; ++i) {
+ bit512 tmp;
+ tmp = a.read();
+ tmp = (tmp << 1);
+ b.write(tmp);
+ }
+}
+
+void Mmap2Stream(
+ tapa::mmap mmap,
+ uint64_t n,
+ tapa::ostream& stream){
+
+ for (uint64_t i = 0; i < n; ++i) {
+ stream << mmap[i];
+ }
+}
+
+void Stream2Mmap(tapa::istream& stream, tapa::mmap mmap,
+ uint64_t n) {
+ for (uint64_t i = 0; i < n; ++i) {
+ mmap[i] = stream.read();
+ }
+}
+
+void bandwidth4(
+ tapa::mmap ch_0,
+ tapa::mmap ch_1,
+ tapa::mmap ch_2,
+ tapa::mmap ch_3,
+ uint64_t n) {
+
+ tapa::stream qr0("qr0");
+ tapa::stream qr1("qr1");
+ tapa::stream qr2("qr2");
+ tapa::stream qr3("qr3");
+
+ tapa::stream qw0("qw0");
+ tapa::stream qw1("qw1");
+ tapa::stream qw2("qw2");
+ tapa::stream qw3("qw3");
+
+ tapa::task()
+ .invoke(Mmap2Stream, ch_0, n, qr0)
+ .invoke(Mmap2Stream, ch_1, n, qr1)
+ .invoke(Mmap2Stream, ch_2, n, qr2)
+ .invoke(Mmap2Stream, ch_3, n, qr3)
+ .invoke(yshift, qr0, qw0, n)
+ .invoke(yshift, qr1, qw1, n)
+ .invoke(yshift, qr2, qw2, n)
+ .invoke(yshift, qr3, qw3, n)
+ .invoke(Stream2Mmap, qw0, ch_0, n)
+ .invoke(Stream2Mmap, qw1, ch_1, n)
+ .invoke(Stream2Mmap, qw2, ch_2, n)
+ .invoke(Stream2Mmap, qw3, ch_3, n);
+}
diff --git a/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.h b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.h
new file mode 100644
index 00000000..6974458f
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.h
@@ -0,0 +1,18 @@
+
+#ifndef __VADD_BW_H__
+#define __VADD_BW_H__
+#include
+
+#include
+#include
+
+typedef ap_uint<512> bit512;
+
+void bandwidth4(
+ tapa::mmap ch_0,
+ tapa::mmap ch_1,
+ tapa::mmap ch_2,
+ tapa::mmap ch_3,
+ uint64_t n);
+
+#endif
diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json
new file mode 100644
index 00000000..264df902
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json
@@ -0,0 +1,15 @@
+{
+ "dse_range_max": 0.8,
+ "dse_range_min": 0.7,
+ "partition_strategy": "flat",
+ "port_pre_assignments": {
+ ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0",
+ "ap_clk": "SLOT_X0Y0:SLOT_X0Y0",
+ "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0",
+ "interrupt": "SLOT_X0Y0:SLOT_X0Y0",
+ "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0"
+ }
+}
diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/impl_config.json b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/impl_config.json
new file mode 100644
index 00000000..9b47f4ca
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/impl_config.json
@@ -0,0 +1,7 @@
+{
+ "max_workers": 2,
+ "port_to_clock_period": {
+ "ap_clk": 3.33
+ },
+ "vitis_platform": "xilinx_vck5000_gen4x8_qdma_2_202220_1"
+}
diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/link_config.ini b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/link_config.ini
new file mode 100644
index 00000000..17e6686e
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/link_config.ini
@@ -0,0 +1,8 @@
+platform=xilinx_vck5000_gen4x8_qdma_2_202220_1
+
+[connectivity]
+
+sp = bandwidth4.m_axi_ch_0:MC_NOC0
+sp = bandwidth4.m_axi_ch_1:MC_NOC0
+sp = bandwidth4.m_axi_ch_2:MC_NOC0
+sp = bandwidth4.m_axi_ch_3:MC_NOC0
diff --git a/benchmarks/tapa_flow/bandwidth4/design/run_tapa.sh b/benchmarks/tapa_flow/bandwidth4/design/run_tapa.sh
new file mode 100644
index 00000000..0071559b
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/run_tapa.sh
@@ -0,0 +1,9 @@
+WORK_DIR=work.out
+
+tapa compile \
+ --top data_decoding \
+ --part-num xcu55c-fsvh2892-2L-e \
+ --clock-period 3.33 \
+ -o ${WORK_DIR}/data_decoding.xo \
+ -f src/data_decoder.cpp \
+ 2>&1 | tee tapa.log
diff --git a/benchmarks/tapa_flow/bandwidth4/run_vck5000.py b/benchmarks/tapa_flow/bandwidth4/run_vck5000.py
new file mode 100644
index 00000000..ae36f962
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/run_vck5000.py
@@ -0,0 +1,84 @@
+__copyright__ = """
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+"""
+
+import os
+from pathlib import Path
+
+from rapidstream import DeviceFactory
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+CURR_FILE = os.path.basename(__file__)
+
+VITIS_PLATFORM = "xilinx_vck5000_gen4x8_qdma_2_202220_1"
+VCK5000_PART_NAME = "xcvc1902-vsvd1760-2MP-e-S"
+
+
+factory = DeviceFactory(row=2, col=2, part_num=VCK5000_PART_NAME, board_name=None)
+
+for x in range(2):
+ for y in range(2):
+ pblock = f"-add CLOCKREGION_X{x*4}Y{y*4}:CLOCKREGION_X{x*4+3}Y{y*4+3}"
+ factory.set_slot_pblock(x, y, [pblock])
+
+
+# set SLR crossing capacity
+for x in range(2):
+ factory.set_slot_capacity(x, 0, north=11520)
+ factory.set_slot_capacity(x, 1, north=11520)
+
+ factory.set_slot_capacity(x, 1, south=11520)
+ # factory.set_slot_capacity(x, 2, south=11520)
+
+# Set W/E capacity
+for y in range(2):
+ factory.set_slot_capacity(0, y, east=40320)
+ factory.set_slot_capacity(1, y, west=40320)
+# factory.set_slot_capacity(0, 2, east=41178)
+# factory.set_slot_capacity(1, 2, west=41178)
+
+
+factory.set_platform_name(VITIS_PLATFORM)
+factory.set_user_pblock_name("pblock_dynamic_region")
+
+factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y2"])
+factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y2"])
+factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y3:CLOCKREGION_X4Y4"])
+factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y3:CLOCKREGION_X9Y4"])
+
+
+# Vitis uses 4395 nets from SLR0 to SLR1
+# factory.set_slot_capacity(1, 0, north=11520 - 4395)
+# factory.set_slot_capacity(1, 1, north=11520 - 4395)
+
+# Vitis uses 4185 nets from SLR1 to SLR2
+# factory.set_slot_capacity(1, 1, south=11520 - 4185)
+
+
+factory.extract_slot_resources()
+
+
+# Reserve resource for the HBM Memory Sub-System.
+# The HMSS is not part of the user kernel so the partition optimization process
+# is unaware of its existence. We need to manually reserve resources for it.
+# For 512-bit HBM channels, each HBM channel uses approximately the following resources:
+# AREA_PER_HBM_CHANNEL = {
+# "LUT": 5000,
+# "FF": 6500,
+# "BRAM": 0,
+# "URAM": 0,
+# "DSP": 0,
+# }
+# factory.reduce_slot_area(0, 0, lut=150800)
+# factory.reduce_slot_area(0, 1, lut=146960)
+# factory.reduce_slot_area(1, 0, lut=128000)
+# factory.reduce_slot_area(1, 1, lut=107840)
+
+
+# For this U280 platform, the right most DSP column on the boundary between
+# dynamic/static region is not usable. So we need to adjust the DSP count
+# to reflect the actual available DSPs.
+print("Reducing DSP of (1, 1) to make it less congested")
+factory.reduce_slot_area(1, 1, dsp=100)
+factory.generate_virtual_device(Path(f"{CURR_DIR}/build/{CURR_FILE}/device.json"))
diff --git a/benchmarks/vitis_flow/bandwidth23/Makefile b/benchmarks/vitis_flow/bandwidth23/Makefile
new file mode 100644
index 00000000..e5963275
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/Makefile
@@ -0,0 +1,119 @@
+# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved.
+# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+
+ROOT_DIR := $(shell git rev-parse --show-toplevel)
+GRP_UTIL := $(ROOT_DIR)/common/util/get_group.py
+PLATFORM := xilinx_vck5000_gen4x8_qdma_2_202220_1
+PART := xcvc1902-vsvd1760-2MP-e-S
+LINK_FILE := link_config_hbm.ini
+KERNEL_NAME := bandwidth23
+HLSXX := vitis_hls
+SRC_DIR := $(CURDIR)/design
+RS_SCRIPT := $(CURDIR)/run.py
+TEMP_DIR := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
+HOST := $(TEMP_DIR)/app.exe
+KERNEL_XO := $(TEMP_DIR)/$(KERNEL_NAME).xo
+KERNEL_XSA := $(TEMP_DIR)/$(KERNEL_NAME).xsa
+KERNEL_XCLBIN := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
+RS_XCLBIN := $(TEMP_DIR)/dse/candidate_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
+CLK_PERIOD_NS := 3
+TARGET := hw
+HLS2RTL_TCL := $(ROOT_DIR)/common/tcl/hls2rtl.tcl
+GEN_XO := 1
+
+BUILD_LOG := $(TEMP_DIR)/build.json
+SUCCESS := "Build Successful"
+TIMING_RPT := impl_1_hw_bb_locked_timing_summary_routed.rpt
+SLACK_GETTER := $(ROOT_DIR)/common/util/get_slack.py
+RSXX := rapidstream
+
+
+
+
+all: $(RS_XCLBIN)
+ $(RSXX) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333
+ echo $(SUCCESS)
+
+$(RS_XCLBIN):$(KERNEL_XO)
+ $(RSXX) $(RS_SCRIPT)
+
+hw: $(KERNEL_XCLBIN)
+
+$(KERNEL_XCLBIN): $(KERNEL_XSA)
+ @echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****"
+ cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \
+ $^ \
+ --temp_dir $(TEMP_DIR) \
+ --save-temps \
+ --report_dir $(TEMP_DIR)/reports/ \
+ --package.boot_mode=ospi \
+ -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log
+ @echo "### ***** $(KERNEL_XCLBIN) packaging done! *****"
+
+$(KERNEL_XSA): $(KERNEL_XO)
+ cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
+ --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+ --config $(SRC_DIR)/vck5000.cfg \
+ --save-temps \
+ --temp_dir $(TEMP_DIR) \
+ --clock.defaultFreqHz 250000000 \
+ --vivado.synth.jobs 16 \
+ $< -o $@
+
+
+xo:$(KERNEL_XO)
+
+$(KERNEL_XO): $(SRC_DIR)/$(KERNEL_NAME).cpp $(SRC_DIR)/$(KERNEL_NAME).h
+ mkdir -p $(TEMP_DIR)
+ cd $(TEMP_DIR) && v++ -c -t ${TARGET} \
+ --platform $(PLATFORM) \
+ -k $(KERNEL_NAME) \
+ --temp_dir $(TEMP_DIR) \
+ --save-temps \
+ -o $@ \
+ $^
+
+sw_emu: $(HOST) $(SRC_DIR)/$(KERNEL_NAME).cpp $(SRC_DIR)/$(KERNEL_NAME).h
+ mkdir -p $(TEMP_DIR)
+ cd $(TEMP_DIR) && v++ -c -t sw_emu \
+ --platform xilinx_u50_gen3x16_xdma_5_202210_1 \
+ -k $(KERNEL_NAME) \
+ --temp_dir $(TEMP_DIR) \
+ --save-temps \
+ -o $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xo \
+ $^
+ cd $(TEMP_DIR) && v++ -l -t sw_emu \
+ $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xo \
+ --platform xilinx_u50_gen3x16_xdma_5_202210_1 \
+ --kernel $(KERNEL_NAME) \
+ --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+ -o $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xclbin
+ cd $(TEMP_DIR) && XCL_EMULATION_MODE=sw_emu $< $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xclbin
+
+host:$(HOST)
+
+$(HOST): $(SRC_DIR)/host.cpp
+ mkdir -p $(TEMP_DIR)
+ g++ -Wall -g -std=c++11 $(SRC_DIR)/host.cpp -o $@ \
+ -I${XILINX_XRT}/include/ \
+ -I${XILINX_HLS}/include/ \
+ -L${XILINX_XRT}/lib/ -lOpenCL -pthread -lrt -lstdc++
+
+show_groups:
+ rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \
+ -o $(TEMP_DIR)/module_types.csv
+
+
+
+clean:
+ rm -rf $(TEMP_DIR) *.log
+ rm -rf .Xil .run
+ rm -rf *.exe
+ rm -rf .ipcache
+
+
+cleanall:
+ rm -rf build *.log
+ rm -rf .Xil .run
+ rm -rf *.exe
+ rm -rf .ipcache
diff --git a/benchmarks/vitis_flow/bandwidth23/README.md b/benchmarks/vitis_flow/bandwidth23/README.md
new file mode 100644
index 00000000..f6d4bcdd
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/README.md
@@ -0,0 +1,118 @@
+
+
+
+
+# Large Language Model Benchmark
+
+## Introduction
+
+In this recipe, we illustrate how to create a Vitis objective file (`.xo`) for a Large Language Model kernel from [Chen *et al.* (TRETS)](https://dl.acm.org/doi/10.1145/3656177) using Vitis, then optimize the `.xo` file with Rapidstream, and finally utilize the optimized output in the ongoing Vitis development process.
+
+
+## Tutorial
+
+### Step 1: Generate the Xilinx Object File (`.xo`)
+
+We use Vitis 2023.2 to generate the `.xo` file. Since we want to disable [free running pipeline (FRP)](https://www.xilinx.com/htmldocs/xilinx2021_2/hls-guidance/200-1553.html) feature for HLS step, we use [hls2rtl.tcl](../../../common/tcl/hls2rtl.tcl) to compile the C++ code to `.xo` file.
+
+Run the following command or run `make clean && make xo`:
+
+```bash
+source /Vitis/2023.2/settings64.sh
+make clean
+mkdir -p build
+vitis_hls ../../../common/tcl/hls2rtl.tcl \
+ -l build/vitis_hls_llm.log \
+ -tclargs \
+ xcu50-fsvh2104-2-e \
+ 4 \
+ bert_all \
+ 1 \
+ design/bert_all.cpp design/kernel.h \
+ design/bert_region_1.cpp design/bert_region_1.h \
+ design/bert_region_2.cpp design/bert_region_2.h \
+ design/bert_region_3.cpp design/bert_region_3.h
+```
+
+### Step 2 (Optional): Use Vitis --link to Generate the `.xclbin` File
+
+:warning: **Note**: This step can take hours to complete. We recommend using the RapidStream flow to optimize the `.xo` file instead of generating the `.xclbin` file if you are familiar with AMD Vitis flow.
+
+With the `.xo` file generated, you can use `v++ -link` to generate the `.xclbin` file. Run the following command or execute `make hw`:
+
+```bash
+v++ -l -t hw \
+ --platform xilinx_u50_gen3x16_xdma_5_202210_1 \
+ --kernel bert_all \
+ --connectivity.nk bert_all:1:bert_all \
+ --config design/link_config_hbm.ini \
+ --temp_dir build \
+ -o build/bert_all.xclbin \
+ build/bert_all.xo
+```
+
+### Step 3: Call RapidStream to Optimize the Design
+
+The RapidStream flow conducts design space exploration and generates optimized `.xo` files by taking the Vitis generated `.xo` as the input. The RapidStream flow for Vitis requires four key inputs:
+
+1. **Device**: Specify the Vitis platform name for `v++`.
+2. **Xilinx Object file** (.xo): Provide the file generated by `v++` or Vivado.
+3. **Connectivity** (.ini): Include the configuration file for `v++` ./design/link_config_hbm.ini.
+4. **Clock targets**: Define the desired clock frequencies.
+5. RapidStream automatically handles all other aspects of the flow.
+
+Please refer to [run_u50.py](./run_u50.py) for the complete RapidStream flow.
+To execute the flow and generate optimized `.xo` files,
+Run the following command or execute `make rs_opt`:
+
+```bash
+rapidstream ./run_u50.py
+```
+
+Unlike in the example provided in [getting_started/vitis_source](../../../getting_started/vitis_source/run.py) where the `skip_impl` variable is set to `True`, in this case, the DSE engine will automatically launch Vitis to link the optimized `.xo` file to the target device and generate the `.xclbin` file.
+
+```bash
+# Skip Vitis implementation.
+rs.run_dse(skip_impl=True)
+```
+
+When finished, you can locate these files using the following command:
+
+
+```bash
+find ./build/dse/ -name *.xclbin
+```
+
+If everything is successful, you should at least get one optimized `.xclbin` file.
+
+
+### Step 4: Check the Group Module Report
+
+
+RapidStream mandates a clear distinction between communication and computation within user designs.
+
+- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules.
+
+- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged.
+
+For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation.
+
+To generate a report on group types, execute the commands below or `run make show_groups`:
+
+```bash
+rapidstream ../../../common/util/get_group.py \
+ -i build/passes/0-imported.json \
+ -o build/module_types.csv
+```
+
+The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `VecAdd` serves as a Group module, while the other three modules are added by RapidStream.
+
+| Module Name | Group Type |
+|:--------------------------------:|:--------------:|
+| bert_all | grouped_module |
+|__rs_ap_ctrl_start_ready_pipeline | grouped_module |
+|__rs_ff_pipeline | grouped_module |
+|__rs_hs_pipeline | grouped_module |
diff --git a/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.cpp b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.cpp
new file mode 100644
index 00000000..e1197a2d
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.cpp
@@ -0,0 +1,181 @@
+// Copyright 2024 RapidStream Design Automation, Inc.
+// All Rights Reserved.
+
+#include "bandwidth23.h"
+#include
+
+
+void print_512(bit512 din){
+ // Print out the data 64-bit hex per line
+ for (int i = 0; i < 8; i++) {
+ printf("%08x%08x\n", (unsigned int) din(63+i*64, 32+i*64), (unsigned int) din(31+i*64, 0+i*64));
+ }
+}
+
+void read_mem(bit512* mem, hls::stream& ch, long offset) {
+ for (int j = 0; j < 1024; j++) {
+ ch.write(mem[(offset<<10) + j]<<1);
+ }
+}
+
+
+void write_mem(hls::stream& ch, bit512* mem, long offset) {
+ for (int j = 0; j < 1024; j++) {
+ mem[(offset<<10) + j] = ch.read();
+ }
+}
+
+
+
+extern "C" {
+
+void bandwidth23(
+ bit512* ch_0,
+ bit512* ch_1,
+ bit512* ch_2,
+ bit512* ch_3,
+ bit512* ch_4,
+ bit512* ch_5,
+ bit512* ch_6,
+ bit512* ch_7,
+ bit512* ch_8,
+ bit512* ch_9,
+ bit512* ch_10,
+ bit512* ch_11,
+ bit512* ch_12,
+ bit512* ch_13,
+ bit512* ch_14,
+ bit512* ch_15,
+ bit512* ch_16,
+ bit512* ch_17,
+ bit512* ch_18,
+ bit512* ch_19,
+ bit512* ch_20,
+ bit512* ch_21,
+ bit512* ch_22,
+ long n)
+{
+#pragma HLS INTERFACE m_axi port=ch_0 bundle=ch_0
+#pragma HLS INTERFACE m_axi port=ch_1 bundle=ch_1
+#pragma HLS INTERFACE m_axi port=ch_2 bundle=ch_2
+#pragma HLS INTERFACE m_axi port=ch_3 bundle=ch_3
+#pragma HLS INTERFACE m_axi port=ch_4 bundle=ch_4
+#pragma HLS INTERFACE m_axi port=ch_5 bundle=ch_5
+#pragma HLS INTERFACE m_axi port=ch_6 bundle=ch_6
+#pragma HLS INTERFACE m_axi port=ch_7 bundle=ch_7
+#pragma HLS INTERFACE m_axi port=ch_8 bundle=ch_8
+#pragma HLS INTERFACE m_axi port=ch_9 bundle=ch_9
+#pragma HLS INTERFACE m_axi port=ch_10 bundle=ch_10
+#pragma HLS INTERFACE m_axi port=ch_11 bundle=ch_11
+#pragma HLS INTERFACE m_axi port=ch_12 bundle=ch_12
+#pragma HLS INTERFACE m_axi port=ch_13 bundle=ch_13
+#pragma HLS INTERFACE m_axi port=ch_14 bundle=ch_14
+#pragma HLS INTERFACE m_axi port=ch_15 bundle=ch_15
+#pragma HLS INTERFACE m_axi port=ch_16 bundle=ch_16
+#pragma HLS INTERFACE m_axi port=ch_17 bundle=ch_17
+#pragma HLS INTERFACE m_axi port=ch_18 bundle=ch_18
+#pragma HLS INTERFACE m_axi port=ch_19 bundle=ch_19
+#pragma HLS INTERFACE m_axi port=ch_20 bundle=ch_20
+#pragma HLS INTERFACE m_axi port=ch_21 bundle=ch_21
+#pragma HLS INTERFACE m_axi port=ch_22 bundle=ch_22
+#pragma HLS INTERFACE s_axilite port=n bundle=control
+#pragma HLS INTERFACE s_axilite port=return bundle=control
+ hls::stream stream_0;
+#pragma HLS STREAM variable=stream_0 depth=2048
+ hls::stream stream_1;
+#pragma HLS STREAM variable=stream_1 depth=2048
+ hls::stream stream_2;
+#pragma HLS STREAM variable=stream_2 depth=2048
+ hls::stream stream_3;
+#pragma HLS STREAM variable=stream_3 depth=2048
+ hls::stream stream_4;
+#pragma HLS STREAM variable=stream_4 depth=2048
+ hls::stream stream_5;
+#pragma HLS STREAM variable=stream_5 depth=2048
+ hls::stream stream_6;
+#pragma HLS STREAM variable=stream_6 depth=2048
+ hls::stream stream_7;
+#pragma HLS STREAM variable=stream_7 depth=2048
+ hls::stream stream_8;
+#pragma HLS STREAM variable=stream_8 depth=2048
+ hls::stream stream_9;
+#pragma HLS STREAM variable=stream_9 depth=2048
+ hls::stream stream_10;
+#pragma HLS STREAM variable=stream_10 depth=2048
+ hls::stream stream_11;
+#pragma HLS STREAM variable=stream_11 depth=2048
+ hls::stream stream_12;
+#pragma HLS STREAM variable=stream_12 depth=2048
+ hls::stream stream_13;
+#pragma HLS STREAM variable=stream_13 depth=2048
+ hls::stream stream_14;
+#pragma HLS STREAM variable=stream_14 depth=2048
+ hls::stream stream_15;
+#pragma HLS STREAM variable=stream_15 depth=2048
+ hls::stream stream_16;
+#pragma HLS STREAM variable=stream_16 depth=2048
+ hls::stream stream_17;
+#pragma HLS STREAM variable=stream_17 depth=2048
+ hls::stream stream_18;
+#pragma HLS STREAM variable=stream_18 depth=2048
+ hls::stream stream_19;
+#pragma HLS STREAM variable=stream_19 depth=2048
+ hls::stream stream_20;
+#pragma HLS STREAM variable=stream_20 depth=2048
+ hls::stream stream_21;
+#pragma HLS STREAM variable=stream_21 depth=2048
+ hls::stream stream_22;
+#pragma HLS STREAM variable=stream_22 depth=2048
+
+
+ for(int i=0; i<(n>>10); i++){
+ read_mem(ch_0, stream_0, i);
+ read_mem(ch_1, stream_1, i);
+ read_mem(ch_2, stream_2, i);
+ read_mem(ch_3, stream_3, i);
+ read_mem(ch_4, stream_4, i);
+ read_mem(ch_5, stream_5, i);
+ read_mem(ch_6, stream_6, i);
+ read_mem(ch_7, stream_7, i);
+ read_mem(ch_8, stream_8, i);
+ read_mem(ch_9, stream_9, i);
+ read_mem(ch_10, stream_10, i);
+ read_mem(ch_11, stream_11, i);
+ read_mem(ch_12, stream_12, i);
+ read_mem(ch_13, stream_13, i);
+ read_mem(ch_14, stream_14, i);
+ read_mem(ch_15, stream_15, i);
+ read_mem(ch_16, stream_16, i);
+ read_mem(ch_17, stream_17, i);
+ read_mem(ch_18, stream_18, i);
+ read_mem(ch_19, stream_19, i);
+ read_mem(ch_20, stream_20, i);
+ read_mem(ch_21, stream_21, i);
+ read_mem(ch_22, stream_22, i);
+
+ write_mem(stream_0, ch_0, i);
+ write_mem(stream_1, ch_1, i);
+ write_mem(stream_2, ch_2, i);
+ write_mem(stream_3, ch_3, i);
+ write_mem(stream_4, ch_4, i);
+ write_mem(stream_5, ch_5, i);
+ write_mem(stream_6, ch_6, i);
+ write_mem(stream_7, ch_7, i);
+ write_mem(stream_8, ch_8, i);
+ write_mem(stream_9, ch_9, i);
+ write_mem(stream_10, ch_10, i);
+ write_mem(stream_11, ch_11, i);
+ write_mem(stream_12, ch_12, i);
+ write_mem(stream_13, ch_13, i);
+ write_mem(stream_14, ch_14, i);
+ write_mem(stream_15, ch_15, i);
+ write_mem(stream_16, ch_16, i);
+ write_mem(stream_17, ch_17, i);
+ write_mem(stream_18, ch_18, i);
+ write_mem(stream_19, ch_19, i);
+ write_mem(stream_20, ch_20, i);
+ write_mem(stream_21, ch_21, i);
+ write_mem(stream_22, ch_22, i);
+ }
+}
+}
diff --git a/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.h b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.h
new file mode 100644
index 00000000..e8fffd02
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.h
@@ -0,0 +1,43 @@
+// Copyright 2024 RapidStream Design Automation, Inc.
+// All Rights Reserved.
+
+#include "stdio.h"
+#include "stdlib.h"
+#include "math.h"
+#include
+#include
+
+
+/* Data Type */
+typedef ap_uint<512> bit512;
+typedef ap_uint<64> bit64;
+typedef bit512 data_t ;
+/* Data Type */
+
+
+
+extern "C" { void bandwidth23(
+ bit512* ch_0,
+ bit512* ch_1,
+ bit512* ch_2,
+ bit512* ch_3,
+ bit512* ch_4,
+ bit512* ch_5,
+ bit512* ch_6,
+ bit512* ch_7,
+ bit512* ch_8,
+ bit512* ch_9,
+ bit512* ch_10,
+ bit512* ch_11,
+ bit512* ch_12,
+ bit512* ch_13,
+ bit512* ch_14,
+ bit512* ch_15,
+ bit512* ch_16,
+ bit512* ch_17,
+ bit512* ch_18,
+ bit512* ch_19,
+ bit512* ch_20,
+ bit512* ch_21,
+ bit512* ch_22,
+ long n); }
diff --git a/benchmarks/vitis_flow/bandwidth23/design/host.cpp b/benchmarks/vitis_flow/bandwidth23/design/host.cpp
new file mode 100644
index 00000000..1e5f972d
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/design/host.cpp
@@ -0,0 +1,339 @@
+// Copyright 2024 RapidStream Design Automation, Inc.
+// All Rights Reserved.
+
+
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
+
+#include
+#include
+#include
+#include
+#include "bandwidth23.h"
+
+void print_512(bit512 din){
+ // Print out the data 64-bit hex per line
+ for (int i = 0; i < 8; i++) {
+ printf("%08x%08x\n", (unsigned int) din(63+i*64, 32+i*64), (unsigned int) din(31+i*64, 0+i*64));
+ }
+}
+
+#define CHECK_MSG(msg, call) \
+ call; \
+ if (msg != CL_SUCCESS) { \
+ printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, msg); \
+ exit(EXIT_FAILURE); \
+ }
+
+static const std::string error_message =
+ "Error: Result mismatch:\n"
+ "i = %d CPU result = %d Device result = %d\n";
+
+int main(int argc, char* argv[]) {
+ // Must specify the xclbin file as the second argument
+ if (argc != 2) {
+ std::cout << "Please run the application by: " << argv[0] << " " << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ std::string xclbin_file = argv[1];
+
+ // Calculate the byte size the input data
+ long DATA_SIZE = 4096;
+
+ std::vector devices;
+ cl_int err;
+ cl::Context context;
+ cl::CommandQueue q;
+ cl::Kernel bandwidth23;
+ cl::Program program;
+ std::vector platforms;
+ bool device_found = false;
+
+ // The get_xil_devices will return vector of Xilinx Devices
+ // Iterate through devices and find Xilinx Alveo Device
+ cl::Platform::get(&platforms);
+ for (size_t i = 0; (i < platforms.size()) & (device_found == false); i++) {
+ cl::Platform platform = platforms[i];
+ std::string platformName = platform.getInfo();
+ if (platformName == "Xilinx") {
+ devices.clear();
+ platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
+ if (devices.size()) {
+ device_found = true;
+ break;
+ }
+ }
+ }
+ if (device_found == false) {
+ std::cout << "Error: could not find the target Xilinx Alveo device" << std::endl;
+ return EXIT_FAILURE;
+ }
+
+ std::cout << "INFO: reading " << xclbin_file << " xclbinfile" << std::endl;
+ FILE* fp;
+ if ((fp = fopen(xclbin_file.c_str(), "r")) == nullptr) {
+ std::cout << "ERROR: cannot open" << xclbin_file.c_str() << " xclbin!" << std::endl;
+ exit(EXIT_FAILURE);
+ }
+
+ // Load xclbin
+ std::cout << "INFO: loading: '" << xclbin_file << "'\n";
+ std::ifstream bin_file(xclbin_file, std::ifstream::binary);
+ bin_file.seekg(0, bin_file.end);
+ unsigned nb = bin_file.tellg();
+ bin_file.seekg(0, bin_file.beg);
+ char* buf = new char[nb];
+ bin_file.read(buf, nb);
+
+ // Creating Program from Binary File
+ cl::Program::Binaries bins;
+ bins.push_back({buf, nb});
+ bool valid_device = false;
+ for (unsigned int i = 0; i < devices.size(); i++) {
+ auto device = devices[i];
+ // For the device, we create a context and command queue
+ CHECK_MSG(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err));
+ CHECK_MSG(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
+ std::cout << "Trying to program device[" << i << "]: " << device.getInfo() << std::endl;
+ cl::Program program(context, {device}, bins, nullptr, &err);
+ if (err != CL_SUCCESS) {
+ std::cout << "Device[" << i << "]: failed to load xclbin file!\n";
+ } else {
+ std::cout << "Device[" << i << "]: xclbin is loaded successfully!\n";
+ CHECK_MSG(err, bandwidth23 = cl::Kernel(program, "bandwidth23", &err));
+ valid_device = true;
+ break; // we break because we found a valid device
+ }
+ }
+ if (!valid_device) {
+ std::cout << "Failed to program any device found, exit!\n";
+ exit(EXIT_FAILURE);
+ }
+
+ // These commands will allocate memory on the Device. The cl::Buffer objects can
+ // be used to reference the memory locations on the device.
+ CHECK_MSG(err, cl::Buffer buffer_ch_0(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_1(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_2(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_3(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_4(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_5(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_6(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_7(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_8(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_9(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_10(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_11(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_12(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_13(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_14(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_15(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_16(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_17(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_18(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_19(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_20(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_21(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+ CHECK_MSG(err, cl::Buffer buffer_ch_22(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+
+
+ // set the kernel Arguments
+ CHECK_MSG(err, err = bandwidth23.setArg(0, buffer_ch_0));
+ CHECK_MSG(err, err = bandwidth23.setArg(1, buffer_ch_1));
+ CHECK_MSG(err, err = bandwidth23.setArg(2, buffer_ch_2));
+ CHECK_MSG(err, err = bandwidth23.setArg(3, buffer_ch_3));
+ CHECK_MSG(err, err = bandwidth23.setArg(4, buffer_ch_4));
+ CHECK_MSG(err, err = bandwidth23.setArg(5, buffer_ch_5));
+ CHECK_MSG(err, err = bandwidth23.setArg(6, buffer_ch_6));
+ CHECK_MSG(err, err = bandwidth23.setArg(7, buffer_ch_7));
+ CHECK_MSG(err, err = bandwidth23.setArg(8, buffer_ch_8));
+ CHECK_MSG(err, err = bandwidth23.setArg(9, buffer_ch_9));
+ CHECK_MSG(err, err = bandwidth23.setArg(10, buffer_ch_10));
+ CHECK_MSG(err, err = bandwidth23.setArg(11, buffer_ch_11));
+ CHECK_MSG(err, err = bandwidth23.setArg(12, buffer_ch_12));
+ CHECK_MSG(err, err = bandwidth23.setArg(13, buffer_ch_13));
+ CHECK_MSG(err, err = bandwidth23.setArg(14, buffer_ch_14));
+ CHECK_MSG(err, err = bandwidth23.setArg(15, buffer_ch_15));
+ CHECK_MSG(err, err = bandwidth23.setArg(16, buffer_ch_16));
+ CHECK_MSG(err, err = bandwidth23.setArg(17, buffer_ch_17));
+ CHECK_MSG(err, err = bandwidth23.setArg(18, buffer_ch_18));
+ CHECK_MSG(err, err = bandwidth23.setArg(19, buffer_ch_19));
+ CHECK_MSG(err, err = bandwidth23.setArg(20, buffer_ch_20));
+ CHECK_MSG(err, err = bandwidth23.setArg(21, buffer_ch_21));
+ CHECK_MSG(err, err = bandwidth23.setArg(22, buffer_ch_22));
+
+
+ // We then need to map our OpenCL buffers to get the pointers
+ data_t* ch_0;
+ data_t* ch_1;
+ data_t* ch_2;
+ data_t* ch_3;
+ data_t* ch_4;
+ data_t* ch_5;
+ data_t* ch_6;
+ data_t* ch_7;
+ data_t* ch_8;
+ data_t* ch_9;
+ data_t* ch_10;
+ data_t* ch_11;
+ data_t* ch_12;
+ data_t* ch_13;
+ data_t* ch_14;
+ data_t* ch_15;
+ data_t* ch_16;
+ data_t* ch_17;
+ data_t* ch_18;
+ data_t* ch_19;
+ data_t* ch_20;
+ data_t* ch_21;
+ data_t* ch_22;
+
+ CHECK_MSG(err, ch_0 = (data_t*)q.enqueueMapBuffer(buffer_ch_0, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_1 = (data_t*)q.enqueueMapBuffer(buffer_ch_1, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_2 = (data_t*)q.enqueueMapBuffer(buffer_ch_2, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_3 = (data_t*)q.enqueueMapBuffer(buffer_ch_3, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_4 = (data_t*)q.enqueueMapBuffer(buffer_ch_4, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_5 = (data_t*)q.enqueueMapBuffer(buffer_ch_5, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_6 = (data_t*)q.enqueueMapBuffer(buffer_ch_6, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_7 = (data_t*)q.enqueueMapBuffer(buffer_ch_7, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_8 = (data_t*)q.enqueueMapBuffer(buffer_ch_8, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_9 = (data_t*)q.enqueueMapBuffer(buffer_ch_9, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_10 = (data_t*)q.enqueueMapBuffer(buffer_ch_10, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_11 = (data_t*)q.enqueueMapBuffer(buffer_ch_11, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_12 = (data_t*)q.enqueueMapBuffer(buffer_ch_12, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_13 = (data_t*)q.enqueueMapBuffer(buffer_ch_13, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_14 = (data_t*)q.enqueueMapBuffer(buffer_ch_14, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_15 = (data_t*)q.enqueueMapBuffer(buffer_ch_15, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_16 = (data_t*)q.enqueueMapBuffer(buffer_ch_16, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_17 = (data_t*)q.enqueueMapBuffer(buffer_ch_17, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_18 = (data_t*)q.enqueueMapBuffer(buffer_ch_18, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_19 = (data_t*)q.enqueueMapBuffer(buffer_ch_19, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_20 = (data_t*)q.enqueueMapBuffer(buffer_ch_20, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_21 = (data_t*)q.enqueueMapBuffer(buffer_ch_21, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+ CHECK_MSG(err, ch_22 = (data_t*)q.enqueueMapBuffer(buffer_ch_22, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+
+
+ // Initialize input data
+ for (int i = 0; i < DATA_SIZE; i++) { ch_0[i] = 0 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_1[i] = 1 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_2[i] = 2 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_3[i] = 3 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_4[i] = 4 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_5[i] = 5 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_6[i] = 6 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_7[i] = 7 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_8[i] = 8 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_9[i] = 9 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_10[i] = 10 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_11[i] = 11 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_12[i] = 12 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_13[i] = 13 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_14[i] = 14 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_15[i] = 15 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_16[i] = 16 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_17[i] = 17 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_18[i] = 18 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_19[i] = 19 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_20[i] = 20 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_21[i] = 21 ^ i; }
+ for (int i = 0; i < DATA_SIZE; i++) { ch_22[i] = 22 ^ i; }
+
+ CHECK_MSG(err, err = bandwidth23.setArg(0, buffer_ch_0));
+ CHECK_MSG(err, err = bandwidth23.setArg(1, buffer_ch_1));
+ CHECK_MSG(err, err = bandwidth23.setArg(2, buffer_ch_2));
+ CHECK_MSG(err, err = bandwidth23.setArg(3, buffer_ch_3));
+ CHECK_MSG(err, err = bandwidth23.setArg(4, buffer_ch_4));
+ CHECK_MSG(err, err = bandwidth23.setArg(5, buffer_ch_5));
+ CHECK_MSG(err, err = bandwidth23.setArg(6, buffer_ch_6));
+ CHECK_MSG(err, err = bandwidth23.setArg(7, buffer_ch_7));
+ CHECK_MSG(err, err = bandwidth23.setArg(8, buffer_ch_8));
+ CHECK_MSG(err, err = bandwidth23.setArg(9, buffer_ch_9));
+ CHECK_MSG(err, err = bandwidth23.setArg(10, buffer_ch_10));
+ CHECK_MSG(err, err = bandwidth23.setArg(11, buffer_ch_11));
+ CHECK_MSG(err, err = bandwidth23.setArg(12, buffer_ch_12));
+ CHECK_MSG(err, err = bandwidth23.setArg(13, buffer_ch_13));
+ CHECK_MSG(err, err = bandwidth23.setArg(14, buffer_ch_14));
+ CHECK_MSG(err, err = bandwidth23.setArg(15, buffer_ch_15));
+ CHECK_MSG(err, err = bandwidth23.setArg(16, buffer_ch_16));
+ CHECK_MSG(err, err = bandwidth23.setArg(17, buffer_ch_17));
+ CHECK_MSG(err, err = bandwidth23.setArg(18, buffer_ch_18));
+ CHECK_MSG(err, err = bandwidth23.setArg(19, buffer_ch_19));
+ CHECK_MSG(err, err = bandwidth23.setArg(20, buffer_ch_20));
+ CHECK_MSG(err, err = bandwidth23.setArg(21, buffer_ch_21));
+ CHECK_MSG(err, err = bandwidth23.setArg(22, buffer_ch_22));
+ CHECK_MSG(err, err = bandwidth23.setArg(23, DATA_SIZE));
+
+
+ // Data will be migrated to device global memory
+ CHECK_MSG(err, err = q.enqueueMigrateMemObjects({buffer_ch_0, buffer_ch_1, buffer_ch_2, buffer_ch_3, buffer_ch_4, buffer_ch_5, buffer_ch_6, buffer_ch_7, buffer_ch_8, buffer_ch_9, buffer_ch_10, buffer_ch_11, buffer_ch_12, buffer_ch_13, buffer_ch_14, buffer_ch_15, buffer_ch_16, buffer_ch_17, buffer_ch_18, buffer_ch_19, buffer_ch_20, buffer_ch_21, buffer_ch_22}, 0 /* 0 means from host*/));
+
+ // Launnch the VecAdd kernel
+ CHECK_MSG(err, err = q.enqueueTask(bandwidth23));
+
+ // Migrate the result data back to host memory
+ CHECK_MSG(err, q.enqueueMigrateMemObjects({buffer_ch_0, buffer_ch_1, buffer_ch_2, buffer_ch_3, buffer_ch_4, buffer_ch_5, buffer_ch_6, buffer_ch_7, buffer_ch_8, buffer_ch_9, buffer_ch_10, buffer_ch_11, buffer_ch_12, buffer_ch_13, buffer_ch_14, buffer_ch_15, buffer_ch_16, buffer_ch_17, buffer_ch_18, buffer_ch_19, buffer_ch_20, buffer_ch_21, buffer_ch_22}, CL_MIGRATE_MEM_OBJECT_HOST));
+
+ // Wait for all the commands to complete
+ CHECK_MSG(err, q.finish());
+
+ // Verify the result
+ int match = 0;
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_0[i] != ((0 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_1[i] != ((1 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_2[i] != ((2 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_3[i] != ((3 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_4[i] != ((4 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_5[i] != ((5 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_6[i] != ((6 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_7[i] != ((7 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_8[i] != ((8 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_9[i] != ((9 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_10[i] != ((10 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_11[i] != ((11 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_12[i] != ((12 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_13[i] != ((13 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_14[i] != ((14 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_15[i] != ((15 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_16[i] != ((16 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_17[i] != ((17 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_18[i] != ((18 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_19[i] != ((19 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_20[i] != ((20 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_21[i] != ((21 ^ i))<<1) match++; }
+ for (int i = 0; i < DATA_SIZE; i++) { if(ch_22[i] != ((22 ^ i))<<1) match++; }
+
+
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_0, ch_0));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_1, ch_1));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_2, ch_2));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_3, ch_3));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_5, ch_5));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_6, ch_6));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_7, ch_7));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_8, ch_8));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_9, ch_9));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_10, ch_10));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_11, ch_11));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_12, ch_12));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_13, ch_13));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_14, ch_14));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_15, ch_15));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_16, ch_16));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_17, ch_17));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_18, ch_18));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_19, ch_19));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_20, ch_20));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_21, ch_21));
+ CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_22, ch_22));
+
+ CHECK_MSG(err, err = q.finish());
+
+ if (match == 0) {
+ std::cout << "TEST PASSED!" << std::endl;
+ } else {
+ std::cout << match << " TEST FAILED!" << std::endl;
+ }
+ return (match ? EXIT_FAILURE : EXIT_SUCCESS);
+}
diff --git a/benchmarks/vitis_flow/bandwidth23/design/vck5000.cfg b/benchmarks/vitis_flow/bandwidth23/design/vck5000.cfg
new file mode 100644
index 00000000..cf375c2d
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/design/vck5000.cfg
@@ -0,0 +1,27 @@
+platform=xilinx_vck5000_gen4x8_qdma_2_202220_1
+
+[connectivity]
+
+sp = bandwidth23.m_axi_ch_0:MC_NOC0
+sp = bandwidth23.m_axi_ch_1:MC_NOC0
+sp = bandwidth23.m_axi_ch_2:MC_NOC0
+sp = bandwidth23.m_axi_ch_3:MC_NOC0
+sp = bandwidth23.m_axi_ch_4:MC_NOC0
+sp = bandwidth23.m_axi_ch_5:MC_NOC0
+sp = bandwidth23.m_axi_ch_6:MC_NOC0
+sp = bandwidth23.m_axi_ch_7:MC_NOC0
+sp = bandwidth23.m_axi_ch_8:MC_NOC0
+sp = bandwidth23.m_axi_ch_9:MC_NOC0
+sp = bandwidth23.m_axi_ch_10:MC_NOC0
+sp = bandwidth23.m_axi_ch_11:MC_NOC0
+sp = bandwidth23.m_axi_ch_12:MC_NOC0
+sp = bandwidth23.m_axi_ch_13:MC_NOC0
+sp = bandwidth23.m_axi_ch_14:MC_NOC0
+sp = bandwidth23.m_axi_ch_15:MC_NOC0
+sp = bandwidth23.m_axi_ch_16:MC_NOC0
+sp = bandwidth23.m_axi_ch_17:MC_NOC0
+sp = bandwidth23.m_axi_ch_18:MC_NOC0
+sp = bandwidth23.m_axi_ch_19:MC_NOC0
+sp = bandwidth23.m_axi_ch_20:MC_NOC0
+sp = bandwidth23.m_axi_ch_21:MC_NOC0
+sp = bandwidth23.m_axi_ch_22:MC_NOC0
diff --git a/benchmarks/vitis_flow/bandwidth23/run_u50.py b/benchmarks/vitis_flow/bandwidth23/run_u50.py
new file mode 100644
index 00000000..34aece07
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth23/run_u50.py
@@ -0,0 +1,40 @@
+"""Getting Started: CNN13x2 in the Vitis flow
+
+This script demonstrates how to optimize a CNN13x2 design in
+a Vitis object file. In this example, the object file is generated from the
+Vitis_HLS.
+"""
+
+__copyright__ = """
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+"""
+
+from rapidstream import get_u50_vitis_device_factory, RapidStreamVitis
+import os
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Replace with RapidStreamVitis for the ".xo" files generated by `v++`.
+# Create a RapidStream project in the "run" directory:
+rs = RapidStreamVitis(f"{CURR_DIR}/build")
+
+# Use the "xilinx_u50_gen3x16_xdma_5_202210_1" platform as the device:
+u50_factory = get_u50_vitis_device_factory("xilinx_u50_gen3x16_xdma_5_202210_1")
+rs.set_virtual_device(u50_factory.generate_virtual_device())
+
+# Add the design object file (".xo") to the project:
+rs.add_xo_file(f"{CURR_DIR}/build/bert_all.xo")
+
+# Specify the Vitis platform and connectivity configuration:
+rs.set_vitis_platform("xilinx_u50_gen3x16_xdma_5_202210_1")
+rs.set_vitis_connectivity_config(f"{CURR_DIR}/design/link_config_hbm.ini")
+
+# Set the clock target for the design:
+rs.add_clock("ap_clk", period_ns=3)
+
+# Bind all ports to HBM 16-31:
+rs.assign_port_to_region(".*", "SLOT_X1Y0:SLOT_X1Y0")
+
+# Start the RapidStream optimization process:
+rs.run_dse()