diff --git a/benchmarks/tapa_flow/bandwidth23/Makefile b/benchmarks/tapa_flow/bandwidth23/Makefile index 441f8a63..15591519 100644 --- a/benchmarks/tapa_flow/bandwidth23/Makefile +++ b/benchmarks/tapa_flow/bandwidth23/Makefile @@ -8,11 +8,14 @@ SRC_DIR := $(CURDIR)/design AB_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json IMPL_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json LINK_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini +FIX_NOC_TCL := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/fix_noc.tcl PLATFORM := xilinx_vck5000_gen4x8_qdma_2_202220_1 PART_NUM := xcvc1902-vsvd1760-2MP-e-S GRP_UTIL := $(ROOT_DIR)/common/util/get_group.py TEMP_DIR := $(CURDIR)/build/$(notdir $(RS_SCRIPT)) -RS_TARGET := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin +SOLUTION_DIR := $(TEMP_DIR)/dse/solution_0 +RS_XO := $(SOLUTION_DIR)/updated.xo +RS_TARGET := $(SOLUTION_DIR)/$(KERNEL_NAME)_$(PLATFORM).xsa BUILD_LOG := $(TEMP_DIR)/build.json SUCCESS := "Build Successful" TIMING_RPT := impl_1_hw_bb_locked_timing_summary_routed.rpt @@ -27,23 +30,87 @@ KERNEL_XO := $(TEMP_DIR)/$(KERNEL_NAME).xo KERNEL_XCLBIN := $(TEMP_DIR)/$(KERNEL_NAME).xclbin KERNEL_XSA := $(TEMP_DIR)/$(KERNEL_NAME).xsa TARGET := hw - -all: $(RS_TARGET) - cd $(RSPATH) && $(RSPYTHON) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333 - @echo $(SUCCESS) - -$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG) +TARGET_FREQUENCY := 300 +PLACEMENT_STRATEGY := Explore +STRATEGY := Explore +VIVADO_PRJ_DIR := $(SOLUTION_DIR)/$(KERNEL_NAME)_$(PLATFORM).temp/link/vivado/vpl/prj +RS_FIX_NOC_TCL := $(VIVADO_PRJ_DIR)/fix_noc.tcl +RS_SYN := $(VIVADO_PRJ_DIR)/prj.runs/my_rm_synth_1/ulp_inst_0.dcp + + +all:$(RS_TARGET) + @echo $< + +#$(RS_TARGET) +#cd $(RSPATH) && $(RSPYTHON) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333 +#@echo $(SUCCESS) + +$(RS_TARGET):$(RS_SYN) + cd $(SOLUTION_DIR) && v++ ${DEBUG} \ + --link \ + --kernel $(KERNEL_NAME) \ + --platform $(PLATFORM) \ + --target ${TARGET} \ + --report_level 2 \ + --temp_dir "$(SOLUTION_DIR)/$(KERNEL_NAME)_$(PLATFORM).temp" \ + --optimize 3 \ + --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \ + --save-temps \ + --from_step vpl.impl \ + $(RS_XO) \ + --vivado.synth.jobs $(shell nproc) \ + --vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 \ + --vivado.prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \ + --vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$(PLACEMENT_STRATEGY) \ + --vivado.prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \ + --vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.TCL.PRE=$(SOLUTION_DIR)/xdc/floorplan.xdc \ + --config $(LINK_CONFIG) \ + --remote_ip_cache /var/tmp/remote_ip_cache \ + --output $(RS_TARGET) \ + --clock.defaultFreqHz $(TARGET_FREQUENCY)000000 + +$(RS_SYN): $(RS_FIX_NOC_TCL) + cd $(VIVADO_PRJ_DIR) && vivado -mode batch -source $(RS_FIX_NOC_TCL) -tclargs $(VIVADO_PRJ_DIR) + +$(RS_FIX_NOC_TCL): $(RS_XO) + cd $(dir $<) && v++ ${DEBUG} \ + --link \ + --kernel $(KERNEL_NAME) \ + --platform $(PLATFORM) \ + --target ${TARGET} \ + --report_level 2 \ + --temp_dir "$(dir $<)/$(KERNEL_NAME)_$(PLATFORM).temp" \ + --optimize 3 \ + --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \ + --save-temps \ + --to_step vpl.synth \ + $< \ + --vivado.synth.jobs $(shell nproc) \ + --vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 \ + --vivado.prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \ + --vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$(PLACEMENT_STRATEGY) \ + --vivado.prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \ + --vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.TCL.PRE=$(dir $<)/xdc/floorplan.xdc \ + --config $(LINK_CONFIG) \ + --remote_ip_cache /var/tmp/remote_ip_cache \ + --output $(RS_TARGET) \ + --clock.defaultFreqHz $(TARGET_FREQUENCY)000000 + cp $(FIX_NOC_TCL) $@ + + +# --run-impl +$(RS_XO):$(KERNEL_XO) $(DEVICE_CONFIG) mkdir -p $(TEMP_DIR) cd $(RSPATH) && $(RSXX)-tapaopt \ --work-dir $(TEMP_DIR) \ --tapa-xo-path $< \ --device-config $(DEVICE_CONFIG) \ --floorplan-config $(AB_CONFIG) \ - --single-reg \ - --run-impl \ --implementation-config $(IMPL_CONFIG) \ --connectivity-ini $(LINK_CONFIG) +device:$(DEVICE_CONFIG) + $(DEVICE_CONFIG):$(AB_CONFIG) mkdir -p $(TEMP_DIR) cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT) @@ -69,10 +136,10 @@ $(KERNEL_XCLBIN): $(KERNEL_XSA) $(KERNEL_XSA): $(KERNEL_XO) cd $(TEMP_DIR) && v++ -l -t ${TARGET} \ --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \ - --config $(SRC_DIR)/vck5000.cfg \ + --config $(LINK_CONFIG) \ --save-temps \ --temp_dir $(TEMP_DIR) \ - --clock.defaultFreqHz 250000000 \ + --clock.defaultFreqHz $(TARGET_FREQUENCY)000000 \ --vivado.synth.jobs 16 \ $< -o $@ @@ -82,7 +149,7 @@ $(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp mkdir -p $(TEMP_DIR) cd $(TEMP_DIR) && tapa compile \ --top $(KERNEL_NAME) \ - --part-num xcu55c-fsvh2892-2L-e \ + --part-num $(PART_NUM) \ --clock-period 3.33 \ -o $(KERNEL_NAME).xo \ -f $< \ diff --git a/benchmarks/tapa_flow/bandwidth23/README.md b/benchmarks/tapa_flow/bandwidth23/README.md index 54f2286e..bf45ced9 100644 --- a/benchmarks/tapa_flow/bandwidth23/README.md +++ b/benchmarks/tapa_flow/bandwidth23/README.md @@ -5,39 +5,74 @@ The contributor(s) of this file has/have agreed to the RapidStream Contributor L RapidStream Logo -# TAPA Flow: ORC Decoder +# TAPA Flow: Bandwidth23 ## Introduction +The AMD Versal device introduces a revolutionary hardware architecture for FPGA developers. +One standout feature is the distributed NoC AXI master/slave ports spread across the entire chip, +which is especially beneficial for kernels requiring numerous ports. For instance, in Alveo devices, +utilizing HBM bandwidth at the chip's base requires routing 32 AXI ports, each 512 bits wide, close +to the HBM—posing significant implementation challenges. With the Versal architecture, however, +these 32 AXI ports can be distributed across different regions of the chip, alleviating local routing congestion. -In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include: +Nevertheless, routing these ports across the chip introduces the challenge of connecting them +with long wires. To optimize the effectiveness of the NoC architecture, RapidStream can be employed +to automatically insert pipeline registers between distributed logic. This approach not only achieves +high bandwidth through multiple AXI ports but also mitigates the local routing congestion seen in +previous architectures like the Alveo device. -- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA. -- Optimize the .xo file with RapidStream to obtain an optimized .xo file. -- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment. + +In this recipe, we demonstrate how to leverage RapidStream to optimize a TAPA project that +includes a kernel with 23 AXI ports, each 512 bits wide. The process involves the following key steps: + +- Compile the TAPA C++ code into a Vitis-compatible `.xo` file using TAPA. +- Optimize the generated `.xo` file with RapidStream to produce an enhanced `.xo` file. +- Use Vitis to compile the optimized `.xo` file into an .xclbin file for FPGA deployment. ## Tutorial -### Step 1 (Done): Generate the Xilinx Object File (`.xo`) +### Step 1 : Generate the Xilinx Object File (`.xo`) + + +We utilize [Rapidstream-TAPA](https://github.com/rapidstream-org/rapidstream-tapa) to generate the `.xo` file. +The original C++ source files are located in design/src. To compile C++ to `.xo` using TAPA, we use the commands shown below. +For your convenience, you can also execute `make xo` command in the terminal supported by our [Makefile](Makefile). + +We use [Rapidstream-TAPA](https://github.com/rapidstream-org/rapidstream-tapa) to generate the `.xo` file, +with the original C++ source files located in the [design](./design) directory. To compile the C++ code +into a `.xo` file using TAPA, follow the commands provided below. For convenience, +you can also execute the `make xo` command in the terminal, as supported by our Makefile. + +```bash +mkdir -p build/run_vck5000.py +cd build/run_vck5000.py && tapa compile \ +--top bandwidth23 \ +--part-num xcvc1902-vsvd1760-2MP-e-S \ +--clock-period 3.33 \ +-o bandwidth23.xo \ +-f design/bandwidth23.cpp \ +2>&1 | tee tapa.log +``` + +### Step 2: Define Virtual Device + +The VCK5000 device is equipped with 4x7 NMU512 and NSU512 ports across the chip (only NMU512 ports are shown). For our design, we focus solely on the FPGA fabric and not the AI Engine. We define four slots for the virtual device, each containing either six or eight NMU512 ports to connect internal logic to the DDR SRAM at the base. A Python-based script, [run_vck5000.py](./run_vck5000.py), is provided as a reference for defining the virtual device using the RapidStream API. + +VCK5000 Device + +You can run the `run_vck5000.py` script by invoking RapidStream as shown below, or simply type `make device` in the terminal. This will generate a `device.json` file, which outlines all the device features, including slot resources, slot locations, and more. -We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory. ```bash -WORK_DIR=generated -tapac \ - --work-dir ${WORK_DIR} \ - --top data_decoding \ - --part-num xcu280-fsvh2892-2L-e \ - --clock-period 3.33 \ - -o ${WORK_DIR}/data_decoding.xo \ - --connectivity config/link_config.ini \ - src/data_decoder.cpp \ - 2>&1 | tee tapa.log +rapidstream run_vck5000.py ``` + ### Step 2: Use Rapidstream to Optimize `.xo` Design + The RapidStream flow conducts design space exploration and generates solutions by taking all TAPA-generated `.xo` file as the input. The RapidStream flow for TAPA requires the following key inputs: @@ -101,7 +136,7 @@ rs.assign_port_to_region("ap_rst_n", left_slot) rs.assign_port_to_region("interrupt", left_slot) ``` -For the complete detail, please refore to [./run.py](./run.py) file. Call the rapidstream by launching the command below or `make all`. +For the complete detail, please refore to [./run_vck5000.py](./run_vck5000.py) file. Call the rapidstream by launching the command below or `make all`. ```bash rapidstream run.py diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json index b9325669..3e2d0fe1 100644 --- a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json +++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json @@ -3,32 +3,32 @@ "dse_range_min": 0.7, "partition_strategy": "flat", "port_pre_assignments": { - ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0", - ".*ch_10_.*": "SLOT_X1Y0:SLOT_X1Y0", - ".*ch_11_.*": "SLOT_X1Y0:SLOT_X1Y0", - ".*ch_12_.*": "SLOT_X1Y0:SLOT_X1Y0", - ".*ch_13_.*": "SLOT_X1Y0:SLOT_X1Y0", - ".*ch_14_.*": "SLOT_X1Y0:SLOT_X1Y0", - ".*ch_15_.*": "SLOT_X1Y0:SLOT_X1Y0", - ".*ch_16_.*": "SLOT_X1Y0:SLOT_X1Y0", - ".*ch_17_.*": "SLOT_X1Y1:SLOT_X1Y1", - ".*ch_18_.*": "SLOT_X1Y1:SLOT_X1Y1", - ".*ch_19_.*": "SLOT_X1Y1:SLOT_X1Y1", - ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0", - ".*ch_20_.*": "SLOT_X1Y1:SLOT_X1Y1", - ".*ch_21_.*": "SLOT_X1Y1:SLOT_X1Y1", - ".*ch_22_.*": "SLOT_X1Y1:SLOT_X1Y1", - ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0", - ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0", - ".*ch_4_.*": "SLOT_X0Y0:SLOT_X0Y0", - ".*ch_5_.*": "SLOT_X0Y0:SLOT_X0Y0", - ".*ch_6_.*": "SLOT_X0Y0:SLOT_X0Y0", - ".*ch_7_.*": "SLOT_X0Y1:SLOT_X0Y1", - ".*ch_8_.*": "SLOT_X0Y1:SLOT_X0Y1", - ".*ch_9_.*": "SLOT_X0Y1:SLOT_X0Y1", - "ap_clk": "SLOT_X0Y0:SLOT_X0Y0", - "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0", - "interrupt": "SLOT_X0Y0:SLOT_X0Y0", - "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0" + ".*ch_0_.*": "NMU512_X0Y0", + ".*ch_10_.*": "NMU512_X1Y6", + ".*ch_11_.*": "NMU512_X2Y0", + ".*ch_12_.*": "NMU512_X2Y1", + ".*ch_13_.*": "NMU512_X2Y2", + ".*ch_14_.*": "NMU512_X2Y3", + ".*ch_15_.*": "NMU512_X3Y0", + ".*ch_16_.*": "NMU512_X3Y1", + ".*ch_17_.*": "NMU512_X3Y2", + ".*ch_18_.*": "NMU512_X2Y4", + ".*ch_19_.*": "NMU512_X2Y5", + ".*ch_1_.*": "NMU512_X0Y1", + ".*ch_20_.*": "NMU512_X2Y6", + ".*ch_21_.*": "NMU512_X3Y5", + ".*ch_22_.*": "NMU512_X3Y6", + ".*ch_2_.*": "NMU512_X0Y2", + ".*ch_3_.*": "NMU512_X0Y3", + ".*ch_4_.*": "NMU512_X1Y0", + ".*ch_5_.*": "NMU512_X1Y1", + ".*ch_6_.*": "NMU512_X1Y2", + ".*ch_7_.*": "NMU512_X0Y4", + ".*ch_8_.*": "NMU512_X1Y4", + ".*ch_9_.*": "NMU512_X1Y5", + "ap_clk": "CLK_RST", + "ap_rst_n": "CLK_RST", + "interrupt": "S_AXI_CONTROL", + "s_axi_control_.*": "S_AXI_CONTROL" } } diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/fix_noc.tcl b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/fix_noc.tcl new file mode 100644 index 00000000..19d36733 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/fix_noc.tcl @@ -0,0 +1,43 @@ + + +if {${argc} != 1} { + puts stderr "Should -tclargs
 . Too few arguments. Exiting."
+	exit 1
+}
+
+set PRE_PATH   "[lindex $argv 0]"
+
+open_project ${PRE_PATH}/prj.xpr
+open_bd_design ${PRE_PATH}/prj.srcs/my_rm/bd/ulp_inst_0/ulp_inst_0.bd
+
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y0}] [get_bd_intf_pins /axi_noc_kernel0/S00_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y1}] [get_bd_intf_pins /axi_noc_kernel0/S01_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y2}] [get_bd_intf_pins /axi_noc_kernel0/S02_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y3}] [get_bd_intf_pins /axi_noc_kernel0/S03_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y0}] [get_bd_intf_pins /axi_noc_kernel0/S04_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y1}] [get_bd_intf_pins /axi_noc_kernel0/S05_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y2}] [get_bd_intf_pins /axi_noc_kernel0/S06_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y4}] [get_bd_intf_pins /axi_noc_kernel0/S07_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y4}] [get_bd_intf_pins /axi_noc_kernel0/S08_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y5}] [get_bd_intf_pins /axi_noc_kernel0/S09_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y6}] [get_bd_intf_pins /axi_noc_kernel0/S10_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y0}] [get_bd_intf_pins /axi_noc_kernel0/S11_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y1}] [get_bd_intf_pins /axi_noc_kernel0/S12_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y2}] [get_bd_intf_pins /axi_noc_kernel0/S13_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y3}] [get_bd_intf_pins /axi_noc_kernel0/S14_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y0}] [get_bd_intf_pins /axi_noc_kernel0/S15_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y1}] [get_bd_intf_pins /axi_noc_kernel0/S16_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y2}] [get_bd_intf_pins /axi_noc_kernel0/S17_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y4}] [get_bd_intf_pins /axi_noc_kernel0/S18_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y5}] [get_bd_intf_pins /axi_noc_kernel0/S19_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y6}] [get_bd_intf_pins /axi_noc_kernel0/S20_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y5}] [get_bd_intf_pins /axi_noc_kernel0/S21_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y6}] [get_bd_intf_pins /axi_noc_kernel0/S22_AXI]
+
+
+save_bd_design
+validate_bd_design
+save_bd_design
+reset_run my_rm_synth_1
+launch_runs  my_rm_synth_1 -jobs 16
+wait_on_run  my_rm_synth_1
diff --git a/benchmarks/tapa_flow/bandwidth23/run_vck5000.py b/benchmarks/tapa_flow/bandwidth23/run_vck5000.py
index ae36f962..9b7274b9 100644
--- a/benchmarks/tapa_flow/bandwidth23/run_vck5000.py
+++ b/benchmarks/tapa_flow/bandwidth23/run_vck5000.py
@@ -29,14 +29,11 @@
     factory.set_slot_capacity(x, 1, north=11520)
 
     factory.set_slot_capacity(x, 1, south=11520)
-    # factory.set_slot_capacity(x, 2, south=11520)
 
 # Set W/E capacity
 for y in range(2):
     factory.set_slot_capacity(0, y, east=40320)
     factory.set_slot_capacity(1, y, west=40320)
-# factory.set_slot_capacity(0, 2, east=41178)
-# factory.set_slot_capacity(1, 2, west=41178)
 
 
 factory.set_platform_name(VITIS_PLATFORM)
diff --git a/benchmarks/tapa_flow/bandwidth4/Makefile b/benchmarks/tapa_flow/bandwidth4/Makefile
index 3f2761f9..d883c4f4 100644
--- a/benchmarks/tapa_flow/bandwidth4/Makefile
+++ b/benchmarks/tapa_flow/bandwidth4/Makefile
@@ -8,11 +8,14 @@ SRC_DIR		     := $(CURDIR)/design
 AB_CONFIG        := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json
 IMPL_CONFIG      := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json
 LINK_CONFIG      := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini
+FIX_NOC_TCL      := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/fix_noc.tcl
 PLATFORM         := xilinx_vck5000_gen4x8_qdma_2_202220_1
 PART_NUM         := xcvc1902-vsvd1760-2MP-e-S
 GRP_UTIL         := $(ROOT_DIR)/common/util/get_group.py
 TEMP_DIR         := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
-RS_TARGET        := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
+SOLUTION_DIR     := $(TEMP_DIR)/dse/solution_0
+RS_XO            := $(SOLUTION_DIR)/updated.xo
+RS_TARGET        := $(SOLUTION_DIR)/$(KERNEL_NAME)_$(PLATFORM).xsa
 BUILD_LOG        := $(TEMP_DIR)/build.json
 SUCCESS          := "Build Successful"
 TIMING_RPT       := impl_1_hw_bb_locked_timing_summary_routed.rpt
@@ -27,20 +30,82 @@ KERNEL_XO        := $(TEMP_DIR)/$(KERNEL_NAME).xo
 KERNEL_XCLBIN    := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
 KERNEL_XSA       := $(TEMP_DIR)/$(KERNEL_NAME).xsa
 TARGET		     := hw
+TARGET_FREQUENCY := 300
+PLACEMENT_STRATEGY := Explore
+STRATEGY           := Explore
+VIVADO_PRJ_DIR   := $(SOLUTION_DIR)/$(KERNEL_NAME)_$(PLATFORM).temp/link/vivado/vpl/prj
+RS_FIX_NOC_TCL   := $(VIVADO_PRJ_DIR)/fix_noc.tcl
+RS_SYN           := $(VIVADO_PRJ_DIR)/prj.runs/my_rm_synth_1/ulp_inst_0.dcp
+
+
+all:$(RS_TARGET)
+	@echo $<
+
+#$(RS_TARGET)
+#cd $(RSPATH) && $(RSPYTHON)	$(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG)  -c clk_kernel_00_unbuffered_net -p 3.333
+#@echo $(SUCCESS)
+
+$(RS_TARGET):$(RS_SYN)
+	cd $(SOLUTION_DIR) && v++ ${DEBUG} \
+	--link \
+	--kernel $(KERNEL_NAME) \
+	--platform $(PLATFORM) \
+	--target ${TARGET} \
+	--report_level 2 \
+	--temp_dir "$(SOLUTION_DIR)/$(KERNEL_NAME)_$(PLATFORM).temp" \
+	--optimize 3 \
+	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+	--save-temps \
+	--from_step vpl.impl \
+	$(RS_XO) \
+	--vivado.synth.jobs $(shell nproc) \
+	--vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 \
+	--vivado.prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \
+	--vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$(PLACEMENT_STRATEGY) \
+	--vivado.prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \
+	--vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.TCL.PRE=$(SOLUTION_DIR)/xdc/floorplan.xdc \
+	--config $(LINK_CONFIG) \
+	--remote_ip_cache /var/tmp/remote_ip_cache \
+	--output $(RS_TARGET) \
+	--clock.defaultFreqHz $(TARGET_FREQUENCY)000000
+
+$(RS_SYN): $(RS_FIX_NOC_TCL)
+	cd $(VIVADO_PRJ_DIR) && vivado -mode batch -source $(RS_FIX_NOC_TCL) -tclargs $(VIVADO_PRJ_DIR)
+
+$(RS_FIX_NOC_TCL): $(RS_XO)
+	cd $(dir $<) && v++ ${DEBUG} \
+	--link \
+	--kernel $(KERNEL_NAME) \
+	--platform $(PLATFORM) \
+	--target ${TARGET} \
+	--report_level 2 \
+	--temp_dir "$(dir $<)/$(KERNEL_NAME)_$(PLATFORM).temp" \
+	--optimize 3 \
+	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+	--save-temps \
+	--to_step vpl.synth \
+	$< \
+	--vivado.synth.jobs $(shell nproc) \
+	--vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 \
+	--vivado.prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \
+	--vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$(PLACEMENT_STRATEGY) \
+	--vivado.prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \
+	--vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.TCL.PRE=$(dir $<)/xdc/floorplan.xdc \
+	--config $(LINK_CONFIG) \
+	--remote_ip_cache /var/tmp/remote_ip_cache \
+	--output $(RS_TARGET) \
+	--clock.defaultFreqHz $(TARGET_FREQUENCY)000000
+	cp $(FIX_NOC_TCL) $@
 
-all: $(RS_TARGET)
-	cd $(RSPATH) && $(RSPYTHON)	$(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG)  -c clk_kernel_00_unbuffered_net -p 3.333
-	@echo $(SUCCESS)
 
 #   --run-impl
-$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG)
+$(RS_XO):$(KERNEL_XO) $(DEVICE_CONFIG)
 	mkdir -p $(TEMP_DIR)
 	cd $(RSPATH) && $(RSXX)-tapaopt \
     --work-dir $(TEMP_DIR) \
     --tapa-xo-path $< \
     --device-config $(DEVICE_CONFIG) \
     --floorplan-config $(AB_CONFIG) \
-	--single-reg \
     --implementation-config $(IMPL_CONFIG) \
     --connectivity-ini $(LINK_CONFIG)
 
@@ -69,10 +134,10 @@ $(KERNEL_XCLBIN): $(KERNEL_XSA)
 $(KERNEL_XSA): $(KERNEL_XO)
 	cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
 	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
-	--config $(SRC_DIR)/vck5000.cfg \
+	--config $(LINK_CONFIG) \
 	--save-temps \
 	--temp_dir $(TEMP_DIR) \
-	--clock.defaultFreqHz 250000000 \
+	--clock.defaultFreqHz $(TARGET_FREQUENCY)000000 \
 	--vivado.synth.jobs 16 \
 	$< -o $@
 
diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json
index 264df902..ef31fc94 100644
--- a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json
+++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json
@@ -3,13 +3,13 @@
     "dse_range_min": 0.7,
     "partition_strategy": "flat",
     "port_pre_assignments": {
-        ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0",
-        ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0",
-        ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0",
-        ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0",
-        "ap_clk": "SLOT_X0Y0:SLOT_X0Y0",
-        "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0",
-        "interrupt": "SLOT_X0Y0:SLOT_X0Y0",
-        "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0"
+        ".*ch_0_.*": "NMU512_X0Y0",
+        ".*ch_1_.*": "NMU512_X2Y0",
+        ".*ch_2_.*": "NMU512_X0Y4",
+        ".*ch_3_.*": "NMU512_X2Y4",
+        "ap_clk": "CLK_RST",
+        "ap_rst_n": "CLK_RST",
+        "interrupt": "S_AXI_CONTROL",
+        "s_axi_control_.*": "S_AXI_CONTROL"
     }
 }
diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/fix_noc.tcl b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/fix_noc.tcl
new file mode 100644
index 00000000..70388db5
--- /dev/null
+++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/fix_noc.tcl
@@ -0,0 +1,23 @@
+
+
+if {${argc} != 1} {
+	puts stderr "Should -tclargs 
 . Too few arguments. Exiting."
+	exit 1
+}
+
+set PRE_PATH   "[lindex $argv 0]"
+
+open_project ${PRE_PATH}/prj.xpr
+open_bd_design ${PRE_PATH}/prj.srcs/my_rm/bd/ulp_inst_0/ulp_inst_0.bd
+
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y0}] [get_bd_intf_pins /axi_noc_kernel0/S00_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y0}] [get_bd_intf_pins /axi_noc_kernel0/S01_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y4}] [get_bd_intf_pins /axi_noc_kernel0/S02_AXI]
+set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y4}] [get_bd_intf_pins /axi_noc_kernel0/S03_AXI]
+
+save_bd_design
+validate_bd_design
+save_bd_design
+reset_run my_rm_synth_1
+launch_runs  my_rm_synth_1 -jobs 16
+wait_on_run  my_rm_synth_1
diff --git a/benchmarks/tapa_flow/bandwidth4/run_vck5000.py b/benchmarks/tapa_flow/bandwidth4/run_vck5000.py
index ae36f962..77d49260 100644
--- a/benchmarks/tapa_flow/bandwidth4/run_vck5000.py
+++ b/benchmarks/tapa_flow/bandwidth4/run_vck5000.py
@@ -44,8 +44,42 @@
 
 factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y2"])
 factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y2"])
-factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y3:CLOCKREGION_X4Y4"])
-factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y3:CLOCKREGION_X9Y4"])
+
+
+factory.set_slot_pblock(
+    0,
+    1,
+    [
+        "-add SLICE_X0Y188:SLICE_X187Y327",
+        "-add DSP58_CPLX_X0Y94:DSP58_CPLX_X2Y163",
+        "-add DSP_X0Y94:DSP_X5Y163",
+        "-add IRI_QUAD_X0Y780:IRI_QUAD_X116Y1339",
+        "-add NOC_NMU512_X0Y4:NOC_NMU512_X1Y6",
+        "-add NOC_NSU512_X0Y4:NOC_NSU512_X1Y6",
+        "-add RAMB18_X0Y96:RAMB18_X5Y165",
+        "-add RAMB36_X0Y48:RAMB36_X5Y82",
+        "-add URAM288_X0Y48:URAM288_X2Y82",
+        "-add URAM_CAS_DLY_X0Y2:URAM_CAS_DLY_X2Y2",
+    ],
+)
+
+
+factory.set_slot_pblock(
+    1,
+    1,
+    [
+        "SLICE_X188Y188:SLICE_X359Y327",
+        "DSP58_CPLX_X3Y94:DSP58_CPLX_X5Y163",
+        "DSP_X6Y94:DSP_X11Y163",
+        "IRI_QUAD_X117Y780:IRI_QUAD_X224Y1339",
+        "NOC_NMU512_X2Y4:NOC_NMU512_X3Y6",
+        "NOC_NSU512_X2Y4:NOC_NSU512_X3Y6",
+        "RAMB18_X6Y96:RAMB18_X11Y165",
+        "RAMB36_X6Y48:RAMB36_X11Y82",
+        "URAM288_X3Y48:URAM288_X5Y82",
+        "URAM_CAS_DLY_X3Y2:URAM_CAS_DLY_X5Y2",
+    ],
+)
 
 
 # Vitis uses 4395 nets from SLR0 to SLR1
@@ -55,6 +89,22 @@
 # Vitis uses 4185 nets from SLR1 to SLR2
 # factory.set_slot_capacity(1, 1, south=11520 - 4185)
 
+# set hbm tags
+factory.set_slot_tags(
+    0,
+    0,
+    [f"NMU512_X{x}Y{y}" for x in range(0, 2) for y in range(0, 4)]
+    + ["S_AXI_CONTROL", "CLK_RST"],
+)
+factory.set_slot_tags(
+    1, 0, [f"NMU512_X{x}Y{y}" for x in range(2, 4) for y in range(0, 4)]
+)
+factory.set_slot_tags(
+    0, 1, [f"NMU512_X{x}Y{y}" for x in range(0, 2) for y in range(4, 7)]
+)
+factory.set_slot_tags(
+    1, 1, [f"NMU512_X{x}Y{y}" for x in range(2, 4) for y in range(4, 7)]
+)
 
 factory.extract_slot_resources()
 
diff --git a/benchmarks/vitis_flow/bandwidth4/Makefile b/benchmarks/vitis_flow/bandwidth4/Makefile
new file mode 100644
index 00000000..21573096
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth4/Makefile
@@ -0,0 +1,119 @@
+# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+
+ROOT_DIR         := $(shell git rev-parse --show-toplevel)
+GRP_UTIL         := $(ROOT_DIR)/common/util/get_group.py
+PLATFORM         := xilinx_vck5000_gen4x8_qdma_2_202220_1
+PART             := xcvc1902-vsvd1760-2MP-e-S
+LINK_FILE        := link_config_hbm.ini
+KERNEL_NAME      := bandwidth4
+HLSXX            := vitis_hls
+SRC_DIR          := $(CURDIR)/design
+RS_SCRIPT        := $(CURDIR)/run.py
+TEMP_DIR         := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
+HOST			 := $(TEMP_DIR)/app.exe
+KERNEL_XO        := $(TEMP_DIR)/$(KERNEL_NAME).xo
+KERNEL_XSA       := $(TEMP_DIR)/$(KERNEL_NAME).xsa
+KERNEL_XCLBIN    := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
+RS_XCLBIN        := $(TEMP_DIR)/dse/candidate_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
+CLK_PERIOD_NS    := 3
+TARGET           := hw
+HLS2RTL_TCL	     := $(ROOT_DIR)/common/tcl/hls2rtl.tcl
+GEN_XO           := 1
+
+BUILD_LOG        := $(TEMP_DIR)/build.json
+SUCCESS          := "Build Successful"
+TIMING_RPT       := impl_1_hw_bb_locked_timing_summary_routed.rpt
+SLACK_GETTER     := $(ROOT_DIR)/common/util/get_slack.py
+RSXX             := rapidstream
+
+
+
+
+all: $(RS_XCLBIN)
+	$(RSXX) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG)   -c clk_kernel_00_unbuffered_net -p 3.333
+	echo $(SUCCESS)
+
+$(RS_XCLBIN):$(KERNEL_XO)
+	$(RSXX) $(RS_SCRIPT)
+
+hw: $(KERNEL_XCLBIN)
+
+$(KERNEL_XCLBIN): $(KERNEL_XSA)
+	@echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****"
+	cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \
+	  $^ \
+	  --temp_dir $(TEMP_DIR) \
+	  --save-temps \
+	  --report_dir $(TEMP_DIR)/reports/ \
+	  --package.boot_mode=ospi \
+	  -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log
+	@echo "### ***** $(KERNEL_XCLBIN) packaging done! *****"
+
+$(KERNEL_XSA): $(KERNEL_XO)
+	cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
+	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+	--config $(SRC_DIR)/vck5000.cfg \
+	--save-temps \
+	--temp_dir $(TEMP_DIR) \
+	--clock.defaultFreqHz 250000000 \
+	--vivado.synth.jobs 16 \
+	$< -o $@
+
+
+xo:$(KERNEL_XO)
+
+$(KERNEL_XO): $(SRC_DIR)/$(KERNEL_NAME).cpp  $(SRC_DIR)/$(KERNEL_NAME).h
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && v++ -c -t ${TARGET} \
+	--platform $(PLATFORM) \
+	-k $(KERNEL_NAME) \
+	--temp_dir $(TEMP_DIR) \
+	--save-temps \
+	-o $@ \
+	$^
+
+sw_emu: $(HOST) $(SRC_DIR)/$(KERNEL_NAME).cpp  $(SRC_DIR)/$(KERNEL_NAME).h
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && v++ -c -t sw_emu \
+	--platform xilinx_u50_gen3x16_xdma_5_202210_1 \
+	-k $(KERNEL_NAME) \
+	--temp_dir $(TEMP_DIR) \
+	--save-temps \
+	-o $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xo \
+	$^
+	cd $(TEMP_DIR) && v++ -l -t sw_emu \
+	$(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xo \
+	--platform xilinx_u50_gen3x16_xdma_5_202210_1 \
+	--kernel $(KERNEL_NAME) \
+	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+	-o $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xclbin
+	cd $(TEMP_DIR) && XCL_EMULATION_MODE=sw_emu $< $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xclbin
+
+host:$(HOST)
+
+$(HOST): $(SRC_DIR)/host.cpp
+	mkdir -p $(TEMP_DIR)
+	g++ -Wall -g -std=c++11 $(SRC_DIR)/host.cpp -o $@ \
+		-I${XILINX_XRT}/include/ \
+		-I${XILINX_HLS}/include/ \
+		-L${XILINX_XRT}/lib/ -lOpenCL -pthread -lrt -lstdc++
+
+show_groups:
+	rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \
+	-o $(TEMP_DIR)/module_types.csv
+
+
+
+clean:
+	rm -rf $(TEMP_DIR) *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
+
+
+cleanall:
+	rm -rf build *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
diff --git a/benchmarks/vitis_flow/bandwidth4/README.md b/benchmarks/vitis_flow/bandwidth4/README.md
new file mode 100644
index 00000000..f6d4bcdd
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth4/README.md
@@ -0,0 +1,118 @@
+
+
+RapidStream Logo
+
+# Large Language Model Benchmark
+
+## Introduction
+
+In this recipe, we illustrate how to create a Vitis objective file (`.xo`) for a Large Language Model kernel from [Chen *et al.* (TRETS)](https://dl.acm.org/doi/10.1145/3656177) using Vitis, then optimize the `.xo` file with Rapidstream, and finally utilize the optimized output in the ongoing Vitis development process.
+
+
+## Tutorial
+
+### Step 1: Generate the Xilinx Object File (`.xo`)
+
+We use Vitis 2023.2 to generate the `.xo` file. Since we want to disable [free running pipeline (FRP)](https://www.xilinx.com/htmldocs/xilinx2021_2/hls-guidance/200-1553.html) feature for HLS step, we use [hls2rtl.tcl](../../../common/tcl/hls2rtl.tcl) to compile the C++ code to `.xo` file.
+
+Run the following command or run `make clean && make xo`:
+
+```bash
+source /Vitis/2023.2/settings64.sh
+make clean
+mkdir -p build
+vitis_hls ../../../common/tcl/hls2rtl.tcl \
+  -l build/vitis_hls_llm.log \
+  -tclargs \
+  xcu50-fsvh2104-2-e \
+  4 \
+  bert_all \
+  1 \
+  design/bert_all.cpp design/kernel.h \
+  design/bert_region_1.cpp design/bert_region_1.h \
+  design/bert_region_2.cpp design/bert_region_2.h \
+  design/bert_region_3.cpp design/bert_region_3.h
+```
+
+### Step 2 (Optional): Use Vitis --link to Generate the `.xclbin` File
+
+:warning: **Note**: This step can take hours to complete. We recommend using the RapidStream flow to optimize the `.xo` file instead of generating the `.xclbin` file if you are familiar with AMD Vitis flow.
+
+With the `.xo` file generated, you can use `v++ -link` to generate the `.xclbin` file. Run the following command or execute `make hw`:
+
+```bash
+v++ -l -t hw \
+  --platform xilinx_u50_gen3x16_xdma_5_202210_1 \
+  --kernel bert_all \
+  --connectivity.nk bert_all:1:bert_all \
+  --config design/link_config_hbm.ini \
+  --temp_dir build \
+  -o build/bert_all.xclbin \
+  build/bert_all.xo
+```
+
+### Step 3: Call RapidStream to Optimize the Design
+
+The RapidStream flow conducts design space exploration and generates optimized `.xo` files by taking the Vitis generated `.xo` as the input. The RapidStream flow for Vitis requires four key inputs:
+
+1. **Device**: Specify the Vitis platform name for `v++`.
+2. **Xilinx Object file** (.xo): Provide the file generated by `v++` or Vivado.
+3. **Connectivity** (.ini): Include the configuration file for `v++` ./design/link_config_hbm.ini.
+4. **Clock targets**: Define the desired clock frequencies.
+5. RapidStream automatically handles all other aspects of the flow.
+
+Please refer to [run_u50.py](./run_u50.py) for the complete RapidStream flow.
+To execute the flow and generate optimized `.xo` files,
+Run the following command or execute `make rs_opt`:
+
+```bash
+rapidstream ./run_u50.py
+```
+
+Unlike in the example provided in [getting_started/vitis_source](../../../getting_started/vitis_source/run.py) where the `skip_impl` variable is set to `True`, in this case, the DSE engine will automatically launch Vitis to link the optimized `.xo` file to the target device and generate the `.xclbin` file.
+
+```bash
+# Skip Vitis implementation.
+rs.run_dse(skip_impl=True)
+```
+
+When finished, you can locate these files using the following command:
+
+
+```bash
+find ./build/dse/ -name *.xclbin
+```
+
+If everything is successful, you should at least get one optimized `.xclbin` file.
+
+
+### Step 4: Check the Group Module Report
+
+
+RapidStream mandates a clear distinction between communication and computation within user designs.
+
+- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules.
+
+- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged.
+
+For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation.
+
+To generate a report on group types, execute the commands below or `run make show_groups`:
+
+```bash
+rapidstream ../../../common/util/get_group.py \
+	-i build/passes/0-imported.json \
+	-o build/module_types.csv
+```
+
+The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `VecAdd` serves as a Group module, while the other three modules are added by RapidStream.
+
+| Module Name                      | Group Type     |
+|:--------------------------------:|:--------------:|
+| bert_all                         | grouped_module |
+|__rs_ap_ctrl_start_ready_pipeline | grouped_module |
+|__rs_ff_pipeline                  | grouped_module |
+|__rs_hs_pipeline                  | grouped_module |
diff --git a/benchmarks/vitis_flow/bandwidth4/design/bandwidth4.cpp b/benchmarks/vitis_flow/bandwidth4/design/bandwidth4.cpp
new file mode 100644
index 00000000..649ed7f5
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth4/design/bandwidth4.cpp
@@ -0,0 +1,68 @@
+// Copyright 2024 RapidStream Design Automation, Inc.
+// All Rights Reserved.
+
+#include "bandwidth4.h"
+#include 
+
+
+void print_512(bit512 din){
+    // Print out the data 64-bit hex per line
+    for (int i = 0; i < 8; i++) {
+        printf("%08x%08x\n", (unsigned int) din(63+i*64, 32+i*64), (unsigned int) din(31+i*64, 0+i*64));
+    }
+}
+
+void read_mem(bit512* mem, hls::stream& ch, long offset) {
+    for (int j = 0; j < 1024; j++) {
+        ch.write(mem[(offset<<10) + j]<<1);
+    }
+}
+
+
+void write_mem(hls::stream& ch, bit512* mem, long offset) {
+    for (int j = 0; j < 1024; j++) {
+        mem[(offset<<10) + j] =  ch.read();
+    }
+}
+
+
+
+extern "C" {
+
+void bandwidth4(
+    bit512* ch_0,
+    bit512* ch_1,
+    bit512* ch_2,
+    bit512* ch_3,
+    long n)
+{
+#pragma HLS INTERFACE m_axi port=ch_0 bundle=ch_0
+#pragma HLS INTERFACE m_axi port=ch_1 bundle=ch_1
+#pragma HLS INTERFACE m_axi port=ch_2 bundle=ch_2
+#pragma HLS INTERFACE m_axi port=ch_3 bundle=ch_3
+#pragma HLS INTERFACE s_axilite port=n bundle=control
+#pragma HLS INTERFACE s_axilite port=return bundle=control
+    hls::stream stream_0;
+#pragma HLS STREAM variable=stream_0 depth=2048
+    hls::stream stream_1;
+#pragma HLS STREAM variable=stream_1 depth=2048
+    hls::stream stream_2;
+#pragma HLS STREAM variable=stream_2 depth=2048
+    hls::stream stream_3;
+#pragma HLS STREAM variable=stream_3 depth=2048
+
+
+
+    for(int i=0; i<(n>>10); i++){
+        read_mem(ch_0, stream_0, i);
+        read_mem(ch_1, stream_1, i);
+        read_mem(ch_2, stream_2, i);
+        read_mem(ch_3, stream_3, i);
+
+        write_mem(stream_0, ch_0, i);
+        write_mem(stream_1, ch_1, i);
+        write_mem(stream_2, ch_2, i);
+        write_mem(stream_3, ch_3, i);
+    }
+}
+}
diff --git a/benchmarks/vitis_flow/bandwidth4/design/bandwidth4.h b/benchmarks/vitis_flow/bandwidth4/design/bandwidth4.h
new file mode 100644
index 00000000..5efd8cbd
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth4/design/bandwidth4.h
@@ -0,0 +1,24 @@
+// Copyright 2024 RapidStream Design Automation, Inc.
+// All Rights Reserved.
+
+#include "stdio.h"
+#include "stdlib.h"
+#include "math.h"
+#include 
+#include 
+
+
+/* Data Type */
+typedef ap_uint<512> bit512;
+typedef ap_uint<64> bit64;
+typedef bit512 data_t ;
+/* Data Type */
+
+
+
+extern "C" { void bandwidth4(
+    bit512* ch_0,
+    bit512* ch_1,
+    bit512* ch_2,
+    bit512* ch_3,
+    long n); }
diff --git a/benchmarks/vitis_flow/bandwidth4/design/host.cpp b/benchmarks/vitis_flow/bandwidth4/design/host.cpp
new file mode 100644
index 00000000..637bf018
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth4/design/host.cpp
@@ -0,0 +1,186 @@
+// Copyright 2024 RapidStream Design Automation, Inc.
+// All Rights Reserved.
+
+
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
+
+#include 
+#include 
+#include 
+#include 
+#include "bandwidth4.h"
+
+void print_512(bit512 din){
+    // Print out the data 64-bit hex per line
+    for (int i = 0; i < 8; i++) {
+        printf("%08x%08x\n", (unsigned int) din(63+i*64, 32+i*64), (unsigned int) din(31+i*64, 0+i*64));
+    }
+}
+
+#define CHECK_MSG(msg, call)                                                                   \
+    call;                                                                                        \
+    if (msg != CL_SUCCESS) {                                                                   \
+        printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, msg); \
+        exit(EXIT_FAILURE);                                                                      \
+    }
+
+static const std::string error_message =
+    "Error: Result mismatch:\n"
+    "i = %d CPU result = %d Device result = %d\n";
+
+int main(int argc, char* argv[]) {
+    // Must specify the xclbin file as the second argument
+    if (argc != 2) {
+        std::cout << "Please run the application by: " << argv[0] << " " << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::string xclbin_file = argv[1];
+
+    // Calculate the byte size the input data
+    long DATA_SIZE = 4096;
+
+    std::vector devices;
+    cl_int err;
+    cl::Context context;
+    cl::CommandQueue q;
+    cl::Kernel bandwidth4;
+    cl::Program program;
+    std::vector platforms;
+    bool device_found = false;
+
+    // The get_xil_devices will return vector of Xilinx Devices
+    // Iterate through devices and find Xilinx Alveo Device
+    cl::Platform::get(&platforms);
+    for (size_t i = 0; (i < platforms.size()) & (device_found == false); i++) {
+        cl::Platform platform = platforms[i];
+        std::string platformName = platform.getInfo();
+        if (platformName == "Xilinx") {
+            devices.clear();
+            platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
+            if (devices.size()) {
+                device_found = true;
+                break;
+            }
+        }
+    }
+    if (device_found == false) {
+        std::cout << "Error: could not find the target Xilinx Alveo device" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    std::cout << "INFO: reading " << xclbin_file << " xclbinfile" << std::endl;
+    FILE* fp;
+    if ((fp = fopen(xclbin_file.c_str(), "r")) == nullptr) {
+        std::cout << "ERROR: cannot open" << xclbin_file.c_str() << " xclbin!" << std::endl;
+        exit(EXIT_FAILURE);
+    }
+
+    // Load xclbin
+    std::cout << "INFO: loading: '" << xclbin_file << "'\n";
+    std::ifstream bin_file(xclbin_file, std::ifstream::binary);
+    bin_file.seekg(0, bin_file.end);
+    unsigned nb = bin_file.tellg();
+    bin_file.seekg(0, bin_file.beg);
+    char* buf = new char[nb];
+    bin_file.read(buf, nb);
+
+    // Creating Program from Binary File
+    cl::Program::Binaries bins;
+    bins.push_back({buf, nb});
+    bool valid_device = false;
+    for (unsigned int i = 0; i < devices.size(); i++) {
+        auto device = devices[i];
+        // For the device, we create a context and command queue
+        CHECK_MSG(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err));
+        CHECK_MSG(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));
+        std::cout << "Trying to program device[" << i << "]: " << device.getInfo() << std::endl;
+        cl::Program program(context, {device}, bins, nullptr, &err);
+        if (err != CL_SUCCESS) {
+            std::cout << "Device[" << i << "]: failed to load xclbin file!\n";
+        } else {
+            std::cout << "Device[" << i << "]: xclbin is loaded successfully!\n";
+            CHECK_MSG(err, bandwidth4 = cl::Kernel(program, "bandwidth4", &err));
+            valid_device = true;
+            break; // we break because we found a valid device
+        }
+    }
+    if (!valid_device) {
+        std::cout << "Failed to program any device found, exit!\n";
+        exit(EXIT_FAILURE);
+    }
+
+    // These commands will allocate memory on the Device. The cl::Buffer objects can
+    // be used to reference the memory locations on the device.
+    CHECK_MSG(err, cl::Buffer buffer_ch_0(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_1(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_2(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+    CHECK_MSG(err, cl::Buffer buffer_ch_3(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err));
+
+    // set the kernel Arguments
+    CHECK_MSG(err, err = bandwidth4.setArg(0, buffer_ch_0));
+    CHECK_MSG(err, err = bandwidth4.setArg(1, buffer_ch_1));
+    CHECK_MSG(err, err = bandwidth4.setArg(2, buffer_ch_2));
+    CHECK_MSG(err, err = bandwidth4.setArg(3, buffer_ch_3));
+
+
+    // We then need to map our OpenCL buffers to get the pointers
+    data_t* ch_0;
+    data_t* ch_1;
+    data_t* ch_2;
+    data_t* ch_3;
+
+    CHECK_MSG(err, ch_0 = (data_t*)q.enqueueMapBuffer(buffer_ch_0, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_1 = (data_t*)q.enqueueMapBuffer(buffer_ch_1, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_2 = (data_t*)q.enqueueMapBuffer(buffer_ch_2, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+    CHECK_MSG(err, ch_3 = (data_t*)q.enqueueMapBuffer(buffer_ch_3, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err));
+
+
+    // Initialize input data
+    for (int i = 0; i < DATA_SIZE; i++) { ch_0[i] = 0 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_1[i] = 1 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_2[i] = 2 ^ i; }
+    for (int i = 0; i < DATA_SIZE; i++) { ch_3[i] = 3 ^ i; }
+
+    CHECK_MSG(err, err = bandwidth4.setArg(0, buffer_ch_0));
+    CHECK_MSG(err, err = bandwidth4.setArg(1, buffer_ch_1));
+    CHECK_MSG(err, err = bandwidth4.setArg(2, buffer_ch_2));
+    CHECK_MSG(err, err = bandwidth4.setArg(3, buffer_ch_3));
+    CHECK_MSG(err, err = bandwidth4.setArg(4, DATA_SIZE));
+
+
+    // Data will be migrated to device global memory
+    CHECK_MSG(err, err = q.enqueueMigrateMemObjects({buffer_ch_0, buffer_ch_1, buffer_ch_2, buffer_ch_3}, 0 /* 0 means from host*/));
+
+    // Launnch the VecAdd kernel
+    CHECK_MSG(err, err = q.enqueueTask(bandwidth4));
+
+    // Migrate the result data back to host memory
+    CHECK_MSG(err, q.enqueueMigrateMemObjects({buffer_ch_0, buffer_ch_1, buffer_ch_2, buffer_ch_3}, CL_MIGRATE_MEM_OBJECT_HOST));
+
+    // Wait for all the commands to complete
+    CHECK_MSG(err, q.finish());
+
+    // Verify the result
+    int match = 0;
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_0[i] != ((0 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_1[i] != ((1 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_2[i] != ((2 ^ i))<<1) match++; }
+    for (int i = 0; i < DATA_SIZE; i++) { if(ch_3[i] != ((3 ^ i))<<1) match++; }
+
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_0, ch_0));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_1, ch_1));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_2, ch_2));
+    CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_3, ch_3));
+
+    CHECK_MSG(err, err = q.finish());
+
+    if (match == 0) {
+        std::cout << "TEST PASSED!" << std::endl;
+    } else {
+        std::cout << match << " TEST FAILED!" << std::endl;
+    }
+    return (match ? EXIT_FAILURE : EXIT_SUCCESS);
+}
diff --git a/benchmarks/vitis_flow/bandwidth4/design/vck5000.cfg b/benchmarks/vitis_flow/bandwidth4/design/vck5000.cfg
new file mode 100644
index 00000000..cf375c2d
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth4/design/vck5000.cfg
@@ -0,0 +1,27 @@
+platform=xilinx_vck5000_gen4x8_qdma_2_202220_1
+
+[connectivity]
+
+sp = bandwidth23.m_axi_ch_0:MC_NOC0
+sp = bandwidth23.m_axi_ch_1:MC_NOC0
+sp = bandwidth23.m_axi_ch_2:MC_NOC0
+sp = bandwidth23.m_axi_ch_3:MC_NOC0
+sp = bandwidth23.m_axi_ch_4:MC_NOC0
+sp = bandwidth23.m_axi_ch_5:MC_NOC0
+sp = bandwidth23.m_axi_ch_6:MC_NOC0
+sp = bandwidth23.m_axi_ch_7:MC_NOC0
+sp = bandwidth23.m_axi_ch_8:MC_NOC0
+sp = bandwidth23.m_axi_ch_9:MC_NOC0
+sp = bandwidth23.m_axi_ch_10:MC_NOC0
+sp = bandwidth23.m_axi_ch_11:MC_NOC0
+sp = bandwidth23.m_axi_ch_12:MC_NOC0
+sp = bandwidth23.m_axi_ch_13:MC_NOC0
+sp = bandwidth23.m_axi_ch_14:MC_NOC0
+sp = bandwidth23.m_axi_ch_15:MC_NOC0
+sp = bandwidth23.m_axi_ch_16:MC_NOC0
+sp = bandwidth23.m_axi_ch_17:MC_NOC0
+sp = bandwidth23.m_axi_ch_18:MC_NOC0
+sp = bandwidth23.m_axi_ch_19:MC_NOC0
+sp = bandwidth23.m_axi_ch_20:MC_NOC0
+sp = bandwidth23.m_axi_ch_21:MC_NOC0
+sp = bandwidth23.m_axi_ch_22:MC_NOC0
diff --git a/benchmarks/vitis_flow/bandwidth4/run_u50.py b/benchmarks/vitis_flow/bandwidth4/run_u50.py
new file mode 100644
index 00000000..34aece07
--- /dev/null
+++ b/benchmarks/vitis_flow/bandwidth4/run_u50.py
@@ -0,0 +1,40 @@
+"""Getting Started: CNN13x2 in the Vitis flow
+
+This script demonstrates how to optimize a CNN13x2 design in
+a Vitis object file. In this example, the object file is generated from the
+Vitis_HLS.
+"""
+
+__copyright__ = """
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+"""
+
+from rapidstream import get_u50_vitis_device_factory, RapidStreamVitis
+import os
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+
+# Replace with RapidStreamVitis for the  ".xo" files generated by `v++`.
+# Create a RapidStream project in the "run" directory:
+rs = RapidStreamVitis(f"{CURR_DIR}/build")
+
+# Use the "xilinx_u50_gen3x16_xdma_5_202210_1" platform as the device:
+u50_factory = get_u50_vitis_device_factory("xilinx_u50_gen3x16_xdma_5_202210_1")
+rs.set_virtual_device(u50_factory.generate_virtual_device())
+
+# Add the design object file (".xo") to the project:
+rs.add_xo_file(f"{CURR_DIR}/build/bert_all.xo")
+
+# Specify the Vitis platform and connectivity configuration:
+rs.set_vitis_platform("xilinx_u50_gen3x16_xdma_5_202210_1")
+rs.set_vitis_connectivity_config(f"{CURR_DIR}/design/link_config_hbm.ini")
+
+# Set the clock target for the design:
+rs.add_clock("ap_clk", period_ns=3)
+
+# Bind all ports to HBM 16-31:
+rs.assign_port_to_region(".*", "SLOT_X1Y0:SLOT_X1Y0")
+
+# Start the RapidStream optimization process:
+rs.run_dse()
diff --git a/common/img/vck5000_virtual_device.jpg b/common/img/vck5000_virtual_device.jpg
new file mode 100644
index 00000000..6882c37a
Binary files /dev/null and b/common/img/vck5000_virtual_device.jpg differ