Skip to content

Commit

Permalink
feat(bandwidth4): fix noc hack successful
Browse files Browse the repository at this point in the history
  • Loading branch information
vagrantxiao24 committed Dec 10, 2024
1 parent 7fd6996 commit 2692af9
Show file tree
Hide file tree
Showing 17 changed files with 940 additions and 78 deletions.
91 changes: 79 additions & 12 deletions benchmarks/tapa_flow/bandwidth23/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,14 @@ SRC_DIR := $(CURDIR)/design
AB_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json
IMPL_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json
LINK_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini
FIX_NOC_TCL := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/fix_noc.tcl
PLATFORM := xilinx_vck5000_gen4x8_qdma_2_202220_1
PART_NUM := xcvc1902-vsvd1760-2MP-e-S
GRP_UTIL := $(ROOT_DIR)/common/util/get_group.py
TEMP_DIR := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
RS_TARGET := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
SOLUTION_DIR := $(TEMP_DIR)/dse/solution_0
RS_XO := $(SOLUTION_DIR)/updated.xo
RS_TARGET := $(SOLUTION_DIR)/$(KERNEL_NAME)_$(PLATFORM).xsa
BUILD_LOG := $(TEMP_DIR)/build.json
SUCCESS := "Build Successful"
TIMING_RPT := impl_1_hw_bb_locked_timing_summary_routed.rpt
Expand All @@ -27,23 +30,87 @@ KERNEL_XO := $(TEMP_DIR)/$(KERNEL_NAME).xo
KERNEL_XCLBIN := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
KERNEL_XSA := $(TEMP_DIR)/$(KERNEL_NAME).xsa
TARGET := hw

all: $(RS_TARGET)
cd $(RSPATH) && $(RSPYTHON) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333
@echo $(SUCCESS)

$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG)
TARGET_FREQUENCY := 300
PLACEMENT_STRATEGY := Explore
STRATEGY := Explore
VIVADO_PRJ_DIR := $(SOLUTION_DIR)/$(KERNEL_NAME)_$(PLATFORM).temp/link/vivado/vpl/prj
RS_FIX_NOC_TCL := $(VIVADO_PRJ_DIR)/fix_noc.tcl
RS_SYN := $(VIVADO_PRJ_DIR)/prj.runs/my_rm_synth_1/ulp_inst_0.dcp


all:$(RS_TARGET)
@echo $<

#$(RS_TARGET)
#cd $(RSPATH) && $(RSPYTHON) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333
#@echo $(SUCCESS)

$(RS_TARGET):$(RS_SYN)
cd $(SOLUTION_DIR) && v++ ${DEBUG} \
--link \
--kernel $(KERNEL_NAME) \
--platform $(PLATFORM) \
--target ${TARGET} \
--report_level 2 \
--temp_dir "$(SOLUTION_DIR)/$(KERNEL_NAME)_$(PLATFORM).temp" \
--optimize 3 \
--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
--save-temps \
--from_step vpl.impl \
$(RS_XO) \
--vivado.synth.jobs $(shell nproc) \
--vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 \
--vivado.prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \
--vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$(PLACEMENT_STRATEGY) \
--vivado.prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \
--vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.TCL.PRE=$(SOLUTION_DIR)/xdc/floorplan.xdc \
--config $(LINK_CONFIG) \
--remote_ip_cache /var/tmp/remote_ip_cache \
--output $(RS_TARGET) \
--clock.defaultFreqHz $(TARGET_FREQUENCY)000000

$(RS_SYN): $(RS_FIX_NOC_TCL)
cd $(VIVADO_PRJ_DIR) && vivado -mode batch -source $(RS_FIX_NOC_TCL) -tclargs $(VIVADO_PRJ_DIR)

$(RS_FIX_NOC_TCL): $(RS_XO)
cd $(dir $<) && v++ ${DEBUG} \
--link \
--kernel $(KERNEL_NAME) \
--platform $(PLATFORM) \
--target ${TARGET} \
--report_level 2 \
--temp_dir "$(dir $<)/$(KERNEL_NAME)_$(PLATFORM).temp" \
--optimize 3 \
--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
--save-temps \
--to_step vpl.synth \
$< \
--vivado.synth.jobs $(shell nproc) \
--vivado.prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=1 \
--vivado.prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \
--vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=$(PLACEMENT_STRATEGY) \
--vivado.prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=$(STRATEGY) \
--vivado.prop=run.impl_1.STEPS.PLACE_DESIGN.TCL.PRE=$(dir $<)/xdc/floorplan.xdc \
--config $(LINK_CONFIG) \
--remote_ip_cache /var/tmp/remote_ip_cache \
--output $(RS_TARGET) \
--clock.defaultFreqHz $(TARGET_FREQUENCY)000000
cp $(FIX_NOC_TCL) $@


# --run-impl
$(RS_XO):$(KERNEL_XO) $(DEVICE_CONFIG)
mkdir -p $(TEMP_DIR)
cd $(RSPATH) && $(RSXX)-tapaopt \
--work-dir $(TEMP_DIR) \
--tapa-xo-path $< \
--device-config $(DEVICE_CONFIG) \
--floorplan-config $(AB_CONFIG) \
--single-reg \
--run-impl \
--implementation-config $(IMPL_CONFIG) \
--connectivity-ini $(LINK_CONFIG)

device:$(DEVICE_CONFIG)

$(DEVICE_CONFIG):$(AB_CONFIG)
mkdir -p $(TEMP_DIR)
cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT)
Expand All @@ -69,10 +136,10 @@ $(KERNEL_XCLBIN): $(KERNEL_XSA)
$(KERNEL_XSA): $(KERNEL_XO)
cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
--config $(SRC_DIR)/vck5000.cfg \
--config $(LINK_CONFIG) \
--save-temps \
--temp_dir $(TEMP_DIR) \
--clock.defaultFreqHz 250000000 \
--clock.defaultFreqHz $(TARGET_FREQUENCY)000000 \
--vivado.synth.jobs 16 \
$< -o $@

Expand All @@ -82,7 +149,7 @@ $(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp
mkdir -p $(TEMP_DIR)
cd $(TEMP_DIR) && tapa compile \
--top $(KERNEL_NAME) \
--part-num xcu55c-fsvh2892-2L-e \
--part-num $(PART_NUM) \
--clock-period 3.33 \
-o $(KERNEL_NAME).xo \
-f $< \
Expand Down
71 changes: 53 additions & 18 deletions benchmarks/tapa_flow/bandwidth23/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,39 +5,74 @@ The contributor(s) of this file has/have agreed to the RapidStream Contributor L

<img src="https://imagedelivery.net/AU8IzMTGgpVmEBfwPILIgw/1b565657-df33-41f9-f29e-0d539743e700/128" width="64px" alt="RapidStream Logo" />

# TAPA Flow: ORC Decoder
# TAPA Flow: Bandwidth23

## Introduction

The AMD Versal device introduces a revolutionary hardware architecture for FPGA developers.
One standout feature is the distributed NoC AXI master/slave ports spread across the entire chip,
which is especially beneficial for kernels requiring numerous ports. For instance, in Alveo devices,
utilizing HBM bandwidth at the chip's base requires routing 32 AXI ports, each 512 bits wide, close
to the HBM—posing significant implementation challenges. With the Versal architecture, however,
these 32 AXI ports can be distributed across different regions of the chip, alleviating local routing congestion.

In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include:
Nevertheless, routing these ports across the chip introduces the challenge of connecting them
with long wires. To optimize the effectiveness of the NoC architecture, RapidStream can be employed
to automatically insert pipeline registers between distributed logic. This approach not only achieves
high bandwidth through multiple AXI ports but also mitigates the local routing congestion seen in
previous architectures like the Alveo device.

- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA.
- Optimize the .xo file with RapidStream to obtain an optimized .xo file.
- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment.

In this recipe, we demonstrate how to leverage RapidStream to optimize a TAPA project that
includes a kernel with 23 AXI ports, each 512 bits wide. The process involves the following key steps:

- Compile the TAPA C++ code into a Vitis-compatible `.xo` file using TAPA.
- Optimize the generated `.xo` file with RapidStream to produce an enhanced `.xo` file.
- Use Vitis to compile the optimized `.xo` file into an .xclbin file for FPGA deployment.

## Tutorial

### Step 1 (Done): Generate the Xilinx Object File (`.xo`)
### Step 1 : Generate the Xilinx Object File (`.xo`)


We utilize [Rapidstream-TAPA](https://github.com/rapidstream-org/rapidstream-tapa) to generate the `.xo` file.
The original C++ source files are located in design/src. To compile C++ to `.xo` using TAPA, we use the commands shown below.
For your convenience, you can also execute `make xo` command in the terminal supported by our [Makefile](Makefile).

We use [Rapidstream-TAPA](https://github.com/rapidstream-org/rapidstream-tapa) to generate the `.xo` file,
with the original C++ source files located in the [design](./design) directory. To compile the C++ code
into a `.xo` file using TAPA, follow the commands provided below. For convenience,
you can also execute the `make xo` command in the terminal, as supported by our Makefile.

```bash
mkdir -p build/run_vck5000.py
cd build/run_vck5000.py && tapa compile \
--top bandwidth23 \
--part-num xcvc1902-vsvd1760-2MP-e-S \
--clock-period 3.33 \
-o bandwidth23.xo \
-f design/bandwidth23.cpp \
2>&1 | tee tapa.log
```

### Step 2: Define Virtual Device

The VCK5000 device is equipped with 4x7 NMU512 and NSU512 ports across the chip (only NMU512 ports are shown). For our design, we focus solely on the FPGA fabric and not the AI Engine. We define four slots for the virtual device, each containing either six or eight NMU512 ports to connect internal logic to the DDR SRAM at the base. A Python-based script, [run_vck5000.py](./run_vck5000.py), is provided as a reference for defining the virtual device using the RapidStream API.


<img src="../../../common/img/vck5000_virtual_device.jpg" width="800px" alt="VCK5000 Device"/>

You can run the `run_vck5000.py` script by invoking RapidStream as shown below, or simply type `make device` in the terminal. This will generate a `device.json` file, which outlines all the device features, including slot resources, slot locations, and more.

We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory.

```bash
WORK_DIR=generated
tapac \
--work-dir ${WORK_DIR} \
--top data_decoding \
--part-num xcu280-fsvh2892-2L-e \
--clock-period 3.33 \
-o ${WORK_DIR}/data_decoding.xo \
--connectivity config/link_config.ini \
src/data_decoder.cpp \
2>&1 | tee tapa.log
rapidstream run_vck5000.py
```


### Step 2: Use Rapidstream to Optimize `.xo` Design


The RapidStream flow conducts design space exploration and generates solutions by taking all TAPA-generated `.xo` file as the input.
The RapidStream flow for TAPA requires the following key inputs:

Expand Down Expand Up @@ -101,7 +136,7 @@ rs.assign_port_to_region("ap_rst_n", left_slot)
rs.assign_port_to_region("interrupt", left_slot)
```

For the complete detail, please refore to [./run.py](./run.py) file. Call the rapidstream by launching the command below or `make all`.
For the complete detail, please refore to [./run_vck5000.py](./run_vck5000.py) file. Call the rapidstream by launching the command below or `make all`.

```bash
rapidstream run.py
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,32 @@
"dse_range_min": 0.7,
"partition_strategy": "flat",
"port_pre_assignments": {
".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0",
".*ch_10_.*": "SLOT_X1Y0:SLOT_X1Y0",
".*ch_11_.*": "SLOT_X1Y0:SLOT_X1Y0",
".*ch_12_.*": "SLOT_X1Y0:SLOT_X1Y0",
".*ch_13_.*": "SLOT_X1Y0:SLOT_X1Y0",
".*ch_14_.*": "SLOT_X1Y0:SLOT_X1Y0",
".*ch_15_.*": "SLOT_X1Y0:SLOT_X1Y0",
".*ch_16_.*": "SLOT_X1Y0:SLOT_X1Y0",
".*ch_17_.*": "SLOT_X1Y1:SLOT_X1Y1",
".*ch_18_.*": "SLOT_X1Y1:SLOT_X1Y1",
".*ch_19_.*": "SLOT_X1Y1:SLOT_X1Y1",
".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0",
".*ch_20_.*": "SLOT_X1Y1:SLOT_X1Y1",
".*ch_21_.*": "SLOT_X1Y1:SLOT_X1Y1",
".*ch_22_.*": "SLOT_X1Y1:SLOT_X1Y1",
".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0",
".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0",
".*ch_4_.*": "SLOT_X0Y0:SLOT_X0Y0",
".*ch_5_.*": "SLOT_X0Y0:SLOT_X0Y0",
".*ch_6_.*": "SLOT_X0Y0:SLOT_X0Y0",
".*ch_7_.*": "SLOT_X0Y1:SLOT_X0Y1",
".*ch_8_.*": "SLOT_X0Y1:SLOT_X0Y1",
".*ch_9_.*": "SLOT_X0Y1:SLOT_X0Y1",
"ap_clk": "SLOT_X0Y0:SLOT_X0Y0",
"ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0",
"interrupt": "SLOT_X0Y0:SLOT_X0Y0",
"s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0"
".*ch_0_.*": "NMU512_X0Y0",
".*ch_10_.*": "NMU512_X1Y6",
".*ch_11_.*": "NMU512_X2Y0",
".*ch_12_.*": "NMU512_X2Y1",
".*ch_13_.*": "NMU512_X2Y2",
".*ch_14_.*": "NMU512_X2Y3",
".*ch_15_.*": "NMU512_X3Y0",
".*ch_16_.*": "NMU512_X3Y1",
".*ch_17_.*": "NMU512_X3Y2",
".*ch_18_.*": "NMU512_X2Y4",
".*ch_19_.*": "NMU512_X2Y5",
".*ch_1_.*": "NMU512_X0Y1",
".*ch_20_.*": "NMU512_X2Y6",
".*ch_21_.*": "NMU512_X3Y5",
".*ch_22_.*": "NMU512_X3Y6",
".*ch_2_.*": "NMU512_X0Y2",
".*ch_3_.*": "NMU512_X0Y3",
".*ch_4_.*": "NMU512_X1Y0",
".*ch_5_.*": "NMU512_X1Y1",
".*ch_6_.*": "NMU512_X1Y2",
".*ch_7_.*": "NMU512_X0Y4",
".*ch_8_.*": "NMU512_X1Y4",
".*ch_9_.*": "NMU512_X1Y5",
"ap_clk": "CLK_RST",
"ap_rst_n": "CLK_RST",
"interrupt": "S_AXI_CONTROL",
"s_axi_control_.*": "S_AXI_CONTROL"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@


if {${argc} != 1} {
puts stderr "Should -tclargs <PRE PATH> <CONFIG_JSON_NAME>. Too few arguments. Exiting."
exit 1
}

set PRE_PATH "[lindex $argv 0]"

open_project ${PRE_PATH}/prj.xpr
open_bd_design ${PRE_PATH}/prj.srcs/my_rm/bd/ulp_inst_0/ulp_inst_0.bd

set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y0}] [get_bd_intf_pins /axi_noc_kernel0/S00_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y1}] [get_bd_intf_pins /axi_noc_kernel0/S01_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y2}] [get_bd_intf_pins /axi_noc_kernel0/S02_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y3}] [get_bd_intf_pins /axi_noc_kernel0/S03_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y0}] [get_bd_intf_pins /axi_noc_kernel0/S04_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y1}] [get_bd_intf_pins /axi_noc_kernel0/S05_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y2}] [get_bd_intf_pins /axi_noc_kernel0/S06_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X0Y4}] [get_bd_intf_pins /axi_noc_kernel0/S07_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y4}] [get_bd_intf_pins /axi_noc_kernel0/S08_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y5}] [get_bd_intf_pins /axi_noc_kernel0/S09_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X1Y6}] [get_bd_intf_pins /axi_noc_kernel0/S10_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y0}] [get_bd_intf_pins /axi_noc_kernel0/S11_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y1}] [get_bd_intf_pins /axi_noc_kernel0/S12_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y2}] [get_bd_intf_pins /axi_noc_kernel0/S13_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y3}] [get_bd_intf_pins /axi_noc_kernel0/S14_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y0}] [get_bd_intf_pins /axi_noc_kernel0/S15_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y1}] [get_bd_intf_pins /axi_noc_kernel0/S16_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y2}] [get_bd_intf_pins /axi_noc_kernel0/S17_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y4}] [get_bd_intf_pins /axi_noc_kernel0/S18_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y5}] [get_bd_intf_pins /axi_noc_kernel0/S19_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X2Y6}] [get_bd_intf_pins /axi_noc_kernel0/S20_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y5}] [get_bd_intf_pins /axi_noc_kernel0/S21_AXI]
set_property -dict [list CONFIG.PHYSICAL_LOC {NOC_NMU512_X3Y6}] [get_bd_intf_pins /axi_noc_kernel0/S22_AXI]


save_bd_design
validate_bd_design
save_bd_design
reset_run my_rm_synth_1
launch_runs my_rm_synth_1 -jobs 16
wait_on_run my_rm_synth_1
3 changes: 0 additions & 3 deletions benchmarks/tapa_flow/bandwidth23/run_vck5000.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,11 @@
factory.set_slot_capacity(x, 1, north=11520)

factory.set_slot_capacity(x, 1, south=11520)
# factory.set_slot_capacity(x, 2, south=11520)

# Set W/E capacity
for y in range(2):
factory.set_slot_capacity(0, y, east=40320)
factory.set_slot_capacity(1, y, west=40320)
# factory.set_slot_capacity(0, 2, east=41178)
# factory.set_slot_capacity(1, 2, west=41178)


factory.set_platform_name(VITIS_PLATFORM)
Expand Down
Loading

0 comments on commit 2692af9

Please sign in to comment.