Skip to content

Commit

Permalink
feat(tapa): check in bandwidth app
Browse files Browse the repository at this point in the history
  • Loading branch information
vagrantxiao24 committed Dec 10, 2024
1 parent 7f81e0b commit 7fd6996
Show file tree
Hide file tree
Showing 31 changed files with 2,258 additions and 0 deletions.
114 changes: 114 additions & 0 deletions benchmarks/tapa_flow/bandwidth23/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved.
# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.

ROOT_DIR := $(shell git rev-parse --show-toplevel)
KERNEL_NAME := bandwidth23
RS_SCRIPT := $(CURDIR)/run.py
SRC_DIR := $(CURDIR)/design
AB_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json
IMPL_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json
LINK_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini
PLATFORM := xilinx_vck5000_gen4x8_qdma_2_202220_1
PART_NUM := xcvc1902-vsvd1760-2MP-e-S
GRP_UTIL := $(ROOT_DIR)/common/util/get_group.py
TEMP_DIR := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
RS_TARGET := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
BUILD_LOG := $(TEMP_DIR)/build.json
SUCCESS := "Build Successful"
TIMING_RPT := impl_1_hw_bb_locked_timing_summary_routed.rpt
SLACK_GETTER := $(ROOT_DIR)/common/util/get_slack.py
RSPATH := $(CURDIR)
RSXX := rapidstream
RSPYTHON := rapidstream
DEVICE_CONFIG := $(TEMP_DIR)/device.json
DEVICE_GEN := $(CURDIR)/gen_device.py
INCLUDE := -I $(XILINX_HLS)/include
KERNEL_XO := $(TEMP_DIR)/$(KERNEL_NAME).xo
KERNEL_XCLBIN := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
KERNEL_XSA := $(TEMP_DIR)/$(KERNEL_NAME).xsa
TARGET := hw

all: $(RS_TARGET)
cd $(RSPATH) && $(RSPYTHON) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333
@echo $(SUCCESS)

$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG)
mkdir -p $(TEMP_DIR)
cd $(RSPATH) && $(RSXX)-tapaopt \
--work-dir $(TEMP_DIR) \
--tapa-xo-path $< \
--device-config $(DEVICE_CONFIG) \
--floorplan-config $(AB_CONFIG) \
--single-reg \
--run-impl \
--implementation-config $(IMPL_CONFIG) \
--connectivity-ini $(LINK_CONFIG)

$(DEVICE_CONFIG):$(AB_CONFIG)
mkdir -p $(TEMP_DIR)
cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT)

cosim:$(KERNEL_XO) $(TEMP_DIR)/main.exe
cd $(TEMP_DIR) && $(TEMP_DIR)/main.exe 1024 \
--bitstream $< \
-xosim_work_dir $(TEMP_DIR)/xosim_work_dir

hw: $(KERNEL_XCLBIN)

$(KERNEL_XCLBIN): $(KERNEL_XSA)
@echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****"
cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \
$^ \
--temp_dir $(TEMP_DIR) \
--save-temps \
--report_dir $(TEMP_DIR)/reports/ \
--package.boot_mode=ospi \
-o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log
@echo "### ***** $(KERNEL_XCLBIN) packaging done! *****"

$(KERNEL_XSA): $(KERNEL_XO)
cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
--config $(SRC_DIR)/vck5000.cfg \
--save-temps \
--temp_dir $(TEMP_DIR) \
--clock.defaultFreqHz 250000000 \
--vivado.synth.jobs 16 \
$< -o $@

xo: $(KERNEL_XO)

$(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp
mkdir -p $(TEMP_DIR)
cd $(TEMP_DIR) && tapa compile \
--top $(KERNEL_NAME) \
--part-num xcu55c-fsvh2892-2L-e \
--clock-period 3.33 \
-o $(KERNEL_NAME).xo \
-f $< \
2>&1 | tee tapa.log

csim:$(TEMP_DIR)/main.exe

$(TEMP_DIR)/main.exe: $(SRC_DIR)/*.cpp
mkdir -p $(TEMP_DIR)
cd $(TEMP_DIR) && tapa g++ $^ $(INCLUDE) -o $(TEMP_DIR)/main.exe -O2
$(TEMP_DIR)/main.exe

show_groups:
rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \
-o $(TEMP_DIR)/module_types.csv



clean:
rm -rf $(TEMP_DIR) *.log
rm -rf .Xil .run
rm -rf *.exe
rm -rf .ipcache

cleanall:
rm -rf build *.log
rm -rf .Xil .run
rm -rf *.exe
rm -rf .ipcache
141 changes: 141 additions & 0 deletions benchmarks/tapa_flow/bandwidth23/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
<!--
Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved.
The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
-->

<img src="https://imagedelivery.net/AU8IzMTGgpVmEBfwPILIgw/1b565657-df33-41f9-f29e-0d539743e700/128" width="64px" alt="RapidStream Logo" />

# TAPA Flow: ORC Decoder

## Introduction


In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include:

- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA.
- Optimize the .xo file with RapidStream to obtain an optimized .xo file.
- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment.

## Tutorial

### Step 1 (Done): Generate the Xilinx Object File (`.xo`)


We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory.

```bash
WORK_DIR=generated
tapac \
--work-dir ${WORK_DIR} \
--top data_decoding \
--part-num xcu280-fsvh2892-2L-e \
--clock-period 3.33 \
-o ${WORK_DIR}/data_decoding.xo \
--connectivity config/link_config.ini \
src/data_decoder.cpp \
2>&1 | tee tapa.log
```

### Step 2: Use Rapidstream to Optimize `.xo` Design

The RapidStream flow conducts design space exploration and generates solutions by taking all TAPA-generated `.xo` file as the input.
The RapidStream flow for TAPA requires the following key inputs:

- **Platform**: The Vitis platform (e.g., `xilinx_u280_gen3x16_xdma_1_202211_1`).
- **Device**: virtual device define by calling rapidstream APIs based on platform (e.g., `get_u280_vitis_device_factory`).
- **.xo file**: The `.xo` file generated by TAPA
- **Connectivity** (.ini): Include the configuration file for `v++` design/config/run.py/link_config.ini.
- **top_module_name**: Top module name for the kernel.
- **Clock**: All the clock and frequencies.
- **Flatten Module**: Within a design, not all modules need to be optimized. The flatten module name is the target module rapidstream will optimize.

The Python snippet below shows how we initiate rapidstream instance to set up the rapidstream environment.

```Python
from rapidstream import get_u280_vitis_device_factory, RapidStreamTAPA
import os

CURR_DIR = os.path.dirname(os.path.abspath(__file__))
INI_PATH = f"{CURR_DIR}/design/config/link_config.ini"
VITIS_PLATFORM = "xilinx_u280_gen3x16_xdma_1_202211_1"
XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo"
kernel_name = "data_decoding"
factory = get_u280_vitis_device_factory(VITIS_PLATFORM)
rs = RapidStreamTAPA(f"{CURR_DIR}/build")
rs.set_virtual_device(factory.generate_virtual_device())
rs.add_xo_file(XO_PATH)
rs.set_vitis_platform(VITIS_PLATFORM)
rs.set_vitis_connectivity_config(INI_PATH)
rs.set_top_module_name(kernel_name)
rs.add_clock("ap_clk", 3.33)
rs.add_flatten_targets([kernel_name])
```

The HBM AXI port connection is described in design/config/run.py/link_config.ini.

```bash
[connectivity]
sp=data_decoding.input_port:HBM[0:1]
sp=data_decoding.output_port0_32b_8b:HBM[16:17]
sp=data_decoding.output_port1_16b_8b:HBM[18:19]
sp=data_decoding.output_port2_16b_8b:HBM[20:21]
sp=data_decoding.output_port3_8b:HBM[22:23]
sp=data_decoding.output_port4_Track:HBM[24:25]
```

As a result, it is necessary to assign the kernel ports to the appropriate slots. The Python code below demonstrates this process. For comprehensive linking details, please refer to the design/config/run.py/link_config.ini file.

```Python
# Bind ports to HBM 16-31
right_slot = "SLOT_X1Y0:SLOT_X1Y0"
left_slot = "SLOT_X0Y0:SLOT_X0Y0"
rs.assign_port_to_region(".*input_port.*", left_slot)
rs.assign_port_to_region(".*output_port0_32b_8b.*", right_slot)
rs.assign_port_to_region(".*output_port1_16b_8b.*", right_slot)
rs.assign_port_to_region(".*output_port2_16b_8b.*", right_slot)
rs.assign_port_to_region(".*output_port3_8b.*", right_slot)
rs.assign_port_to_region(".*output_port4_Track.*", right_slot)
rs.assign_port_to_region("s_axi_control_.*", left_slot)
rs.assign_port_to_region("ap_clk", left_slot)
rs.assign_port_to_region("ap_rst_n", left_slot)
rs.assign_port_to_region("interrupt", left_slot)
```

For the complete detail, please refore to [./run.py](./run.py) file. Call the rapidstream by launching the command below or `make all`.

```bash
rapidstream run.py
```

If everything is successful, you should at least get one optimized `.xclbin` file.




### Step 3: Check the Group Module Report


RapidStream mandates a clear distinction between communication and computation within user designs.

- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules.

- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged.

For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation.

To generate a report on group types, execute the commands below or `run make show_groups`:

```bash
rapidstream ../../../common/util/get_group.py \
-i build/passes/0-imported.json \
-o build/module_types.csv
```

The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `Callipepla` serves as a Group module, while the other three modules are added by RapidStream.

| Module Name | Group Type |
|:--------------------------------:|:--------------:|
| data_decoding | grouped_module |
|__rs_ap_ctrl_start_ready_pipeline | grouped_module |
|__rs_ff_pipeline | grouped_module |
|__rs_hs_pipeline | grouped_module |
127 changes: 127 additions & 0 deletions benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
#include <iostream>
#include <vector>

#include <gflags/gflags.h>
#include <tapa.h>
#include "bandwidth23.h"

using std::clog;
using std::endl;
using std::vector;

DEFINE_string(bitstream, "", "path to bitstream file, run csim if empty");

int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, /*remove_flags=*/true);

const uint64_t n = argc > 1 ? atoll(argv[1]) : 1024 * 1024;

vector<bit512> rmem0(n);
vector<bit512> rmem1(n);
vector<bit512> rmem2(n);
vector<bit512> rmem3(n);
vector<bit512> rmem4(n);
vector<bit512> rmem5(n);
vector<bit512> rmem6(n);
vector<bit512> rmem7(n);
vector<bit512> rmem8(n);
vector<bit512> rmem9(n);
vector<bit512> rmem10(n);
vector<bit512> rmem11(n);
vector<bit512> rmem12(n);
vector<bit512> rmem13(n);
vector<bit512> rmem14(n);
vector<bit512> rmem15(n);
vector<bit512> rmem16(n);
vector<bit512> rmem17(n);
vector<bit512> rmem18(n);
vector<bit512> rmem19(n);
vector<bit512> rmem20(n);
vector<bit512> rmem21(n);
vector<bit512> rmem22(n);


for (uint64_t i = 0; i < n; ++i) {
rmem0[i] = i;
rmem1[i] = i;
rmem2[i] = i;
rmem3[i] = i;
rmem4[i] = i;
rmem5[i] = i;
rmem6[i] = i;
rmem7[i] = i;
rmem8[i] = i;
rmem9[i] = i;
rmem10[i] = i;
rmem11[i] = i;
rmem12[i] = i;
rmem13[i] = i;
rmem14[i] = i;
rmem15[i] = i;
rmem16[i] = i;
rmem17[i] = i;
rmem18[i] = i;
rmem19[i] = i;
rmem20[i] = i;
rmem21[i] = i;
rmem22[i] = i;
}
int64_t kernel_time_ns = tapa::invoke(
bandwidth23,
FLAGS_bitstream,
tapa::read_write_mmap<bit512>(rmem0),
tapa::read_write_mmap<bit512>(rmem1),
tapa::read_write_mmap<bit512>(rmem2),
tapa::read_write_mmap<bit512>(rmem3),
tapa::read_write_mmap<bit512>(rmem4),
tapa::read_write_mmap<bit512>(rmem5),
tapa::read_write_mmap<bit512>(rmem6),
tapa::read_write_mmap<bit512>(rmem7),
tapa::read_write_mmap<bit512>(rmem8),
tapa::read_write_mmap<bit512>(rmem9),
tapa::read_write_mmap<bit512>(rmem10),
tapa::read_write_mmap<bit512>(rmem11),
tapa::read_write_mmap<bit512>(rmem12),
tapa::read_write_mmap<bit512>(rmem13),
tapa::read_write_mmap<bit512>(rmem14),
tapa::read_write_mmap<bit512>(rmem15),
tapa::read_write_mmap<bit512>(rmem16),
tapa::read_write_mmap<bit512>(rmem17),
tapa::read_write_mmap<bit512>(rmem18),
tapa::read_write_mmap<bit512>(rmem19),
tapa::read_write_mmap<bit512>(rmem20),
tapa::read_write_mmap<bit512>(rmem21),
tapa::read_write_mmap<bit512>(rmem22),
n);

clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << endl;

uint64_t num_errors = 0;
const uint64_t threshold = 10; // only report up to these errors
for (uint64_t i = 0; i < n; ++i) {
bit512 out512 = (i << 1);
if (rmem0[i] != out512) {
if (num_errors < threshold) {
clog << "error at " << i << ": expected " << rmem0[i] << ", got "
<< out512 << endl;
}
++num_errors;
}
if (rmem22[i] != out512) {
if (num_errors < threshold) {
clog << "error at " << i << ": expected " << rmem22[i] << ", got "
<< out512 << endl;
}
++num_errors;
}
}
if (num_errors == 0) {
clog << "PASS!" << endl;
} else {
if (num_errors > threshold) {
clog << " (+" << (num_errors - threshold) << " more errors)" << endl;
}
clog << "FAIL!" << endl;
}
return num_errors > 0 ? 1 : 0;
}
Loading

0 comments on commit 7fd6996

Please sign in to comment.