diff --git a/benchmarks/tapa_flow/bandwidth23/Makefile b/benchmarks/tapa_flow/bandwidth23/Makefile new file mode 100644 index 00000000..441f8a63 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/Makefile @@ -0,0 +1,114 @@ +# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved. +# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement. + +ROOT_DIR := $(shell git rev-parse --show-toplevel) +KERNEL_NAME := bandwidth23 +RS_SCRIPT := $(CURDIR)/run.py +SRC_DIR := $(CURDIR)/design +AB_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json +IMPL_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json +LINK_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini +PLATFORM := xilinx_vck5000_gen4x8_qdma_2_202220_1 +PART_NUM := xcvc1902-vsvd1760-2MP-e-S +GRP_UTIL := $(ROOT_DIR)/common/util/get_group.py +TEMP_DIR := $(CURDIR)/build/$(notdir $(RS_SCRIPT)) +RS_TARGET := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin +BUILD_LOG := $(TEMP_DIR)/build.json +SUCCESS := "Build Successful" +TIMING_RPT := impl_1_hw_bb_locked_timing_summary_routed.rpt +SLACK_GETTER := $(ROOT_DIR)/common/util/get_slack.py +RSPATH := $(CURDIR) +RSXX := rapidstream +RSPYTHON := rapidstream +DEVICE_CONFIG := $(TEMP_DIR)/device.json +DEVICE_GEN := $(CURDIR)/gen_device.py +INCLUDE := -I $(XILINX_HLS)/include +KERNEL_XO := $(TEMP_DIR)/$(KERNEL_NAME).xo +KERNEL_XCLBIN := $(TEMP_DIR)/$(KERNEL_NAME).xclbin +KERNEL_XSA := $(TEMP_DIR)/$(KERNEL_NAME).xsa +TARGET := hw + +all: $(RS_TARGET) + cd $(RSPATH) && $(RSPYTHON) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333 + @echo $(SUCCESS) + +$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG) + mkdir -p $(TEMP_DIR) + cd $(RSPATH) && $(RSXX)-tapaopt \ + --work-dir $(TEMP_DIR) \ + --tapa-xo-path $< \ + --device-config $(DEVICE_CONFIG) \ + --floorplan-config $(AB_CONFIG) \ + --single-reg \ + --run-impl \ + --implementation-config $(IMPL_CONFIG) \ + --connectivity-ini $(LINK_CONFIG) + +$(DEVICE_CONFIG):$(AB_CONFIG) + mkdir -p $(TEMP_DIR) + cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT) + +cosim:$(KERNEL_XO) $(TEMP_DIR)/main.exe + cd $(TEMP_DIR) && $(TEMP_DIR)/main.exe 1024 \ + --bitstream $< \ + -xosim_work_dir $(TEMP_DIR)/xosim_work_dir + +hw: $(KERNEL_XCLBIN) + +$(KERNEL_XCLBIN): $(KERNEL_XSA) + @echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****" + cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \ + $^ \ + --temp_dir $(TEMP_DIR) \ + --save-temps \ + --report_dir $(TEMP_DIR)/reports/ \ + --package.boot_mode=ospi \ + -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log + @echo "### ***** $(KERNEL_XCLBIN) packaging done! *****" + +$(KERNEL_XSA): $(KERNEL_XO) + cd $(TEMP_DIR) && v++ -l -t ${TARGET} \ + --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \ + --config $(SRC_DIR)/vck5000.cfg \ + --save-temps \ + --temp_dir $(TEMP_DIR) \ + --clock.defaultFreqHz 250000000 \ + --vivado.synth.jobs 16 \ + $< -o $@ + +xo: $(KERNEL_XO) + +$(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp + mkdir -p $(TEMP_DIR) + cd $(TEMP_DIR) && tapa compile \ + --top $(KERNEL_NAME) \ + --part-num xcu55c-fsvh2892-2L-e \ + --clock-period 3.33 \ + -o $(KERNEL_NAME).xo \ + -f $< \ + 2>&1 | tee tapa.log + +csim:$(TEMP_DIR)/main.exe + +$(TEMP_DIR)/main.exe: $(SRC_DIR)/*.cpp + mkdir -p $(TEMP_DIR) + cd $(TEMP_DIR) && tapa g++ $^ $(INCLUDE) -o $(TEMP_DIR)/main.exe -O2 + $(TEMP_DIR)/main.exe + +show_groups: + rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \ + -o $(TEMP_DIR)/module_types.csv + + + +clean: + rm -rf $(TEMP_DIR) *.log + rm -rf .Xil .run + rm -rf *.exe + rm -rf .ipcache + +cleanall: + rm -rf build *.log + rm -rf .Xil .run + rm -rf *.exe + rm -rf .ipcache diff --git a/benchmarks/tapa_flow/bandwidth23/README.md b/benchmarks/tapa_flow/bandwidth23/README.md new file mode 100644 index 00000000..54f2286e --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/README.md @@ -0,0 +1,141 @@ + + +RapidStream Logo + +# TAPA Flow: ORC Decoder + +## Introduction + + +In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include: + +- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA. +- Optimize the .xo file with RapidStream to obtain an optimized .xo file. +- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment. + +## Tutorial + +### Step 1 (Done): Generate the Xilinx Object File (`.xo`) + + +We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory. + +```bash +WORK_DIR=generated +tapac \ + --work-dir ${WORK_DIR} \ + --top data_decoding \ + --part-num xcu280-fsvh2892-2L-e \ + --clock-period 3.33 \ + -o ${WORK_DIR}/data_decoding.xo \ + --connectivity config/link_config.ini \ + src/data_decoder.cpp \ + 2>&1 | tee tapa.log +``` + +### Step 2: Use Rapidstream to Optimize `.xo` Design + +The RapidStream flow conducts design space exploration and generates solutions by taking all TAPA-generated `.xo` file as the input. +The RapidStream flow for TAPA requires the following key inputs: + +- **Platform**: The Vitis platform (e.g., `xilinx_u280_gen3x16_xdma_1_202211_1`). +- **Device**: virtual device define by calling rapidstream APIs based on platform (e.g., `get_u280_vitis_device_factory`). +- **.xo file**: The `.xo` file generated by TAPA +- **Connectivity** (.ini): Include the configuration file for `v++` design/config/run.py/link_config.ini. +- **top_module_name**: Top module name for the kernel. +- **Clock**: All the clock and frequencies. +- **Flatten Module**: Within a design, not all modules need to be optimized. The flatten module name is the target module rapidstream will optimize. + +The Python snippet below shows how we initiate rapidstream instance to set up the rapidstream environment. + +```Python +from rapidstream import get_u280_vitis_device_factory, RapidStreamTAPA +import os + +CURR_DIR = os.path.dirname(os.path.abspath(__file__)) +INI_PATH = f"{CURR_DIR}/design/config/link_config.ini" +VITIS_PLATFORM = "xilinx_u280_gen3x16_xdma_1_202211_1" +XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo" +kernel_name = "data_decoding" +factory = get_u280_vitis_device_factory(VITIS_PLATFORM) +rs = RapidStreamTAPA(f"{CURR_DIR}/build") +rs.set_virtual_device(factory.generate_virtual_device()) +rs.add_xo_file(XO_PATH) +rs.set_vitis_platform(VITIS_PLATFORM) +rs.set_vitis_connectivity_config(INI_PATH) +rs.set_top_module_name(kernel_name) +rs.add_clock("ap_clk", 3.33) +rs.add_flatten_targets([kernel_name]) +``` + +The HBM AXI port connection is described in design/config/run.py/link_config.ini. + +```bash +[connectivity] +sp=data_decoding.input_port:HBM[0:1] +sp=data_decoding.output_port0_32b_8b:HBM[16:17] +sp=data_decoding.output_port1_16b_8b:HBM[18:19] +sp=data_decoding.output_port2_16b_8b:HBM[20:21] +sp=data_decoding.output_port3_8b:HBM[22:23] +sp=data_decoding.output_port4_Track:HBM[24:25] +``` + +As a result, it is necessary to assign the kernel ports to the appropriate slots. The Python code below demonstrates this process. For comprehensive linking details, please refer to the design/config/run.py/link_config.ini file. + + ```Python +# Bind ports to HBM 16-31 +right_slot = "SLOT_X1Y0:SLOT_X1Y0" +left_slot = "SLOT_X0Y0:SLOT_X0Y0" +rs.assign_port_to_region(".*input_port.*", left_slot) +rs.assign_port_to_region(".*output_port0_32b_8b.*", right_slot) +rs.assign_port_to_region(".*output_port1_16b_8b.*", right_slot) +rs.assign_port_to_region(".*output_port2_16b_8b.*", right_slot) +rs.assign_port_to_region(".*output_port3_8b.*", right_slot) +rs.assign_port_to_region(".*output_port4_Track.*", right_slot) +rs.assign_port_to_region("s_axi_control_.*", left_slot) +rs.assign_port_to_region("ap_clk", left_slot) +rs.assign_port_to_region("ap_rst_n", left_slot) +rs.assign_port_to_region("interrupt", left_slot) +``` + +For the complete detail, please refore to [./run.py](./run.py) file. Call the rapidstream by launching the command below or `make all`. + +```bash +rapidstream run.py +``` + +If everything is successful, you should at least get one optimized `.xclbin` file. + + + + +### Step 3: Check the Group Module Report + + +RapidStream mandates a clear distinction between communication and computation within user designs. + +- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules. + +- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged. + +For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation. + +To generate a report on group types, execute the commands below or `run make show_groups`: + +```bash +rapidstream ../../../common/util/get_group.py \ + -i build/passes/0-imported.json \ + -o build/module_types.csv +``` + +The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `Callipepla` serves as a Group module, while the other three modules are added by RapidStream. + +| Module Name | Group Type | +|:--------------------------------:|:--------------:| +| data_decoding | grouped_module | +|__rs_ap_ctrl_start_ready_pipeline | grouped_module | +|__rs_ff_pipeline | grouped_module | +|__rs_hs_pipeline | grouped_module | diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp b/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp new file mode 100644 index 00000000..9471ab2d --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp @@ -0,0 +1,127 @@ +#include +#include + +#include +#include +#include "bandwidth23.h" + +using std::clog; +using std::endl; +using std::vector; + +DEFINE_string(bitstream, "", "path to bitstream file, run csim if empty"); + +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, /*remove_flags=*/true); + + const uint64_t n = argc > 1 ? atoll(argv[1]) : 1024 * 1024; + + vector rmem0(n); + vector rmem1(n); + vector rmem2(n); + vector rmem3(n); + vector rmem4(n); + vector rmem5(n); + vector rmem6(n); + vector rmem7(n); + vector rmem8(n); + vector rmem9(n); + vector rmem10(n); + vector rmem11(n); + vector rmem12(n); + vector rmem13(n); + vector rmem14(n); + vector rmem15(n); + vector rmem16(n); + vector rmem17(n); + vector rmem18(n); + vector rmem19(n); + vector rmem20(n); + vector rmem21(n); + vector rmem22(n); + + + for (uint64_t i = 0; i < n; ++i) { + rmem0[i] = i; + rmem1[i] = i; + rmem2[i] = i; + rmem3[i] = i; + rmem4[i] = i; + rmem5[i] = i; + rmem6[i] = i; + rmem7[i] = i; + rmem8[i] = i; + rmem9[i] = i; + rmem10[i] = i; + rmem11[i] = i; + rmem12[i] = i; + rmem13[i] = i; + rmem14[i] = i; + rmem15[i] = i; + rmem16[i] = i; + rmem17[i] = i; + rmem18[i] = i; + rmem19[i] = i; + rmem20[i] = i; + rmem21[i] = i; + rmem22[i] = i; + } + int64_t kernel_time_ns = tapa::invoke( + bandwidth23, + FLAGS_bitstream, + tapa::read_write_mmap(rmem0), + tapa::read_write_mmap(rmem1), + tapa::read_write_mmap(rmem2), + tapa::read_write_mmap(rmem3), + tapa::read_write_mmap(rmem4), + tapa::read_write_mmap(rmem5), + tapa::read_write_mmap(rmem6), + tapa::read_write_mmap(rmem7), + tapa::read_write_mmap(rmem8), + tapa::read_write_mmap(rmem9), + tapa::read_write_mmap(rmem10), + tapa::read_write_mmap(rmem11), + tapa::read_write_mmap(rmem12), + tapa::read_write_mmap(rmem13), + tapa::read_write_mmap(rmem14), + tapa::read_write_mmap(rmem15), + tapa::read_write_mmap(rmem16), + tapa::read_write_mmap(rmem17), + tapa::read_write_mmap(rmem18), + tapa::read_write_mmap(rmem19), + tapa::read_write_mmap(rmem20), + tapa::read_write_mmap(rmem21), + tapa::read_write_mmap(rmem22), + n); + + clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << endl; + + uint64_t num_errors = 0; + const uint64_t threshold = 10; // only report up to these errors + for (uint64_t i = 0; i < n; ++i) { + bit512 out512 = (i << 1); + if (rmem0[i] != out512) { + if (num_errors < threshold) { + clog << "error at " << i << ": expected " << rmem0[i] << ", got " + << out512 << endl; + } + ++num_errors; + } + if (rmem22[i] != out512) { + if (num_errors < threshold) { + clog << "error at " << i << ": expected " << rmem22[i] << ", got " + << out512 << endl; + } + ++num_errors; + } + } + if (num_errors == 0) { + clog << "PASS!" << endl; + } else { + if (num_errors > threshold) { + clog << " (+" << (num_errors - threshold) << " more errors)" << endl; + } + clog << "FAIL!" << endl; + } + return num_errors > 0 ? 1 : 0; +} diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.cpp b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.cpp new file mode 100644 index 00000000..e2d60f9d --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.cpp @@ -0,0 +1,176 @@ +#include + +#include +#include "bandwidth23.h" + +void yshift(tapa::istream& a, tapa::ostream& b, uint64_t n) { + for (uint64_t i = 0; i < n; ++i) { + bit512 tmp; + tmp = a.read(); + tmp = (tmp << 1); + b.write(tmp); + } +} + +void Mmap2Stream( + tapa::mmap mmap, + uint64_t n, + tapa::ostream& stream){ + + for (uint64_t i = 0; i < n; ++i) { + stream << mmap[i]; + } +} + +void Stream2Mmap(tapa::istream& stream, tapa::mmap mmap, + uint64_t n) { + for (uint64_t i = 0; i < n; ++i) { + mmap[i] = stream.read(); + } +} + +void bandwidth23( + tapa::mmap ch_0, + tapa::mmap ch_1, + tapa::mmap ch_2, + tapa::mmap ch_3, + tapa::mmap ch_4, + tapa::mmap ch_5, + tapa::mmap ch_6, + tapa::mmap ch_7, + tapa::mmap ch_8, + tapa::mmap ch_9, + tapa::mmap ch_10, + tapa::mmap ch_11, + tapa::mmap ch_12, + tapa::mmap ch_13, + tapa::mmap ch_14, + tapa::mmap ch_15, + tapa::mmap ch_16, + tapa::mmap ch_17, + tapa::mmap ch_18, + tapa::mmap ch_19, + tapa::mmap ch_20, + tapa::mmap ch_21, + tapa::mmap ch_22, + uint64_t n) { + + tapa::stream qr0("qr0"); + tapa::stream qr1("qr1"); + tapa::stream qr2("qr2"); + tapa::stream qr3("qr3"); + tapa::stream qr4("qr4"); + tapa::stream qr5("qr5"); + tapa::stream qr6("qr6"); + tapa::stream qr7("qr7"); + tapa::stream qr8("qr8"); + tapa::stream qr9("qr9"); + tapa::stream qr10("qr10"); + tapa::stream qr11("qr11"); + tapa::stream qr12("qr12"); + tapa::stream qr13("qr13"); + tapa::stream qr14("qr14"); + tapa::stream qr15("qr15"); + tapa::stream qr16("qr16"); + tapa::stream qr17("qr17"); + tapa::stream qr18("qr18"); + tapa::stream qr19("qr19"); + tapa::stream qr20("qr20"); + tapa::stream qr21("qr21"); + tapa::stream qr22("qr22"); + + tapa::stream qw0("qw0"); + tapa::stream qw1("qw1"); + tapa::stream qw2("qw2"); + tapa::stream qw3("qw3"); + tapa::stream qw4("qw4"); + tapa::stream qw5("qw5"); + tapa::stream qw6("qw6"); + tapa::stream qw7("qw7"); + tapa::stream qw8("qw8"); + tapa::stream qw9("qw9"); + tapa::stream qw10("qw10"); + tapa::stream qw11("qw11"); + tapa::stream qw12("qw12"); + tapa::stream qw13("qw13"); + tapa::stream qw14("qw14"); + tapa::stream qw15("qw15"); + tapa::stream qw16("qw16"); + tapa::stream qw17("qw17"); + tapa::stream qw18("qw18"); + tapa::stream qw19("qw19"); + tapa::stream qw20("qw20"); + tapa::stream qw21("qw21"); + tapa::stream qw22("qw22"); + + tapa::task() + .invoke(Mmap2Stream, ch_0, n, qr0) + .invoke(Mmap2Stream, ch_1, n, qr1) + .invoke(Mmap2Stream, ch_2, n, qr2) + .invoke(Mmap2Stream, ch_3, n, qr3) + .invoke(Mmap2Stream, ch_4, n, qr4) + .invoke(Mmap2Stream, ch_5, n, qr5) + .invoke(Mmap2Stream, ch_6, n, qr6) + .invoke(Mmap2Stream, ch_7, n, qr7) + .invoke(Mmap2Stream, ch_8, n, qr8) + .invoke(Mmap2Stream, ch_9, n, qr9) + .invoke(Mmap2Stream, ch_10, n, qr10) + .invoke(Mmap2Stream, ch_11, n, qr11) + .invoke(Mmap2Stream, ch_12, n, qr12) + .invoke(Mmap2Stream, ch_13, n, qr13) + .invoke(Mmap2Stream, ch_14, n, qr14) + .invoke(Mmap2Stream, ch_15, n, qr15) + .invoke(Mmap2Stream, ch_16, n, qr16) + .invoke(Mmap2Stream, ch_17, n, qr17) + .invoke(Mmap2Stream, ch_18, n, qr18) + .invoke(Mmap2Stream, ch_19, n, qr19) + .invoke(Mmap2Stream, ch_20, n, qr20) + .invoke(Mmap2Stream, ch_21, n, qr21) + .invoke(Mmap2Stream, ch_22, n, qr22) + .invoke(yshift, qr0, qw0, n) + .invoke(yshift, qr1, qw1, n) + .invoke(yshift, qr2, qw2, n) + .invoke(yshift, qr3, qw3, n) + .invoke(yshift, qr4, qw4, n) + .invoke(yshift, qr5, qw5, n) + .invoke(yshift, qr6, qw6, n) + .invoke(yshift, qr7, qw7, n) + .invoke(yshift, qr8, qw8, n) + .invoke(yshift, qr9, qw9, n) + .invoke(yshift, qr10, qw10, n) + .invoke(yshift, qr11, qw11, n) + .invoke(yshift, qr12, qw12, n) + .invoke(yshift, qr13, qw13, n) + .invoke(yshift, qr14, qw14, n) + .invoke(yshift, qr15, qw15, n) + .invoke(yshift, qr16, qw16, n) + .invoke(yshift, qr17, qw17, n) + .invoke(yshift, qr18, qw18, n) + .invoke(yshift, qr19, qw19, n) + .invoke(yshift, qr20, qw20, n) + .invoke(yshift, qr21, qw21, n) + .invoke(yshift, qr22, qw22, n) + .invoke(Stream2Mmap, qw0, ch_0, n) + .invoke(Stream2Mmap, qw1, ch_1, n) + .invoke(Stream2Mmap, qw2, ch_2, n) + .invoke(Stream2Mmap, qw3, ch_3, n) + .invoke(Stream2Mmap, qw4, ch_4, n) + .invoke(Stream2Mmap, qw5, ch_5, n) + .invoke(Stream2Mmap, qw6, ch_6, n) + .invoke(Stream2Mmap, qw7, ch_7, n) + .invoke(Stream2Mmap, qw8, ch_8, n) + .invoke(Stream2Mmap, qw9, ch_9, n) + .invoke(Stream2Mmap, qw10, ch_10, n) + .invoke(Stream2Mmap, qw11, ch_11, n) + .invoke(Stream2Mmap, qw12, ch_12, n) + .invoke(Stream2Mmap, qw13, ch_13, n) + .invoke(Stream2Mmap, qw14, ch_14, n) + .invoke(Stream2Mmap, qw15, ch_15, n) + .invoke(Stream2Mmap, qw16, ch_16, n) + .invoke(Stream2Mmap, qw17, ch_17, n) + .invoke(Stream2Mmap, qw18, ch_18, n) + .invoke(Stream2Mmap, qw19, ch_19, n) + .invoke(Stream2Mmap, qw20, ch_20, n) + .invoke(Stream2Mmap, qw21, ch_21, n) + .invoke(Stream2Mmap, qw22, ch_22, n); +} diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.h b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.h new file mode 100644 index 00000000..5686779d --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/bandwidth23.h @@ -0,0 +1,37 @@ + +#ifndef __VADD_BW_H__ +#define __VADD_BW_H__ +#include + +#include +#include + +typedef ap_uint<512> bit512; + +void bandwidth23( + tapa::mmap ch_0, + tapa::mmap ch_1, + tapa::mmap ch_2, + tapa::mmap ch_3, + tapa::mmap ch_4, + tapa::mmap ch_5, + tapa::mmap ch_6, + tapa::mmap ch_7, + tapa::mmap ch_8, + tapa::mmap ch_9, + tapa::mmap ch_10, + tapa::mmap ch_11, + tapa::mmap ch_12, + tapa::mmap ch_13, + tapa::mmap ch_14, + tapa::mmap ch_15, + tapa::mmap ch_16, + tapa::mmap ch_17, + tapa::mmap ch_18, + tapa::mmap ch_19, + tapa::mmap ch_20, + tapa::mmap ch_21, + tapa::mmap ch_22, + uint64_t n); + +#endif diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/ab_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/ab_config.json new file mode 100644 index 00000000..5676d8e8 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/ab_config.json @@ -0,0 +1,34 @@ +{ + "dse_range_max": 0.8, + "dse_range_min": 0.7, + "partition_strategy": "flat", + "port_pre_assignments": { + ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_10_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_11_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_12_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_13_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_14_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_15_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_16_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_17_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_18_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_19_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_20_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_21_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_22_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_4_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_5_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_6_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_7_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_8_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_9_.*": "SLOT_X0Y0:SLOT_X0Y0", + "ap_clk": "SLOT_X0Y0:SLOT_X0Y0", + "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0", + "interrupt": "SLOT_X0Y0:SLOT_X0Y0", + "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0" + } +} diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/impl_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/impl_config.json new file mode 100644 index 00000000..3c481977 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/impl_config.json @@ -0,0 +1,7 @@ +{ + "max_workers": 2, + "port_to_clock_period": { + "ap_clk": 3.33 + }, + "vitis_platform": "xilinx_u55c_gen3x16_xdma_3_202210_1" +} diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/link_config.ini b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/link_config.ini new file mode 100644 index 00000000..c19a4a5a --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_au55c.py/link_config.ini @@ -0,0 +1,24 @@ +[connectivity] +sp=banwidth23.ch_0:HBM[0:1] +sp=banwidth23.ch_1:HBM[0:1] +sp=banwidth23.ch_2:HBM[0:1] +sp=banwidth23.ch_3:HBM[0:1] +sp=banwidth23.ch_4:HBM[0:1] +sp=banwidth23.ch_5:HBM[0:1] +sp=banwidth23.ch_6:HBM[0:1] +sp=banwidth23.ch_7:HBM[0:1] +sp=banwidth23.ch_8:HBM[0:1] +sp=banwidth23.ch_9:HBM[0:1] +sp=banwidth23.ch_10:HBM[0:1] +sp=banwidth23.ch_11:HBM[0:1] +sp=banwidth23.ch_12:HBM[0:1] +sp=banwidth23.ch_13:HBM[0:1] +sp=banwidth23.ch_14:HBM[0:1] +sp=banwidth23.ch_15:HBM[0:1] +sp=banwidth23.ch_16:HBM[0:1] +sp=banwidth23.ch_17:HBM[0:1] +sp=banwidth23.ch_18:HBM[0:1] +sp=banwidth23.ch_19:HBM[0:1] +sp=banwidth23.ch_20:HBM[0:1] +sp=banwidth23.ch_21:HBM[0:1] +sp=banwidth23.ch_22:HBM[0:1] diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json new file mode 100644 index 00000000..b9325669 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/ab_config.json @@ -0,0 +1,34 @@ +{ + "dse_range_max": 0.8, + "dse_range_min": 0.7, + "partition_strategy": "flat", + "port_pre_assignments": { + ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_10_.*": "SLOT_X1Y0:SLOT_X1Y0", + ".*ch_11_.*": "SLOT_X1Y0:SLOT_X1Y0", + ".*ch_12_.*": "SLOT_X1Y0:SLOT_X1Y0", + ".*ch_13_.*": "SLOT_X1Y0:SLOT_X1Y0", + ".*ch_14_.*": "SLOT_X1Y0:SLOT_X1Y0", + ".*ch_15_.*": "SLOT_X1Y0:SLOT_X1Y0", + ".*ch_16_.*": "SLOT_X1Y0:SLOT_X1Y0", + ".*ch_17_.*": "SLOT_X1Y1:SLOT_X1Y1", + ".*ch_18_.*": "SLOT_X1Y1:SLOT_X1Y1", + ".*ch_19_.*": "SLOT_X1Y1:SLOT_X1Y1", + ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_20_.*": "SLOT_X1Y1:SLOT_X1Y1", + ".*ch_21_.*": "SLOT_X1Y1:SLOT_X1Y1", + ".*ch_22_.*": "SLOT_X1Y1:SLOT_X1Y1", + ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_4_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_5_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_6_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_7_.*": "SLOT_X0Y1:SLOT_X0Y1", + ".*ch_8_.*": "SLOT_X0Y1:SLOT_X0Y1", + ".*ch_9_.*": "SLOT_X0Y1:SLOT_X0Y1", + "ap_clk": "SLOT_X0Y0:SLOT_X0Y0", + "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0", + "interrupt": "SLOT_X0Y0:SLOT_X0Y0", + "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0" + } +} diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/impl_config.json b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/impl_config.json new file mode 100644 index 00000000..9b47f4ca --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/impl_config.json @@ -0,0 +1,7 @@ +{ + "max_workers": 2, + "port_to_clock_period": { + "ap_clk": 3.33 + }, + "vitis_platform": "xilinx_vck5000_gen4x8_qdma_2_202220_1" +} diff --git a/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/link_config.ini b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/link_config.ini new file mode 100644 index 00000000..cf375c2d --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/config/run_vck5000.py/link_config.ini @@ -0,0 +1,27 @@ +platform=xilinx_vck5000_gen4x8_qdma_2_202220_1 + +[connectivity] + +sp = bandwidth23.m_axi_ch_0:MC_NOC0 +sp = bandwidth23.m_axi_ch_1:MC_NOC0 +sp = bandwidth23.m_axi_ch_2:MC_NOC0 +sp = bandwidth23.m_axi_ch_3:MC_NOC0 +sp = bandwidth23.m_axi_ch_4:MC_NOC0 +sp = bandwidth23.m_axi_ch_5:MC_NOC0 +sp = bandwidth23.m_axi_ch_6:MC_NOC0 +sp = bandwidth23.m_axi_ch_7:MC_NOC0 +sp = bandwidth23.m_axi_ch_8:MC_NOC0 +sp = bandwidth23.m_axi_ch_9:MC_NOC0 +sp = bandwidth23.m_axi_ch_10:MC_NOC0 +sp = bandwidth23.m_axi_ch_11:MC_NOC0 +sp = bandwidth23.m_axi_ch_12:MC_NOC0 +sp = bandwidth23.m_axi_ch_13:MC_NOC0 +sp = bandwidth23.m_axi_ch_14:MC_NOC0 +sp = bandwidth23.m_axi_ch_15:MC_NOC0 +sp = bandwidth23.m_axi_ch_16:MC_NOC0 +sp = bandwidth23.m_axi_ch_17:MC_NOC0 +sp = bandwidth23.m_axi_ch_18:MC_NOC0 +sp = bandwidth23.m_axi_ch_19:MC_NOC0 +sp = bandwidth23.m_axi_ch_20:MC_NOC0 +sp = bandwidth23.m_axi_ch_21:MC_NOC0 +sp = bandwidth23.m_axi_ch_22:MC_NOC0 diff --git a/benchmarks/tapa_flow/bandwidth23/design/run_tapa.sh b/benchmarks/tapa_flow/bandwidth23/design/run_tapa.sh new file mode 100644 index 00000000..0071559b --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/design/run_tapa.sh @@ -0,0 +1,9 @@ +WORK_DIR=work.out + +tapa compile \ + --top data_decoding \ + --part-num xcu55c-fsvh2892-2L-e \ + --clock-period 3.33 \ + -o ${WORK_DIR}/data_decoding.xo \ + -f src/data_decoder.cpp \ + 2>&1 | tee tapa.log diff --git a/benchmarks/tapa_flow/bandwidth23/run_au55c.py b/benchmarks/tapa_flow/bandwidth23/run_au55c.py new file mode 100644 index 00000000..8ea706e5 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/run_au55c.py @@ -0,0 +1,42 @@ +__copyright__ = """ +Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved. +The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement. +""" + +from rapidstream import get_u55c_vitis_device_factory +import os +from pathlib import Path + +CURR_DIR = os.path.dirname(os.path.abspath(__file__)) +CURR_FILE = os.path.basename(__file__) + +VITIS_PLATFORM = "xilinx_u55c_gen3x16_xdma_3_202210_1" +XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo" + +factory = get_u55c_vitis_device_factory(VITIS_PLATFORM) + +# Reserve resource for the HBM Memory Sub-System. +# The HMSS is not part of the user kernel so the partition optimization process +# is unaware of its existence. We need to manually reserve resources for it. +# For 512-bit HBM channels, each HBM channel uses approximately the following resources: +# AREA_PER_HBM_CHANNEL = { +# "LUT": 5000, +# "FF": 6500, +# "BRAM": 0, +# "URAM": 0, +# "DSP": 0, +# } +factory.reduce_slot_area(0, 0, lut=150800) +factory.reduce_slot_area(0, 1, lut=146960) +factory.reduce_slot_area(0, 2, lut=146960) +factory.reduce_slot_area(1, 0, lut=128000) +factory.reduce_slot_area(1, 1, lut=107840) +factory.reduce_slot_area(1, 2, lut=115120) + + +# For this U280 platform, the right most DSP column on the boundary between +# dynamic/static region is not usable. So we need to adjust the DSP count +# to reflect the actual available DSPs. +print("Reducing DSP of (1, 1) to make it less congested") +factory.reduce_slot_area(1, 1, dsp=100) +factory.generate_virtual_device(Path(f"{CURR_DIR}/build/{CURR_FILE}/device.json")) diff --git a/benchmarks/tapa_flow/bandwidth23/run_vck5000.py b/benchmarks/tapa_flow/bandwidth23/run_vck5000.py new file mode 100644 index 00000000..ae36f962 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth23/run_vck5000.py @@ -0,0 +1,84 @@ +__copyright__ = """ +Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved. +The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement. +""" + +import os +from pathlib import Path + +from rapidstream import DeviceFactory + +CURR_DIR = os.path.dirname(os.path.abspath(__file__)) +CURR_FILE = os.path.basename(__file__) + +VITIS_PLATFORM = "xilinx_vck5000_gen4x8_qdma_2_202220_1" +VCK5000_PART_NAME = "xcvc1902-vsvd1760-2MP-e-S" + + +factory = DeviceFactory(row=2, col=2, part_num=VCK5000_PART_NAME, board_name=None) + +for x in range(2): + for y in range(2): + pblock = f"-add CLOCKREGION_X{x*4}Y{y*4}:CLOCKREGION_X{x*4+3}Y{y*4+3}" + factory.set_slot_pblock(x, y, [pblock]) + + +# set SLR crossing capacity +for x in range(2): + factory.set_slot_capacity(x, 0, north=11520) + factory.set_slot_capacity(x, 1, north=11520) + + factory.set_slot_capacity(x, 1, south=11520) + # factory.set_slot_capacity(x, 2, south=11520) + +# Set W/E capacity +for y in range(2): + factory.set_slot_capacity(0, y, east=40320) + factory.set_slot_capacity(1, y, west=40320) +# factory.set_slot_capacity(0, 2, east=41178) +# factory.set_slot_capacity(1, 2, west=41178) + + +factory.set_platform_name(VITIS_PLATFORM) +factory.set_user_pblock_name("pblock_dynamic_region") + +factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y2"]) +factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y2"]) +factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y3:CLOCKREGION_X4Y4"]) +factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y3:CLOCKREGION_X9Y4"]) + + +# Vitis uses 4395 nets from SLR0 to SLR1 +# factory.set_slot_capacity(1, 0, north=11520 - 4395) +# factory.set_slot_capacity(1, 1, north=11520 - 4395) + +# Vitis uses 4185 nets from SLR1 to SLR2 +# factory.set_slot_capacity(1, 1, south=11520 - 4185) + + +factory.extract_slot_resources() + + +# Reserve resource for the HBM Memory Sub-System. +# The HMSS is not part of the user kernel so the partition optimization process +# is unaware of its existence. We need to manually reserve resources for it. +# For 512-bit HBM channels, each HBM channel uses approximately the following resources: +# AREA_PER_HBM_CHANNEL = { +# "LUT": 5000, +# "FF": 6500, +# "BRAM": 0, +# "URAM": 0, +# "DSP": 0, +# } +# factory.reduce_slot_area(0, 0, lut=150800) +# factory.reduce_slot_area(0, 1, lut=146960) +# factory.reduce_slot_area(1, 0, lut=128000) +# factory.reduce_slot_area(1, 1, lut=107840) + + +# For this U280 platform, the right most DSP column on the boundary between +# dynamic/static region is not usable. So we need to adjust the DSP count +# to reflect the actual available DSPs. +print("Reducing DSP of (1, 1) to make it less congested") +factory.reduce_slot_area(1, 1, dsp=100) +factory.generate_virtual_device(Path(f"{CURR_DIR}/build/{CURR_FILE}/device.json")) diff --git a/benchmarks/tapa_flow/bandwidth4/Makefile b/benchmarks/tapa_flow/bandwidth4/Makefile new file mode 100644 index 00000000..3f2761f9 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/Makefile @@ -0,0 +1,114 @@ +# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved. +# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement. + +ROOT_DIR := $(shell git rev-parse --show-toplevel) +KERNEL_NAME := bandwidth4 +RS_SCRIPT := $(CURDIR)/run.py +SRC_DIR := $(CURDIR)/design +AB_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json +IMPL_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json +LINK_CONFIG := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini +PLATFORM := xilinx_vck5000_gen4x8_qdma_2_202220_1 +PART_NUM := xcvc1902-vsvd1760-2MP-e-S +GRP_UTIL := $(ROOT_DIR)/common/util/get_group.py +TEMP_DIR := $(CURDIR)/build/$(notdir $(RS_SCRIPT)) +RS_TARGET := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin +BUILD_LOG := $(TEMP_DIR)/build.json +SUCCESS := "Build Successful" +TIMING_RPT := impl_1_hw_bb_locked_timing_summary_routed.rpt +SLACK_GETTER := $(ROOT_DIR)/common/util/get_slack.py +RSPATH := $(CURDIR) +RSXX := rapidstream +RSPYTHON := rapidstream +DEVICE_CONFIG := $(TEMP_DIR)/device.json +DEVICE_GEN := $(CURDIR)/gen_device.py +INCLUDE := -I $(XILINX_HLS)/include +KERNEL_XO := $(TEMP_DIR)/$(KERNEL_NAME).xo +KERNEL_XCLBIN := $(TEMP_DIR)/$(KERNEL_NAME).xclbin +KERNEL_XSA := $(TEMP_DIR)/$(KERNEL_NAME).xsa +TARGET := hw + +all: $(RS_TARGET) + cd $(RSPATH) && $(RSPYTHON) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333 + @echo $(SUCCESS) + +# --run-impl +$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG) + mkdir -p $(TEMP_DIR) + cd $(RSPATH) && $(RSXX)-tapaopt \ + --work-dir $(TEMP_DIR) \ + --tapa-xo-path $< \ + --device-config $(DEVICE_CONFIG) \ + --floorplan-config $(AB_CONFIG) \ + --single-reg \ + --implementation-config $(IMPL_CONFIG) \ + --connectivity-ini $(LINK_CONFIG) + +$(DEVICE_CONFIG):$(AB_CONFIG) + mkdir -p $(TEMP_DIR) + cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT) + +cosim:$(KERNEL_XO) $(TEMP_DIR)/main.exe + cd $(TEMP_DIR) && $(TEMP_DIR)/main.exe 1024 \ + --bitstream $< \ + -xosim_work_dir $(TEMP_DIR)/xosim_work_dir + +hw: $(KERNEL_XCLBIN) + +$(KERNEL_XCLBIN): $(KERNEL_XSA) + @echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****" + cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \ + $^ \ + --temp_dir $(TEMP_DIR) \ + --save-temps \ + --report_dir $(TEMP_DIR)/reports/ \ + --package.boot_mode=ospi \ + -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log + @echo "### ***** $(KERNEL_XCLBIN) packaging done! *****" + +$(KERNEL_XSA): $(KERNEL_XO) + cd $(TEMP_DIR) && v++ -l -t ${TARGET} \ + --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \ + --config $(SRC_DIR)/vck5000.cfg \ + --save-temps \ + --temp_dir $(TEMP_DIR) \ + --clock.defaultFreqHz 250000000 \ + --vivado.synth.jobs 16 \ + $< -o $@ + +xo: $(KERNEL_XO) + +$(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp + mkdir -p $(TEMP_DIR) + cd $(TEMP_DIR) && tapa compile \ + --top $(KERNEL_NAME) \ + --part-num xcu55c-fsvh2892-2L-e \ + --clock-period 3.33 \ + -o $(KERNEL_NAME).xo \ + -f $< \ + 2>&1 | tee tapa.log + +csim:$(TEMP_DIR)/main.exe + +$(TEMP_DIR)/main.exe: $(SRC_DIR)/*.cpp + mkdir -p $(TEMP_DIR) + cd $(TEMP_DIR) && tapa g++ $^ $(INCLUDE) -o $(TEMP_DIR)/main.exe -O2 + $(TEMP_DIR)/main.exe + +show_groups: + rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \ + -o $(TEMP_DIR)/module_types.csv + + + +clean: + rm -rf $(TEMP_DIR) *.log + rm -rf .Xil .run + rm -rf *.exe + rm -rf .ipcache + +cleanall: + rm -rf build *.log + rm -rf .Xil .run + rm -rf *.exe + rm -rf .ipcache diff --git a/benchmarks/tapa_flow/bandwidth4/README.md b/benchmarks/tapa_flow/bandwidth4/README.md new file mode 100644 index 00000000..237c8651 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/README.md @@ -0,0 +1,141 @@ + + +RapidStream Logo + +# TAPA Flow: ORC Decoder + +## Introduction + + +In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include: + +- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA. +- Optimize the .xo file with RapidStream to obtain an optimized .xo file. +- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment. + +## Tutorial + +### Step 1 (Done): Generate the Xilinx Object File (`.xo`) + + +We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory. + +```bash +WORK_DIR=generated +tapac \ + --work-dir ${WORK_DIR} \ + --top data_decoding \ + --part-num xcu280-fsvh2892-2L-e \ + --clock-period 3.33 \ + -o ${WORK_DIR}/data_decoding.xo \ + --connectivity config/link_config.ini \ + src/data_decoder.cpp \ + 2>&1 | tee tapa.log +``` + +### Step 2: Use Rapidstream to Optimize `.xo` Design + +The RapidStream flow conducts design space exploration and generates solutions by taking all TAPA-generated `.xo` file as the input. +The RapidStream flow for TAPA requires the following key inputs: + +- **Platform**: The Vitis platform (e.g., `xilinx_u280_gen3x16_xdma_1_202211_1`). +- **Device**: virtual device define by calling rapidstream APIs based on platform (e.g., `get_u280_vitis_device_factory`). +- **.xo file**: The `.xo` file generated by TAPA +- **Connectivity** (.ini): Include the configuration file for `v++` design/config/run.py/link_config.ini. +- **top_module_name**: Top module name for the kernel. +- **Clock**: All the clock and frequencies. +- **Flatten Module**: Within a design, not all modules need to be optimized. The flatten module name is the target module rapidstream will optimize. + +The Python snippet below shows how we initiate rapidstream instance to set up the rapidstream environment. + +```Python +from rapidstream import get_u280_vitis_device_factory, RapidStreamTAPA +import os + +CURR_DIR = os.path.dirname(os.path.abspath(__file__)) +INI_PATH = f"{CURR_DIR}/design/config/link_config.ini" +VITIS_PLATFORM = "xilinx_u280_gen3x16_xdma_1_202211_1" +XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo" +kernel_name = "data_decoding" +factory = get_u280_vitis_device_factory(VITIS_PLATFORM) +rs = RapidStreamTAPA(f"{CURR_DIR}/build") +rs.set_virtual_device(factory.generate_virtual_device()) +rs.add_xo_file(XO_PATH) +rs.set_vitis_platform(VITIS_PLATFORM) +rs.set_vitis_connectivity_config(INI_PATH) +rs.set_top_module_name(kernel_name) +rs.add_clock("ap_clk", 3.33) +rs.add_flatten_targets([kernel_name]) +``` + +The HBM AXI port connection is described in design/config/run.py/link_config.ini. + +```bash +[connectivity] +sp=data_decoding.input_port:HBM[0:1] +sp=data_decoding.output_port0_32b_8b:HBM[16:17] +sp=data_decoding.output_port1_16b_8b:HBM[18:19] +sp=data_decoding.output_port2_16b_8b:HBM[20:21] +sp=data_decoding.output_port3_8b:HBM[22:23] +sp=data_decoding.output_port4_Track:HBM[24:25] +``` + +As a result, it is necessary to assign the kernel ports to the appropriate slots. The Python code below demonstrates this process. For comprehensive linking details, please refer to the design/config/run.py/link_config.ini file. + + ```Python +# Bind ports to HBM 16-31 +right_slot = "SLOT_X1Y0:SLOT_X1Y0" +left_slot = "SLOT_X0Y0:SLOT_X0Y0" +rs.assign_port_to_region(".*input_port.*", left_slot) +rs.assign_port_to_region(".*output_port0_32b_8b.*", right_slot) +rs.assign_port_to_region(".*output_port1_16b_8b.*", right_slot) +rs.assign_port_to_region(".*output_port2_16b_8b.*", right_slot) +rs.assign_port_to_region(".*output_port3_8b.*", right_slot) +rs.assign_port_to_region(".*output_port4_Track.*", right_slot) +rs.assign_port_to_region("s_axi_control_.*", left_slot) +rs.assign_port_to_region("ap_clk", left_slot) +rs.assign_port_to_region("ap_rst_n", left_slot) +rs.assign_port_to_region("interrupt", left_slot) +``` + +For the complete detail, please refore to [./run_vck5000.py](./run_vck5000.py) file. Call the rapidstream by launching the command below or `make all`. + +```bash +rapidstream run.py +``` + +If everything is successful, you should at least get one optimized `.xclbin` file. + + + + +### Step 3: Check the Group Module Report + + +RapidStream mandates a clear distinction between communication and computation within user designs. + +- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules. + +- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged. + +For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation. + +To generate a report on group types, execute the commands below or `run make show_groups`: + +```bash +rapidstream ../../../common/util/get_group.py \ + -i build/passes/0-imported.json \ + -o build/module_types.csv +``` + +The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `Callipepla` serves as a Group module, while the other three modules are added by RapidStream. + +| Module Name | Group Type | +|:--------------------------------:|:--------------:| +| data_decoding | grouped_module | +|__rs_ap_ctrl_start_ready_pipeline | grouped_module | +|__rs_ff_pipeline | grouped_module | +|__rs_hs_pipeline | grouped_module | diff --git a/benchmarks/tapa_flow/bandwidth4/design/bandwidth-host.cpp b/benchmarks/tapa_flow/bandwidth4/design/bandwidth-host.cpp new file mode 100644 index 00000000..340e299d --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/design/bandwidth-host.cpp @@ -0,0 +1,70 @@ +#include +#include + +#include +#include +#include "bandwidth4.h" + +using std::clog; +using std::endl; +using std::vector; + +DEFINE_string(bitstream, "", "path to bitstream file, run csim if empty"); + +int main(int argc, char* argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, /*remove_flags=*/true); + + const uint64_t n = argc > 1 ? atoll(argv[1]) : 1024 * 1024; + + vector rmem0(n); + vector rmem1(n); + vector rmem2(n); + vector rmem3(n); + + + for (uint64_t i = 0; i < n; ++i) { + rmem0[i] = i; + rmem1[i] = i; + rmem2[i] = i; + rmem3[i] = i; + } + int64_t kernel_time_ns = tapa::invoke( + bandwidth4, + FLAGS_bitstream, + tapa::read_write_mmap(rmem0), + tapa::read_write_mmap(rmem1), + tapa::read_write_mmap(rmem2), + tapa::read_write_mmap(rmem3), + n); + + clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << endl; + + uint64_t num_errors = 0; + const uint64_t threshold = 10; // only report up to these errors + for (uint64_t i = 0; i < n; ++i) { + bit512 out512 = (i << 1); + if (rmem0[i] != out512) { + if (num_errors < threshold) { + clog << "error at " << i << ": expected " << rmem0[i] << ", got " + << out512 << endl; + } + ++num_errors; + } + if (rmem3[i] != out512) { + if (num_errors < threshold) { + clog << "error at " << i << ": expected " << rmem3[i] << ", got " + << out512 << endl; + } + ++num_errors; + } + } + if (num_errors == 0) { + clog << "PASS!" << endl; + } else { + if (num_errors > threshold) { + clog << " (+" << (num_errors - threshold) << " more errors)" << endl; + } + clog << "FAIL!" << endl; + } + return num_errors > 0 ? 1 : 0; +} diff --git a/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.cpp b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.cpp new file mode 100644 index 00000000..25d1ba55 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.cpp @@ -0,0 +1,62 @@ +#include + +#include +#include "bandwidth4.h" + +void yshift(tapa::istream& a, tapa::ostream& b, uint64_t n) { + for (uint64_t i = 0; i < n; ++i) { + bit512 tmp; + tmp = a.read(); + tmp = (tmp << 1); + b.write(tmp); + } +} + +void Mmap2Stream( + tapa::mmap mmap, + uint64_t n, + tapa::ostream& stream){ + + for (uint64_t i = 0; i < n; ++i) { + stream << mmap[i]; + } +} + +void Stream2Mmap(tapa::istream& stream, tapa::mmap mmap, + uint64_t n) { + for (uint64_t i = 0; i < n; ++i) { + mmap[i] = stream.read(); + } +} + +void bandwidth4( + tapa::mmap ch_0, + tapa::mmap ch_1, + tapa::mmap ch_2, + tapa::mmap ch_3, + uint64_t n) { + + tapa::stream qr0("qr0"); + tapa::stream qr1("qr1"); + tapa::stream qr2("qr2"); + tapa::stream qr3("qr3"); + + tapa::stream qw0("qw0"); + tapa::stream qw1("qw1"); + tapa::stream qw2("qw2"); + tapa::stream qw3("qw3"); + + tapa::task() + .invoke(Mmap2Stream, ch_0, n, qr0) + .invoke(Mmap2Stream, ch_1, n, qr1) + .invoke(Mmap2Stream, ch_2, n, qr2) + .invoke(Mmap2Stream, ch_3, n, qr3) + .invoke(yshift, qr0, qw0, n) + .invoke(yshift, qr1, qw1, n) + .invoke(yshift, qr2, qw2, n) + .invoke(yshift, qr3, qw3, n) + .invoke(Stream2Mmap, qw0, ch_0, n) + .invoke(Stream2Mmap, qw1, ch_1, n) + .invoke(Stream2Mmap, qw2, ch_2, n) + .invoke(Stream2Mmap, qw3, ch_3, n); +} diff --git a/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.h b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.h new file mode 100644 index 00000000..6974458f --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/design/bandwidth4.h @@ -0,0 +1,18 @@ + +#ifndef __VADD_BW_H__ +#define __VADD_BW_H__ +#include + +#include +#include + +typedef ap_uint<512> bit512; + +void bandwidth4( + tapa::mmap ch_0, + tapa::mmap ch_1, + tapa::mmap ch_2, + tapa::mmap ch_3, + uint64_t n); + +#endif diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json new file mode 100644 index 00000000..264df902 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/ab_config.json @@ -0,0 +1,15 @@ +{ + "dse_range_max": 0.8, + "dse_range_min": 0.7, + "partition_strategy": "flat", + "port_pre_assignments": { + ".*ch_0_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_1_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_2_.*": "SLOT_X0Y0:SLOT_X0Y0", + ".*ch_3_.*": "SLOT_X0Y0:SLOT_X0Y0", + "ap_clk": "SLOT_X0Y0:SLOT_X0Y0", + "ap_rst_n": "SLOT_X0Y0:SLOT_X0Y0", + "interrupt": "SLOT_X0Y0:SLOT_X0Y0", + "s_axi_control_.*": "SLOT_X0Y0:SLOT_X0Y0" + } +} diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/impl_config.json b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/impl_config.json new file mode 100644 index 00000000..9b47f4ca --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/impl_config.json @@ -0,0 +1,7 @@ +{ + "max_workers": 2, + "port_to_clock_period": { + "ap_clk": 3.33 + }, + "vitis_platform": "xilinx_vck5000_gen4x8_qdma_2_202220_1" +} diff --git a/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/link_config.ini b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/link_config.ini new file mode 100644 index 00000000..17e6686e --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/design/config/run_vck5000.py/link_config.ini @@ -0,0 +1,8 @@ +platform=xilinx_vck5000_gen4x8_qdma_2_202220_1 + +[connectivity] + +sp = bandwidth4.m_axi_ch_0:MC_NOC0 +sp = bandwidth4.m_axi_ch_1:MC_NOC0 +sp = bandwidth4.m_axi_ch_2:MC_NOC0 +sp = bandwidth4.m_axi_ch_3:MC_NOC0 diff --git a/benchmarks/tapa_flow/bandwidth4/design/run_tapa.sh b/benchmarks/tapa_flow/bandwidth4/design/run_tapa.sh new file mode 100644 index 00000000..0071559b --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/design/run_tapa.sh @@ -0,0 +1,9 @@ +WORK_DIR=work.out + +tapa compile \ + --top data_decoding \ + --part-num xcu55c-fsvh2892-2L-e \ + --clock-period 3.33 \ + -o ${WORK_DIR}/data_decoding.xo \ + -f src/data_decoder.cpp \ + 2>&1 | tee tapa.log diff --git a/benchmarks/tapa_flow/bandwidth4/run_vck5000.py b/benchmarks/tapa_flow/bandwidth4/run_vck5000.py new file mode 100644 index 00000000..ae36f962 --- /dev/null +++ b/benchmarks/tapa_flow/bandwidth4/run_vck5000.py @@ -0,0 +1,84 @@ +__copyright__ = """ +Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved. +The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement. +""" + +import os +from pathlib import Path + +from rapidstream import DeviceFactory + +CURR_DIR = os.path.dirname(os.path.abspath(__file__)) +CURR_FILE = os.path.basename(__file__) + +VITIS_PLATFORM = "xilinx_vck5000_gen4x8_qdma_2_202220_1" +VCK5000_PART_NAME = "xcvc1902-vsvd1760-2MP-e-S" + + +factory = DeviceFactory(row=2, col=2, part_num=VCK5000_PART_NAME, board_name=None) + +for x in range(2): + for y in range(2): + pblock = f"-add CLOCKREGION_X{x*4}Y{y*4}:CLOCKREGION_X{x*4+3}Y{y*4+3}" + factory.set_slot_pblock(x, y, [pblock]) + + +# set SLR crossing capacity +for x in range(2): + factory.set_slot_capacity(x, 0, north=11520) + factory.set_slot_capacity(x, 1, north=11520) + + factory.set_slot_capacity(x, 1, south=11520) + # factory.set_slot_capacity(x, 2, south=11520) + +# Set W/E capacity +for y in range(2): + factory.set_slot_capacity(0, y, east=40320) + factory.set_slot_capacity(1, y, west=40320) +# factory.set_slot_capacity(0, 2, east=41178) +# factory.set_slot_capacity(1, 2, west=41178) + + +factory.set_platform_name(VITIS_PLATFORM) +factory.set_user_pblock_name("pblock_dynamic_region") + +factory.set_slot_pblock(0, 0, ["-add CLOCKREGION_X0Y1:CLOCKREGION_X4Y2"]) +factory.set_slot_pblock(1, 0, ["-add CLOCKREGION_X5Y1:CLOCKREGION_X9Y2"]) +factory.set_slot_pblock(0, 1, ["-add CLOCKREGION_X0Y3:CLOCKREGION_X4Y4"]) +factory.set_slot_pblock(1, 1, ["-add CLOCKREGION_X5Y3:CLOCKREGION_X9Y4"]) + + +# Vitis uses 4395 nets from SLR0 to SLR1 +# factory.set_slot_capacity(1, 0, north=11520 - 4395) +# factory.set_slot_capacity(1, 1, north=11520 - 4395) + +# Vitis uses 4185 nets from SLR1 to SLR2 +# factory.set_slot_capacity(1, 1, south=11520 - 4185) + + +factory.extract_slot_resources() + + +# Reserve resource for the HBM Memory Sub-System. +# The HMSS is not part of the user kernel so the partition optimization process +# is unaware of its existence. We need to manually reserve resources for it. +# For 512-bit HBM channels, each HBM channel uses approximately the following resources: +# AREA_PER_HBM_CHANNEL = { +# "LUT": 5000, +# "FF": 6500, +# "BRAM": 0, +# "URAM": 0, +# "DSP": 0, +# } +# factory.reduce_slot_area(0, 0, lut=150800) +# factory.reduce_slot_area(0, 1, lut=146960) +# factory.reduce_slot_area(1, 0, lut=128000) +# factory.reduce_slot_area(1, 1, lut=107840) + + +# For this U280 platform, the right most DSP column on the boundary between +# dynamic/static region is not usable. So we need to adjust the DSP count +# to reflect the actual available DSPs. +print("Reducing DSP of (1, 1) to make it less congested") +factory.reduce_slot_area(1, 1, dsp=100) +factory.generate_virtual_device(Path(f"{CURR_DIR}/build/{CURR_FILE}/device.json")) diff --git a/benchmarks/vitis_flow/bandwidth23/Makefile b/benchmarks/vitis_flow/bandwidth23/Makefile new file mode 100644 index 00000000..e5963275 --- /dev/null +++ b/benchmarks/vitis_flow/bandwidth23/Makefile @@ -0,0 +1,119 @@ +# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved. +# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement. + +ROOT_DIR := $(shell git rev-parse --show-toplevel) +GRP_UTIL := $(ROOT_DIR)/common/util/get_group.py +PLATFORM := xilinx_vck5000_gen4x8_qdma_2_202220_1 +PART := xcvc1902-vsvd1760-2MP-e-S +LINK_FILE := link_config_hbm.ini +KERNEL_NAME := bandwidth23 +HLSXX := vitis_hls +SRC_DIR := $(CURDIR)/design +RS_SCRIPT := $(CURDIR)/run.py +TEMP_DIR := $(CURDIR)/build/$(notdir $(RS_SCRIPT)) +HOST := $(TEMP_DIR)/app.exe +KERNEL_XO := $(TEMP_DIR)/$(KERNEL_NAME).xo +KERNEL_XSA := $(TEMP_DIR)/$(KERNEL_NAME).xsa +KERNEL_XCLBIN := $(TEMP_DIR)/$(KERNEL_NAME).xclbin +RS_XCLBIN := $(TEMP_DIR)/dse/candidate_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin +CLK_PERIOD_NS := 3 +TARGET := hw +HLS2RTL_TCL := $(ROOT_DIR)/common/tcl/hls2rtl.tcl +GEN_XO := 1 + +BUILD_LOG := $(TEMP_DIR)/build.json +SUCCESS := "Build Successful" +TIMING_RPT := impl_1_hw_bb_locked_timing_summary_routed.rpt +SLACK_GETTER := $(ROOT_DIR)/common/util/get_slack.py +RSXX := rapidstream + + + + +all: $(RS_XCLBIN) + $(RSXX) $(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG) -c clk_kernel_00_unbuffered_net -p 3.333 + echo $(SUCCESS) + +$(RS_XCLBIN):$(KERNEL_XO) + $(RSXX) $(RS_SCRIPT) + +hw: $(KERNEL_XCLBIN) + +$(KERNEL_XCLBIN): $(KERNEL_XSA) + @echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****" + cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \ + $^ \ + --temp_dir $(TEMP_DIR) \ + --save-temps \ + --report_dir $(TEMP_DIR)/reports/ \ + --package.boot_mode=ospi \ + -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log + @echo "### ***** $(KERNEL_XCLBIN) packaging done! *****" + +$(KERNEL_XSA): $(KERNEL_XO) + cd $(TEMP_DIR) && v++ -l -t ${TARGET} \ + --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \ + --config $(SRC_DIR)/vck5000.cfg \ + --save-temps \ + --temp_dir $(TEMP_DIR) \ + --clock.defaultFreqHz 250000000 \ + --vivado.synth.jobs 16 \ + $< -o $@ + + +xo:$(KERNEL_XO) + +$(KERNEL_XO): $(SRC_DIR)/$(KERNEL_NAME).cpp $(SRC_DIR)/$(KERNEL_NAME).h + mkdir -p $(TEMP_DIR) + cd $(TEMP_DIR) && v++ -c -t ${TARGET} \ + --platform $(PLATFORM) \ + -k $(KERNEL_NAME) \ + --temp_dir $(TEMP_DIR) \ + --save-temps \ + -o $@ \ + $^ + +sw_emu: $(HOST) $(SRC_DIR)/$(KERNEL_NAME).cpp $(SRC_DIR)/$(KERNEL_NAME).h + mkdir -p $(TEMP_DIR) + cd $(TEMP_DIR) && v++ -c -t sw_emu \ + --platform xilinx_u50_gen3x16_xdma_5_202210_1 \ + -k $(KERNEL_NAME) \ + --temp_dir $(TEMP_DIR) \ + --save-temps \ + -o $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xo \ + $^ + cd $(TEMP_DIR) && v++ -l -t sw_emu \ + $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xo \ + --platform xilinx_u50_gen3x16_xdma_5_202210_1 \ + --kernel $(KERNEL_NAME) \ + --connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \ + -o $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xclbin + cd $(TEMP_DIR) && XCL_EMULATION_MODE=sw_emu $< $(TEMP_DIR)/$(KERNEL_NAME)_sw_emu.xclbin + +host:$(HOST) + +$(HOST): $(SRC_DIR)/host.cpp + mkdir -p $(TEMP_DIR) + g++ -Wall -g -std=c++11 $(SRC_DIR)/host.cpp -o $@ \ + -I${XILINX_XRT}/include/ \ + -I${XILINX_HLS}/include/ \ + -L${XILINX_XRT}/lib/ -lOpenCL -pthread -lrt -lstdc++ + +show_groups: + rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \ + -o $(TEMP_DIR)/module_types.csv + + + +clean: + rm -rf $(TEMP_DIR) *.log + rm -rf .Xil .run + rm -rf *.exe + rm -rf .ipcache + + +cleanall: + rm -rf build *.log + rm -rf .Xil .run + rm -rf *.exe + rm -rf .ipcache diff --git a/benchmarks/vitis_flow/bandwidth23/README.md b/benchmarks/vitis_flow/bandwidth23/README.md new file mode 100644 index 00000000..f6d4bcdd --- /dev/null +++ b/benchmarks/vitis_flow/bandwidth23/README.md @@ -0,0 +1,118 @@ + + +RapidStream Logo + +# Large Language Model Benchmark + +## Introduction + +In this recipe, we illustrate how to create a Vitis objective file (`.xo`) for a Large Language Model kernel from [Chen *et al.* (TRETS)](https://dl.acm.org/doi/10.1145/3656177) using Vitis, then optimize the `.xo` file with Rapidstream, and finally utilize the optimized output in the ongoing Vitis development process. + + +## Tutorial + +### Step 1: Generate the Xilinx Object File (`.xo`) + +We use Vitis 2023.2 to generate the `.xo` file. Since we want to disable [free running pipeline (FRP)](https://www.xilinx.com/htmldocs/xilinx2021_2/hls-guidance/200-1553.html) feature for HLS step, we use [hls2rtl.tcl](../../../common/tcl/hls2rtl.tcl) to compile the C++ code to `.xo` file. + +Run the following command or run `make clean && make xo`: + +```bash +source /Vitis/2023.2/settings64.sh +make clean +mkdir -p build +vitis_hls ../../../common/tcl/hls2rtl.tcl \ + -l build/vitis_hls_llm.log \ + -tclargs \ + xcu50-fsvh2104-2-e \ + 4 \ + bert_all \ + 1 \ + design/bert_all.cpp design/kernel.h \ + design/bert_region_1.cpp design/bert_region_1.h \ + design/bert_region_2.cpp design/bert_region_2.h \ + design/bert_region_3.cpp design/bert_region_3.h +``` + +### Step 2 (Optional): Use Vitis --link to Generate the `.xclbin` File + +:warning: **Note**: This step can take hours to complete. We recommend using the RapidStream flow to optimize the `.xo` file instead of generating the `.xclbin` file if you are familiar with AMD Vitis flow. + +With the `.xo` file generated, you can use `v++ -link` to generate the `.xclbin` file. Run the following command or execute `make hw`: + +```bash +v++ -l -t hw \ + --platform xilinx_u50_gen3x16_xdma_5_202210_1 \ + --kernel bert_all \ + --connectivity.nk bert_all:1:bert_all \ + --config design/link_config_hbm.ini \ + --temp_dir build \ + -o build/bert_all.xclbin \ + build/bert_all.xo +``` + +### Step 3: Call RapidStream to Optimize the Design + +The RapidStream flow conducts design space exploration and generates optimized `.xo` files by taking the Vitis generated `.xo` as the input. The RapidStream flow for Vitis requires four key inputs: + +1. **Device**: Specify the Vitis platform name for `v++`. +2. **Xilinx Object file** (.xo): Provide the file generated by `v++` or Vivado. +3. **Connectivity** (.ini): Include the configuration file for `v++` ./design/link_config_hbm.ini. +4. **Clock targets**: Define the desired clock frequencies. +5. RapidStream automatically handles all other aspects of the flow. + +Please refer to [run_u50.py](./run_u50.py) for the complete RapidStream flow. +To execute the flow and generate optimized `.xo` files, +Run the following command or execute `make rs_opt`: + +```bash +rapidstream ./run_u50.py +``` + +Unlike in the example provided in [getting_started/vitis_source](../../../getting_started/vitis_source/run.py) where the `skip_impl` variable is set to `True`, in this case, the DSE engine will automatically launch Vitis to link the optimized `.xo` file to the target device and generate the `.xclbin` file. + +```bash +# Skip Vitis implementation. +rs.run_dse(skip_impl=True) +``` + +When finished, you can locate these files using the following command: + + +```bash +find ./build/dse/ -name *.xclbin +``` + +If everything is successful, you should at least get one optimized `.xclbin` file. + + +### Step 4: Check the Group Module Report + + +RapidStream mandates a clear distinction between communication and computation within user designs. + +- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules. + +- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged. + +For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation. + +To generate a report on group types, execute the commands below or `run make show_groups`: + +```bash +rapidstream ../../../common/util/get_group.py \ + -i build/passes/0-imported.json \ + -o build/module_types.csv +``` + +The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `VecAdd` serves as a Group module, while the other three modules are added by RapidStream. + +| Module Name | Group Type | +|:--------------------------------:|:--------------:| +| bert_all | grouped_module | +|__rs_ap_ctrl_start_ready_pipeline | grouped_module | +|__rs_ff_pipeline | grouped_module | +|__rs_hs_pipeline | grouped_module | diff --git a/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.cpp b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.cpp new file mode 100644 index 00000000..e1197a2d --- /dev/null +++ b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.cpp @@ -0,0 +1,181 @@ +// Copyright 2024 RapidStream Design Automation, Inc. +// All Rights Reserved. + +#include "bandwidth23.h" +#include + + +void print_512(bit512 din){ + // Print out the data 64-bit hex per line + for (int i = 0; i < 8; i++) { + printf("%08x%08x\n", (unsigned int) din(63+i*64, 32+i*64), (unsigned int) din(31+i*64, 0+i*64)); + } +} + +void read_mem(bit512* mem, hls::stream& ch, long offset) { + for (int j = 0; j < 1024; j++) { + ch.write(mem[(offset<<10) + j]<<1); + } +} + + +void write_mem(hls::stream& ch, bit512* mem, long offset) { + for (int j = 0; j < 1024; j++) { + mem[(offset<<10) + j] = ch.read(); + } +} + + + +extern "C" { + +void bandwidth23( + bit512* ch_0, + bit512* ch_1, + bit512* ch_2, + bit512* ch_3, + bit512* ch_4, + bit512* ch_5, + bit512* ch_6, + bit512* ch_7, + bit512* ch_8, + bit512* ch_9, + bit512* ch_10, + bit512* ch_11, + bit512* ch_12, + bit512* ch_13, + bit512* ch_14, + bit512* ch_15, + bit512* ch_16, + bit512* ch_17, + bit512* ch_18, + bit512* ch_19, + bit512* ch_20, + bit512* ch_21, + bit512* ch_22, + long n) +{ +#pragma HLS INTERFACE m_axi port=ch_0 bundle=ch_0 +#pragma HLS INTERFACE m_axi port=ch_1 bundle=ch_1 +#pragma HLS INTERFACE m_axi port=ch_2 bundle=ch_2 +#pragma HLS INTERFACE m_axi port=ch_3 bundle=ch_3 +#pragma HLS INTERFACE m_axi port=ch_4 bundle=ch_4 +#pragma HLS INTERFACE m_axi port=ch_5 bundle=ch_5 +#pragma HLS INTERFACE m_axi port=ch_6 bundle=ch_6 +#pragma HLS INTERFACE m_axi port=ch_7 bundle=ch_7 +#pragma HLS INTERFACE m_axi port=ch_8 bundle=ch_8 +#pragma HLS INTERFACE m_axi port=ch_9 bundle=ch_9 +#pragma HLS INTERFACE m_axi port=ch_10 bundle=ch_10 +#pragma HLS INTERFACE m_axi port=ch_11 bundle=ch_11 +#pragma HLS INTERFACE m_axi port=ch_12 bundle=ch_12 +#pragma HLS INTERFACE m_axi port=ch_13 bundle=ch_13 +#pragma HLS INTERFACE m_axi port=ch_14 bundle=ch_14 +#pragma HLS INTERFACE m_axi port=ch_15 bundle=ch_15 +#pragma HLS INTERFACE m_axi port=ch_16 bundle=ch_16 +#pragma HLS INTERFACE m_axi port=ch_17 bundle=ch_17 +#pragma HLS INTERFACE m_axi port=ch_18 bundle=ch_18 +#pragma HLS INTERFACE m_axi port=ch_19 bundle=ch_19 +#pragma HLS INTERFACE m_axi port=ch_20 bundle=ch_20 +#pragma HLS INTERFACE m_axi port=ch_21 bundle=ch_21 +#pragma HLS INTERFACE m_axi port=ch_22 bundle=ch_22 +#pragma HLS INTERFACE s_axilite port=n bundle=control +#pragma HLS INTERFACE s_axilite port=return bundle=control + hls::stream stream_0; +#pragma HLS STREAM variable=stream_0 depth=2048 + hls::stream stream_1; +#pragma HLS STREAM variable=stream_1 depth=2048 + hls::stream stream_2; +#pragma HLS STREAM variable=stream_2 depth=2048 + hls::stream stream_3; +#pragma HLS STREAM variable=stream_3 depth=2048 + hls::stream stream_4; +#pragma HLS STREAM variable=stream_4 depth=2048 + hls::stream stream_5; +#pragma HLS STREAM variable=stream_5 depth=2048 + hls::stream stream_6; +#pragma HLS STREAM variable=stream_6 depth=2048 + hls::stream stream_7; +#pragma HLS STREAM variable=stream_7 depth=2048 + hls::stream stream_8; +#pragma HLS STREAM variable=stream_8 depth=2048 + hls::stream stream_9; +#pragma HLS STREAM variable=stream_9 depth=2048 + hls::stream stream_10; +#pragma HLS STREAM variable=stream_10 depth=2048 + hls::stream stream_11; +#pragma HLS STREAM variable=stream_11 depth=2048 + hls::stream stream_12; +#pragma HLS STREAM variable=stream_12 depth=2048 + hls::stream stream_13; +#pragma HLS STREAM variable=stream_13 depth=2048 + hls::stream stream_14; +#pragma HLS STREAM variable=stream_14 depth=2048 + hls::stream stream_15; +#pragma HLS STREAM variable=stream_15 depth=2048 + hls::stream stream_16; +#pragma HLS STREAM variable=stream_16 depth=2048 + hls::stream stream_17; +#pragma HLS STREAM variable=stream_17 depth=2048 + hls::stream stream_18; +#pragma HLS STREAM variable=stream_18 depth=2048 + hls::stream stream_19; +#pragma HLS STREAM variable=stream_19 depth=2048 + hls::stream stream_20; +#pragma HLS STREAM variable=stream_20 depth=2048 + hls::stream stream_21; +#pragma HLS STREAM variable=stream_21 depth=2048 + hls::stream stream_22; +#pragma HLS STREAM variable=stream_22 depth=2048 + + + for(int i=0; i<(n>>10); i++){ + read_mem(ch_0, stream_0, i); + read_mem(ch_1, stream_1, i); + read_mem(ch_2, stream_2, i); + read_mem(ch_3, stream_3, i); + read_mem(ch_4, stream_4, i); + read_mem(ch_5, stream_5, i); + read_mem(ch_6, stream_6, i); + read_mem(ch_7, stream_7, i); + read_mem(ch_8, stream_8, i); + read_mem(ch_9, stream_9, i); + read_mem(ch_10, stream_10, i); + read_mem(ch_11, stream_11, i); + read_mem(ch_12, stream_12, i); + read_mem(ch_13, stream_13, i); + read_mem(ch_14, stream_14, i); + read_mem(ch_15, stream_15, i); + read_mem(ch_16, stream_16, i); + read_mem(ch_17, stream_17, i); + read_mem(ch_18, stream_18, i); + read_mem(ch_19, stream_19, i); + read_mem(ch_20, stream_20, i); + read_mem(ch_21, stream_21, i); + read_mem(ch_22, stream_22, i); + + write_mem(stream_0, ch_0, i); + write_mem(stream_1, ch_1, i); + write_mem(stream_2, ch_2, i); + write_mem(stream_3, ch_3, i); + write_mem(stream_4, ch_4, i); + write_mem(stream_5, ch_5, i); + write_mem(stream_6, ch_6, i); + write_mem(stream_7, ch_7, i); + write_mem(stream_8, ch_8, i); + write_mem(stream_9, ch_9, i); + write_mem(stream_10, ch_10, i); + write_mem(stream_11, ch_11, i); + write_mem(stream_12, ch_12, i); + write_mem(stream_13, ch_13, i); + write_mem(stream_14, ch_14, i); + write_mem(stream_15, ch_15, i); + write_mem(stream_16, ch_16, i); + write_mem(stream_17, ch_17, i); + write_mem(stream_18, ch_18, i); + write_mem(stream_19, ch_19, i); + write_mem(stream_20, ch_20, i); + write_mem(stream_21, ch_21, i); + write_mem(stream_22, ch_22, i); + } +} +} diff --git a/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.h b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.h new file mode 100644 index 00000000..e8fffd02 --- /dev/null +++ b/benchmarks/vitis_flow/bandwidth23/design/bandwidth23.h @@ -0,0 +1,43 @@ +// Copyright 2024 RapidStream Design Automation, Inc. +// All Rights Reserved. + +#include "stdio.h" +#include "stdlib.h" +#include "math.h" +#include +#include + + +/* Data Type */ +typedef ap_uint<512> bit512; +typedef ap_uint<64> bit64; +typedef bit512 data_t ; +/* Data Type */ + + + +extern "C" { void bandwidth23( + bit512* ch_0, + bit512* ch_1, + bit512* ch_2, + bit512* ch_3, + bit512* ch_4, + bit512* ch_5, + bit512* ch_6, + bit512* ch_7, + bit512* ch_8, + bit512* ch_9, + bit512* ch_10, + bit512* ch_11, + bit512* ch_12, + bit512* ch_13, + bit512* ch_14, + bit512* ch_15, + bit512* ch_16, + bit512* ch_17, + bit512* ch_18, + bit512* ch_19, + bit512* ch_20, + bit512* ch_21, + bit512* ch_22, + long n); } diff --git a/benchmarks/vitis_flow/bandwidth23/design/host.cpp b/benchmarks/vitis_flow/bandwidth23/design/host.cpp new file mode 100644 index 00000000..1e5f972d --- /dev/null +++ b/benchmarks/vitis_flow/bandwidth23/design/host.cpp @@ -0,0 +1,339 @@ +// Copyright 2024 RapidStream Design Automation, Inc. +// All Rights Reserved. + + +#define CL_HPP_TARGET_OPENCL_VERSION 120 +#define CL_HPP_MINIMUM_OPENCL_VERSION 120 +#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 + +#include +#include +#include +#include +#include "bandwidth23.h" + +void print_512(bit512 din){ + // Print out the data 64-bit hex per line + for (int i = 0; i < 8; i++) { + printf("%08x%08x\n", (unsigned int) din(63+i*64, 32+i*64), (unsigned int) din(31+i*64, 0+i*64)); + } +} + +#define CHECK_MSG(msg, call) \ + call; \ + if (msg != CL_SUCCESS) { \ + printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, msg); \ + exit(EXIT_FAILURE); \ + } + +static const std::string error_message = + "Error: Result mismatch:\n" + "i = %d CPU result = %d Device result = %d\n"; + +int main(int argc, char* argv[]) { + // Must specify the xclbin file as the second argument + if (argc != 2) { + std::cout << "Please run the application by: " << argv[0] << " " << std::endl; + return EXIT_FAILURE; + } + + std::string xclbin_file = argv[1]; + + // Calculate the byte size the input data + long DATA_SIZE = 4096; + + std::vector devices; + cl_int err; + cl::Context context; + cl::CommandQueue q; + cl::Kernel bandwidth23; + cl::Program program; + std::vector platforms; + bool device_found = false; + + // The get_xil_devices will return vector of Xilinx Devices + // Iterate through devices and find Xilinx Alveo Device + cl::Platform::get(&platforms); + for (size_t i = 0; (i < platforms.size()) & (device_found == false); i++) { + cl::Platform platform = platforms[i]; + std::string platformName = platform.getInfo(); + if (platformName == "Xilinx") { + devices.clear(); + platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices); + if (devices.size()) { + device_found = true; + break; + } + } + } + if (device_found == false) { + std::cout << "Error: could not find the target Xilinx Alveo device" << std::endl; + return EXIT_FAILURE; + } + + std::cout << "INFO: reading " << xclbin_file << " xclbinfile" << std::endl; + FILE* fp; + if ((fp = fopen(xclbin_file.c_str(), "r")) == nullptr) { + std::cout << "ERROR: cannot open" << xclbin_file.c_str() << " xclbin!" << std::endl; + exit(EXIT_FAILURE); + } + + // Load xclbin + std::cout << "INFO: loading: '" << xclbin_file << "'\n"; + std::ifstream bin_file(xclbin_file, std::ifstream::binary); + bin_file.seekg(0, bin_file.end); + unsigned nb = bin_file.tellg(); + bin_file.seekg(0, bin_file.beg); + char* buf = new char[nb]; + bin_file.read(buf, nb); + + // Creating Program from Binary File + cl::Program::Binaries bins; + bins.push_back({buf, nb}); + bool valid_device = false; + for (unsigned int i = 0; i < devices.size(); i++) { + auto device = devices[i]; + // For the device, we create a context and command queue + CHECK_MSG(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err)); + CHECK_MSG(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err)); + std::cout << "Trying to program device[" << i << "]: " << device.getInfo() << std::endl; + cl::Program program(context, {device}, bins, nullptr, &err); + if (err != CL_SUCCESS) { + std::cout << "Device[" << i << "]: failed to load xclbin file!\n"; + } else { + std::cout << "Device[" << i << "]: xclbin is loaded successfully!\n"; + CHECK_MSG(err, bandwidth23 = cl::Kernel(program, "bandwidth23", &err)); + valid_device = true; + break; // we break because we found a valid device + } + } + if (!valid_device) { + std::cout << "Failed to program any device found, exit!\n"; + exit(EXIT_FAILURE); + } + + // These commands will allocate memory on the Device. The cl::Buffer objects can + // be used to reference the memory locations on the device. + CHECK_MSG(err, cl::Buffer buffer_ch_0(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_1(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_2(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_3(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_4(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_5(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_6(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_7(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_8(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_9(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_10(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_11(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_12(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_13(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_14(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_15(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_16(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_17(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_18(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_19(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_20(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_21(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + CHECK_MSG(err, cl::Buffer buffer_ch_22(context, CL_MEM_READ_WRITE, sizeof(data_t)*DATA_SIZE, NULL, &err)); + + + // set the kernel Arguments + CHECK_MSG(err, err = bandwidth23.setArg(0, buffer_ch_0)); + CHECK_MSG(err, err = bandwidth23.setArg(1, buffer_ch_1)); + CHECK_MSG(err, err = bandwidth23.setArg(2, buffer_ch_2)); + CHECK_MSG(err, err = bandwidth23.setArg(3, buffer_ch_3)); + CHECK_MSG(err, err = bandwidth23.setArg(4, buffer_ch_4)); + CHECK_MSG(err, err = bandwidth23.setArg(5, buffer_ch_5)); + CHECK_MSG(err, err = bandwidth23.setArg(6, buffer_ch_6)); + CHECK_MSG(err, err = bandwidth23.setArg(7, buffer_ch_7)); + CHECK_MSG(err, err = bandwidth23.setArg(8, buffer_ch_8)); + CHECK_MSG(err, err = bandwidth23.setArg(9, buffer_ch_9)); + CHECK_MSG(err, err = bandwidth23.setArg(10, buffer_ch_10)); + CHECK_MSG(err, err = bandwidth23.setArg(11, buffer_ch_11)); + CHECK_MSG(err, err = bandwidth23.setArg(12, buffer_ch_12)); + CHECK_MSG(err, err = bandwidth23.setArg(13, buffer_ch_13)); + CHECK_MSG(err, err = bandwidth23.setArg(14, buffer_ch_14)); + CHECK_MSG(err, err = bandwidth23.setArg(15, buffer_ch_15)); + CHECK_MSG(err, err = bandwidth23.setArg(16, buffer_ch_16)); + CHECK_MSG(err, err = bandwidth23.setArg(17, buffer_ch_17)); + CHECK_MSG(err, err = bandwidth23.setArg(18, buffer_ch_18)); + CHECK_MSG(err, err = bandwidth23.setArg(19, buffer_ch_19)); + CHECK_MSG(err, err = bandwidth23.setArg(20, buffer_ch_20)); + CHECK_MSG(err, err = bandwidth23.setArg(21, buffer_ch_21)); + CHECK_MSG(err, err = bandwidth23.setArg(22, buffer_ch_22)); + + + // We then need to map our OpenCL buffers to get the pointers + data_t* ch_0; + data_t* ch_1; + data_t* ch_2; + data_t* ch_3; + data_t* ch_4; + data_t* ch_5; + data_t* ch_6; + data_t* ch_7; + data_t* ch_8; + data_t* ch_9; + data_t* ch_10; + data_t* ch_11; + data_t* ch_12; + data_t* ch_13; + data_t* ch_14; + data_t* ch_15; + data_t* ch_16; + data_t* ch_17; + data_t* ch_18; + data_t* ch_19; + data_t* ch_20; + data_t* ch_21; + data_t* ch_22; + + CHECK_MSG(err, ch_0 = (data_t*)q.enqueueMapBuffer(buffer_ch_0, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_1 = (data_t*)q.enqueueMapBuffer(buffer_ch_1, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_2 = (data_t*)q.enqueueMapBuffer(buffer_ch_2, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_3 = (data_t*)q.enqueueMapBuffer(buffer_ch_3, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_4 = (data_t*)q.enqueueMapBuffer(buffer_ch_4, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_5 = (data_t*)q.enqueueMapBuffer(buffer_ch_5, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_6 = (data_t*)q.enqueueMapBuffer(buffer_ch_6, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_7 = (data_t*)q.enqueueMapBuffer(buffer_ch_7, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_8 = (data_t*)q.enqueueMapBuffer(buffer_ch_8, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_9 = (data_t*)q.enqueueMapBuffer(buffer_ch_9, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_10 = (data_t*)q.enqueueMapBuffer(buffer_ch_10, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_11 = (data_t*)q.enqueueMapBuffer(buffer_ch_11, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_12 = (data_t*)q.enqueueMapBuffer(buffer_ch_12, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_13 = (data_t*)q.enqueueMapBuffer(buffer_ch_13, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_14 = (data_t*)q.enqueueMapBuffer(buffer_ch_14, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_15 = (data_t*)q.enqueueMapBuffer(buffer_ch_15, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_16 = (data_t*)q.enqueueMapBuffer(buffer_ch_16, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_17 = (data_t*)q.enqueueMapBuffer(buffer_ch_17, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_18 = (data_t*)q.enqueueMapBuffer(buffer_ch_18, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_19 = (data_t*)q.enqueueMapBuffer(buffer_ch_19, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_20 = (data_t*)q.enqueueMapBuffer(buffer_ch_20, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_21 = (data_t*)q.enqueueMapBuffer(buffer_ch_21, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + CHECK_MSG(err, ch_22 = (data_t*)q.enqueueMapBuffer(buffer_ch_22, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, sizeof(data_t)*DATA_SIZE, NULL, NULL, &err)); + + + // Initialize input data + for (int i = 0; i < DATA_SIZE; i++) { ch_0[i] = 0 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_1[i] = 1 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_2[i] = 2 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_3[i] = 3 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_4[i] = 4 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_5[i] = 5 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_6[i] = 6 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_7[i] = 7 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_8[i] = 8 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_9[i] = 9 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_10[i] = 10 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_11[i] = 11 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_12[i] = 12 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_13[i] = 13 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_14[i] = 14 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_15[i] = 15 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_16[i] = 16 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_17[i] = 17 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_18[i] = 18 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_19[i] = 19 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_20[i] = 20 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_21[i] = 21 ^ i; } + for (int i = 0; i < DATA_SIZE; i++) { ch_22[i] = 22 ^ i; } + + CHECK_MSG(err, err = bandwidth23.setArg(0, buffer_ch_0)); + CHECK_MSG(err, err = bandwidth23.setArg(1, buffer_ch_1)); + CHECK_MSG(err, err = bandwidth23.setArg(2, buffer_ch_2)); + CHECK_MSG(err, err = bandwidth23.setArg(3, buffer_ch_3)); + CHECK_MSG(err, err = bandwidth23.setArg(4, buffer_ch_4)); + CHECK_MSG(err, err = bandwidth23.setArg(5, buffer_ch_5)); + CHECK_MSG(err, err = bandwidth23.setArg(6, buffer_ch_6)); + CHECK_MSG(err, err = bandwidth23.setArg(7, buffer_ch_7)); + CHECK_MSG(err, err = bandwidth23.setArg(8, buffer_ch_8)); + CHECK_MSG(err, err = bandwidth23.setArg(9, buffer_ch_9)); + CHECK_MSG(err, err = bandwidth23.setArg(10, buffer_ch_10)); + CHECK_MSG(err, err = bandwidth23.setArg(11, buffer_ch_11)); + CHECK_MSG(err, err = bandwidth23.setArg(12, buffer_ch_12)); + CHECK_MSG(err, err = bandwidth23.setArg(13, buffer_ch_13)); + CHECK_MSG(err, err = bandwidth23.setArg(14, buffer_ch_14)); + CHECK_MSG(err, err = bandwidth23.setArg(15, buffer_ch_15)); + CHECK_MSG(err, err = bandwidth23.setArg(16, buffer_ch_16)); + CHECK_MSG(err, err = bandwidth23.setArg(17, buffer_ch_17)); + CHECK_MSG(err, err = bandwidth23.setArg(18, buffer_ch_18)); + CHECK_MSG(err, err = bandwidth23.setArg(19, buffer_ch_19)); + CHECK_MSG(err, err = bandwidth23.setArg(20, buffer_ch_20)); + CHECK_MSG(err, err = bandwidth23.setArg(21, buffer_ch_21)); + CHECK_MSG(err, err = bandwidth23.setArg(22, buffer_ch_22)); + CHECK_MSG(err, err = bandwidth23.setArg(23, DATA_SIZE)); + + + // Data will be migrated to device global memory + CHECK_MSG(err, err = q.enqueueMigrateMemObjects({buffer_ch_0, buffer_ch_1, buffer_ch_2, buffer_ch_3, buffer_ch_4, buffer_ch_5, buffer_ch_6, buffer_ch_7, buffer_ch_8, buffer_ch_9, buffer_ch_10, buffer_ch_11, buffer_ch_12, buffer_ch_13, buffer_ch_14, buffer_ch_15, buffer_ch_16, buffer_ch_17, buffer_ch_18, buffer_ch_19, buffer_ch_20, buffer_ch_21, buffer_ch_22}, 0 /* 0 means from host*/)); + + // Launnch the VecAdd kernel + CHECK_MSG(err, err = q.enqueueTask(bandwidth23)); + + // Migrate the result data back to host memory + CHECK_MSG(err, q.enqueueMigrateMemObjects({buffer_ch_0, buffer_ch_1, buffer_ch_2, buffer_ch_3, buffer_ch_4, buffer_ch_5, buffer_ch_6, buffer_ch_7, buffer_ch_8, buffer_ch_9, buffer_ch_10, buffer_ch_11, buffer_ch_12, buffer_ch_13, buffer_ch_14, buffer_ch_15, buffer_ch_16, buffer_ch_17, buffer_ch_18, buffer_ch_19, buffer_ch_20, buffer_ch_21, buffer_ch_22}, CL_MIGRATE_MEM_OBJECT_HOST)); + + // Wait for all the commands to complete + CHECK_MSG(err, q.finish()); + + // Verify the result + int match = 0; + for (int i = 0; i < DATA_SIZE; i++) { if(ch_0[i] != ((0 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_1[i] != ((1 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_2[i] != ((2 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_3[i] != ((3 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_4[i] != ((4 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_5[i] != ((5 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_6[i] != ((6 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_7[i] != ((7 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_8[i] != ((8 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_9[i] != ((9 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_10[i] != ((10 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_11[i] != ((11 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_12[i] != ((12 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_13[i] != ((13 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_14[i] != ((14 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_15[i] != ((15 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_16[i] != ((16 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_17[i] != ((17 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_18[i] != ((18 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_19[i] != ((19 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_20[i] != ((20 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_21[i] != ((21 ^ i))<<1) match++; } + for (int i = 0; i < DATA_SIZE; i++) { if(ch_22[i] != ((22 ^ i))<<1) match++; } + + + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_0, ch_0)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_1, ch_1)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_2, ch_2)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_3, ch_3)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_5, ch_5)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_6, ch_6)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_7, ch_7)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_8, ch_8)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_9, ch_9)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_10, ch_10)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_11, ch_11)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_12, ch_12)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_13, ch_13)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_14, ch_14)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_15, ch_15)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_16, ch_16)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_17, ch_17)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_18, ch_18)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_19, ch_19)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_20, ch_20)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_21, ch_21)); + CHECK_MSG(err, err = q.enqueueUnmapMemObject(buffer_ch_22, ch_22)); + + CHECK_MSG(err, err = q.finish()); + + if (match == 0) { + std::cout << "TEST PASSED!" << std::endl; + } else { + std::cout << match << " TEST FAILED!" << std::endl; + } + return (match ? EXIT_FAILURE : EXIT_SUCCESS); +} diff --git a/benchmarks/vitis_flow/bandwidth23/design/vck5000.cfg b/benchmarks/vitis_flow/bandwidth23/design/vck5000.cfg new file mode 100644 index 00000000..cf375c2d --- /dev/null +++ b/benchmarks/vitis_flow/bandwidth23/design/vck5000.cfg @@ -0,0 +1,27 @@ +platform=xilinx_vck5000_gen4x8_qdma_2_202220_1 + +[connectivity] + +sp = bandwidth23.m_axi_ch_0:MC_NOC0 +sp = bandwidth23.m_axi_ch_1:MC_NOC0 +sp = bandwidth23.m_axi_ch_2:MC_NOC0 +sp = bandwidth23.m_axi_ch_3:MC_NOC0 +sp = bandwidth23.m_axi_ch_4:MC_NOC0 +sp = bandwidth23.m_axi_ch_5:MC_NOC0 +sp = bandwidth23.m_axi_ch_6:MC_NOC0 +sp = bandwidth23.m_axi_ch_7:MC_NOC0 +sp = bandwidth23.m_axi_ch_8:MC_NOC0 +sp = bandwidth23.m_axi_ch_9:MC_NOC0 +sp = bandwidth23.m_axi_ch_10:MC_NOC0 +sp = bandwidth23.m_axi_ch_11:MC_NOC0 +sp = bandwidth23.m_axi_ch_12:MC_NOC0 +sp = bandwidth23.m_axi_ch_13:MC_NOC0 +sp = bandwidth23.m_axi_ch_14:MC_NOC0 +sp = bandwidth23.m_axi_ch_15:MC_NOC0 +sp = bandwidth23.m_axi_ch_16:MC_NOC0 +sp = bandwidth23.m_axi_ch_17:MC_NOC0 +sp = bandwidth23.m_axi_ch_18:MC_NOC0 +sp = bandwidth23.m_axi_ch_19:MC_NOC0 +sp = bandwidth23.m_axi_ch_20:MC_NOC0 +sp = bandwidth23.m_axi_ch_21:MC_NOC0 +sp = bandwidth23.m_axi_ch_22:MC_NOC0 diff --git a/benchmarks/vitis_flow/bandwidth23/run_u50.py b/benchmarks/vitis_flow/bandwidth23/run_u50.py new file mode 100644 index 00000000..34aece07 --- /dev/null +++ b/benchmarks/vitis_flow/bandwidth23/run_u50.py @@ -0,0 +1,40 @@ +"""Getting Started: CNN13x2 in the Vitis flow + +This script demonstrates how to optimize a CNN13x2 design in +a Vitis object file. In this example, the object file is generated from the +Vitis_HLS. +""" + +__copyright__ = """ +Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors. All rights reserved. +The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement. +""" + +from rapidstream import get_u50_vitis_device_factory, RapidStreamVitis +import os + +CURR_DIR = os.path.dirname(os.path.abspath(__file__)) + +# Replace with RapidStreamVitis for the ".xo" files generated by `v++`. +# Create a RapidStream project in the "run" directory: +rs = RapidStreamVitis(f"{CURR_DIR}/build") + +# Use the "xilinx_u50_gen3x16_xdma_5_202210_1" platform as the device: +u50_factory = get_u50_vitis_device_factory("xilinx_u50_gen3x16_xdma_5_202210_1") +rs.set_virtual_device(u50_factory.generate_virtual_device()) + +# Add the design object file (".xo") to the project: +rs.add_xo_file(f"{CURR_DIR}/build/bert_all.xo") + +# Specify the Vitis platform and connectivity configuration: +rs.set_vitis_platform("xilinx_u50_gen3x16_xdma_5_202210_1") +rs.set_vitis_connectivity_config(f"{CURR_DIR}/design/link_config_hbm.ini") + +# Set the clock target for the design: +rs.add_clock("ap_clk", period_ns=3) + +# Bind all ports to HBM 16-31: +rs.assign_port_to_region(".*", "SLOT_X1Y0:SLOT_X1Y0") + +# Start the RapidStream optimization process: +rs.run_dse()