feat(tapa): check in bandwidth app

rapidstream-org · Dec 10, 2024 · 7fd6996 · 7fd6996
1 parent 7f81e0b
commit 7fd6996
Show file tree

Hide file tree

Showing 31 changed files with 2,258 additions and 0 deletions.
diff --git a/benchmarks/tapa_flow/bandwidth23/Makefile b/benchmarks/tapa_flow/bandwidth23/Makefile
@@ -0,0 +1,114 @@
+# Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+# The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+
+ROOT_DIR         := $(shell git rev-parse --show-toplevel)
+KERNEL_NAME      := bandwidth23
+RS_SCRIPT        := $(CURDIR)/run.py
+SRC_DIR		     := $(CURDIR)/design
+AB_CONFIG        := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/ab_config.json
+IMPL_CONFIG      := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/impl_config.json
+LINK_CONFIG      := $(CURDIR)/design/config/$(notdir $(RS_SCRIPT))/link_config.ini
+PLATFORM         := xilinx_vck5000_gen4x8_qdma_2_202220_1
+PART_NUM         := xcvc1902-vsvd1760-2MP-e-S
+GRP_UTIL         := $(ROOT_DIR)/common/util/get_group.py
+TEMP_DIR         := $(CURDIR)/build/$(notdir $(RS_SCRIPT))
+RS_TARGET        := $(TEMP_DIR)/dse/solution_0/vitis_run_hw/$(KERNEL_NAME)_$(PLATFORM).xclbin
+BUILD_LOG        := $(TEMP_DIR)/build.json
+SUCCESS          := "Build Successful"
+TIMING_RPT       := impl_1_hw_bb_locked_timing_summary_routed.rpt
+SLACK_GETTER     := $(ROOT_DIR)/common/util/get_slack.py
+RSPATH           := $(CURDIR)
+RSXX             := rapidstream
+RSPYTHON         := rapidstream
+DEVICE_CONFIG    := $(TEMP_DIR)/device.json
+DEVICE_GEN       := $(CURDIR)/gen_device.py
+INCLUDE          := -I $(XILINX_HLS)/include
+KERNEL_XO        := $(TEMP_DIR)/$(KERNEL_NAME).xo
+KERNEL_XCLBIN    := $(TEMP_DIR)/$(KERNEL_NAME).xclbin
+KERNEL_XSA       := $(TEMP_DIR)/$(KERNEL_NAME).xsa
+TARGET		     := hw
+
+all: $(RS_TARGET)
+	cd $(RSPATH) && $(RSPYTHON)	$(SLACK_GETTER) -d $(TEMP_DIR) -i $(TIMING_RPT) -o $(BUILD_LOG)  -c clk_kernel_00_unbuffered_net -p 3.333
+	@echo $(SUCCESS)
+
+$(RS_TARGET):$(KERNEL_XO) $(DEVICE_CONFIG)
+	mkdir -p $(TEMP_DIR)
+	cd $(RSPATH) && $(RSXX)-tapaopt \
+    --work-dir $(TEMP_DIR) \
+    --tapa-xo-path $< \
+    --device-config $(DEVICE_CONFIG) \
+    --floorplan-config $(AB_CONFIG) \
+	--single-reg \
+    --run-impl \
+    --implementation-config $(IMPL_CONFIG) \
+    --connectivity-ini $(LINK_CONFIG)
+
+$(DEVICE_CONFIG):$(AB_CONFIG)
+	mkdir -p $(TEMP_DIR)
+	cd $(RSPATH) && $(RSPYTHON) $(RS_SCRIPT)
+
+cosim:$(KERNEL_XO) $(TEMP_DIR)/main.exe
+	cd $(TEMP_DIR) && $(TEMP_DIR)/main.exe 1024 \
+	--bitstream $< \
+	-xosim_work_dir $(TEMP_DIR)/xosim_work_dir
+
+hw: $(KERNEL_XCLBIN)
+
+$(KERNEL_XCLBIN): $(KERNEL_XSA)
+	@echo "### ***** packaging $(KERNEL_XSA) into $(KERNEL_XCLBIN) ... *****"
+	cd $(TEMP_DIR) && v++ --package -t $(TARGET) --platform $(PLATFORM) \
+	  $^ \
+	  --temp_dir $(TEMP_DIR) \
+	  --save-temps \
+	  --report_dir $(TEMP_DIR)/reports/ \
+	  --package.boot_mode=ospi \
+	  -o $@ 2>&1 | tee $(KERNEL_NAME)_xclbin.log
+	@echo "### ***** $(KERNEL_XCLBIN) packaging done! *****"
+
+$(KERNEL_XSA): $(KERNEL_XO)
+	cd $(TEMP_DIR) && v++ -l -t ${TARGET} \
+	--connectivity.nk $(KERNEL_NAME):1:$(KERNEL_NAME) \
+	--config $(SRC_DIR)/vck5000.cfg \
+	--save-temps \
+	--temp_dir $(TEMP_DIR) \
+	--clock.defaultFreqHz 250000000 \
+	--vivado.synth.jobs 16 \
+	$< -o $@
+
+xo: $(KERNEL_XO)
+
+$(KERNEL_XO):$(SRC_DIR)/$(KERNEL_NAME).cpp
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && tapa compile \
+	--top $(KERNEL_NAME) \
+	--part-num xcu55c-fsvh2892-2L-e \
+	--clock-period 3.33 \
+	-o $(KERNEL_NAME).xo \
+	-f $< \
+	2>&1 | tee tapa.log
+
+csim:$(TEMP_DIR)/main.exe
+
+$(TEMP_DIR)/main.exe: $(SRC_DIR)/*.cpp
+	mkdir -p $(TEMP_DIR)
+	cd $(TEMP_DIR) && tapa g++ $^ $(INCLUDE) -o $(TEMP_DIR)/main.exe -O2
+	$(TEMP_DIR)/main.exe
+
+show_groups:
+	rapidstream $(GRP_UTIL) -i $(TEMP_DIR)/passes/0-imported.json \
+	-o $(TEMP_DIR)/module_types.csv
+
+
+
+clean:
+	rm -rf $(TEMP_DIR) *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
+
+cleanall:
+	rm -rf build *.log
+	rm -rf .Xil .run
+	rm -rf *.exe
+	rm -rf .ipcache
diff --git a/benchmarks/tapa_flow/bandwidth23/README.md b/benchmarks/tapa_flow/bandwidth23/README.md
@@ -0,0 +1,141 @@
+<!--
+Copyright (c) 2024 RapidStream Design Automation, Inc. and contributors.  All rights reserved.
+The contributor(s) of this file has/have agreed to the RapidStream Contributor License Agreement.
+-->
+
+<img src="https://imagedelivery.net/AU8IzMTGgpVmEBfwPILIgw/1b565657-df33-41f9-f29e-0d539743e700/128" width="64px" alt="RapidStream Logo" />
+
+# TAPA Flow: ORC Decoder
+
+## Introduction
+
+
+In this recipe, we demonstrate how to use RapidStream to optimize TAPA projects. The basic steps include:
+
+- Compile the HLS C++ code into a Vitis-compatible .xo file using TAPA.
+- Optimize the .xo file with RapidStream to obtain an optimized .xo file.
+- Use Vitis to compile the optimized .xo file into an .xclbin file for FPGA deployment.
+
+## Tutorial
+
+### Step 1 (Done): Generate the Xilinx Object File (`.xo`)
+
+
+We utilize TAPA to generate the `.xo` file. If you have not installed TAPA, we've already compiled the C++ source to `.xo` using TAPA. The original C++ source files are located in design/src. The generated `.xo` file can be found at design/generated/data_decoding.xo. To compile C++ to `.xo` using TAPA, we use the script [design/run_tapa.sh](design/run_tapa.sh), with the detailed commands shown below. For your convenience, we have also backed up all the generated metadata by TAPA in the design/generated directory.
+
+```bash
+WORK_DIR=generated
+tapac \
+  --work-dir ${WORK_DIR} \
+  --top data_decoding \
+  --part-num xcu280-fsvh2892-2L-e \
+  --clock-period 3.33 \
+  -o ${WORK_DIR}/data_decoding.xo \
+  --connectivity config/link_config.ini \
+  src/data_decoder.cpp \
+  2>&1 | tee tapa.log
+```
+
+### Step 2: Use Rapidstream to Optimize `.xo` Design
+
+The RapidStream flow conducts design space exploration and generates solutions  by taking all TAPA-generated `.xo` file as the input.
+The RapidStream flow for TAPA requires the following key inputs:
+
+- **Platform**: The Vitis platform (e.g., `xilinx_u280_gen3x16_xdma_1_202211_1`).
+- **Device**: virtual device define by calling rapidstream APIs based on platform (e.g., `get_u280_vitis_device_factory`).
+- **.xo file**: The `.xo` file generated by TAPA
+- **Connectivity** (.ini): Include the configuration file for `v++` design/config/run.py/link_config.ini.
+- **top_module_name**: Top module name for the kernel.
+- **Clock**: All the clock and frequencies.
+- **Flatten Module**: Within a design, not all modules need to be optimized. The flatten module name is the target module rapidstream will optimize.
+
+The Python snippet below shows how we initiate rapidstream instance to set up the rapidstream environment.
+
+```Python
+from rapidstream import get_u280_vitis_device_factory, RapidStreamTAPA
+import os
+
+CURR_DIR = os.path.dirname(os.path.abspath(__file__))
+INI_PATH = f"{CURR_DIR}/design/config/link_config.ini"
+VITIS_PLATFORM = "xilinx_u280_gen3x16_xdma_1_202211_1"
+XO_PATH = f"{CURR_DIR}/design/generated/data_decoding.xo"
+kernel_name = "data_decoding"
+factory = get_u280_vitis_device_factory(VITIS_PLATFORM)
+rs = RapidStreamTAPA(f"{CURR_DIR}/build")
+rs.set_virtual_device(factory.generate_virtual_device())
+rs.add_xo_file(XO_PATH)
+rs.set_vitis_platform(VITIS_PLATFORM)
+rs.set_vitis_connectivity_config(INI_PATH)
+rs.set_top_module_name(kernel_name)
+rs.add_clock("ap_clk", 3.33)
+rs.add_flatten_targets([kernel_name])
+```
+
+The HBM AXI port connection is described in design/config/run.py/link_config.ini.
+
+```bash
+[connectivity]
+sp=data_decoding.input_port:HBM[0:1]
+sp=data_decoding.output_port0_32b_8b:HBM[16:17]
+sp=data_decoding.output_port1_16b_8b:HBM[18:19]
+sp=data_decoding.output_port2_16b_8b:HBM[20:21]
+sp=data_decoding.output_port3_8b:HBM[22:23]
+sp=data_decoding.output_port4_Track:HBM[24:25]
+```
+
+As a result, it is necessary to assign the kernel ports to the appropriate slots. The Python code below demonstrates this process. For comprehensive linking details, please refer to the design/config/run.py/link_config.ini file.
+
+ ```Python
+# Bind ports to HBM 16-31
+right_slot = "SLOT_X1Y0:SLOT_X1Y0"
+left_slot = "SLOT_X0Y0:SLOT_X0Y0"
+rs.assign_port_to_region(".*input_port.*", left_slot)
+rs.assign_port_to_region(".*output_port0_32b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port1_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port2_16b_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port3_8b.*", right_slot)
+rs.assign_port_to_region(".*output_port4_Track.*", right_slot)
+rs.assign_port_to_region("s_axi_control_.*", left_slot)
+rs.assign_port_to_region("ap_clk", left_slot)
+rs.assign_port_to_region("ap_rst_n", left_slot)
+rs.assign_port_to_region("interrupt", left_slot)
+```
+
+For the complete detail, please refore to [./run.py](./run.py) file. Call the rapidstream by launching the command below or `make all`.
+
+```bash
+rapidstream run.py
+```
+
+If everything is successful, you should at least get one optimized `.xclbin` file.
+
+
+
+
+### Step 3: Check the Group Module Report
+
+
+RapidStream mandates a clear distinction between communication and computation within user designs.
+
+- In `Group modules`, users are tasked solely with defining inter-submodule communication. For those familiar with Vivado IP Integrator flow, crafting a Group module mirrors the process of connecting IPs in IPI. RapidStream subsequently integrates appropriate pipeline registers into these Group modules.
+
+- In `Leaf modules`, users retain the flexibility to implement diverse computational patterns, as RapidStream leaves these Leaf modules unchanged.
+
+For further details, please consult the [code style](https://docs.rapidstream-da.com/required-coding-style/) section in our Documentation.
+
+To generate a report on group types, execute the commands below or `run make show_groups`:
+
+```bash
+rapidstream ../../../common/util/get_group.py \
+	-i build/passes/0-imported.json \
+	-o build/module_types.csv
+```
+
+The module types for your design can be found in `build/module_types.csv`. Below, we list the four Group modules. In this design, `Callipepla` serves as a Group module, while the other three modules are added by RapidStream.
+
+| Module Name                      | Group Type     |
+|:--------------------------------:|:--------------:|
+| data_decoding                    | grouped_module |
+|__rs_ap_ctrl_start_ready_pipeline | grouped_module |
+|__rs_ff_pipeline                  | grouped_module |
+|__rs_hs_pipeline                  | grouped_module |
diff --git a/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp b/benchmarks/tapa_flow/bandwidth23/design/bandwidth-host.cpp
@@ -0,0 +1,127 @@
+#include <iostream>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <tapa.h>
+#include "bandwidth23.h"
+
+using std::clog;
+using std::endl;
+using std::vector;
+
+DEFINE_string(bitstream, "", "path to bitstream file, run csim if empty");
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, /*remove_flags=*/true);
+
+  const uint64_t n = argc > 1 ? atoll(argv[1]) : 1024 * 1024;
+
+  vector<bit512> rmem0(n);
+  vector<bit512> rmem1(n);
+  vector<bit512> rmem2(n);
+  vector<bit512> rmem3(n);
+  vector<bit512> rmem4(n);
+  vector<bit512> rmem5(n);
+  vector<bit512> rmem6(n);
+  vector<bit512> rmem7(n);
+  vector<bit512> rmem8(n);
+  vector<bit512> rmem9(n);
+  vector<bit512> rmem10(n);
+  vector<bit512> rmem11(n);
+  vector<bit512> rmem12(n);
+  vector<bit512> rmem13(n);
+  vector<bit512> rmem14(n);
+  vector<bit512> rmem15(n);
+  vector<bit512> rmem16(n);
+  vector<bit512> rmem17(n);
+  vector<bit512> rmem18(n);
+  vector<bit512> rmem19(n);
+  vector<bit512> rmem20(n);
+  vector<bit512> rmem21(n);
+  vector<bit512> rmem22(n);
+
+
+  for (uint64_t i = 0; i < n; ++i) {
+    rmem0[i] = i;
+    rmem1[i] = i;
+    rmem2[i] = i;
+    rmem3[i] = i;
+    rmem4[i] = i;
+    rmem5[i] = i;
+    rmem6[i] = i;
+    rmem7[i] = i;
+    rmem8[i] = i;
+    rmem9[i] = i;
+    rmem10[i] = i;
+    rmem11[i] = i;
+    rmem12[i] = i;
+    rmem13[i] = i;
+    rmem14[i] = i;
+    rmem15[i] = i;
+    rmem16[i] = i;
+    rmem17[i] = i;
+    rmem18[i] = i;
+    rmem19[i] = i;
+    rmem20[i] = i;
+    rmem21[i] = i;
+    rmem22[i] = i;
+  }
+  int64_t kernel_time_ns = tapa::invoke(
+      bandwidth23,
+      FLAGS_bitstream,
+      tapa::read_write_mmap<bit512>(rmem0),
+      tapa::read_write_mmap<bit512>(rmem1),
+      tapa::read_write_mmap<bit512>(rmem2),
+      tapa::read_write_mmap<bit512>(rmem3),
+      tapa::read_write_mmap<bit512>(rmem4),
+      tapa::read_write_mmap<bit512>(rmem5),
+      tapa::read_write_mmap<bit512>(rmem6),
+      tapa::read_write_mmap<bit512>(rmem7),
+      tapa::read_write_mmap<bit512>(rmem8),
+      tapa::read_write_mmap<bit512>(rmem9),
+      tapa::read_write_mmap<bit512>(rmem10),
+      tapa::read_write_mmap<bit512>(rmem11),
+      tapa::read_write_mmap<bit512>(rmem12),
+      tapa::read_write_mmap<bit512>(rmem13),
+      tapa::read_write_mmap<bit512>(rmem14),
+      tapa::read_write_mmap<bit512>(rmem15),
+      tapa::read_write_mmap<bit512>(rmem16),
+      tapa::read_write_mmap<bit512>(rmem17),
+      tapa::read_write_mmap<bit512>(rmem18),
+      tapa::read_write_mmap<bit512>(rmem19),
+      tapa::read_write_mmap<bit512>(rmem20),
+      tapa::read_write_mmap<bit512>(rmem21),
+      tapa::read_write_mmap<bit512>(rmem22),
+      n);
+
+  clog << "kernel time: " << kernel_time_ns * 1e-9 << " s" << endl;
+
+  uint64_t num_errors = 0;
+  const uint64_t threshold = 10;  // only report up to these errors
+  for (uint64_t i = 0; i < n; ++i) {
+    bit512 out512 = (i << 1);
+    if (rmem0[i] != out512) {
+      if (num_errors < threshold) {
+        clog << "error at " << i << ": expected " << rmem0[i] << ", got "
+             << out512 << endl;
+      }
+      ++num_errors;
+    }
+    if (rmem22[i] != out512) {
+      if (num_errors < threshold) {
+        clog << "error at " << i << ": expected " << rmem22[i] << ", got "
+             << out512 << endl;
+      }
+      ++num_errors;
+    }
+  }
+  if (num_errors == 0) {
+    clog << "PASS!" << endl;
+  } else {
+    if (num_errors > threshold) {
+      clog << " (+" << (num_errors - threshold) << " more errors)" << endl;
+    }
+    clog << "FAIL!" << endl;
+  }
+  return num_errors > 0 ? 1 : 0;
+}