From 9e43db559985b05792d5842ba522789cdd64fac6 Mon Sep 17 00:00:00 2001 From: Sergi Granell Date: Sat, 1 Jul 2023 12:29:11 +0900 Subject: [PATCH] runtime: Add initial Xilinx Runtime Library (XRT) support --- src/runtime/CMakeLists.txt | 2 + src/runtime/HalideRuntimeXRT.h | 41 + src/runtime/mini_xrt.h | 1354 ++++++++++++++++++++++++++++++++ src/runtime/runtime_api.cpp | 5 + src/runtime/runtime_internal.h | 1 + src/runtime/xrt.cpp | 636 +++++++++++++++ 6 files changed, 2039 insertions(+) create mode 100644 src/runtime/HalideRuntimeXRT.h create mode 100644 src/runtime/mini_xrt.h create mode 100644 src/runtime/xrt.cpp diff --git a/src/runtime/CMakeLists.txt b/src/runtime/CMakeLists.txt index 71af475c2eb4..b4825083b1c2 100644 --- a/src/runtime/CMakeLists.txt +++ b/src/runtime/CMakeLists.txt @@ -98,6 +98,7 @@ set(RUNTIME_CPP windows_yield write_debug_image x86_cpu_features + xrt ) set(RUNTIME_LL @@ -139,6 +140,7 @@ set(RUNTIME_HEADER_FILES HalideRuntimeQurt.h HalideRuntimeVulkan.h HalideRuntimeWebGPU.h + HalideRuntimeXRT.h ) # Need to create an object library for this because CMake diff --git a/src/runtime/HalideRuntimeXRT.h b/src/runtime/HalideRuntimeXRT.h new file mode 100644 index 000000000000..7680893ef306 --- /dev/null +++ b/src/runtime/HalideRuntimeXRT.h @@ -0,0 +1,41 @@ +#ifndef HALIDE_HALIDERUNTIMEXRT_H +#define HALIDE_HALIDERUNTIMEXRT_H + +// Don't include HalideRuntime.h if the contents of it were already pasted into a generated header above this one +#ifndef HALIDE_HALIDERUNTIME_H + +#include "HalideRuntime.h" + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/** \file + * Routines specific to the Halide XRT runtime. + */ + +#define HALIDE_RUNTIME_XRT + +extern const struct halide_device_interface_t *halide_xrt_device_interface(); + +/** These are forward declared here to allow clients to override the + * Halide XRT runtime. Do not call them. */ +// @{ +extern int halide_xrt_initialize_kernels(void *user_context, void **state_ptr, + const char *kernel_name); +extern int halide_xrt_run(void *user_context, + void *state_ptr, + const char *entry_name, + halide_type_t arg_types[], + void *args[], + int8_t arg_is_buffer[]); +extern void halide_xrt_finalize_kernels(void *user_context, void *state_ptr); +// @} + +#ifdef __cplusplus +} // End extern "C" +#endif + +#endif // HALIDE_HALIDERUNTIMEXRT_H diff --git a/src/runtime/mini_xrt.h b/src/runtime/mini_xrt.h new file mode 100644 index 000000000000..8770c87bf41d --- /dev/null +++ b/src/runtime/mini_xrt.h @@ -0,0 +1,1354 @@ +/* + * Copyright (C) 2019-2022, Xilinx Inc + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + * + * Apache License Verbiage + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * GPL license Verbiage: + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. This program is + * distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + * License for more details. You should have received a copy of the + * GNU General Public License along with this program; if not, write + * to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA + * + */ + +#ifndef MINI_XRT_H +#define MINI_XRT_H + +#ifdef __GNUC__ +#define XRT_DEPRECATED __attribute__((deprecated)) +#else +#define XRT_DEPRECATED +#endif + +#if defined(_WIN32) +#ifdef XCL_DRIVER_DLL_EXPORT +#define XCL_DRIVER_DLLESPEC __declspec(dllexport) +#else +#define XCL_DRIVER_DLLESPEC __declspec(dllimport) +#endif +#else +#define XCL_DRIVER_DLLESPEC __attribute__((visibility("default"))) +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#endif + +#define to_cfg_pkg(pkg) \ + ((struct ert_configure_cmd *)(pkg)) +#define to_start_krnl_pkg(pkg) \ + ((struct ert_start_kernel_cmd *)(pkg)) +#define to_copybo_pkg(pkg) \ + ((struct ert_start_copybo_cmd *)(pkg)) +#define to_cfg_sk_pkg(pkg) \ + ((struct ert_configure_sk_cmd *)(pkg)) +#define to_init_krnl_pkg(pkg) \ + ((struct ert_init_kernel_cmd *)(pkg)) +#define to_validate_pkg(pkg) \ + ((struct ert_validate_cmd *)(pkg)) +#define to_abort_pkg(pkg) \ + ((struct ert_abort_cmd *)(pkg)) + +#define HOST_RW_PATTERN 0xF0F0F0F0 +#define DEVICE_RW_PATTERN 0x0F0F0F0F + +typedef unsigned char xuid_t[16]; + +#define XRT_NULL_HANDLE nullptr + +/** + * typedef xrtDeviceHandle - opaque device handle + */ +typedef void *xrtDeviceHandle; + +/** + * typedef xrtBufferHandle - opaque buffer handle + */ +typedef void *xrtBufferHandle; + +/** + * typedef xrtBufferFlags - flags for BO + * + * See ``xrt_mem.h`` for available flags + */ +typedef uint64_t xrtBufferFlags; + +/** + * typedef xrtMemoryGroup - Memory bank group for buffer + */ +typedef uint32_t xrtMemoryGroup; + +/** + * typedef xrtKernelHandle - opaque kernel handle + * + * A kernel handle is obtained by opening a kernel. Clients + * pass this kernel handle to APIs that operate on a kernel. + */ +typedef void *xrtKernelHandle; + +/** + * typedef xrtRunHandle - opaque handle to a specific kernel run + * + * A run handle is obtained by running a kernel. Clients + * use a run handle to check or wait for kernel completion. + */ +typedef void *xrtRunHandle; // NOLINT + +enum xclBOSyncDirection { + XCL_BO_SYNC_BO_TO_DEVICE = 0, + XCL_BO_SYNC_BO_FROM_DEVICE, + XCL_BO_SYNC_BO_GMIO_TO_AIE, + XCL_BO_SYNC_BO_AIE_TO_GMIO, +}; + +/** + * Encoding of flags passed to xcl buffer allocation APIs + */ +struct xcl_bo_flags { + union { + uint32_t flags; + struct { + uint16_t bank; // [15-0] + uint8_t slot; // [16-23] + uint8_t boflags; // [24-31] + }; + }; +}; + +/** + * XCL BO Flags bits layout + * + * bits 0 ~ 15: DDR BANK index + * bits 24 ~ 31: BO flags + */ +#define XRT_BO_FLAGS_MEMIDX_MASK (0xFFFFFFUL) +#define XCL_BO_FLAGS_NONE (0) +#define XCL_BO_FLAGS_CACHEABLE (1U << 24) +#define XCL_BO_FLAGS_KERNBUF (1U << 25) +#define XCL_BO_FLAGS_SGL (1U << 26) +#define XCL_BO_FLAGS_SVM (1U << 27) +#define XCL_BO_FLAGS_DEV_ONLY (1U << 28) +#define XCL_BO_FLAGS_HOST_ONLY (1U << 29) +#define XCL_BO_FLAGS_P2P (1U << 30) +#define XCL_BO_FLAGS_EXECBUF (1U << 31) + +/** + * XRT Native BO flags + * + * These flags are simple aliases for use with XRT native BO APIs. + */ +#define XRT_BO_FLAGS_NONE XCL_BO_FLAGS_NONE +#define XRT_BO_FLAGS_CACHEABLE XCL_BO_FLAGS_CACHEABLE +#define XRT_BO_FLAGS_DEV_ONLY XCL_BO_FLAGS_DEV_ONLY +#define XRT_BO_FLAGS_HOST_ONLY XCL_BO_FLAGS_HOST_ONLY +#define XRT_BO_FLAGS_P2P XCL_BO_FLAGS_P2P +#define XRT_BO_FLAGS_SVM XCL_BO_FLAGS_SVM + +/** + * This is the legacy usage of XCL DDR Flags. + * + * byte-0 lower 4 bits for DDR Flags are one-hot encoded + */ +enum xclDDRFlags { + XCL_DEVICE_RAM_BANK0 = 0x00000000, + XCL_DEVICE_RAM_BANK1 = 0x00000002, + XCL_DEVICE_RAM_BANK2 = 0x00000004, + XCL_DEVICE_RAM_BANK3 = 0x00000008, +}; + +/** + * struct ert_packet: ERT generic packet format + * + * @state: [3-0] current state of a command + * @custom: [11-4] custom per specific commands + * @count: [22-12] number of words in payload (data) + * @opcode: [27-23] opcode identifying specific command + * @type: [31-28] type of command (currently 0) + * @data: count number of words representing packet payload + */ +struct ert_packet { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-28] */ + }; + uint32_t header; + }; + uint32_t data[1]; /* count number of words */ +}; + +/** + * struct ert_start_kernel_cmd: ERT start kernel command format + * + * @state: [3-0] current state of a command + * @stat_enabled: [4] enabled driver to record timestamp for various + * states cmd has gone through. The stat data + * is appended after cmd data. + * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask + * @count: [22-12] number of words following header for cmd data. Not + * include stat data. + * @opcode: [27-23] 0, opcode for start_kernel + * @type: [31-27] 0, type of start_kernel + * + * @cu_mask: first mandatory CU mask + * @data: count-1 number of words representing interpreted payload + * + * The packet payload is comprised of reserved id field, a mandatory CU mask, + * and extra_cu_masks per header field, followed by a CU register map of size + * (count - (1 + extra_cu_masks)) uint32_t words. + */ +struct ert_start_kernel_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t stat_enabled : 1; /* [4] */ + uint32_t unused : 5; /* [9-5] */ + uint32_t extra_cu_masks : 2; /* [11-10] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t cu_mask; /* mandatory cu mask */ + uint32_t data[1]; /* count-1 number of words */ +}; + +/** + * struct ert_init_kernel_cmd: ERT initialize kernel command format + * this command initializes CUs by writing CU registers. CUs are + * represented by cu_mask and extra_cu_masks. + * + * @state: [3-0] current state of a command + * @update_rtp: [4] command is for runtime update of cu argument + * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask + * @count: [22-12] number of words following header + * @opcode: [27-23] 0, opcode for init_kernel + * @type: [31-27] 0, type of init_kernel + * + * @cu_run_timeout the configured CU timeout value in Microseconds + * setting to 0 means CU should not timeout + * @cu_reset_timeout the configured CU reset timeout value in Microseconds + * when CU timeout, CU will be reset. this indicates + * CU reset should be completed within the timeout value. + * if cu_run_timeout is set to 0, this field is undefined. + * + * @cu_mask: first mandatory CU mask + * @data: count-9 number of words representing interpreted payload + * + * The packet payload is comprised of reserved id field, 8 reserved fields, + * a mandatory CU mask, and extra_cu_masks per header field, followed by a + * CU register map of size (count - (9 + extra_cu_masks)) uint32_t words. + */ +struct ert_init_kernel_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t update_rtp : 1; /* [4] */ + uint32_t unused : 5; /* [9-5] */ + uint32_t extra_cu_masks : 2; /* [11-10] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + uint32_t cu_run_timeout; /* CU timeout value in Microseconds */ + uint32_t cu_reset_timeout; /* CU reset timeout value in Microseconds */ + uint32_t reserved[6]; /* reserved for future use */ + + /* payload */ + uint32_t cu_mask; /* mandatory cu mask */ + uint32_t data[1]; /* count-9 number of words */ +}; + +#define KDMA_BLOCK_SIZE 64 /* Limited by KDMA CU */ +struct ert_start_copybo_cmd { + uint32_t state : 4; /* [3-0], must be ERT_CMD_STATE_NEW */ + uint32_t unused : 6; /* [9-4] */ + uint32_t extra_cu_masks : 2; /* [11-10], = 3 */ + uint32_t count : 11; /* [22-12], = 16, exclude 'arg' */ + uint32_t opcode : 5; /* [27-23], = ERT_START_COPYBO */ + uint32_t type : 4; /* [31-27], = ERT_DEFAULT */ + uint32_t cu_mask[4]; /* mandatory cu masks */ + uint32_t reserved[4]; /* for scheduler use */ + uint32_t src_addr_lo; /* low 32 bit of src addr */ + uint32_t src_addr_hi; /* high 32 bit of src addr */ + uint32_t src_bo_hdl; /* src bo handle, cleared by driver */ + uint32_t dst_addr_lo; /* low 32 bit of dst addr */ + uint32_t dst_addr_hi; /* high 32 bit of dst addr */ + uint32_t dst_bo_hdl; /* dst bo handle, cleared by driver */ + uint32_t size; /* size in bytes low 32 bit*/ + uint32_t size_hi; /* size in bytes high 32 bit*/ + void *arg; /* pointer to aux data for KDS */ +}; + +/** + * struct ert_configure_cmd: ERT configure command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload (5 + num_cus) + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @slot_size: command queue slot size + * @num_cus: number of compute units in program + * @cu_shift: shift value to convert CU idx to CU addr + * @cu_base_addr: base address to add to CU addr for actual physical address + * + * @ert:1 enable embedded HW scheduler + * @polling:1 poll for command completion + * @cu_dma:1 enable CUDMA custom module for HW scheduler + * @cu_isr:1 enable CUISR custom module for HW scheduler + * @cq_int:1 enable interrupt from host to HW scheduler + * @cdma:1 enable CDMA kernel + * @unused:25 + * @dsa52:1 reserved for internal use + * + * @data: addresses of @num_cus CUs + */ +struct ert_configure_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t unused : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t slot_size; + uint32_t num_cus; + uint32_t cu_shift; + uint32_t cu_base_addr; + + /* features */ + uint32_t ert : 1; + uint32_t polling : 1; + uint32_t cu_dma : 1; + uint32_t cu_isr : 1; + uint32_t cq_int : 1; + uint32_t cdma : 1; + uint32_t dataflow : 1; + /* WORKAROUND: allow xclRegWrite/xclRegRead access shared CU */ + uint32_t rw_shared : 1; + uint32_t kds_30 : 1; + uint32_t dmsg : 1; + uint32_t echo : 1; + uint32_t intr : 1; + uint32_t unusedf : 19; + uint32_t dsa52 : 1; + + /* cu address map size is num_cus */ + uint32_t data[1]; +}; + +/* + * Note: We need to put maximum 128 soft kernel image + * in one config command (1024 DWs including header). + * So each one needs to be smaller than 8 DWs. + * + * This data struct is obsoleted. Only used in legacy ERT firmware. + * Use 'struct config_sk_image_uuid' instead on XGQ based ERT. + * + * @start_cuidx: start index of compute units of each image + * @num_cus: number of compute units of each image + * @sk_name: symbol name of soft kernel of each image + */ +struct config_sk_image { + uint32_t start_cuidx; + uint32_t num_cus; + uint32_t sk_name[5]; +}; + +/* + * Note: We need to put maximum 128 soft kernel image + * in one config command (1024 DWs including header). + * So each one needs to be smaller than 8 DWs. + * + * @start_cuidx: start index of compute units of each image + * @num_cus: number of compute units of each image + * @sk_name: symbol name of soft kernel of each image + * @sk_uuid: xclbin uuid that this soft kernel image belones to + */ +struct config_sk_image_uuid { + uint32_t start_cuidx; + uint32_t num_cus; + uint32_t sk_name[5]; + unsigned char sk_uuid[16]; +}; + +/** + * struct ert_configure_sk_cmd: ERT configure soft kernel command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @num_image: number of images + */ +struct ert_configure_sk_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t unused : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t num_image; + struct config_sk_image image[1]; +}; + +/** + * struct ert_unconfigure_sk_cmd: ERT unconfigure soft kernel command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @start_cuidx: start index of compute units + * @num_cus: number of compute units in program + */ +struct ert_unconfigure_sk_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t unused : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t start_cuidx; + uint32_t num_cus; +}; + +/** + * struct ert_abort_cmd: ERT abort command format. + * + * @exec_bo_handle: The bo handle of execbuf command to abort + */ +struct ert_abort_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint64_t exec_bo_handle; +}; + +/** + * struct ert_validate_cmd: ERT BIST command format. + * + */ +struct ert_validate_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + uint32_t timestamp; + uint32_t cq_read_single; + uint32_t cq_write_single; + uint32_t cu_read_single; + uint32_t cu_write_single; +}; + +/** + * struct ert_validate_cmd: ERT BIST command format. + * + */ +struct ert_access_valid_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + uint32_t h2h_access; + uint32_t h2d_access; + uint32_t d2h_access; + uint32_t d2d_access; + uint32_t d2cu_access; + uint32_t wr_count; + uint32_t wr_test; +}; + +/** + * ERT command state + * + * @ERT_CMD_STATE_NEW: Set by host before submitting a command to + * scheduler + * @ERT_CMD_STATE_QUEUED: Internal scheduler state + * @ERT_CMD_STATE_SUBMITTED: Internal scheduler state + * @ERT_CMD_STATE_RUNNING: Internal scheduler state + * @ERT_CMD_STATE_COMPLETED: Set by scheduler when command completes + * @ERT_CMD_STATE_ERROR: Set by scheduler if command failed + * @ERT_CMD_STATE_ABORT: Set by scheduler if command abort + * @ERT_CMD_STATE_TIMEOUT: Set by scheduler if command timeout and reset + * @ERT_CMD_STATE_NORESPONSE: Set by scheduler if command timeout and fail to + * reset + */ +enum ert_cmd_state { + ERT_CMD_STATE_NEW = 1, + ERT_CMD_STATE_QUEUED = 2, + ERT_CMD_STATE_RUNNING = 3, + ERT_CMD_STATE_COMPLETED = 4, + ERT_CMD_STATE_ERROR = 5, + ERT_CMD_STATE_ABORT = 6, + ERT_CMD_STATE_SUBMITTED = 7, + ERT_CMD_STATE_TIMEOUT = 8, + ERT_CMD_STATE_NORESPONSE = 9, + ERT_CMD_STATE_SKERROR = 10, // Check for error return code from Soft Kernel + ERT_CMD_STATE_SKCRASHED = 11, // Soft kernel has crashed + ERT_CMD_STATE_MAX, // Always the last one +}; + +struct cu_cmd_state_timestamps { + uint64_t skc_timestamps[ERT_CMD_STATE_MAX]; // In nano-second +}; + +/** + * Opcode types for commands + * + * @ERT_START_CU: start a workgroup on a CU + * @ERT_START_KERNEL: currently aliased to ERT_START_CU + * @ERT_CONFIGURE: configure command scheduler + * @ERT_EXEC_WRITE: execute a specified CU after writing + * @ERT_CU_STAT: get stats about CU execution + * @ERT_START_COPYBO: start KDMA CU or P2P, may be converted to ERT_START_CU + * before cmd reach to scheduler, short-term hack + * @ERT_SK_CONFIG: configure soft kernel + * @ERT_SK_START: start a soft kernel + * @ERT_SK_UNCONFIG: unconfigure a soft kernel + * @ERT_START_KEY_VAL: same as ERT_START_CU but with key-value pair flavor + */ +enum ert_cmd_opcode { + ERT_START_CU = 0, + ERT_START_KERNEL = 0, + ERT_CONFIGURE = 2, + ERT_EXIT = 3, + ERT_ABORT = 4, + ERT_EXEC_WRITE = 5, + ERT_CU_STAT = 6, + ERT_START_COPYBO = 7, + ERT_SK_CONFIG = 8, + ERT_SK_START = 9, + ERT_SK_UNCONFIG = 10, + ERT_INIT_CU = 11, + ERT_START_FA = 12, + ERT_CLK_CALIB = 13, + ERT_MB_VALIDATE = 14, + ERT_START_KEY_VAL = 15, + ERT_ACCESS_TEST_C = 16, + ERT_ACCESS_TEST = 17, +}; + +/** + * Command types + * + * @ERT_DEFAULT: default command type + * @ERT_KDS_LOCAL: command processed by KDS locally + * @ERT_CTRL: control command uses reserved command queue slot + * @ERT_CU: compute unit command + */ +enum ert_cmd_type { + ERT_DEFAULT = 0, + ERT_KDS_LOCAL = 1, + ERT_CTRL = 2, + ERT_CU = 3, + ERT_SCU = 4, +}; + +/** + * Soft kernel types + * + * @SOFTKERNEL_TYPE_EXEC: executable + */ +enum softkernel_type { + SOFTKERNEL_TYPE_EXEC = 0, +}; + +/* + * Base address GPIO per spec + * | Offset | Description + * ----------------------- + * | 0x00 | ERT_MGMT_PF_base_addr (Not sure where this should be use) + * | 0x08 | ERT_USER_PF_base_addr. The base address of ERT peripherals + */ +#if defined(ERT_BUILD_V20) +uint32_t ert_base_addr = 0; +#define ERT_BASE_ADDR 0x01F30008 +#endif + +#if defined(ERT_BUILD_V30) +uint32_t ert_base_addr = 0; +#define ERT_BASE_ADDR 0x01F30008 +#endif + +/** + * Address constants per spec + */ +#define ERT_WORD_SIZE 4 /* 4 bytes */ +#define ERT_CQ_SIZE 0x10000 /* 64K */ +#if defined(ERT_BUILD_U50) +#define ERT_CQ_BASE_ADDR 0x340000 +#define ERT_CSR_ADDR 0x360000 +#elif defined(ERT_BUILD_V20) +#define ERT_CQ_BASE_ADDR (0x000000 + ert_base_addr) +#define ERT_CSR_ADDR (0x010000 + ert_base_addr) +#elif defined(ERT_BUILD_V30) +#define ERT_CQ_BASE_ADDR 0x1F60000 +#define ERT_CSR_ADDR (0x010000 + ert_base_addr) +#else +#define ERT_CQ_BASE_ADDR 0x190000 +#define ERT_CSR_ADDR 0x180000 +#endif + +/** + * The STATUS REGISTER is for communicating completed CQ slot indices + * MicroBlaze write, host reads. MB(W) / HOST(COR) + */ +#define ERT_STATUS_REGISTER_ADDR (ERT_CSR_ADDR) +#define ERT_STATUS_REGISTER_ADDR0 (ERT_CSR_ADDR) +#define ERT_STATUS_REGISTER_ADDR1 (ERT_CSR_ADDR + 0x4) +#define ERT_STATUS_REGISTER_ADDR2 (ERT_CSR_ADDR + 0x8) +#define ERT_STATUS_REGISTER_ADDR3 (ERT_CSR_ADDR + 0xC) + +/** + * The CU DMA REGISTER is for communicating which CQ slot is to be started + * on a specific CU. MB selects a free CU on which the command can + * run, then writes the 1<size_in_bytes(): " << (uint64_t)size + << ", handle->size: " << (uint64_t)handle->size << "\n"; + + ret = xrtBOWrite(handle->handle, host, size, 0); + if (ret) { + error(user_context) << "XRT: halide_xrt_copy_to_device: xrtBOWrite failed: " + << ret << "\n"; + return halide_error_code_generic_error; + } + + ret = xrtBOSync(handle->handle, XCL_BO_SYNC_BO_TO_DEVICE, size, 0); + if (ret) { + error(user_context) << "XRT: halide_xrt_copy_to_device: xrtBOSync failed: " + << ret << "\n"; + return halide_error_code_generic_error; + } + + return halide_error_code_success; +} + +} // namespace + +extern "C" { + +WEAK int halide_xrt_device_malloc(void *user_context, halide_buffer_t *buf) { + debug(user_context) + << "XRT: halide_xrt_device_malloc (user_context: " << user_context + << ", buf: " << buf << ")\n"; + + if (buf->device) { + return halide_error_code_success; + } + + XRTContext context(user_context); + if (context.error_code) { + return context.error_code; + } + + // Buffers are lazily allocated on the device. This is because the memory + // bank to allocate from is not known until the xclbin is loaded. + XrtBufferHandle *handle = + (XrtBufferHandle *)malloc(sizeof(XrtBufferHandle)); + memset(handle, 0, sizeof(*handle)); + handle->handle = nullptr; + handle->size = buf->size_in_bytes(); + + buf->device = (uint64_t)(uintptr_t)handle; + buf->device_interface = &xrt_device_interface; + + debug(user_context) << "XRT: halide_xrt_device_malloc:" + << " lazily allocated device buffer with size: " + << (uint64_t)handle->size + << ". Descriptor: " << (void *)buf->device << "\n"; + + return halide_error_code_success; +} + +WEAK int halide_xrt_device_free(void *user_context, halide_buffer_t *buf) { + if (buf->device == 0) { + return halide_error_code_success; + } + + XrtBufferHandle *handle = (XrtBufferHandle *)buf->device; + + debug(user_context) + << "XRT: halide_xrt_device_free (user_context: " << user_context + << ", buf: " << buf << ")" + << "\n"; + + XRTContext context(user_context); + if (context.error_code) { + return context.error_code; + } + + if (handle->handle != nullptr) { + xrtBOFree(handle->handle); + handle->handle = nullptr; + } + + free(handle); + buf->device = 0; + buf->device_interface->impl->release_module(); + buf->device_interface = nullptr; + + return halide_error_code_success; +} + +WEAK int halide_xrt_device_sync(void *user_context, halide_buffer_t *) { + debug(user_context) + << "XRT: halide_xrt_device_sync (user_context: " << user_context + << ")\n"; + + return halide_error_code_generic_error; +} + +WEAK int halide_xrt_device_release(void *user_context) { + debug(user_context) + << "XRT: halide_xrt_device_release (user_context: " << user_context + << ")\n"; + + // The XRTContext object does not allow the context storage to be modified, + // so we use halide_acquire_context directly. + int err; + xrtDeviceHandle device; + err = halide_xrt_acquire_context(user_context, &device, false); + if (err != halide_error_code_success) { + return err; + } + + if (device) { + if (device == global_device) { + xrtDeviceClose(device); + global_device = nullptr; + } + } + + return halide_xrt_release_context(user_context); +} + +WEAK int halide_xrt_device_and_host_malloc(void *user_context, + struct halide_buffer_t *buf) { + return halide_default_device_and_host_malloc(user_context, buf, + &xrt_device_interface); +} + +WEAK int halide_xrt_device_and_host_free(void *user_context, + struct halide_buffer_t *buf) { + return halide_default_device_and_host_free(user_context, buf, + &xrt_device_interface); +} + +WEAK int halide_xrt_buffer_copy(void *user_context, + struct halide_buffer_t *src, + const struct halide_device_interface_t *dst_device_interface, + struct halide_buffer_t *dst) { + debug(user_context) + << "XRT: halide_xrt_buffer_copy (user_context: " << user_context + << ", src: " << src << ", dst: " << dst << ")\n"; + + return halide_error_code_generic_error; +} + +WEAK int halide_xrt_copy_to_device(void *user_context, + halide_buffer_t *buf) { + debug(user_context) + << "XRT: halide_xrt_copy_to_device (user_context: " << user_context + << ", buf: " << buf << ")\n"; + + XrtBufferHandle *handle = (XrtBufferHandle *)buf->device; + + XRTContext context(user_context); + if (context.error_code) { + return context.error_code; + } + + // The copy to device will take place just before launching the kernel. + handle->copy_to_device_pending = true; + + return halide_error_code_success; +} + +WEAK int halide_xrt_copy_to_host(void *user_context, + halide_buffer_t *buf) { + int ret; + debug(user_context) + << "XRT: halide_xrt_copy_to_host (user_context: " << user_context + << ", buf: " << buf << ")\n"; + + XrtBufferHandle *handle = (XrtBufferHandle *)buf->device; + + XRTContext context(user_context); + if (context.error_code) { + return context.error_code; + } + + debug(user_context) + << "buf->size_in_bytes(): " << (uint64_t)buf->size_in_bytes() + << ", handle->size: " << (uint64_t)handle->size << "\n"; + + ret = xrtBOSync(handle->handle, XCL_BO_SYNC_BO_FROM_DEVICE, buf->size_in_bytes(), 0); + if (ret) { + error(user_context) << "XRT: halide_xrt_copy_to_host: xrtBOSync failed: " + << ret << "\n"; + return halide_error_code_generic_error; + } + + ret = xrtBORead(handle->handle, buf->host, buf->size_in_bytes(), 0); + if (ret) { + error(user_context) << "XRT: halide_xrt_copy_to_host: xrtBORead failed: " + << ret << "\n"; + return halide_error_code_generic_error; + } + + return halide_error_code_success; +} + +WEAK int halide_xrt_device_crop(void *user_context, + const struct halide_buffer_t *src, + struct halide_buffer_t *dst) { + return halide_error_code_generic_error; +} + +WEAK int halide_xrt_device_slice(void *user_context, + const struct halide_buffer_t *src, + int slice_dim, + int slice_pos, + struct halide_buffer_t *dst) { + return halide_error_code_generic_error; +} + +WEAK int halide_xrt_device_release_crop(void *user_context, + struct halide_buffer_t *buf) { + return halide_error_code_generic_error; +} + +WEAK int halide_xrt_wrap_native(void *user_context, struct halide_buffer_t *buf, uint64_t mem) { + halide_debug_assert(user_context, false && "unimplemented"); + return halide_error_code_unimplemented; +} + +WEAK int halide_xrt_detach_native(void *user_context, halide_buffer_t *buf) { + halide_debug_assert(user_context, false && "unimplemented"); + return halide_error_code_unimplemented; +} + +WEAK int halide_xrt_initialize_kernels(void *user_context, void **state_ptr, const char *kernel_name) { + int ret; + char xclbin_name[512]; + xuid_t uuid; + xrtKernelHandle kernel_handle; + XrtKernelState *state; + + debug(user_context) + << "XRT: halide_xrt_initialize_kernels (user_context: " << user_context + << ", state_ptr: " << state_ptr + << ", kernel_name: " << kernel_name << ")\n"; + + XRTContext context(user_context); + if (context.error_code) { + return context.error_code; + } + + ret = snprintf(xclbin_name, sizeof(xclbin_name), "%s.xclbin", kernel_name); + if (ret < 0) { + error(user_context) + << "XRT: halide_xrt_initialize_kernels: " + << "error generating xclbin name\n"; + return halide_error_code_generic_error; + } + + ret = xrtDeviceLoadXclbinFile(context.device, xclbin_name); + if (ret != 0) { + error(user_context) + << "XRT: halide_xrt_initialize_kernels: " + << "failed to load xclbin file: " << xclbin_name + << ", error: " << ret << "\n"; + return halide_error_code_generic_error; + } + + debug(user_context) + << "XRT: halide_xrt_initialize_kernels: " + << "loaded xclbin file: " << xclbin_name << "\n"; + + ret = xrtDeviceGetXclbinUUID(context.device, uuid); + if (ret != 0) { + error(user_context) + << "XRT: halide_xrt_initialize_kernels: " + << "failed to get xclbin uuid, error " << ret << "\n"; + return halide_error_code_generic_error; + } + + kernel_handle = xrtPLKernelOpen(context.device, uuid, "toplevel"); + if (kernel_handle == XRT_NULL_HANDLE) { + error(user_context) + << "XRT: halide_xrt_initialize_kernels: " + << "failed to open PL kernel\n"; + return halide_error_code_generic_error; + } + + state = (XrtKernelState *)malloc(sizeof(XrtKernelState)); + state->handle = kernel_handle; + + *state_ptr = state; + + return halide_error_code_success; +} + +WEAK void halide_xrt_finalize_kernels(void *user_context, void *state_ptr) { + XrtKernelState *state; + + debug(user_context) + << "XRT: halide_xrt_finalize_kernels (user_context: " + << user_context << ", state_ptr: " << state_ptr << "\n"; + + XRTContext context(user_context); + if (context.error_code == halide_error_code_success) { + state = (XrtKernelState *)state_ptr; + xrtKernelClose(state->handle); + + free(state); + } +} + +WEAK int halide_xrt_run(void *user_context, + void *state_ptr, + const char *entry_name, + halide_type_t arg_types[], + void *args[], + int8_t arg_is_buffer[]) { + int ret; + XrtKernelState *state; + xrtRunHandle run_handle; + uint32_t num_args; + uint64_t t_before, t_after; + + debug(user_context) + << "XRT: halide_xrt_run (user_context: " << user_context << ", " + << "entry: " << entry_name << ")\n"; + + XRTContext context(user_context); + if (context.error_code) { + return context.error_code; + } + + state = (XrtKernelState *)state_ptr; + run_handle = xrtRunOpen(state->handle); + if (run_handle == XRT_NULL_HANDLE) { + error(user_context) + << "XRT: halide_xrt_run: " + << "failed to open run handle for kernel: " << entry_name << "\n"; + return halide_error_code_generic_error; + } + + num_args = 0; + while (args[num_args] != nullptr) { + static const char *const type_code_names[] = { + "int", + "uint", + "float", + "handle", + "bfloat", + }; + + debug(user_context) + << "XRT: halide_xrt_run: " + << "arg[" << num_args << "]: " + << (arg_is_buffer[num_args] ? "buffer" : "scalar") << ", " + << "type: " << type_code_names[arg_types[num_args].code] << "\n"; + + if (arg_is_buffer[num_args]) { + halide_buffer_t *buffer = (halide_buffer_t *)args[num_args]; + XrtBufferHandle *buf = (XrtBufferHandle *)buffer->device; + + // Buffer not yet allocated. Allocate it now. + if (buf->handle == XRT_NULL_HANDLE) { + buf->handle = xrtBOAlloc(context.device, buf->size, XRT_BO_FLAGS_CACHEABLE, + (xrtMemoryGroup)xrtKernelArgGroupId(state->handle, num_args)); + if (buf->handle == XRT_NULL_HANDLE) { + error(user_context) + << "XRT: halide_xrt_run: " + << "failed to allocate buffer with size: " << (uint64_t)buf->size + << " for kernel: " << entry_name << "\n"; + xrtRunClose(run_handle); + return halide_error_code_generic_error; + } + + debug(user_context) << "XRT: halide_xrt_run: " + << "allocated buffer with size: " << (uint64_t)buf->size + << " at physical address: " << (void *)xrtBOAddress(buf->handle) << "\n"; + + if (buf->copy_to_device_pending) { + debug(user_context) << " buffer has a copy to device pending.\n"; + sync_bo_to_device(user_context, buf, buffer->host, buf->size); + buf->copy_to_device_pending = false; + } + } + ret = xrtRunSetArg(run_handle, num_args, buf->handle); + } else { + switch (arg_types[num_args].bytes()) { + case 1: + ret = xrtRunSetArg(run_handle, num_args, *(uint8_t *)args[num_args]); + break; + case 2: + ret = xrtRunSetArg(run_handle, num_args, *(uint16_t *)args[num_args]); + break; + case 4: + ret = xrtRunSetArg(run_handle, num_args, *(uint32_t *)args[num_args]); + break; + case 8: + ret = xrtRunSetArg(run_handle, num_args, *(uint64_t *)args[num_args]); + break; + default: + halide_debug_assert(user_context, false); + } + } + + if (ret != 0) { + xrtRunClose(run_handle); + error(user_context) + << "XRT: halide_xrt_run: " + << "failed to set arg[" << num_args << "] for kernel: " << entry_name + << ", error: " << ret << "\n"; + return halide_error_code_generic_error; + } + + num_args++; + } + + debug(user_context) << "XRT: halide_xrt_run: starting kernel: " << entry_name << "\n"; + + t_before = halide_current_time_ns(user_context); + + ret = xrtRunStart(run_handle); + if (ret != 0) { + xrtRunClose(run_handle); + error(user_context) + << "XRT: halide_xrt_run: " + << "failed to start kernel: " << entry_name + << ", error: " << ret << "\n"; + return halide_error_code_generic_error; + } + + ret = xrtRunWait(run_handle); + + t_after = halide_current_time_ns(user_context); + + xrtRunClose(run_handle); + + if (ret != ERT_CMD_STATE_COMPLETED) { + error(user_context) + << "XRT: halide_xrt_run: " + << "error waiting for kernel run completion, error: " << ret << "\n"; + return halide_error_code_generic_error; + } + + print(user_context) << "XRT: '" << entry_name << "' execution took " << (t_after - t_before) << " ns\n"; + + return halide_error_code_success; +} + +WEAK const struct halide_device_interface_t *halide_xrt_device_interface() { + return &xrt_device_interface; +} + +namespace { + +WEAK __attribute__((destructor)) void halide_xrt_cleanup() { + halide_xrt_device_release(nullptr); +} + +} // namespace + +} // extern "C" linkage + +namespace Halide { +namespace Runtime { +namespace Internal { +namespace XRT { + +WEAK halide_device_interface_impl_t xrt_device_interface_impl = { + halide_use_jit_module, + halide_release_jit_module, + halide_xrt_device_malloc, + halide_xrt_device_free, + halide_xrt_device_sync, + halide_xrt_device_release, + halide_xrt_copy_to_host, + halide_xrt_copy_to_device, + halide_xrt_device_and_host_malloc, + halide_xrt_device_and_host_free, + halide_xrt_buffer_copy, + halide_xrt_device_crop, + halide_xrt_device_slice, + halide_xrt_device_release_crop, + halide_xrt_wrap_native, + halide_xrt_detach_native, +}; + +WEAK halide_device_interface_t xrt_device_interface = { + halide_device_malloc, + halide_device_free, + halide_device_sync, + halide_device_release, + halide_copy_to_host, + halide_copy_to_device, + halide_device_and_host_malloc, + halide_device_and_host_free, + halide_buffer_copy, + halide_device_crop, + halide_device_slice, + halide_device_release_crop, + halide_device_wrap_native, + halide_device_detach_native, + nullptr, + &xrt_device_interface_impl}; + +} // namespace XRT +} // namespace Internal +} // namespace Runtime +} // namespace Halide