Skip to content

Commit

Permalink
Merge pull request #1 from ComingToy/im2col_tile8
Browse files Browse the repository at this point in the history
Im2col tile8
  • Loading branch information
ComingToy authored Dec 24, 2023
2 parents 8016d47 + 86e0811 commit 8dbd48d
Show file tree
Hide file tree
Showing 12 changed files with 1,130 additions and 15 deletions.
4 changes: 2 additions & 2 deletions source/device/cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ FOREACH(_OP_NAME ${_CPU_OP_LIST})
FILE (GLOB _x86_REGISTER_FILE "${_OP_ROOT}/${_OP_NAME}/x86/*_hcl_x86.c")
FILE (GLOB _MIPS_REGISTER_FILE "${_OP_ROOT}/${_OP_NAME}/mips/*_hcl_mips.c")
FILE (GLOB _RISC_V_REGISTER_FILE "${_OP_ROOT}/${_OP_NAME}/risc-v/lp64dv/*_hcl_rv64.c")
FILE (GLOB _RISC_V_REGISTER_FILE "${_OP_ROOT}/${_OP_NAME}/risc-v/lp64dv/*_hcl_rv64_tile8.c")

LIST (APPEND _CPU_REGISTER_SOURCE ${_CPU_REF_REGISTER_FILE})
IF (${TENGINE_TARGET_PROCESSOR} MATCHES "ARM")
Expand Down Expand Up @@ -279,9 +280,8 @@ IF (TENGINE_COMPILER_GCC OR TENGINE_COMPILER_CLANG)
ENDIF()

IF (${TENGINE_TARGET_PROCESSOR} MATCHES "lp64dv")
LIST (APPEND _CPU_COMPILER_OPTIONS "-march=rv64gcvxthead")
LIST (APPEND _CPU_COMPILER_OPTIONS "-march=rv64gcvxthead3")
LIST (APPEND _CPU_COMPILER_OPTIONS "-mabi=lp64d")
LIST (APPEND _CPU_COMPILER_OPTIONS "-mfp16")
LIST (APPEND _CPU_COMPILER_OPTIONS "-lc")
ENDIF()
ENDIF()
Expand Down
209 changes: 209 additions & 0 deletions source/device/cpu/op/conv/risc-v/lp64dv/conv_hcl_rv64_tile8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
#include "convolution_param.h"
#include "graph/tensor.h"
#include "graph/node.h"
#include "graph/graph.h"
#include "device/cpu/cpu_node.h"
#include "device/cpu/cpu_graph.h"
#include "operator/op.h"
#include "api/c_api.h"
#include "utility/log.h"
#include "utility/sys_port.h"
#include "device/cpu/cpu_module.h"
#include <string.h>
#include <stdio.h>

extern int conv_hcl_prerun_tile8(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param);
extern int conv_hcl_run_tile8(struct node* ir_node, struct tensor* input_tensor, struct tensor* filter_tensor, struct tensor* bias_tensor, struct tensor* output_tensor, struct conv_priv_info* info, struct conv_param* param, int num_thread, int cpu_affinity);
extern int conv_hcl_get_shared_mem_size_rv64_tile8(struct tensor* input_tensor, struct tensor* output_tensor, struct conv_param* param);
extern int conv_hcl_postrun_tile8(struct node* ir_node, struct conv_priv_info* info);

static int init_node(struct node_ops* ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct node* ir_node = exec_node->ir_node;
struct graph* ir_graph = ir_node->graph;
struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
struct tensor* kernel_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
struct conv_param* params = ir_node->op.param_mem;
struct conv_priv_info* info = sys_malloc(sizeof(struct conv_priv_info));
if (!info)
{
return -1;
}

memset(info, 0, sizeof(*info));
exec_node->ops_priv = info;

if (exec_graph->mode == TENGINE_MODE_FP32)
{
exec_node->shared_mem_size = conv_hcl_get_shared_mem_size_rv64_tile8(input_tensor, output_tensor, params);
exec_node->shared_pack4_mem_size = 0;
}
else
{
TLOG_ERR("Tengine work node %s not support %d\n", ir_node->name, exec_graph->mode);
return -1;
}

return 0;
}

static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct node* ir_node = exec_node->ir_node;
struct graph* ir_graph = ir_node->graph;

struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);

struct conv_param* param = ir_node->op.param_mem;
struct conv_priv_info* info = exec_node->ops_priv;

info->cpu_type = exec_graph->cpu_affinity;

if (exec_graph->mode == TENGINE_MODE_FP32)
{
if (exec_node->shared_mem_size < exec_graph->shared_mem_size)
{
info->external_im2col_mem = 1;
info->im2col_buffer = exec_graph->shared_mem;
info->im2col_buffer_size = exec_graph->shared_mem_size;
}

if (exec_node->shared_pack4_mem_size < exec_graph->shared_pack4_mem_size)
{
info->external_im2col_pack4_mem = 0;
info->im2col_buffer_pack4 = NULL;
info->im2col_buffer_pack4_size = 0;
}

if (param->group > 1 && param->kernel_h == 7 && param->kernel_w == 7)
{
info->external_interleave_pack4_mem = 0;
}
else
{
info->external_interleave_pack4_mem = 1;
}

if (conv_hcl_prerun_tile8(ir_node, input_tensor, filter_tensor, output_tensor, info, param) < 0)
{
TLOG_ERR("hcl conv tile8 prerun failed.\n");
return -1;
}
}
else
{
return -1;
}

return 0;
}

static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct node* ir_node = exec_node->ir_node;
struct graph* ir_graph = ir_node->graph;
struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
struct tensor* filter_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
struct tensor* bias_tensor = NULL;
if (ir_node->input_num > 2)
{
bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
}

struct conv_param* params = ir_node->op.param_mem;
struct conv_priv_info* info = exec_node->ops_priv;
int num_thread = exec_graph->num_thread;
int cpu_affinity = exec_graph->cpu_affinity;

if (exec_graph->mode == TENGINE_DT_FP32)
{
int ret = conv_hcl_run_tile8(ir_node, input_tensor, filter_tensor, bias_tensor, output_tensor, info, params, num_thread, cpu_affinity);
if (ret < 0)
{
TLOG_ERR("conv_hcl_run_tile8 %s run failed: %d\n", ir_node->name, ret);
return ret;
}
}
else
{
TLOG_ERR("Tengine work node %s not support %d mode\n", ir_node->name, exec_graph->mode);
return -1;
}

return 0;
}

static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
return 0;
}

static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
if (exec_graph->mode == TENGINE_MODE_FP32)
{
return conv_hcl_postrun_tile8(exec_node->ir_node, exec_node->ops_priv);
}
else
{
TLOG_ERR("Tengine work node %s not support %d mode\n", exec_node->ir_node->name, exec_graph->mode);
return -1;
}
}

static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
{
struct conv_priv_info* info = exec_node->ops_priv;
sys_free(info);
exec_node->ops_priv = NULL;

return 0;
}

static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* ir_node)
{
struct graph* ir_graph = ir_node->graph;
struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
struct tensor* kernel_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
struct conv_param* param = ir_node->op.param_mem;

if (input_tensor->data_type != TENGINE_DT_FP32)
{
return 0;
}

if (param->group != 1)
{
return 0;
}

return OPS_SCORE_PREFER;
}
#if 1
static struct node_ops hcl_node_ops = {
.prerun = prerun,
.run = run,
.reshape = reshape,
.postrun = postrun,
.init_node = init_node,
.release_node = release_node,
.score = score,
};

int register_conv_hcl_rv64_tile8_op()
{
TLOG_INFO("register conv_hcl_tile8 op");
return register_builtin_node_ops(OP_CONV, &hcl_node_ops);
}

int unregister_conv_hcl_rv64_tile8_op()
{
unregister_builtin_node_ops(OP_CONV, &hcl_node_ops);
return 0;
}
#endif
Loading

0 comments on commit 8dbd48d

Please sign in to comment.