From 5c3b19106ae31e035f3926958a49b290f502181e Mon Sep 17 00:00:00 2001 From: Daniel <7994127+ddzhao91@users.noreply.github.com> Date: Fri, 30 Apr 2021 00:21:37 +0800 Subject: [PATCH] Dev/vulkan (#633) * update CMakeLists to install tengine cpp api header * update vulkan to support tengine v1.4 * Update CMakeLists.txt Co-authored-by: dongdong --- source/CMakeLists.txt | 6 + source/device/CMakeLists.txt | 17 +- source/device/vulkan/CMakeLists.txt | 171 ++ source/device/vulkan/layer/concat_vulkan.cpp | 788 +++++++ source/device/vulkan/layer/concat_vulkan.hpp | 81 + .../vulkan/layer/convolution_vulkan.cpp | 616 +++++ .../vulkan/layer/convolution_vulkan.hpp | 115 + .../layer/convolutiondepthwise_vulkan.cpp | 301 +++ .../layer/convolutiondepthwise_vulkan.hpp | 96 + source/device/vulkan/layer/crop_vulkan.cpp | 607 +++++ source/device/vulkan/layer/crop_vulkan.hpp | 95 + source/device/vulkan/layer/dropout_vulkan.cpp | 216 ++ source/device/vulkan/layer/dropout_vulkan.hpp | 78 + source/device/vulkan/layer/eltwise_vulkan.cpp | 266 +++ source/device/vulkan/layer/eltwise_vulkan.hpp | 99 + source/device/vulkan/layer/flatten_vulkan.cpp | 326 +++ source/device/vulkan/layer/flatten_vulkan.hpp | 82 + .../vulkan/layer/innerproduct_vulkan.cpp | 464 ++++ .../vulkan/layer/innerproduct_vulkan.hpp | 103 + source/device/vulkan/layer/interp_vulkan.cpp | 464 ++++ source/device/vulkan/layer/interp_vulkan.hpp | 92 + source/device/vulkan/layer/packing_vulkan.cpp | 495 ++++ source/device/vulkan/layer/packing_vulkan.hpp | 96 + source/device/vulkan/layer/padding_vulkan.cpp | 174 ++ source/device/vulkan/layer/padding_vulkan.hpp | 81 + source/device/vulkan/layer/permute_vulkan.cpp | 475 ++++ source/device/vulkan/layer/permute_vulkan.hpp | 84 + source/device/vulkan/layer/pooling_vulkan.cpp | 338 +++ source/device/vulkan/layer/pooling_vulkan.hpp | 95 + .../device/vulkan/layer/priorbox_vulkan.cpp | 351 +++ .../device/vulkan/layer/priorbox_vulkan.hpp | 96 + source/device/vulkan/layer/relu_vulkan.cpp | 214 ++ source/device/vulkan/layer/relu_vulkan.hpp | 79 + source/device/vulkan/layer/reshape_vulkan.cpp | 580 +++++ source/device/vulkan/layer/reshape_vulkan.hpp | 98 + source/device/vulkan/layer/softmax_vulkan.cpp | 486 ++++ source/device/vulkan/layer/softmax_vulkan.hpp | 90 + .../device/vulkan/layer_shader_registry.h.in | 6 + .../device/vulkan/layer_shader_spv_data.h.in | 6 + source/device/vulkan/layer_shader_type.h | 54 + .../device/vulkan/layer_shader_type_enum.h.in | 5 + source/device/vulkan/layer_type_enum.h.in | 5 + source/device/vulkan/shaders/concat.comp | 108 + .../device/vulkan/shaders/concat_pack4.comp | 108 + .../vulkan/shaders/concat_pack4to1.comp | 164 ++ .../device/vulkan/shaders/concat_pack8.comp | 109 + .../vulkan/shaders/concat_pack8to1.comp | 190 ++ .../vulkan/shaders/concat_pack8to4.comp | 154 ++ source/device/vulkan/shaders/convolution.comp | 175 ++ .../vulkan/shaders/convolution_1x1s1d1.comp | 187 ++ .../vulkan/shaders/convolution_pack1to4.comp | 183 ++ .../vulkan/shaders/convolution_pack1to8.comp | 193 ++ .../vulkan/shaders/convolution_pack4.comp | 203 ++ .../shaders/convolution_pack4_1x1s1d1.comp | 237 ++ ...olution_pack4_3x3s1d1_winograd23_gemm.comp | 139 ++ ...k4_3x3s1d1_winograd23_transform_input.comp | 202 ++ ...4_3x3s1d1_winograd23_transform_output.comp | 209 ++ .../vulkan/shaders/convolution_pack4to1.comp | 183 ++ .../vulkan/shaders/convolution_pack4to8.comp | 219 ++ .../vulkan/shaders/convolution_pack8.comp | 219 ++ .../shaders/convolution_pack8_1x1s1d1.comp | 327 +++ ...olution_pack8_3x3s1d1_winograd23_gemm.comp | 198 ++ ...k8_3x3s1d1_winograd23_transform_input.comp | 203 ++ ...8_3x3s1d1_winograd23_transform_output.comp | 230 ++ .../vulkan/shaders/convolution_pack8to1.comp | 186 ++ .../vulkan/shaders/convolution_pack8to4.comp | 198 ++ .../vulkan/shaders/convolutiondepthwise.comp | 170 ++ .../shaders/convolutiondepthwise_group.comp | 186 ++ .../convolutiondepthwise_group_pack1to4.comp | 194 ++ .../convolutiondepthwise_group_pack1to8.comp | 204 ++ .../convolutiondepthwise_group_pack4.comp | 214 ++ .../convolutiondepthwise_group_pack4to1.comp | 194 ++ .../convolutiondepthwise_group_pack4to8.comp | 230 ++ .../convolutiondepthwise_group_pack8.comp | 230 ++ .../convolutiondepthwise_group_pack8to1.comp | 197 ++ .../convolutiondepthwise_group_pack8to4.comp | 209 ++ .../shaders/convolutiondepthwise_pack4.comp | 178 ++ .../shaders/convolutiondepthwise_pack8.comp | 191 ++ source/device/vulkan/shaders/crop.comp | 92 + .../device/vulkan/shaders/crop_pack1to4.comp | 98 + .../device/vulkan/shaders/crop_pack1to8.comp | 104 + source/device/vulkan/shaders/crop_pack4.comp | 92 + .../device/vulkan/shaders/crop_pack4to1.comp | 107 + .../device/vulkan/shaders/crop_pack4to8.comp | 182 ++ source/device/vulkan/shaders/crop_pack8.comp | 93 + .../device/vulkan/shaders/crop_pack8to1.comp | 108 + .../device/vulkan/shaders/crop_pack8to4.comp | 149 ++ .../vulkan/shaders/depthwiseconvolution.comp | 121 + source/device/vulkan/shaders/dropout.comp | 104 + .../device/vulkan/shaders/dropout_pack4.comp | 104 + .../device/vulkan/shaders/dropout_pack8.comp | 106 + source/device/vulkan/shaders/eltwise.comp | 141 ++ .../device/vulkan/shaders/eltwise_pack4.comp | 141 ++ .../device/vulkan/shaders/eltwise_pack8.comp | 160 ++ source/device/vulkan/shaders/flatten.comp | 98 + .../vulkan/shaders/flatten_pack1to4.comp | 127 + .../vulkan/shaders/flatten_pack1to8.comp | 154 ++ .../device/vulkan/shaders/flatten_pack4.comp | 175 ++ .../vulkan/shaders/flatten_pack4to8.comp | 222 ++ .../device/vulkan/shaders/flatten_pack8.comp | 222 ++ .../device/vulkan/shaders/innerproduct.comp | 140 ++ .../vulkan/shaders/innerproduct_pack1to4.comp | 148 ++ .../vulkan/shaders/innerproduct_pack1to8.comp | 160 ++ .../vulkan/shaders/innerproduct_pack4.comp | 171 ++ .../vulkan/shaders/innerproduct_pack4to1.comp | 148 ++ .../vulkan/shaders/innerproduct_pack4to8.comp | 188 ++ .../vulkan/shaders/innerproduct_pack8.comp | 188 ++ .../vulkan/shaders/innerproduct_pack8to1.comp | 151 ++ .../vulkan/shaders/innerproduct_pack8to4.comp | 167 ++ source/device/vulkan/shaders/interp.comp | 149 ++ .../device/vulkan/shaders/interp_bicubic.comp | 149 ++ .../vulkan/shaders/interp_bicubic_coeffs.comp | 107 + .../vulkan/shaders/interp_bicubic_pack4.comp | 163 ++ .../vulkan/shaders/interp_bicubic_pack8.comp | 175 ++ .../device/vulkan/shaders/interp_pack4.comp | 150 ++ .../device/vulkan/shaders/interp_pack8.comp | 238 ++ source/device/vulkan/shaders/packing.comp | 165 ++ .../vulkan/shaders/packing_fp16_to_fp32.comp | 165 ++ .../vulkan/shaders/packing_fp32_to_fp16.comp | 165 ++ .../vulkan/shaders/packing_pack1to4.comp | 195 ++ .../packing_pack1to4_fp16_to_fp32.comp | 195 ++ .../packing_pack1to4_fp32_to_fp16.comp | 195 ++ .../vulkan/shaders/packing_pack1to8.comp | 223 ++ .../packing_pack1to8_fp16_to_fp32.comp | 226 ++ .../packing_pack1to8_fp32_to_fp16.comp | 223 ++ .../device/vulkan/shaders/packing_pack4.comp | 165 ++ .../shaders/packing_pack4_fp16_to_fp32.comp | 165 ++ .../shaders/packing_pack4_fp32_to_fp16.comp | 165 ++ .../vulkan/shaders/packing_pack4to1.comp | 195 ++ .../packing_pack4to1_fp16_to_fp32.comp | 195 ++ .../packing_pack4to1_fp32_to_fp16.comp | 195 ++ .../vulkan/shaders/packing_pack4to8.comp | 184 ++ .../packing_pack4to8_fp16_to_fp32.comp | 184 ++ .../packing_pack4to8_fp32_to_fp16.comp | 184 ++ .../device/vulkan/shaders/packing_pack8.comp | 166 ++ .../shaders/packing_pack8_fp16_to_fp32.comp | 169 ++ .../shaders/packing_pack8_fp32_to_fp16.comp | 166 ++ .../vulkan/shaders/packing_pack8to1.comp | 223 ++ .../packing_pack8to1_fp16_to_fp32.comp | 223 ++ .../packing_pack8to1_fp32_to_fp16.comp | 223 ++ .../vulkan/shaders/packing_pack8to4.comp | 184 ++ .../packing_pack8to4_fp16_to_fp32.comp | 184 ++ .../packing_pack8to4_fp32_to_fp16.comp | 184 ++ source/device/vulkan/shaders/padding.comp | 145 ++ .../device/vulkan/shaders/padding_pack4.comp | 144 ++ .../device/vulkan/shaders/padding_pack8.comp | 144 ++ source/device/vulkan/shaders/permute.comp | 186 ++ .../vulkan/shaders/permute_pack1to4.comp | 234 ++ .../vulkan/shaders/permute_pack1to8.comp | 284 +++ .../device/vulkan/shaders/permute_pack4.comp | 281 +++ .../vulkan/shaders/permute_pack4to1.comp | 230 ++ .../vulkan/shaders/permute_pack4to8.comp | 350 +++ .../device/vulkan/shaders/permute_pack8.comp | 350 +++ .../vulkan/shaders/permute_pack8to1.comp | 280 +++ .../vulkan/shaders/permute_pack8to4.comp | 285 +++ source/device/vulkan/shaders/pooling.comp | 226 ++ .../device/vulkan/shaders/pooling_global.comp | 130 ++ .../vulkan/shaders/pooling_global_pack4.comp | 130 ++ .../vulkan/shaders/pooling_global_pack8.comp | 139 ++ .../device/vulkan/shaders/pooling_pack4.comp | 226 ++ .../device/vulkan/shaders/pooling_pack8.comp | 242 ++ source/device/vulkan/shaders/priorbox.comp | 170 ++ .../device/vulkan/shaders/priorbox_mxnet.comp | 92 + source/device/vulkan/shaders/relu.comp | 107 + source/device/vulkan/shaders/relu_pack4.comp | 107 + source/device/vulkan/shaders/relu_pack8.comp | 114 + source/device/vulkan/shaders/reshape.comp | 138 ++ .../vulkan/shaders/reshape_pack1to4.comp | 147 ++ .../vulkan/shaders/reshape_pack1to8.comp | 177 ++ .../device/vulkan/shaders/reshape_pack4.comp | 228 ++ .../vulkan/shaders/reshape_pack4to1.comp | 166 ++ .../vulkan/shaders/reshape_pack4to8.comp | 301 +++ .../device/vulkan/shaders/reshape_pack8.comp | 301 +++ .../vulkan/shaders/reshape_pack8to1.comp | 195 ++ .../vulkan/shaders/reshape_pack8to4.comp | 231 ++ .../vulkan/shaders/softmax_div_sum.comp | 166 ++ .../vulkan/shaders/softmax_div_sum_pack4.comp | 175 ++ .../vulkan/shaders/softmax_div_sum_pack8.comp | 177 ++ .../vulkan/shaders/softmax_exp_sub_max.comp | 166 ++ .../shaders/softmax_exp_sub_max_pack4.comp | 175 ++ .../shaders/softmax_exp_sub_max_pack8.comp | 177 ++ .../vulkan/shaders/softmax_reduce_max.comp | 198 ++ .../shaders/softmax_reduce_max_pack4.comp | 204 ++ .../shaders/softmax_reduce_max_pack8.comp | 217 ++ .../vulkan/shaders/softmax_reduce_sum.comp | 198 ++ .../shaders/softmax_reduce_sum_pack4.comp | 204 ++ .../shaders/softmax_reduce_sum_pack8.comp | 211 ++ source/device/vulkan/vulkan_allocator.cpp | 1474 ++++++++++++ source/device/vulkan/vulkan_allocator.hpp | 284 +++ source/device/vulkan/vulkan_command.cpp | 1782 +++++++++++++++ source/device/vulkan/vulkan_command.hpp | 168 ++ source/device/vulkan/vulkan_define.h | 34 + source/device/vulkan/vulkan_device.cc | 234 ++ source/device/vulkan/vulkan_device.hpp | 40 + source/device/vulkan/vulkan_executor.cc | 98 + source/device/vulkan/vulkan_executor.hpp | 89 + source/device/vulkan/vulkan_gpu.cpp | 2036 +++++++++++++++++ source/device/vulkan/vulkan_gpu.hpp | 349 +++ source/device/vulkan/vulkan_graph.cc | 545 +++++ source/device/vulkan/vulkan_graph.hpp | 139 ++ source/device/vulkan/vulkan_helper.cc | 311 +++ source/device/vulkan/vulkan_helper.hpp | 63 + source/device/vulkan/vulkan_layer.cpp | 84 + source/device/vulkan/vulkan_layer.hpp | 119 + source/device/vulkan/vulkan_limit.hpp | 160 ++ source/device/vulkan/vulkan_option.cpp | 73 + source/device/vulkan/vulkan_option.hpp | 128 ++ source/device/vulkan/vulkan_pipeline.cpp | 568 +++++ source/device/vulkan/vulkan_pipeline.hpp | 130 ++ source/device/vulkan/vulkan_platform.hpp | 92 + source/device/vulkan/vulkan_tensor.cpp | 374 +++ source/device/vulkan/vulkan_tensor.hpp | 1817 +++++++++++++++ 212 files changed, 46448 insertions(+), 1 deletion(-) create mode 100644 source/device/vulkan/CMakeLists.txt create mode 100644 source/device/vulkan/layer/concat_vulkan.cpp create mode 100644 source/device/vulkan/layer/concat_vulkan.hpp create mode 100644 source/device/vulkan/layer/convolution_vulkan.cpp create mode 100644 source/device/vulkan/layer/convolution_vulkan.hpp create mode 100644 source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp create mode 100644 source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp create mode 100644 source/device/vulkan/layer/crop_vulkan.cpp create mode 100644 source/device/vulkan/layer/crop_vulkan.hpp create mode 100644 source/device/vulkan/layer/dropout_vulkan.cpp create mode 100644 source/device/vulkan/layer/dropout_vulkan.hpp create mode 100644 source/device/vulkan/layer/eltwise_vulkan.cpp create mode 100644 source/device/vulkan/layer/eltwise_vulkan.hpp create mode 100644 source/device/vulkan/layer/flatten_vulkan.cpp create mode 100644 source/device/vulkan/layer/flatten_vulkan.hpp create mode 100644 source/device/vulkan/layer/innerproduct_vulkan.cpp create mode 100644 source/device/vulkan/layer/innerproduct_vulkan.hpp create mode 100644 source/device/vulkan/layer/interp_vulkan.cpp create mode 100644 source/device/vulkan/layer/interp_vulkan.hpp create mode 100644 source/device/vulkan/layer/packing_vulkan.cpp create mode 100644 source/device/vulkan/layer/packing_vulkan.hpp create mode 100644 source/device/vulkan/layer/padding_vulkan.cpp create mode 100644 source/device/vulkan/layer/padding_vulkan.hpp create mode 100644 source/device/vulkan/layer/permute_vulkan.cpp create mode 100644 source/device/vulkan/layer/permute_vulkan.hpp create mode 100644 source/device/vulkan/layer/pooling_vulkan.cpp create mode 100644 source/device/vulkan/layer/pooling_vulkan.hpp create mode 100644 source/device/vulkan/layer/priorbox_vulkan.cpp create mode 100644 source/device/vulkan/layer/priorbox_vulkan.hpp create mode 100644 source/device/vulkan/layer/relu_vulkan.cpp create mode 100644 source/device/vulkan/layer/relu_vulkan.hpp create mode 100644 source/device/vulkan/layer/reshape_vulkan.cpp create mode 100644 source/device/vulkan/layer/reshape_vulkan.hpp create mode 100644 source/device/vulkan/layer/softmax_vulkan.cpp create mode 100644 source/device/vulkan/layer/softmax_vulkan.hpp create mode 100644 source/device/vulkan/layer_shader_registry.h.in create mode 100644 source/device/vulkan/layer_shader_spv_data.h.in create mode 100644 source/device/vulkan/layer_shader_type.h create mode 100644 source/device/vulkan/layer_shader_type_enum.h.in create mode 100644 source/device/vulkan/layer_type_enum.h.in create mode 100644 source/device/vulkan/shaders/concat.comp create mode 100644 source/device/vulkan/shaders/concat_pack4.comp create mode 100644 source/device/vulkan/shaders/concat_pack4to1.comp create mode 100644 source/device/vulkan/shaders/concat_pack8.comp create mode 100644 source/device/vulkan/shaders/concat_pack8to1.comp create mode 100644 source/device/vulkan/shaders/concat_pack8to4.comp create mode 100644 source/device/vulkan/shaders/convolution.comp create mode 100644 source/device/vulkan/shaders/convolution_1x1s1d1.comp create mode 100644 source/device/vulkan/shaders/convolution_pack1to4.comp create mode 100644 source/device/vulkan/shaders/convolution_pack1to8.comp create mode 100644 source/device/vulkan/shaders/convolution_pack4.comp create mode 100644 source/device/vulkan/shaders/convolution_pack4_1x1s1d1.comp create mode 100644 source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp create mode 100644 source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp create mode 100644 source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp create mode 100644 source/device/vulkan/shaders/convolution_pack4to1.comp create mode 100644 source/device/vulkan/shaders/convolution_pack4to8.comp create mode 100644 source/device/vulkan/shaders/convolution_pack8.comp create mode 100644 source/device/vulkan/shaders/convolution_pack8_1x1s1d1.comp create mode 100644 source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp create mode 100644 source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp create mode 100644 source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp create mode 100644 source/device/vulkan/shaders/convolution_pack8to1.comp create mode 100644 source/device/vulkan/shaders/convolution_pack8to4.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack4.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack8.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_pack4.comp create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_pack8.comp create mode 100644 source/device/vulkan/shaders/crop.comp create mode 100644 source/device/vulkan/shaders/crop_pack1to4.comp create mode 100644 source/device/vulkan/shaders/crop_pack1to8.comp create mode 100644 source/device/vulkan/shaders/crop_pack4.comp create mode 100644 source/device/vulkan/shaders/crop_pack4to1.comp create mode 100644 source/device/vulkan/shaders/crop_pack4to8.comp create mode 100644 source/device/vulkan/shaders/crop_pack8.comp create mode 100644 source/device/vulkan/shaders/crop_pack8to1.comp create mode 100644 source/device/vulkan/shaders/crop_pack8to4.comp create mode 100644 source/device/vulkan/shaders/depthwiseconvolution.comp create mode 100644 source/device/vulkan/shaders/dropout.comp create mode 100644 source/device/vulkan/shaders/dropout_pack4.comp create mode 100644 source/device/vulkan/shaders/dropout_pack8.comp create mode 100644 source/device/vulkan/shaders/eltwise.comp create mode 100644 source/device/vulkan/shaders/eltwise_pack4.comp create mode 100644 source/device/vulkan/shaders/eltwise_pack8.comp create mode 100644 source/device/vulkan/shaders/flatten.comp create mode 100644 source/device/vulkan/shaders/flatten_pack1to4.comp create mode 100644 source/device/vulkan/shaders/flatten_pack1to8.comp create mode 100644 source/device/vulkan/shaders/flatten_pack4.comp create mode 100644 source/device/vulkan/shaders/flatten_pack4to8.comp create mode 100644 source/device/vulkan/shaders/flatten_pack8.comp create mode 100644 source/device/vulkan/shaders/innerproduct.comp create mode 100644 source/device/vulkan/shaders/innerproduct_pack1to4.comp create mode 100644 source/device/vulkan/shaders/innerproduct_pack1to8.comp create mode 100644 source/device/vulkan/shaders/innerproduct_pack4.comp create mode 100644 source/device/vulkan/shaders/innerproduct_pack4to1.comp create mode 100644 source/device/vulkan/shaders/innerproduct_pack4to8.comp create mode 100644 source/device/vulkan/shaders/innerproduct_pack8.comp create mode 100644 source/device/vulkan/shaders/innerproduct_pack8to1.comp create mode 100644 source/device/vulkan/shaders/innerproduct_pack8to4.comp create mode 100644 source/device/vulkan/shaders/interp.comp create mode 100644 source/device/vulkan/shaders/interp_bicubic.comp create mode 100644 source/device/vulkan/shaders/interp_bicubic_coeffs.comp create mode 100644 source/device/vulkan/shaders/interp_bicubic_pack4.comp create mode 100644 source/device/vulkan/shaders/interp_bicubic_pack8.comp create mode 100644 source/device/vulkan/shaders/interp_pack4.comp create mode 100644 source/device/vulkan/shaders/interp_pack8.comp create mode 100644 source/device/vulkan/shaders/packing.comp create mode 100644 source/device/vulkan/shaders/packing_fp16_to_fp32.comp create mode 100644 source/device/vulkan/shaders/packing_fp32_to_fp16.comp create mode 100644 source/device/vulkan/shaders/packing_pack1to4.comp create mode 100644 source/device/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp create mode 100644 source/device/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp create mode 100644 source/device/vulkan/shaders/packing_pack1to8.comp create mode 100644 source/device/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp create mode 100644 source/device/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp create mode 100644 source/device/vulkan/shaders/packing_pack4.comp create mode 100644 source/device/vulkan/shaders/packing_pack4_fp16_to_fp32.comp create mode 100644 source/device/vulkan/shaders/packing_pack4_fp32_to_fp16.comp create mode 100644 source/device/vulkan/shaders/packing_pack4to1.comp create mode 100644 source/device/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp create mode 100644 source/device/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp create mode 100644 source/device/vulkan/shaders/packing_pack4to8.comp create mode 100644 source/device/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp create mode 100644 source/device/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp create mode 100644 source/device/vulkan/shaders/packing_pack8.comp create mode 100644 source/device/vulkan/shaders/packing_pack8_fp16_to_fp32.comp create mode 100644 source/device/vulkan/shaders/packing_pack8_fp32_to_fp16.comp create mode 100644 source/device/vulkan/shaders/packing_pack8to1.comp create mode 100644 source/device/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp create mode 100644 source/device/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp create mode 100644 source/device/vulkan/shaders/packing_pack8to4.comp create mode 100644 source/device/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp create mode 100644 source/device/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp create mode 100644 source/device/vulkan/shaders/padding.comp create mode 100644 source/device/vulkan/shaders/padding_pack4.comp create mode 100644 source/device/vulkan/shaders/padding_pack8.comp create mode 100644 source/device/vulkan/shaders/permute.comp create mode 100644 source/device/vulkan/shaders/permute_pack1to4.comp create mode 100644 source/device/vulkan/shaders/permute_pack1to8.comp create mode 100644 source/device/vulkan/shaders/permute_pack4.comp create mode 100644 source/device/vulkan/shaders/permute_pack4to1.comp create mode 100644 source/device/vulkan/shaders/permute_pack4to8.comp create mode 100644 source/device/vulkan/shaders/permute_pack8.comp create mode 100644 source/device/vulkan/shaders/permute_pack8to1.comp create mode 100644 source/device/vulkan/shaders/permute_pack8to4.comp create mode 100644 source/device/vulkan/shaders/pooling.comp create mode 100644 source/device/vulkan/shaders/pooling_global.comp create mode 100644 source/device/vulkan/shaders/pooling_global_pack4.comp create mode 100644 source/device/vulkan/shaders/pooling_global_pack8.comp create mode 100644 source/device/vulkan/shaders/pooling_pack4.comp create mode 100644 source/device/vulkan/shaders/pooling_pack8.comp create mode 100644 source/device/vulkan/shaders/priorbox.comp create mode 100644 source/device/vulkan/shaders/priorbox_mxnet.comp create mode 100644 source/device/vulkan/shaders/relu.comp create mode 100644 source/device/vulkan/shaders/relu_pack4.comp create mode 100644 source/device/vulkan/shaders/relu_pack8.comp create mode 100644 source/device/vulkan/shaders/reshape.comp create mode 100644 source/device/vulkan/shaders/reshape_pack1to4.comp create mode 100644 source/device/vulkan/shaders/reshape_pack1to8.comp create mode 100644 source/device/vulkan/shaders/reshape_pack4.comp create mode 100644 source/device/vulkan/shaders/reshape_pack4to1.comp create mode 100644 source/device/vulkan/shaders/reshape_pack4to8.comp create mode 100644 source/device/vulkan/shaders/reshape_pack8.comp create mode 100644 source/device/vulkan/shaders/reshape_pack8to1.comp create mode 100644 source/device/vulkan/shaders/reshape_pack8to4.comp create mode 100644 source/device/vulkan/shaders/softmax_div_sum.comp create mode 100644 source/device/vulkan/shaders/softmax_div_sum_pack4.comp create mode 100644 source/device/vulkan/shaders/softmax_div_sum_pack8.comp create mode 100644 source/device/vulkan/shaders/softmax_exp_sub_max.comp create mode 100644 source/device/vulkan/shaders/softmax_exp_sub_max_pack4.comp create mode 100644 source/device/vulkan/shaders/softmax_exp_sub_max_pack8.comp create mode 100644 source/device/vulkan/shaders/softmax_reduce_max.comp create mode 100644 source/device/vulkan/shaders/softmax_reduce_max_pack4.comp create mode 100644 source/device/vulkan/shaders/softmax_reduce_max_pack8.comp create mode 100644 source/device/vulkan/shaders/softmax_reduce_sum.comp create mode 100644 source/device/vulkan/shaders/softmax_reduce_sum_pack4.comp create mode 100644 source/device/vulkan/shaders/softmax_reduce_sum_pack8.comp create mode 100644 source/device/vulkan/vulkan_allocator.cpp create mode 100644 source/device/vulkan/vulkan_allocator.hpp create mode 100644 source/device/vulkan/vulkan_command.cpp create mode 100644 source/device/vulkan/vulkan_command.hpp create mode 100644 source/device/vulkan/vulkan_define.h create mode 100644 source/device/vulkan/vulkan_device.cc create mode 100644 source/device/vulkan/vulkan_device.hpp create mode 100644 source/device/vulkan/vulkan_executor.cc create mode 100644 source/device/vulkan/vulkan_executor.hpp create mode 100644 source/device/vulkan/vulkan_gpu.cpp create mode 100644 source/device/vulkan/vulkan_gpu.hpp create mode 100644 source/device/vulkan/vulkan_graph.cc create mode 100644 source/device/vulkan/vulkan_graph.hpp create mode 100644 source/device/vulkan/vulkan_helper.cc create mode 100644 source/device/vulkan/vulkan_helper.hpp create mode 100644 source/device/vulkan/vulkan_layer.cpp create mode 100644 source/device/vulkan/vulkan_layer.hpp create mode 100644 source/device/vulkan/vulkan_limit.hpp create mode 100644 source/device/vulkan/vulkan_option.cpp create mode 100644 source/device/vulkan/vulkan_option.hpp create mode 100644 source/device/vulkan/vulkan_pipeline.cpp create mode 100644 source/device/vulkan/vulkan_pipeline.hpp create mode 100644 source/device/vulkan/vulkan_platform.hpp create mode 100644 source/device/vulkan/vulkan_tensor.cpp create mode 100644 source/device/vulkan/vulkan_tensor.hpp diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 491498ab7..09065f7f8 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -385,6 +385,12 @@ IF (TENGINE_ENABLE_CUDA) ENDIF() ENDIF() +# deal with depends + +FOREACH(_var ${TENGINE_DEVICE_DEPENDS_FORWARD}) + ADD_DEPENDENCIES(${TENGINE_LITE_NAME}-static ${_var}) + ADD_DEPENDENCIES(${TENGINE_LITE_NAME} ${_var}) +ENDFOREACH() # debug macro information IF (TENGINE_DEBUG_MEM_STAT) diff --git a/source/device/CMakeLists.txt b/source/device/CMakeLists.txt index 54b3103c4..e651ac6e0 100644 --- a/source/device/CMakeLists.txt +++ b/source/device/CMakeLists.txt @@ -133,6 +133,21 @@ IF (TENGINE_ENABLE_TIM_VX) LIST (APPEND _REGISTER_DEVICE_LIST "${CMAKE_SOURCE_DIR}/source/device/tim-vx/timvx_device.cc") ENDIF() +# Khronos Vulkan +IF (TENGINE_ENABLE_VULKAN) + ADD_SUBDIRECTORY (vulkan) + + LIST (APPEND _TENGINE_DEVICE_HEADER_PATH ${TENGINE_VULKAN_HEADER_PATH}) + LIST (APPEND _TENGINE_DEVICE_LINK_PATH ${TENGINE_VULKAN_LINK_PATH}) + LIST (APPEND _TENGINE_DEVICE_COMPILER_DEFINES ${TENGINE_VULKAN_COMPILER_DEFINES}) + LIST (APPEND _TENGINE_DEVICE_COMPILER_OPTIONS ${TENGINE_VULKAN_COMPILER_OPTIONS}) + LIST (APPEND _TENGINE_DEVICE_LINKER_OPTIONS ${TENGINE_VULKAN_LINKER_OPTIONS}) + LIST (APPEND _TENGINE_DEVICE_LINK_LIBRARIES ${TENGINE_VULKAN_LINK_LIBRARIES}) + LIST (APPEND _TENGINE_DEVICE_SOURCE ${TENGINE_VULKAN_DEVICE_SOURCE}) + LIST (APPEND _TENGINE_DEVICE_DEPENDS_FORWARD ${TENGINE_VULKAN_DEPENDS_FORWARD}) + LIST (APPEND _REGISTER_DEVICE_LIST "${CMAKE_SOURCE_DIR}/source/device/vulkan/vulkan_device.cc") +ENDIF() + # set var to cache SET (TENGINE_DEVICE_HEADER_PATH ${_TENGINE_DEVICE_HEADER_PATH} CACHE INTERNAL "Tengine device level header files searching path" FORCE) @@ -142,7 +157,7 @@ SET (TENGINE_DEVICE_COMPILER_DEFINES ${_TENGINE_DEVICE_COMPILER_DEFINES} CACH SET (TENGINE_DEVICE_COMPILER_OPTIONS ${_TENGINE_DEVICE_COMPILER_OPTIONS} CACHE INTERNAL "Tengine device about compiler options" FORCE) SET (TENGINE_DEVICE_LINKER_OPTIONS ${_TENGINE_DEVICE_LINKER_OPTIONS} CACHE INTERNAL "Tengine device about linker options" FORCE) SET (TENGINE_DEVICE_LINK_LIBRARIES ${_TENGINE_DEVICE_LINK_LIBRARIES} CACHE INTERNAL "Tengine device about link libraries" FORCE) - +SET (TENGINE_DEVICE_DEPENDS_FORWARD ${_TENGINE_DEVICE_DEPENDS_FORWARD} CACHE INTERNAL "Tengine device about depends project" FORCE) # generate device register configuration GENERATE_REGISTER_HEADER_FILE ("register_" "unregister_" "" "${CMAKE_SOURCE_DIR}/source/device/register.h.in" "${CMAKE_BINARY_DIR}/source/device/register.h" "${_REGISTER_DEVICE_LIST}") diff --git a/source/device/vulkan/CMakeLists.txt b/source/device/vulkan/CMakeLists.txt new file mode 100644 index 000000000..9273bb39e --- /dev/null +++ b/source/device/vulkan/CMakeLists.txt @@ -0,0 +1,171 @@ +# 0. clear var +UNSET (_DEV_VULKAN_HEADER_PATH) +UNSET (_VULKAN_BASE_SOURCE) +UNSET (_VULKAN_OPS_SOURCE) +UNSET (_DEV_VULKAN_DEVICE_SOURCE) +UNSET (_DEV_VULKAN_COMPILER_DEFINES) +UNSET (_DEV_VULKAN_COMPILER_OPTIONS) +UNSET (_DEV_VULKAN_LINKER_OPTIONS) +UNSET (_DEV_VULKAN_LINK_LIBRARIES) + + + +find_program(GLSLANGVALIDATOR_EXECUTABLE NAMES glslangValidator PATHS $ENV{VULKAN_SDK}/bin NO_CMAKE_FIND_ROOT_PATH REQUIRED) +message(STATUS "Tengine: found glslangValidator: ${GLSLANGVALIDATOR_EXECUTABLE}") + +# add shader spv header generate macro +include(${CMAKE_SOURCE_DIR}/cmake/generate_shader_spv_header.cmake) + +macro(add_shader SHADER_SRC) + message(STATUS "SHADER_SRC: ${SHADER_SRC}") + generate_shader_spv_header(SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS ${SHADER_SRC}) + + + get_filename_component(SHADER_SPV_HEADER_NAME ${SHADER_SPV_HEADER} NAME) + string(APPEND layer_shader_spv_data "#include \"${SHADER_SPV_HEADER_NAME}\"\n") + + get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE) + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_spv_data,sizeof(${SHADER_SRC_NAME_WE}_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16p_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n") + string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data)},\n") + + list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER}) + list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS}) + + # generate layer_shader_type_enum file + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1") + +endmacro() + +macro(add_layer class) + string(TOLOWER ${class} name) + + file(GLOB_RECURSE SHADER_SRCS "shaders/${name}.comp") + file(GLOB_RECURSE SHADER_SUBSRCS "shaders/${name}_*.comp") + list(APPEND SHADER_SRCS ${SHADER_SUBSRCS}) + foreach(SHADER_SRC ${SHADER_SRCS}) + add_shader(${SHADER_SRC}) + endforeach() + + # generate layer_type_enum file + set(layer_type_enum "${layer_type_enum}${class} = ${__LAYER_TYPE_ENUM_INDEX},\n") + math(EXPR __LAYER_TYPE_ENUM_INDEX "${__LAYER_TYPE_ENUM_INDEX}+1") +endmacro() + +set(SHADER_SPV_HEX_FILES) + +set(__LAYER_TYPE_ENUM_INDEX 0) +set(__LAYER_SHADER_TYPE_ENUM_INDEX 0) + +add_layer(Convolution) +add_layer(ConvolutionDepthWise) +add_layer(Pooling) +add_layer(Padding) +add_layer(Packing) +add_layer(InnerProduct) +add_layer(Flatten) +add_layer(Relu) +add_layer(Eltwise) +add_layer(Softmax) +add_layer(Dropout) +add_layer(PriorBox) +add_layer(Permute) +add_layer(Reshape) +add_layer(Concat) +add_layer(Interp) +add_layer(Crop) + +add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES}) + +# create new registry file +configure_file(layer_shader_registry.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_registry.h) +configure_file(layer_shader_spv_data.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_spv_data.h) +configure_file(layer_type_enum.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_type_enum.h) +configure_file(layer_shader_type_enum.h.in ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_type_enum.h) + +# find_package(Vulkan QUIET) +set(VULKAN_LIBRARY "/usr/lib/x86_64-linux-gnu/" CACHE INTERNAL " " FORCE) +set(VULKAN_INCLUDE_DIRS "/usr/include/vulkan/" CACHE INTERNAL " " FORCE) + +# 1. set source root path +SET(_VULKAN_ROOT ${CMAKE_SOURCE_DIR}/source/device/vulkan) +SET(_VULKAN_BUILD_ROOT ${CMAKE_CURRENT_BINARY_DIR}) + + +# 2. add header file path +LIST (APPEND _DEV_VULKAN_HEADER_PATH ${_VULKAN_BUILD_ROOT}) +LIST (APPEND _DEV_VULKAN_HEADER_PATH ${_VULKAN_ROOT}) +LIST (APPEND _DEV_VULKAN_HEADER_PATH ${VULKAN_INCLUDE_DIRS}) + + +# 3. add linking lib searching path +LIST (APPEND _DEV_VULKAN_LINK_PATH ${VULKAN_LIBRARY}) + + +# 4. add source files +AUX_SOURCE_DIRECTORY("${_VULKAN_ROOT}" _VULKAN_BASE_SOURCE) +AUX_SOURCE_DIRECTORY("${_VULKAN_ROOT}/layer" _VULKAN_OPS_SOURCE) +LIST (APPEND _DEV_VULKAN_DEVICE_SOURCE ${_VULKAN_BASE_SOURCE}) +LIST (APPEND _DEV_VULKAN_DEVICE_SOURCE ${_VULKAN_OPS_SOURCE}) + + +# 5. add build options for cpu device +# 5.1 is a gcc or clang like compiler +IF (TENGINE_COMPILER_GCC OR TENGINE_COMPILER_CLANG) + IF (TENGINE_COMPILER_GCC AND (${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "6.1")) + LIST (APPEND _DEV_VULKAN_COMPILER_OPTIONS -Wno-ignored-attributes) + ENDIF() +ENDIF() + + +# 5.2 is Microsoft Visual C++ +IF (TENGINE_COMPILER_MSVC) +ENDIF() + + +# 6. add link options + + +# 7. add link libs +LIST (APPEND _DEV_VULKAN_LINK_LIBRARIES "libvulkan.so") + + +# 8. set all to cmake cache +SET (TENGINE_VULKAN_HEADER_PATH ${_DEV_VULKAN_HEADER_PATH} CACHE INTERNAL "Tengine VULKAN device header files searching path" FORCE) +SET (TENGINE_VULKAN_LINK_PATH ${_DEV_VULKAN_LINK_PATH} CACHE INTERNAL "Tengine VULKAN device link libraries searching path" FORCE) +SET (TENGINE_VULKAN_DEVICE_SOURCE ${_DEV_VULKAN_DEVICE_SOURCE} CACHE INTERNAL "Tengine VULKAN device main source files" FORCE) +SET (TENGINE_VULKAN_COMPILER_DEFINES ${_DEV_VULKAN_COMPILER_DEFINES} CACHE INTERNAL "Tengine VULKAN about compiler defines" FORCE) +SET (TENGINE_VULKAN_COMPILER_OPTIONS ${_DEV_VULKAN_COMPILER_OPTIONS} CACHE INTERNAL "Tengine VULKAN about compiler options" FORCE) +SET (TENGINE_VULKAN_LINKER_OPTIONS ${_DEV_VULKAN_LINKER_OPTIONS} CACHE INTERNAL "Tengine VULKAN about linker options" FORCE) +SET (TENGINE_VULKAN_LINK_LIBRARIES ${_DEV_VULKAN_LINK_LIBRARIES} CACHE INTERNAL "Tengine VULKAN about link libraries" FORCE) +SET (TENGINE_VULKAN_DEPENDS_FORWARD generate-spirv CACHE INTERNAL "Tengine VULKAN about depends project" FORCE) + + +# 9. install device option +INSTALL (FILES ${_VULKAN_ROOT}/VULKAN_define.h DESTINATION include/tengine RENAME VULKAN_device.h) diff --git a/source/device/vulkan/layer/concat_vulkan.cpp b/source/device/vulkan/layer/concat_vulkan.cpp new file mode 100644 index 000000000..926e7b19a --- /dev/null +++ b/source/device/vulkan/layer/concat_vulkan.cpp @@ -0,0 +1,788 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "concat_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Concat_vulkan::Concat_vulkan() +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_concat[0] = 0; + pipeline_concat[1] = 0; + pipeline_concat_pack4[0] = 0; + pipeline_concat_pack4[1] = 0; + pipeline_concat_pack4to1[0] = 0; + pipeline_concat_pack4to1[1] = 0; + pipeline_concat_pack8[0] = 0; + pipeline_concat_pack8[1] = 0; + pipeline_concat_pack8to4[0] = 0; + pipeline_concat_pack8to4[1] = 0; + pipeline_concat_pack8to1[0] = 0; + pipeline_concat_pack8to1[1] = 0; +} + +Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_concat[0] = 0; + pipeline_concat[1] = 0; + pipeline_concat_pack4[0] = 0; + pipeline_concat_pack4[1] = 0; + pipeline_concat_pack4to1[0] = 0; + pipeline_concat_pack4to1[1] = 0; + pipeline_concat_pack8[0] = 0; + pipeline_concat_pack8[1] = 0; + pipeline_concat_pack8to4[0] = 0; + pipeline_concat_pack8to4[1] = 0; + pipeline_concat_pack8to1[0] = 0; + pipeline_concat_pack8to1[1] = 0; + + graph = ir_graph; + node = ir_node; + + for(int i = 0; i < ir_node->input_num; i++) + { + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]); + std::string name = input->name; + bottoms.push_back(name); + } + + for(int i = 0; i < ir_node->output_num; i++) + { + struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]); + std::string name = output->name; + tops.push_back(name); + } + + // params + struct tensor *input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); + input_c = input_tensor->dims[1]; // param->input_channel; + input_h = input_tensor->dims[2]; + input_w = input_tensor->dims[3]; + output_c = output_tensor->dims[1]; // param->output_channel; + output_h = output_tensor->dims[2]; + output_w = output_tensor->dims[3]; + + struct concat_param *param = (struct concat_param *)ir_node->op.param_mem; + axis = param->axis -1; +} + +int Concat_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; + + int out_elempack = 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; + if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; + if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; + + int elempack = 1; + if (axis == 0) + { + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + // TODO fix other input data shape to set elempack + // for (size_t b = 1; b < bottom_shapes.size(); b++) + // { + // const Tensor& shape1 = bottom_shapes[b]; + + // int elempack1 = 1; + // if (shape1.dims == 1) elempack1 = opt.use_shader_pack8 && shape1.w % 8 == 0 ? 8 : shape1.w % 4 == 0 ? 4 : 1; + // if (shape1.dims == 2) elempack1 = opt.use_shader_pack8 && shape1.h % 8 == 0 ? 8 : shape1.h % 4 == 0 ? 4 : 1; + // if (shape1.dims == 3) elempack1 = opt.use_shader_pack8 && shape1.c % 8 == 0 ? 8 : shape1.c % 4 == 0 ? 4 : 1; + + // elempack = std::min(elempack, elempack1); + // } + } + else + { + elempack = out_elempack; + } + + size_t elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else + { + elemsize = elempack * 4u; + } + + Tensor out_shape_unpacked; + if (out_shape.dims == 1) out_shape_unpacked = Tensor(out_shape.w / elempack, (void*)0, elemsize, elempack); + if (out_shape.dims == 2) out_shape_unpacked = Tensor(out_shape.w, out_shape.h / elempack, (void*)0, elemsize, elempack); + if (out_shape.dims == 3) out_shape_unpacked = Tensor(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, elemsize, elempack); + + // if (!vkdev->shape_support_image_storage(out_shape_unpacked)) + { + support_image_storage = false; + opt.use_image_storage = false; + } + + std::vector specializations(1 + 10); + specializations[0].i = axis; + specializations[1 + 0].i = 0; // TODO handle shape_packed for concat2 + specializations[1 + 1].i = 0; + specializations[1 + 2].i = 0; + specializations[1 + 3].i = 0; + specializations[1 + 4].i = 0; + specializations[1 + 5].i = out_shape_unpacked.dims; + specializations[1 + 6].i = out_shape_unpacked.w; + specializations[1 + 7].i = out_shape_unpacked.h; + specializations[1 + 8].i = out_shape_unpacked.c; + specializations[1 + 9].i = out_shape_unpacked.cstep; + + Tensor local_size_xyz; // TODO more precise group size guessed from out_shape_unpacked + if (out_shape_unpacked.dims == 1) + { + local_size_xyz.w = 64; + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (out_shape_unpacked.dims == 2) + { + local_size_xyz.w = 8; + local_size_xyz.h = 8; + local_size_xyz.c = 1; + } + if (out_shape_unpacked.dims == 3) + { + local_size_xyz.w = 4; + local_size_xyz.h = 4; + local_size_xyz.c = 4; + } + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_concat[0] = new Pipeline(vkdev); + pipeline_concat[0]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat[0]->create(LayerShaderType::concat, opt, specializations); + pipeline_concat[1] = new Pipeline(vkdev); + pipeline_concat[1]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat[1]->create(LayerShaderType::concat, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_concat_pack4[0] = new Pipeline(vkdev); + pipeline_concat_pack4[0]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack4[0]->create(LayerShaderType::concat_pack4, opt, specializations); + pipeline_concat_pack4[1] = new Pipeline(vkdev); + pipeline_concat_pack4[1]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack4[1]->create(LayerShaderType::concat_pack4, opt, specializations); + } + + // pack4to1 + if ((axis == 0 && shape.dims == 0) || elempack == 1) + { + pipeline_concat_pack4to1[0] = new Pipeline(vkdev); + pipeline_concat_pack4to1[0]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack4to1[0]->create(LayerShaderType::concat_pack4to1, opt, specializations); + pipeline_concat_pack4to1[1] = new Pipeline(vkdev); + pipeline_concat_pack4to1[1]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack4to1[1]->create(LayerShaderType::concat_pack4to1, opt, specializations); + } + + // pack8 + if (opt.use_shader_pack8 && (shape.dims == 0 || elempack == 8)) + { + pipeline_concat_pack8[0] = new Pipeline(vkdev); + pipeline_concat_pack8[0]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack8[0]->create(LayerShaderType::concat_pack8, opt, specializations); + pipeline_concat_pack8[1] = new Pipeline(vkdev); + pipeline_concat_pack8[1]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack8[1]->create(LayerShaderType::concat_pack8, opt, specializations); + } + + // pack8to4 + if (opt.use_shader_pack8 && ((axis == 0 && shape.dims == 0) || elempack == 4)) + { + pipeline_concat_pack8to4[0] = new Pipeline(vkdev); + pipeline_concat_pack8to4[0]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack8to4[0]->create(LayerShaderType::concat_pack8to4, opt, specializations); + pipeline_concat_pack8to4[1] = new Pipeline(vkdev); + pipeline_concat_pack8to4[1]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack8to4[1]->create(LayerShaderType::concat_pack8to4, opt, specializations); + } + + // pack8to1 + if (opt.use_shader_pack8 && ((axis == 0 && shape.dims == 0) || elempack == 1)) + { + pipeline_concat_pack8to1[0] = new Pipeline(vkdev); + pipeline_concat_pack8to1[0]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack8to1[0]->create(LayerShaderType::concat_pack8to1, opt, specializations); + pipeline_concat_pack8to1[1] = new Pipeline(vkdev); + pipeline_concat_pack8to1[1]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_concat_pack8to1[1]->create(LayerShaderType::concat_pack8to1, opt, specializations); + } + + return 0; +} + +int Concat_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_concat[0]; + delete pipeline_concat[1]; + pipeline_concat[0] = 0; + pipeline_concat[1] = 0; + + delete pipeline_concat_pack4[0]; + delete pipeline_concat_pack4[1]; + pipeline_concat_pack4[0] = 0; + pipeline_concat_pack4[1] = 0; + + delete pipeline_concat_pack4to1[0]; + delete pipeline_concat_pack4to1[1]; + pipeline_concat_pack4to1[0] = 0; + pipeline_concat_pack4to1[1] = 0; + + delete pipeline_concat_pack8[0]; + delete pipeline_concat_pack8[1]; + pipeline_concat_pack8[0] = 0; + pipeline_concat_pack8[1] = 0; + + delete pipeline_concat_pack8to4[0]; + delete pipeline_concat_pack8to4[1]; + pipeline_concat_pack8to4[0] = 0; + pipeline_concat_pack8to4[1] = 0; + + delete pipeline_concat_pack8to1[0]; + delete pipeline_concat_pack8to1[1]; + pipeline_concat_pack8to1[0] = 0; + pipeline_concat_pack8to1[1] = 0; + + return 0; +} + +int Concat_vulkan::record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + int dims = bottom_blobs[0].dims; + + if (dims == 1) // axis == 0 + { + // concat vector + // total length + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_w += bottom_blob.w * bottom_blob.elempack; + } + + int out_elempack = opt.use_shader_pack8 && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + VkTensor& top_blob = top_blobs[0]; + top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + VkTensor top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator); + if (top_blob_unpacked.empty()) + return -100; + } + + int woffset = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob_unpacked; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = top_blob_unpacked.cstep; + constants[10].i = woffset; + + const Pipeline* pipeline = 0; + if (bottom_blob.elempack == 1 && elempack == 1) + { + pipeline = pipeline_concat[b % 2]; + } + else if (bottom_blob.elempack == 4 && elempack == 4) + { + pipeline = pipeline_concat_pack4[b % 2]; + } + else if (bottom_blob.elempack == 4 && elempack == 1) + { + pipeline = pipeline_concat_pack4to1[b % 2]; + } + else if (bottom_blob.elempack == 8 && elempack == 8) + { + pipeline = pipeline_concat_pack8[b % 2]; + } + else if (bottom_blob.elempack == 8 && elempack == 4) + { + pipeline = pipeline_concat_pack8to4[b % 2]; + } + else if (bottom_blob.elempack == 8 && elempack == 1) + { + pipeline = pipeline_concat_pack8to1[b % 2]; + } + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + woffset += bottom_blob.w * bottom_blob.elempack / elempack; + } + + // packing + if (elempack < out_elempack) + { + vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt); + } + + return 0; + } + + if (dims == 2 && axis == 0) + { + // concat image + int w = bottom_blobs[0].w; + + // total height + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_h += bottom_blob.h * bottom_blob.elempack; + } + + int out_elempack = opt.use_shader_pack8 && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + VkTensor& top_blob = top_blobs[0]; + top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + VkTensor top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator); + if (top_blob_unpacked.empty()) + return -100; + } + + int hoffset = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob_unpacked; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = top_blob_unpacked.cstep; + constants[10].i = hoffset; + + const Pipeline* pipeline = 0; + if (bottom_blob.elempack == 1 && elempack == 1) + { + pipeline = pipeline_concat[b % 2]; + } + else if (bottom_blob.elempack == 4 && elempack == 4) + { + pipeline = pipeline_concat_pack4[b % 2]; + } + else if (bottom_blob.elempack == 4 && elempack == 1) + { + pipeline = pipeline_concat_pack4to1[b % 2]; + } + else if (bottom_blob.elempack == 8 && elempack == 8) + { + pipeline = pipeline_concat_pack8[b % 2]; + } + else if (bottom_blob.elempack == 8 && elempack == 4) + { + pipeline = pipeline_concat_pack8to4[b % 2]; + } + else if (bottom_blob.elempack == 8 && elempack == 1) + { + pipeline = pipeline_concat_pack8to1[b % 2]; + } + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + hoffset += bottom_blob.h * bottom_blob.elempack / elempack; + } + + // packing + if (elempack < out_elempack) + { + vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt); + } + + return 0; + } + + if (dims == 2 && axis == 1) + { + // interleave image row + int h = bottom_blobs[0].h; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total width + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } + + VkTensor& top_blob = top_blobs[0]; + top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + int woffset = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].i = woffset; + + const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2] + : elempack == 4 ? pipeline_concat_pack4[b % 2] + : pipeline_concat[b % 2]; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + woffset += bottom_blob.w; + } + + return 0; + } + + if (dims == 3 && axis == 0) + { + // concat dim + int w = bottom_blobs[0].w; + int h = bottom_blobs[0].h; + + // total channels + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + int top_channels = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + elemsize = std::min(elemsize, bottom_blob.elemsize); + elempack = std::min(elempack, bottom_blob.elempack); + top_channels += bottom_blob.c * bottom_blob.elempack; + } + + int out_elempack = opt.use_shader_pack8 && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + VkTensor& top_blob = top_blobs[0]; + top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + VkTensor top_blob_unpacked = top_blob; + if (elempack < out_elempack) + { + top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator); + if (top_blob_unpacked.empty()) + return -100; + } + + int coffset = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob_unpacked; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob_unpacked.dims; + constants[6].i = top_blob_unpacked.w; + constants[7].i = top_blob_unpacked.h; + constants[8].i = top_blob_unpacked.c; + constants[9].i = top_blob_unpacked.cstep; + constants[10].i = coffset; + + const Pipeline* pipeline = 0; + if (bottom_blob.elempack == 1 && elempack == 1) + { + pipeline = pipeline_concat[b % 2]; + } + else if (bottom_blob.elempack == 4 && elempack == 4) + { + pipeline = pipeline_concat_pack4[b % 2]; + } + else if (bottom_blob.elempack == 4 && elempack == 1) + { + pipeline = pipeline_concat_pack4to1[b % 2]; + } + else if (bottom_blob.elempack == 8 && elempack == 8) + { + pipeline = pipeline_concat_pack8[b % 2]; + } + else if (bottom_blob.elempack == 8 && elempack == 4) + { + pipeline = pipeline_concat_pack8to4[b % 2]; + } + else if (bottom_blob.elempack == 8 && elempack == 1) + { + pipeline = pipeline_concat_pack8to1[b % 2]; + } + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + coffset += bottom_blob.c * bottom_blob.elempack / elempack; + } + + // packing + if (elempack < out_elempack) + { + vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt); + } + + return 0; + } + + if (dims == 3 && axis == 1) + { + // interleave dim height + int w = bottom_blobs[0].w; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_h = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + top_h += bottom_blob.h; + } + + VkTensor& top_blob = top_blobs[0]; + top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + int hoffset = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].i = hoffset; + + const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2] + : elempack == 4 ? pipeline_concat_pack4[b % 2] + : pipeline_concat[b % 2]; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + hoffset += bottom_blob.h; + } + + return 0; + } + + if (dims == 3 && axis == 2) + { + // interleave dim width + int h = bottom_blobs[0].h; + int channels = bottom_blobs[0].c; + size_t elemsize = bottom_blobs[0].elemsize; + int elempack = bottom_blobs[0].elempack; + + // total height + int top_w = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + top_w += bottom_blob.w; + } + + VkTensor& top_blob = top_blobs[0]; + top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + int woffset = 0; + for (size_t b = 0; b < bottom_blobs.size(); b++) + { + const VkTensor& bottom_blob = bottom_blobs[b]; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(11); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].i = woffset; + + const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2] + : elempack == 4 ? pipeline_concat_pack4[b % 2] + : pipeline_concat[b % 2]; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_blob); + + woffset += bottom_blob.w; + } + + return 0; + } + + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/concat_vulkan.hpp b/source/device/vulkan/layer/concat_vulkan.hpp new file mode 100644 index 000000000..6476fc997 --- /dev/null +++ b/source/device/vulkan/layer/concat_vulkan.hpp @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_CONCAT_HPP +#define LAYER_CONCAT_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "concat_param.h" + +namespace TEngine{ + +class Concat_vulkan : public Layer +{ +public: + Concat_vulkan(); + Concat_vulkan(ir_graph_t* graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_concat[2]; + Pipeline* pipeline_concat_pack4[2]; + Pipeline* pipeline_concat_pack4to1[2]; + Pipeline* pipeline_concat_pack8[2]; + Pipeline* pipeline_concat_pack8to4[2]; + Pipeline* pipeline_concat_pack8to1[2]; + +public: + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + int axis; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/convolution_vulkan.cpp b/source/device/vulkan/layer/convolution_vulkan.cpp new file mode 100644 index 000000000..5f135feba --- /dev/null +++ b/source/device/vulkan/layer/convolution_vulkan.cpp @@ -0,0 +1,616 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "convolution_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Convolution_vulkan::Convolution_vulkan() +{ + support_vulkan = true; + pipeline_convolution = 0; +} + +Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + padding = 0; + innerproduct = 0; + + pipeline_convolution = 0; + pipeline_convolution_pack4 = 0; + pipeline_convolution_pack8 = 0; + pipeline_convolution_pack1to4 = 0; + pipeline_convolution_pack4to1 = 0; + pipeline_convolution_pack1to8 = 0; + pipeline_convolution_pack4to8 = 0; + pipeline_convolution_pack8to1 = 0; + pipeline_convolution_pack8to4 = 0; + pipeline_convolution_1x1s1d1 = 0; + pipeline_convolution_pack4_1x1s1d1 = 0; + pipeline_convolution_pack8_1x1s1d1 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + // Tensor* output_tensor = t_node->GetOutputTensor(0); + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + // Convolution* conv_op = dynamic_cast(node->GetOp()); + // ConvParam* param = conv_op->GetParam(); + struct conv_param *param = (struct conv_param *)ir_node->op.param_mem; + + group = param->group; + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + pad_w0 = param->pad_w0; // left padding columns + pad_w1 = param->pad_w1; // right padding columns + pad_h0 = param->pad_h0; // top padding rows + pad_h1 = param->pad_h1; // bottom padding rows + stride_w = param->stride_w; + stride_h = param->stride_h; + dilation_w = param->dilation_w; + dilation_h = param->dilation_h; + kernel_w = param->kernel_w; + kernel_h = param->kernel_h; + activation = param->activation == 0 ? 1 : -1; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + struct tensor *weight = get_ir_graph_tensor(graph, node->input_tensors[1]); + weight_data_size = weight->elem_num; +} + +int Convolution_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + + // const Tshape& shape = bottom_shapes.empty() ? Tshape() : bottom_shapes[0]; + // const Tshape& out_shape = top_shapes.empty() ? Tshape() : top_shapes[0]; + + // const int maxk = kernel_w * kernel_h; + // // int num_input = weight_data_size / maxk / num_output; + // int num_output = output_c; + // int num_input = input_c; + const Tshape& shape = Tshape(input_w, input_h, input_c); + const Tshape& out_shape = Tshape(output_w, output_h, output_c); + const int maxk = kernel_w * kernel_h; + int num_output = output_c; + int num_input = input_c; + int pad_left = pad_w0; + int pad_right = pad_w1; + int pad_top = pad_h0; + int pad_bottom = pad_h1; + + // TLOG_INFO("%d %d %d -> %d %d %d\n", shape.c, shape.h, shape.w, out_shape.c, out_shape.h, out_shape.w); + // fc + // if (kernel_w == 1 && kernel_h == 1) + // { + // innerproduct = new InnerProduct_vulkan(graph, node); + // innerproduct->vkdev = vkdev; + + // innerproduct->create_pipeline(opt); + + // if (shape.dims == 1 && shape.w == num_input) + // { + // return 0; + // } + // } + + Tshape shape_bordered = Tshape(); + + if (shape.dims != 0) + { + if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0) + { + shape_bordered = Tshape(shape.w + pad_left + pad_right, shape.h + pad_top + pad_bottom, shape.c); + } + else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233) + || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)) + { + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int wpad = kernel_extent_w + (shape.w - 1) / stride_w * stride_w - shape.w; + int hpad = kernel_extent_h + (shape.h - 1) / stride_h * stride_h - shape.h; + if (wpad > 0 || hpad > 0) + { + shape_bordered = Tshape(shape.w + wpad, shape.h + hpad, shape.c); + } + } + else + { + shape_bordered = shape; + } + } + + int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + + size_t elemsize; + size_t out_elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + + // TLOG_INFO("elemsize out_elemsize:%d %d\n", elemsize, out_elemsize); + + Tshape shape_bordered_packed; + // if (shape_bordered.dims == 3) shape_bordered_packed = Mat(shape_bordered.w, shape_bordered.h, num_input / elempack, (void*)0, elemsize, elempack); + if (shape_bordered.dims == 3) shape_bordered_packed = Tshape(shape_bordered.w, shape_bordered.h, num_input / elempack); + + Tshape out_shape_packed; + // if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, num_output / out_elempack, (void*)0, out_elemsize, out_elempack); + if (out_shape.dims == 3) out_shape_packed = Tshape(out_shape.w, out_shape.h, num_output / out_elempack); + + bool is_conv1x1s1d1 = kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + // bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + // bool is_conv1x1s1d1 = false; + bool is_conv3x3s1d1 = false; + + // if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16 && ((elempack == 4 && out_elempack == 4) || (elempack == 8 && out_elempack == 8))) + { + // TODO do nothing for wino fix me!!!!! + } + // else + { + support_image_storage = false; + opt.use_image_storage = false; + } + + { + padding = new Padding_vulkan(); + padding->vkdev = vkdev; + + padding->top = pad_h0; + padding->bottom = pad_h1; + padding->left = pad_w0; + padding->right = pad_w1; + padding->type = 0; + padding->value = 0; + + padding->input_w = input_w; + padding->input_h = input_h; + padding->input_c = input_c; + padding->output_w = input_w + pad_w0 + pad_w1; + padding->output_h = input_h + pad_h0 + pad_h1; + padding->output_c = input_c; + + padding->create_pipeline(opt); + } + + std::vector specializations(10 + 10); + specializations[0].i = kernel_w; // kernel_w; + specializations[1].i = kernel_h; // kernel_h + specializations[2].i = dilation_w; // dilation_w; + specializations[3].i = dilation_h; // dilation_h; + specializations[4].i = stride_w; // stride_w; + specializations[5].i = stride_h; // stride_h; + specializations[6].i = node->input_num>2 ? 1 : 0; // bias_term; + specializations[7].i = activation; // activation_type; + specializations[8].f = 0;//param->activation; // activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[9].f = 0;//param->activation; // activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[10 + 0].i = 0;//3; // shape_bordered_packed.dims; + specializations[10 + 1].i = 0;//input_w + pad_w0 + pad_w1; // shape_bordered_packed.w; + specializations[10 + 2].i = 0;//input_h + pad_h0 + pad_h1; // shape_bordered_packed.h; + specializations[10 + 3].i = 0;//input_c; // shape_bordered_packed.c; + specializations[10 + 4].i = 0;//(input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1); // shape_bordered_packed.cstep; + specializations[10 + 5].i = 0; // out_shape_packed.dims; + specializations[10 + 6].i = 0;//output_w; // out_shape_packed.w; + specializations[10 + 7].i = 0;//output_h; // out_shape_packed.h; + specializations[10 + 8].i = 0;//output_c; // out_shape_packed.c; + specializations[10 + 9].i = 0;//output_w * output_h; // out_shape_packed.cstep; + + // TODO with local_size_xyz and shader_index options + + VkTensor local_size_xyz; + local_size_xyz.w = std::min(8, out_shape_packed.w); + local_size_xyz.h = std::min(8, out_shape_packed.h); + local_size_xyz.c = std::min(4, out_shape_packed.c); + + // TLOG_INFO("create pipeline elempack out_elempack:%d %d\n", elempack, out_elempack); + + + if (elempack == 1 && out_elempack == 1) + { + // TODO deal with conv1x1s1d1 + if (is_conv1x1s1d1) + { + pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); + pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output)); + pipeline_convolution_1x1s1d1->create(LayerShaderType::convolution_1x1s1d1, opt, specializations); + } + else + { + // TLOG_INFO("create pipeline pack1to1\n"); + pipeline_convolution = new Pipeline(vkdev); + pipeline_convolution->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution->create(LayerShaderType::convolution, opt, specializations); + } + } + + // pack4 + if (elempack == 4 && out_elempack == 4) + { + if (is_conv1x1s1d1) + { + pipeline_convolution_pack4_1x1s1d1 = new Pipeline(vkdev); + pipeline_convolution_pack4_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 4)); + pipeline_convolution_pack4_1x1s1d1->create(LayerShaderType::convolution_pack4_1x1s1d1, opt, specializations); + } + else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) + { + // winograd23 + } + else + { + pipeline_convolution_pack4 = new Pipeline(vkdev); + pipeline_convolution_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution_pack4->create(LayerShaderType::convolution_pack4, opt, specializations); + } + } + + // pack1to4 + if (elempack == 1 && out_elempack == 4) + { + pipeline_convolution_pack1to4 = new Pipeline(vkdev); + pipeline_convolution_pack1to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution_pack1to4->create(LayerShaderType::convolution_pack1to4, opt, specializations); + } + + // pack4to1 + if (elempack == 4 && out_elempack == 1) + { + pipeline_convolution_pack4to1 = new Pipeline(vkdev); + pipeline_convolution_pack4to1->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution_pack4to1->create(LayerShaderType::convolution_pack4to1, opt, specializations); + } + + // pack8 + if (elempack == 8 && out_elempack == 8) + { + if (is_conv1x1s1d1) + { + pipeline_convolution_pack8_1x1s1d1 = new Pipeline(vkdev); + pipeline_convolution_pack8_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 8)); + pipeline_convolution_pack8_1x1s1d1->create(LayerShaderType::convolution_pack8_1x1s1d1, opt, specializations); + } + else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) + { + // winograd23 + } + else + { + pipeline_convolution_pack8 = new Pipeline(vkdev); + pipeline_convolution_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution_pack8->create(LayerShaderType::convolution_pack8, opt, specializations); + } + } + + // pack1to8 + if (elempack == 1 && out_elempack == 8) + { + pipeline_convolution_pack1to8 = new Pipeline(vkdev); + pipeline_convolution_pack1to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution_pack1to8->create(LayerShaderType::convolution_pack1to8, opt, specializations); + } + + // pack4to8 + if (elempack == 4 && out_elempack == 8) + { + pipeline_convolution_pack4to8 = new Pipeline(vkdev); + pipeline_convolution_pack4to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution_pack4to8->create(LayerShaderType::convolution_pack4to8, opt, specializations); + } + + // pack8to4 + if (elempack == 8 && out_elempack == 4) + { + pipeline_convolution_pack8to4 = new Pipeline(vkdev); + pipeline_convolution_pack8to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution_pack8to4->create(LayerShaderType::convolution_pack8to4, opt, specializations); + } + + // pack8to1 + if (elempack == 8 && out_elempack == 1) + { + pipeline_convolution_pack8to1 = new Pipeline(vkdev); + pipeline_convolution_pack8to1->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolution_pack8to1->create(LayerShaderType::convolution_pack8to1, opt, specializations); + } + + return 0; +} + +int Convolution_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + return 0; +} + +int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); + + // Tensor weight_data = Tensor(weight_tensor->elem_num, 1, 1, weight_tensor->data); + Tensor weight_data = Tensor(weight_tensor->elem_num, weight_tensor->data); + + // if (padding) + // { + // padding->upload_model(cmd, opt); + // } + + const int maxk = kernel_w * kernel_h; + int num_output = output_c; + int num_input = input_c; //weight_data_size / maxk / num_output; + + int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + // int elempack = 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + + // TLOG_INFO("conv upload model pack:%d %d\n", elempack, out_elempack); + + Tensor weight_data_packed; + { + Tensor weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_packed.create(maxk, num_input/elempack, num_output/out_elempack, (size_t)4*elempack*out_elempack, elempack*out_elempack); + for (int q=0; q+(out_elempack-1)input_tensors[1]); + // cmd.record_upload(weight_tensor, weight_data_gpu, opt); + if (support_image_storage && opt.use_image_storage) + { + TLOG_INFO("not record_upload weight_data_gpu_image, fix me\n"); + // cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); + } + else + { + cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + } + + // upload bias data + if(node->input_num > 2) + { + tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]); + Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data); + + // TLOG_INFO("bias data shape:%d %d %d\n", bias_data.c, bias_data.h, bias_data.w); + + Tensor bias_data_packed; + convert_packing(bias_data, bias_data_packed, out_elempack); + + if (support_image_storage && opt.use_image_storage) + { + // cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + + } + + // if (innerproduct) + // { + // innerproduct->upload_model(cmd, opt); + // } + + return 0; +} + +int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + // TLOG_INFO("in_c in_h in_w k_h k_w s p dilation group:%d %d %d %d %d %d %d %d %d\n", input_c, input_h, input_w, kernel_h, kernel_w, stride_h, pad_w0, dilation_h, group); + VkTensor bottom_blob_dim3 = bottom_blob; + if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1) + { + bottom_blob_dim3.dims = 3; + bottom_blob_dim3.c = bottom_blob_dim3.w; + bottom_blob_dim3.w = 1; + bottom_blob_dim3.cstep = 1; + } + + int w = bottom_blob_dim3.w; + int h = bottom_blob_dim3.h; + int channels = bottom_blob_dim3.c; + size_t elemsize = bottom_blob_dim3.elemsize; + int elempack = bottom_blob_dim3.elempack; + // TLOG_INFO("botom shape:%d %d %d %d %d %d %d\n", bottom_blob.dims, bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.elemsize, bottom_blob.elempack, bottom_blob.cstep); + + int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + VkTensor bottom_blob_bordered = bottom_blob_dim3; + if (pad_h0 > 0 || pad_h1 > 0 || pad_w0 > 0 || pad_w1 > 0) + { + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad); + } + + // TLOG_INFO("forward convolution, w h c elemsize, elempack:%d %d %d %d %d\n", output_w, output_h, channels, elemsize, elempack); + top_blob.create(output_w, output_h, output_c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + + // TLOG_INFO("convolution bottom shape:%d %d %d %d %d, top shape:%d %d %d %d %d\n", bottom_blob_bordered.dims, bottom_blob_bordered.w, bottom_blob_bordered.h, bottom_blob_bordered.c, bottom_blob_bordered.cstep, top_blob.dims, top_blob.w, top_blob.h, top_blob.c, top_blob.cstep); + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu; + bindings[3] = bias_data_gpu; + + std::vector constants(10); + constants[0].i = bottom_blob_bordered.dims; + constants[1].i = bottom_blob_bordered.w; + constants[2].i = bottom_blob_bordered.h; + constants[3].i = bottom_blob_bordered.c; + constants[4].i = bottom_blob_bordered.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + // record + if (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) + { + VkTensor dispatcher; + dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; + dispatcher.h = 1; + dispatcher.c = top_blob.c; + + cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); + } + else if (elempack == 4 && out_elempack == 4 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) + { + VkTensor dispatcher; + dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; + dispatcher.h = 1; + dispatcher.c = top_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack4_1x1s1d1, bindings, constants, dispatcher); + } + else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1) + { + VkTensor dispatcher; + dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; + dispatcher.h = 1; + dispatcher.c = top_blob.c; + + cmd.record_pipeline(pipeline_convolution_pack8_1x1s1d1, bindings, constants, dispatcher); + } + else + { + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_convolution; + } + else if (elempack == 4 && out_elempack == 4) + { + // TLOG_INFO("pipeline is pipeline_convolution_pack4\n"); + pipeline = pipeline_convolution_pack4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_convolution_pack1to4; + } + else if (elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_convolution_pack4to1; + } + else if (elempack == 8 && out_elempack == 8) + { + pipeline = pipeline_convolution_pack8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_convolution_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_convolution_pack4to8; + } + else if (elempack == 8 && out_elempack == 4) + { + pipeline = pipeline_convolution_pack8to4; + } + else if (elempack == 8 && out_elempack == 1) + { + pipeline = pipeline_convolution_pack8to1; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + // TLOG_INFO("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w); + // cmd.record_pipeline(pipeline_convolution, bindings, constants, top_blob); + // TLOG_INFO("run record convolution\n"); + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/convolution_vulkan.hpp b/source/device/vulkan/layer/convolution_vulkan.hpp new file mode 100644 index 000000000..a1e7c1ad8 --- /dev/null +++ b/source/device/vulkan/layer/convolution_vulkan.hpp @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_CONVOLUTION_HPP +#define LAYER_CONVOLUTION_HPP + +#include "padding_vulkan.hpp" +#include "innerproduct_vulkan.hpp" +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "convolution_param.h" + +namespace TEngine { + +class Convolution_vulkan : public Layer +{ +public: + Convolution_vulkan(); + // Convolution_vulkan(ir_node* node); + Convolution_vulkan(ir_graph_t* graph, ir_node_t* node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + // virtual int record_pipeline(VkCompute& cmd, const Option& opt) const; + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + + +public: + int group; + int input_c; + int input_h; + int input_w; + int pad_w0; // left padding columns + int pad_w1; // right padding columns + int pad_h0; // top padding rows + int pad_h1; // bottom padding rows + int stride_h; + int stride_w; + int dilation_h; + int dilation_w; + int kernel_h; + int kernel_w; + int activation; + int output_c; + int output_h; + int output_w; + + int weight_data_size; + +public: + Padding_vulkan* padding; + InnerProduct_vulkan* innerproduct; + + VkTensor weight_data_gpu; + VkImageTensor weight_data_gpu_image; + VkTensor bias_data_gpu; + + Pipeline* pipeline_convolution; + Pipeline* pipeline_convolution_pack4; + Pipeline* pipeline_convolution_pack8; + Pipeline* pipeline_convolution_pack1to4; + Pipeline* pipeline_convolution_pack4to1; + Pipeline* pipeline_convolution_pack1to8; + Pipeline* pipeline_convolution_pack4to8; + Pipeline* pipeline_convolution_pack8to1; + Pipeline* pipeline_convolution_pack8to4; + + Pipeline* pipeline_convolution_1x1s1d1; + Pipeline* pipeline_convolution_pack4_1x1s1d1; + Pipeline* pipeline_convolution_pack8_1x1s1d1; +}; + +} // namespace TEngine + + +#endif diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp new file mode 100644 index 000000000..bc950cf38 --- /dev/null +++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp @@ -0,0 +1,301 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "convolutiondepthwise_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + + ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan() + { + support_vulkan = true; + pipeline_convolutiondepthwise = 0; + } + + ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) + { + support_vulkan = true; + + padding = 0; + + pipeline_convolutiondepthwise = 0; + pipeline_convolutiondepthwise_pack4 = 0; + pipeline_convolutiondepthwise_pack8 = 0; + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + struct conv_param *param = (struct conv_param *)ir_node->op.param_mem; + + group = param->group; + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + pad_w0 = param->pad_w0; // left padding columns + pad_w1 = param->pad_w1; // right padding columns + pad_h0 = param->pad_h0; // top padding rows + pad_h1 = param->pad_h1; // bottom padding rows + stride_w = param->stride_w; + stride_h = param->stride_h; + dilation_w = param->dilation_w; + dilation_h = param->dilation_h; + kernel_w = param->kernel_w; + kernel_h = param->kernel_h; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + } + +int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + + { + padding = new Padding_vulkan(); + padding->vkdev = vkdev; + + padding->top = pad_h0; + padding->bottom = pad_h1; + padding->left = pad_w0; + padding->right = pad_w1; + padding->type = 0; + padding->value = 0; + + padding->input_w = input_w; + padding->input_h = input_h; + padding->input_c = input_c; + padding->output_w = input_w + pad_w0 + pad_w1; + padding->output_h = input_h + pad_h0 + pad_h1; + padding->output_c = input_c; + + padding->create_pipeline(opt); + } + + + // const int maxk = kernel_w * kernel_h; + int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group; + int num_output = output_c; + + int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + + size_t elemsize; + size_t out_elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + + std::vector specializations(11 + 10); + specializations[0].i = kernel_w; // kernel_w; + specializations[1].i = kernel_h; // kernel_h + specializations[2].i = dilation_w; // dilation_w; + specializations[3].i = dilation_h; // dilation_h; + specializations[4].i = stride_w; // stride_w; + specializations[5].i = stride_h; // stride_h; + specializations[6].i = node->input_num >2 ? 1 : 0; // bias_term; + specializations[7].i = group; + specializations[8].i = 1;//param->activation; // activation_type; + specializations[9].f = 0;//param->activation; // activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[10].f = 0;//param->activation; // activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[11 + 0].i = 0; // 3; // shape_bordered_packed.dims; + specializations[11 + 1].i = 0; // input_w + pad_w0 + pad_w1; // shape_bordered_packed.w; + specializations[11 + 2].i = 0; // input_h + pad_h0 + pad_h1; // shape_bordered_packed.h; + specializations[11 + 3].i = 0; // input_c; // shape_bordered_packed.c; + specializations[11 + 4].i = 0; // (input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1); // shape_bordered_packed.cstep; + specializations[11 + 5].i = 0; // 3; // out_shape_packed.dims; + specializations[11 + 6].i = 0; // output_w; // out_shape_packed.w; + specializations[11 + 7].i = 0; // output_h; // out_shape_packed.h; + specializations[11 + 8].i = 0; // output_c; // out_shape_packed.c; + specializations[11 + 9].i = 0; // output_w * output_h; // out_shape_packed.cstep; + + VkTensor local_size_xyz; + local_size_xyz.w = std::min(4, output_w); + local_size_xyz.h = std::min(4, output_h); + local_size_xyz.c = std::min(4, output_c); + + // pack1 + if (elempack == 1) + { + pipeline_convolutiondepthwise = new Pipeline(vkdev); + pipeline_convolutiondepthwise->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolutiondepthwise->create(LayerShaderType::convolutiondepthwise, opt, specializations); + } + + // pack4 + if (elempack == 4) + { + pipeline_convolutiondepthwise_pack4 = new Pipeline(vkdev); + pipeline_convolutiondepthwise_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolutiondepthwise_pack4->create(LayerShaderType::convolutiondepthwise_pack4, opt, specializations); + } + + // pack8 + if (elempack == 8) + { + pipeline_convolutiondepthwise_pack8 = new Pipeline(vkdev); + pipeline_convolutiondepthwise_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_convolutiondepthwise_pack8->create(LayerShaderType::convolutiondepthwise_pack8, opt, specializations); + } + + return 0; +} + +int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt) +{ + if (padding) + { + padding->destroy_pipeline(opt); + delete padding; + padding = 0; + } + + delete pipeline_convolutiondepthwise; + pipeline_convolutiondepthwise = 0; + + delete pipeline_convolutiondepthwise_pack4; + pipeline_convolutiondepthwise_pack4 = 0; + + delete pipeline_convolutiondepthwise_pack8; + pipeline_convolutiondepthwise_pack8 = 0; + return 0; +} + +int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + // upload kernel data + const int maxk = kernel_w * kernel_h; + int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group; + int num_output = output_c; + + int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + + + tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); + Tensor weight_data = Tensor(weight_tensor->elem_num, weight_tensor->data); + + Tensor weight_data_packed; + Tensor weight_data_r2 = weight_data.reshape(maxk, group); + TEngine::convert_packing(weight_data_r2, weight_data_packed, elempack); + + cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + + // upload bias data + if(node->input_num > 2) + { + tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]); + Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data); + Tensor bias_data_packed; + convert_packing(bias_data, bias_data_packed, out_elempack); + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + return 0; +} + +int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + + VkTensor bottom_blob_bordered = bottom_blob; + if (pad_h0 > 0 || pad_h1 > 0 || pad_w0 > 0 || pad_w1 > 0) + { + // bottom_blob_bordered.w = bottom_blob_bordered.w + pad_w0 + pad_w1; + // bottom_blob_bordered.h = bottom_blob_bordered.h + pad_h0 + pad_h1; + // bottom_blob_bordered.cstep = bottom_blob_bordered.w * bottom_blob_bordered.h; + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad); + } + + top_blob.create(output_w, output_h, output_c/elempack, elemsize, elempack, opt.blob_vkallocator); + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu; + bindings[3] = bias_data_gpu; + + std::vector constants(10); + constants[0].i = bottom_blob_bordered.dims; + constants[1].i = bottom_blob_bordered.w; + constants[2].i = bottom_blob_bordered.h; + constants[3].i = bottom_blob_bordered.c; + constants[4].i = bottom_blob_bordered.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + // printf("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w); + const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8 + : elempack == 4 ? pipeline_convolutiondepthwise_pack4 + : pipeline_convolutiondepthwise; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + +} \ No newline at end of file diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp new file mode 100644 index 000000000..05f78f22c --- /dev/null +++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_CONVOLUTIONDEPTHWISE_HPP +#define LAYER_CONVOLUTIONDEPTHWISE_HPP + +#include "padding_vulkan.hpp" +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "convolution_param.h" + +namespace TEngine { + +class ConvolutionDepthWise_vulkan : public Layer +{ +public: + ConvolutionDepthWise_vulkan(); + ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + int group; + int input_c; + int input_h; + int input_w; + int pad_w0; // left padding columns + int pad_w1; // right padding columns + int pad_h0; // top padding rows + int pad_h1; // bottom padding rows + int stride_h; + int stride_w; + int dilation_h; + int dilation_w; + int kernel_h; + int kernel_w; + int output_c; + int output_h; + int output_w; + +public: + Padding_vulkan* padding; + + VkTensor weight_data_gpu; + VkTensor bias_data_gpu; + + Pipeline* pipeline_convolutiondepthwise; + Pipeline* pipeline_convolutiondepthwise_pack4; + Pipeline* pipeline_convolutiondepthwise_pack8; +}; + +} // namespace TEngine + + +#endif diff --git a/source/device/vulkan/layer/crop_vulkan.cpp b/source/device/vulkan/layer/crop_vulkan.cpp new file mode 100644 index 000000000..26f8768e8 --- /dev/null +++ b/source/device/vulkan/layer/crop_vulkan.cpp @@ -0,0 +1,607 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "crop_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Crop_vulkan::Crop_vulkan() +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_crop = 0; + pipeline_crop_pack4 = 0; + pipeline_crop_pack1to4 = 0; + pipeline_crop_pack4to1 = 0; + pipeline_crop_pack8 = 0; + pipeline_crop_pack1to8 = 0; + pipeline_crop_pack4to8 = 0; + pipeline_crop_pack8to4 = 0; + pipeline_crop_pack8to1 = 0; +} + +Crop_vulkan::Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_crop = 0; + pipeline_crop_pack4 = 0; + pipeline_crop_pack1to4 = 0; + pipeline_crop_pack4to1 = 0; + pipeline_crop_pack8 = 0; + pipeline_crop_pack1to8 = 0; + pipeline_crop_pack4to8 = 0; + pipeline_crop_pack8to4 = 0; + pipeline_crop_pack8to1 = 0; + + graph = ir_graph; + node = ir_node; + + for(int i = 0; i < ir_node->input_num; i++) + { + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]); + std::string name = input->name; + bottoms.push_back(name); + } + + for(int i = 0; i < ir_node->output_num; i++) + { + struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]); + std::string name = output->name; + tops.push_back(name); + } + + // params + struct tensor *input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); + input_c = input_tensor->dims[1]; // param->input_channel; + input_h = input_tensor->dims[2]; + input_w = input_tensor->dims[3]; + output_c = output_tensor->dims[1]; // param->output_channel; + output_h = output_tensor->dims[2]; + output_w = output_tensor->dims[3]; + + struct crop_param *param = (struct crop_param *)ir_node->op.param_mem; + + int num_args = param->num_args; + int offset_c = 0; // param->offset_c; + int offset_h = 0; // param->offset_h; + int offset_w = 0; // param->offset_w; + int crop_h = param->crop_h; + int crop_w = param->crop_w; + int center_crop = param->center_crop; + int axis = param->axis; + int flag = param->flag; +} + +int Crop_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + int out_elempack = 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; + if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; + if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; + + int offset_elempack = 1; + + { + // TODO vec and image crop + if (offset_c == 0) + offset_elempack = elempack; + else + offset_elempack = opt.use_shader_pack8 && offset_c % 8 == 0 ? 8 : offset_c % 4 == 0 ? 4 : 1; + } + + size_t elemsize; + size_t out_elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + + Tensor shape_packed; + if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + + Tensor out_shape_packed; + if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); + if (out_shape.dims == 2) out_shape_packed = Tensor(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); + if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); + + Tensor shape_unpacked = shape_packed; + if (bottoms.size() == 1 && shape.dims != 0 && elempack == out_elempack && elempack > offset_elempack) + { + size_t offset_elemsize; + if (opt.use_fp16_storage) + { + offset_elemsize = offset_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + offset_elemsize = offset_elempack == 1 ? 4u : offset_elempack * 2u; + } + else + { + offset_elemsize = offset_elempack * 4u; + } + + if (shape.dims == 1) shape_unpacked = Tensor(shape.w / offset_elempack, (void*)0, offset_elemsize, offset_elempack); + if (shape.dims == 2) shape_unpacked = Tensor(shape.w, shape.h / offset_elempack, (void*)0, offset_elemsize, offset_elempack); + if (shape.dims == 3) shape_unpacked = Tensor(shape.w, shape.h, shape.c / offset_elempack, (void*)0, offset_elemsize, offset_elempack); + } + + std::vector specializations(1 + 10); + specializations[0].i = vkdev->info.bug_implicit_fp16_arithmetic; + specializations[1 + 0].i = 0; // shape_unpacked.dims; + specializations[1 + 1].i = 0; // shape_unpacked.w; + specializations[1 + 2].i = 0; // shape_unpacked.h; + specializations[1 + 3].i = 0; // shape_unpacked.c; + specializations[1 + 4].i = 0; // shape_unpacked.cstep; + specializations[1 + 5].i = 0; // out_shape_packed.dims; + specializations[1 + 6].i = 0; // out_shape_packed.w; + specializations[1 + 7].i = 0; // out_shape_packed.h; + specializations[1 + 8].i = 0; // out_shape_packed.c; + specializations[1 + 9].i = 0; // out_shape_packed.cstep; + + Tensor local_size_xyz; + if (out_shape_packed.dims == 1) + { + local_size_xyz.w = std::min(64, out_shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (out_shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, out_shape_packed.w); + local_size_xyz.h = std::min(8, out_shape_packed.h); + local_size_xyz.c = 1; + } + if (out_shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, out_shape_packed.w); + local_size_xyz.h = std::min(4, out_shape_packed.h); + local_size_xyz.c = std::min(4, out_shape_packed.c); + } + + // pack1 + if (out_shape.dims == 0 || out_elempack == 1) + { + pipeline_crop = new Pipeline(vkdev); + pipeline_crop->set_optimal_local_size_xyz(local_size_xyz); + pipeline_crop->create(LayerShaderType::crop, opt, specializations); + } + + // pack4 + if (out_shape.dims == 0 || out_elempack == 4) + { + pipeline_crop_pack4 = new Pipeline(vkdev); + pipeline_crop_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_crop_pack4->create(LayerShaderType::crop_pack4, opt, specializations); + } + + // pack1to4 + if (out_shape.dims == 0 || out_elempack == 4) + { + pipeline_crop_pack1to4 = new Pipeline(vkdev); + pipeline_crop_pack1to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_crop_pack1to4->create(LayerShaderType::crop_pack1to4, opt, specializations); + } + + // pack4to1 + if (out_shape.dims == 0 || out_elempack == 1) + { + pipeline_crop_pack4to1 = new Pipeline(vkdev); + pipeline_crop_pack4to1->set_optimal_local_size_xyz(local_size_xyz); + pipeline_crop_pack4to1->create(LayerShaderType::crop_pack4to1, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && out_shape.dims == 0) || (elempack == 8 && out_elempack == 8)) + { + pipeline_crop_pack8 = new Pipeline(vkdev); + pipeline_crop_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_crop_pack8->create(LayerShaderType::crop_pack8, opt, specializations); + } + + // pack1to8 + if ((opt.use_shader_pack8 && out_shape.dims == 0) || out_elempack == 8) + { + pipeline_crop_pack1to8 = new Pipeline(vkdev); + pipeline_crop_pack1to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_crop_pack1to8->create(LayerShaderType::crop_pack1to8, opt, specializations); + } + + // pack4to8 + if ((opt.use_shader_pack8 && out_shape.dims == 0) || out_elempack == 8) + { + pipeline_crop_pack4to8 = new Pipeline(vkdev); + pipeline_crop_pack4to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_crop_pack4to8->create(LayerShaderType::crop_pack4to8, opt, specializations); + } + + // pack8to4 + if ((opt.use_shader_pack8 && out_shape.dims == 0) || (elempack == 8 && out_elempack == 4)) + { + pipeline_crop_pack8to4 = new Pipeline(vkdev); + pipeline_crop_pack8to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_crop_pack8to4->create(LayerShaderType::crop_pack8to4, opt, specializations); + } + + // pack8to1 + if ((opt.use_shader_pack8 && out_shape.dims == 0) || (elempack == 8 && out_elempack == 1)) + { + pipeline_crop_pack8to1 = new Pipeline(vkdev); + pipeline_crop_pack8to1->set_optimal_local_size_xyz(local_size_xyz); + pipeline_crop_pack8to1->create(LayerShaderType::crop_pack8to1, opt, specializations); + } + + + return 0; +} + +int Crop_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_crop; + pipeline_crop = 0; + + delete pipeline_crop_pack4; + pipeline_crop_pack4 = 0; + + delete pipeline_crop_pack1to4; + pipeline_crop_pack1to4 = 0; + + delete pipeline_crop_pack4to1; + pipeline_crop_pack4to1 = 0; + + delete pipeline_crop_pack8; + pipeline_crop_pack8 = 0; + + delete pipeline_crop_pack1to8; + pipeline_crop_pack1to8 = 0; + + delete pipeline_crop_pack4to8; + pipeline_crop_pack4to8 = 0; + + delete pipeline_crop_pack8to4; + pipeline_crop_pack8to4 = 0; + + delete pipeline_crop_pack8to1; + pipeline_crop_pack8to1 = 0; + + return 0; +} + +int Crop_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int _woffset, _hoffset, _coffset; + int _outw, _outh, _outc; + // resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc); + _outw = output_w; + _outh = output_h; + _outc = output_c; + _woffset = offset_w; + _hoffset = offset_h; + _coffset = offset_c; + + // TODO vec and image crop + + if (dims == 3) + { + if (_woffset == 0 && _hoffset == 0 && _coffset == 0 && _outw == bottom_blob.w && _outh == bottom_blob.h && _outc == bottom_blob.c * elempack) + { + top_blob = bottom_blob; + return 0; + } + + int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1; + + int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + // unpacking + VkTensor bottom_blob_unpacked = bottom_blob; + if (elempack == out_elempack && elempack > offset_elempack) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, offset_elempack, cmd, opt_pack1); + } + + top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob_unpacked; + bindings[1] = top_blob; + + std::vector constants(13); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.c; + constants[4].i = bottom_blob_unpacked.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].i = _woffset; + constants[11].i = _hoffset; + constants[12].i = _coffset; + + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_crop; + } + else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4) + { + constants[12].i = _coffset / 4; + + pipeline = pipeline_crop_pack4; + } + else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_crop_pack1to4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_crop_pack1to4; + } + else if (elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_crop_pack4to1; + } + else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8) + { + constants[12].i = _coffset / 8; + + pipeline = pipeline_crop_pack8; + } + else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_crop_pack4to8; + } + else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_crop_pack1to8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_crop_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_crop_pack4to8; + } + else if (elempack == 8 && out_elempack == 4) + { + pipeline = pipeline_crop_pack8to4; + } + else if (elempack == 8 && out_elempack == 1) + { + pipeline = pipeline_crop_pack8to1; + } + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + return 0; +} + +int Crop_vulkan::record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + const VkTensor& bottom_blob = bottom_blobs[0]; + const VkTensor& reference_blob = bottom_blobs[1]; + + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int _woffset, _hoffset, _coffset; + int _outw, _outh, _outc; + // if (woffset == -233) + // { + // resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob.mapped(), _woffset, _hoffset, _coffset, _outw, _outh, _outc); + // } + // else + // { + // resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc); + // } + _outw = output_w; + _outh = output_h; + _outc = output_c; + _woffset = 0; // offset_w; + _hoffset = 0; // offset_h; + _coffset = 0; // offset_c; + + // TODO vec and image crop + + if (dims == 3) + { + if (_woffset == 0 && _hoffset == 0 && _coffset == 0 && _outw == bottom_blob.w && _outh == bottom_blob.h && _outc == bottom_blob.c * elempack) + { + top_blobs[0] = bottom_blob; + return 0; + } + + int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1; + + int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + // unpacking + VkTensor bottom_blob_unpacked = bottom_blob; + if (elempack == out_elempack && elempack > offset_elempack) + { + Option opt_pack1 = opt; + opt_pack1.blob_vkallocator = opt.workspace_vkallocator; + + vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, offset_elempack, cmd, opt_pack1); + } + + VkTensor& top_blob = top_blobs[0]; + + top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob_unpacked; + bindings[1] = top_blob; + + std::vector constants(13); + constants[0].i = bottom_blob_unpacked.dims; + constants[1].i = bottom_blob_unpacked.w; + constants[2].i = bottom_blob_unpacked.h; + constants[3].i = bottom_blob_unpacked.c; + constants[4].i = bottom_blob_unpacked.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].i = _woffset; + constants[11].i = _hoffset; + constants[12].i = _coffset; + + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_crop; + } + else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4) + { + constants[12].i = _coffset / 4; + + pipeline = pipeline_crop_pack4; + } + else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_crop_pack1to4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_crop_pack1to4; + } + else if (elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_crop_pack4to1; + } + else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8) + { + constants[12].i = _coffset / 8; + + pipeline = pipeline_crop_pack8; + } + else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_crop_pack4to8; + } + else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_crop_pack1to8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_crop_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_crop_pack4to8; + } + else if (elempack == 8 && out_elempack == 4) + { + pipeline = pipeline_crop_pack8to4; + } + else if (elempack == 8 && out_elempack == 1) + { + pipeline = pipeline_crop_pack8to1; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/crop_vulkan.hpp b/source/device/vulkan/layer/crop_vulkan.hpp new file mode 100644 index 000000000..1a55f3ca1 --- /dev/null +++ b/source/device/vulkan/layer/crop_vulkan.hpp @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_CROP_HPP +#define LAYER_CROP_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "crop_param.h" + +namespace TEngine{ + +class Crop_vulkan : public Layer +{ +public: + Crop_vulkan(); + Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + void resolve_crop_roi(const Tensor& bottom_blob, int& _woffset, int& _hoffset, int& _coffset, int& _outw, int& _outh, int& _outc) const; + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_crop; + Pipeline* pipeline_crop_pack4; + Pipeline* pipeline_crop_pack1to4; + Pipeline* pipeline_crop_pack4to1; + Pipeline* pipeline_crop_pack8; + Pipeline* pipeline_crop_pack1to8; + Pipeline* pipeline_crop_pack4to8; + Pipeline* pipeline_crop_pack8to4; + Pipeline* pipeline_crop_pack8to1; + +public: + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + + int num_args; + int offset_c; + int offset_h; + int offset_w; + int crop_h; + int crop_w; + int center_crop; + int axis; + int flag; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/dropout_vulkan.cpp b/source/device/vulkan/layer/dropout_vulkan.cpp new file mode 100644 index 000000000..a6c3e0724 --- /dev/null +++ b/source/device/vulkan/layer/dropout_vulkan.cpp @@ -0,0 +1,216 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "dropout_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Dropout_vulkan::Dropout_vulkan() +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_dropout = 0; + pipeline_dropout_pack4 = 0; + pipeline_dropout_pack8 = 0; +} + +Dropout_vulkan::Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_dropout = 0; + pipeline_dropout_pack4 = 0; + pipeline_dropout_pack8 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + // params + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + + if(input->scale != 0) + scale = input->scale; + else + scale = 1.0f; +} + +int Dropout_vulkan::create_pipeline(const Option& opt) +{ + const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else + { + elemsize = elempack * 4u; + } + + Tensor shape_packed; + if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + + std::vector specializations(1 + 5); + specializations[0].f = scale; + specializations[1 + 0].i = shape_packed.dims; + specializations[1 + 1].i = shape_packed.w; + specializations[1 + 2].i = shape_packed.h; + specializations[1 + 3].i = shape_packed.c; + specializations[1 + 4].i = shape_packed.cstep; + + Tensor local_size_xyz; + if (shape_packed.dims == 1) + { + local_size_xyz.w = std::min(64, shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, shape_packed.w); + local_size_xyz.h = std::min(8, shape_packed.h); + local_size_xyz.c = 1; + } + if (shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, shape_packed.w); + local_size_xyz.h = std::min(4, shape_packed.h); + local_size_xyz.c = std::min(4, shape_packed.c); + } + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_dropout = new Pipeline(vkdev); + pipeline_dropout->set_optimal_local_size_xyz(local_size_xyz); + pipeline_dropout->create(LayerShaderType::dropout, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_dropout_pack4 = new Pipeline(vkdev); + pipeline_dropout_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_dropout_pack4->create(LayerShaderType::dropout_pack4, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) + { + pipeline_dropout_pack8 = new Pipeline(vkdev); + pipeline_dropout_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_dropout_pack8->create(LayerShaderType::dropout_pack8, opt, specializations); + } + + return 0; +} + +int Dropout_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_dropout; + pipeline_dropout = 0; + + delete pipeline_dropout_pack4; + pipeline_dropout_pack4 = 0; + + delete pipeline_dropout_pack8; + pipeline_dropout_pack8 = 0; + + return 0; +} + +int Dropout_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const +{ + if (scale == 1.f) + { + return 0; + } + + int elempack = bottom_top_blob.elempack; + + std::vector bindings(1); + bindings[0] = bottom_top_blob; + + std::vector constants(5); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = bottom_top_blob.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_dropout_pack8 + : elempack == 4 ? pipeline_dropout_pack4 + : pipeline_dropout; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); + + return 0; +} + + + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/dropout_vulkan.hpp b/source/device/vulkan/layer/dropout_vulkan.hpp new file mode 100644 index 000000000..b6e943889 --- /dev/null +++ b/source/device/vulkan/layer/dropout_vulkan.hpp @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_DROPOUT_HPP +#define LAYER_DROPOUT_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +namespace TEngine{ + +class Dropout_vulkan : public Layer +{ +public: + Dropout_vulkan(); + Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + // virtual int upload_model(VkTransfer& cmd, const Option& opt); + + virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_dropout; + Pipeline* pipeline_dropout_pack4; + Pipeline* pipeline_dropout_pack8; + +public: + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + float scale; + +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/eltwise_vulkan.cpp b/source/device/vulkan/layer/eltwise_vulkan.cpp new file mode 100644 index 000000000..9fc322bc9 --- /dev/null +++ b/source/device/vulkan/layer/eltwise_vulkan.cpp @@ -0,0 +1,266 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "eltwise_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Eltwise_vulkan::Eltwise_vulkan() +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_eltwise[0] = 0; + pipeline_eltwise[1] = 0; + pipeline_eltwise_pack4[0] = 0; + pipeline_eltwise_pack4[1] = 0; + pipeline_eltwise_pack8[0] = 0; + pipeline_eltwise_pack8[1] = 0; +} + +Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = true; + + pipeline_eltwise[0] = 0; + pipeline_eltwise[1] = 0; + pipeline_eltwise_pack4[0] = 0; + pipeline_eltwise_pack4[1] = 0; + pipeline_eltwise_pack8[0] = 0; + pipeline_eltwise_pack8[1] = 0; + + graph = ir_graph; + node = ir_node; + + for(int i = 0; i < ir_node->input_num; i++) + { + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]); + std::string name = input->name; + bottoms.push_back(name); + } + + for(int i = 0; i < ir_node->output_num; i++) + { + struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]); + std::string name = output->name; + tops.push_back(name); + } + + struct eltwise_param *param = (struct eltwise_param *)ir_node->op.param_mem; + op_type = (param -> type) / 2; +} + +int Eltwise_vulkan::create_pipeline(const Option& opt) +{ + const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else + { + elemsize = elempack * 4u; + } + + Tensor shape_packed; + if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + + std::vector specializations(2 + 5); + specializations[0].i = op_type; + specializations[1].i = 0; // coeffs.w == 0 ? 0 : 1; TODO fix coeffs value + specializations[2 + 0].i = 0; // shape_packed.dims; + specializations[2 + 1].i = 0; // shape_packed.w; + specializations[2 + 2].i = 0; // shape_packed.h; + specializations[2 + 3].i = 0; // shape_packed.c; + specializations[2 + 4].i = 0; // shape_packed.cstep; + + Tensor local_size_xyz; + if (shape_packed.dims == 1) + { + local_size_xyz.w = std::min(64, shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, shape_packed.w); + local_size_xyz.h = std::min(8, shape_packed.h); + local_size_xyz.c = 1; + } + if (shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, shape_packed.w); + local_size_xyz.h = std::min(4, shape_packed.h); + local_size_xyz.c = std::min(4, shape_packed.c); + } + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_eltwise[0] = new Pipeline(vkdev); + pipeline_eltwise[0]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_eltwise[0]->create(LayerShaderType::eltwise, opt, specializations); + pipeline_eltwise[1] = new Pipeline(vkdev); + pipeline_eltwise[1]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_eltwise[1]->create(LayerShaderType::eltwise, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_eltwise_pack4[0] = new Pipeline(vkdev); + pipeline_eltwise_pack4[0]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_eltwise_pack4[0]->create(LayerShaderType::eltwise_pack4, opt, specializations); + pipeline_eltwise_pack4[1] = new Pipeline(vkdev); + pipeline_eltwise_pack4[1]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_eltwise_pack4[1]->create(LayerShaderType::eltwise_pack4, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) + { + pipeline_eltwise_pack8[0] = new Pipeline(vkdev); + pipeline_eltwise_pack8[0]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_eltwise_pack8[0]->create(LayerShaderType::eltwise_pack8, opt, specializations); + pipeline_eltwise_pack8[1] = new Pipeline(vkdev); + pipeline_eltwise_pack8[1]->set_optimal_local_size_xyz(local_size_xyz); + pipeline_eltwise_pack8[1]->create(LayerShaderType::eltwise_pack8, opt, specializations); + } + + return 0; +} + +int Eltwise_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_eltwise[0]; + delete pipeline_eltwise[1]; + pipeline_eltwise[0] = 0; + pipeline_eltwise[1] = 0; + + delete pipeline_eltwise_pack4[0]; + delete pipeline_eltwise_pack4[1]; + pipeline_eltwise_pack4[0] = 0; + pipeline_eltwise_pack4[1] = 0; + + delete pipeline_eltwise_pack8[0]; + delete pipeline_eltwise_pack8[1]; + pipeline_eltwise_pack8[0] = 0; + pipeline_eltwise_pack8[1] = 0; + + return 0; +} + +int Eltwise_vulkan::record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + const VkTensor& bottom_blob = bottom_blobs[0]; + const VkTensor& bottom_blob1 = bottom_blobs[1]; + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + VkTensor& top_blob = top_blobs[0]; + top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = bottom_blob; + bindings[1] = bottom_blob1; + bindings[2] = top_blob; + + std::vector constants(5 + 2); + constants[0].i = top_blob.dims; + constants[1].i = top_blob.w; + constants[2].i = top_blob.h; + constants[3].i = top_blob.c; + constants[4].i = top_blob.cstep; + constants[5].f = 1.0f; // coeffs.w == 0 ? 1.f : coeffs[0]; TODO fix coeffs value + constants[6].f = 1.0f; // coeffs.w == 0 ? 1.f : coeffs[1]; + + const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[1] + : elempack == 4 ? pipeline_eltwise_pack4[1] + : pipeline_eltwise[1]; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + for (size_t b = 2; b < bottom_blobs.size(); b++) + { + std::vector bindings(3); + bindings[0] = top_blob; + bindings[1] = bottom_blobs[b]; + bindings[2] = top_blob; // TODO use separated pipeline ? + + std::vector constants(5 + 2); + constants[0].i = top_blob.dims; + constants[1].i = top_blob.w; + constants[2].i = top_blob.h; + constants[3].i = top_blob.c; + constants[4].i = top_blob.cstep; + constants[5].f = 1.f; + constants[6].f = 1.0f; // coeffs.w == 0 ? 1 : coeffs[b]; TODO fixcoeffs value + + const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[b % 2] + : elempack == 4 ? pipeline_eltwise_pack4[b % 2] + : pipeline_eltwise[b % 2]; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/eltwise_vulkan.hpp b/source/device/vulkan/layer/eltwise_vulkan.hpp new file mode 100644 index 000000000..5830b076d --- /dev/null +++ b/source/device/vulkan/layer/eltwise_vulkan.hpp @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_ELTWISE_HPP +#define LAYER_ELTWISE_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "eltwise_param.h" + +namespace TEngine{ + +class Eltwise_vulkan : public Layer +{ +public: + Eltwise_vulkan(); + Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_eltwise[2]; + Pipeline* pipeline_eltwise_pack4[2]; + Pipeline* pipeline_eltwise_pack8[2]; + +public: + enum EltType + { + ELT_PROD, + ELT_PROD_SCALAR, + ELT_SUM, + ELT_SUM_SCALAR, + ELT_SUB, + ELT_SUB_SCALAR, + ELT_MAX, + ELT_RSQRT, + ELT_MIN_SCALAR, + ELT_LAST, + ELT_DIV, + ELT_LOG, + ELT_EXP, + ELT_SQRT, + ELT_FLOOR, + ELT_SQUARE, + ELT_POW + }; + int op_type; // Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 + + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/flatten_vulkan.cpp b/source/device/vulkan/layer/flatten_vulkan.cpp new file mode 100644 index 000000000..589b7d5d4 --- /dev/null +++ b/source/device/vulkan/layer/flatten_vulkan.cpp @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "flatten_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Flatten_vulkan::Flatten_vulkan() +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_flatten = 0; + pipeline_flatten_pack4 = 0; + pipeline_flatten_pack1to4 = 0; + pipeline_flatten_pack8 = 0; + pipeline_flatten_pack1to8 = 0; + pipeline_flatten_pack4to8 = 0; +} + +Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = true; + + pipeline_flatten = 0; + pipeline_flatten_pack4 = 0; + pipeline_flatten_pack1to4 = 0; + pipeline_flatten_pack8 = 0; + pipeline_flatten_pack1to8 = 0; + pipeline_flatten_pack4to8 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + // params + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + output_size = output->dims[3]*output->dims[2]*output->dims[1]; +} + +int Flatten_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + // const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; + const Tensor& out_shape = Tensor(output_size, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; + + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + int out_elempack = 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; + + size_t elemsize; + size_t out_elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + + Tensor shape_packed; + if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + + Tensor out_shape_packed; + if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); + + // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed)) + { + support_image_storage = false; + opt.use_image_storage = false; + } + + std::vector specializations(0 + 10); + specializations[0 + 0].i = 0; // shape_packed.dims; + specializations[0 + 1].i = 0; // shape_packed.w; + specializations[0 + 2].i = 0; // shape_packed.h; + specializations[0 + 3].i = 0; // shape_packed.c; + specializations[0 + 4].i = 0; // shape_packed.cstep; + specializations[0 + 5].i = 0; // out_shape_packed.dims; + specializations[0 + 6].i = 0; // out_shape_packed.w; + specializations[0 + 7].i = 0; // out_shape_packed.h; + specializations[0 + 8].i = 0; // out_shape_packed.c; + specializations[0 + 9].i = 0; // out_shape_packed.cstep; + + Tensor local_size_xyz(64, 1, 1, (void*)0); + if (out_shape_packed.dims != 0) + { + local_size_xyz.w = std::min(64, out_shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + + // pack1 + if (shape.dims == 0 || (elempack == 1 && out_elempack == 1)) + { + pipeline_flatten = new Pipeline(vkdev); + pipeline_flatten->set_optimal_local_size_xyz(local_size_xyz); + pipeline_flatten->create(LayerShaderType::flatten, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || (elempack == 4 && out_elempack == 4)) + { + pipeline_flatten_pack4 = new Pipeline(vkdev); + pipeline_flatten_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_flatten_pack4->create(LayerShaderType::flatten_pack4, opt, specializations); + } + + // pack1to4 + if (shape.dims == 0 || (elempack == 1 && out_elempack == 4)) + { + pipeline_flatten_pack1to4 = new Pipeline(vkdev); + pipeline_flatten_pack1to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_flatten_pack1to4->create(LayerShaderType::flatten_pack1to4, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 8 && out_elempack == 8)) + { + pipeline_flatten_pack8 = new Pipeline(vkdev); + pipeline_flatten_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_flatten_pack8->create(LayerShaderType::flatten_pack8, opt, specializations); + } + + // pack1to8 + if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 1 && out_elempack == 8)) + { + pipeline_flatten_pack1to8 = new Pipeline(vkdev); + pipeline_flatten_pack1to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_flatten_pack1to8->create(LayerShaderType::flatten_pack1to8, opt, specializations); + } + + // pack4to8 + if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 4 && out_elempack == 8)) + { + pipeline_flatten_pack4to8 = new Pipeline(vkdev); + pipeline_flatten_pack4to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_flatten_pack4to8->create(LayerShaderType::flatten_pack4to8, opt, specializations); + } + + return 0; +} + + + +int Flatten_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_flatten; + pipeline_flatten = 0; + + delete pipeline_flatten_pack4; + pipeline_flatten_pack4 = 0; + + delete pipeline_flatten_pack1to4; + pipeline_flatten_pack1to4 = 0; + + delete pipeline_flatten_pack8; + pipeline_flatten_pack8 = 0; + + delete pipeline_flatten_pack1to8; + pipeline_flatten_pack1to8 = 0; + + delete pipeline_flatten_pack4to8; + pipeline_flatten_pack4to8 = 0; + + return 0; +} + +int Flatten_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + int dims = bottom_blob.dims; + + if (dims == 1) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int total = w * h * channels * elempack; + + int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + if (dims == 2 && elempack == 1 && !(opt.use_fp16_packed && !opt.use_fp16_storage && out_elempack != 1)) + { + top_blob = bottom_blob; + top_blob.dims = 1; + top_blob.w = total / out_elempack; + top_blob.h = 1; + top_blob.cstep = top_blob.w; + top_blob.elemsize = out_elemsize; + top_blob.elempack = out_elempack; + return 0; + } + + top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_flatten; + } + else if (elempack == 4 && out_elempack == 4) + { + pipeline = pipeline_flatten_pack4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_flatten_pack1to4; + } + else if (elempack == 8 /*&& out_elempack == 8*/) + { + pipeline = pipeline_flatten_pack8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_flatten_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_flatten_pack4to8; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/flatten_vulkan.hpp b/source/device/vulkan/layer/flatten_vulkan.hpp new file mode 100644 index 000000000..91de06f9f --- /dev/null +++ b/source/device/vulkan/layer/flatten_vulkan.hpp @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_FLATTEN_HPP +#define LAYER_FLATTEN_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "flatten_param.h" + +namespace TEngine{ + +class Flatten_vulkan : public Layer +{ +public: + Flatten_vulkan(); + Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_flatten; + Pipeline* pipeline_flatten_pack4; + Pipeline* pipeline_flatten_pack1to4; + Pipeline* pipeline_flatten_pack8; + Pipeline* pipeline_flatten_pack1to8; + Pipeline* pipeline_flatten_pack4to8; + +public: + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + int output_size; + +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/innerproduct_vulkan.cpp b/source/device/vulkan/layer/innerproduct_vulkan.cpp new file mode 100644 index 000000000..c4ba14e99 --- /dev/null +++ b/source/device/vulkan/layer/innerproduct_vulkan.cpp @@ -0,0 +1,464 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "innerproduct_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +InnerProduct_vulkan::InnerProduct_vulkan() +{ + support_vulkan = true; + support_image_storage = true; + + flatten = 0; + + pipeline_innerproduct = 0; + pipeline_innerproduct_pack4 = 0; + pipeline_innerproduct_pack1to4 = 0; + pipeline_innerproduct_pack4to1 = 0; + pipeline_innerproduct_pack8 = 0; + pipeline_innerproduct_pack1to8 = 0; + pipeline_innerproduct_pack4to8 = 0; + pipeline_innerproduct_pack8to4 = 0; + pipeline_innerproduct_pack8to1 = 0; +} + +InnerProduct_vulkan::InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = false; + + flatten = 0; + + pipeline_innerproduct = 0; + pipeline_innerproduct_pack4 = 0; + pipeline_innerproduct_pack1to4 = 0; + pipeline_innerproduct_pack4to1 = 0; + pipeline_innerproduct_pack8 = 0; + pipeline_innerproduct_pack1to8 = 0; + pipeline_innerproduct_pack4to8 = 0; + pipeline_innerproduct_pack8to4 = 0; + pipeline_innerproduct_pack8to1 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + struct fc_param *param = (struct fc_param *)ir_node->op.param_mem; + + num_output = param->num_output; + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + + struct tensor *weight = get_ir_graph_tensor(graph, node->input_tensors[1]); + weight_data_size = weight->elem_num; + + activation_type = -1; + +} + +int InnerProduct_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; + + Tensor shape_flatten; + if (shape.dims != 0) + { + shape_flatten = Tensor(shape.w * shape.h * shape.c, (void*)0); + } + + int num_input = weight_data_size / num_output; + + int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + + size_t elemsize; + size_t out_elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + + Tensor shape_flatten_packed; + if (shape_flatten.dims == 1) shape_flatten_packed = Tensor(shape_flatten.w / elempack, (void*)0, elemsize, elempack); + + Tensor out_shape_packed; + if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); + + { + support_image_storage = false; + opt.use_image_storage = false; + } + + { + flatten = new Flatten_vulkan(); + flatten->vkdev = vkdev; + + flatten->input_w = shape.w; + flatten->input_h = shape.h; + flatten->input_c = shape.c; + flatten->output_w = shape_flatten.w; + flatten->output_h = shape_flatten.h; + flatten->output_c = shape_flatten.c; + flatten->output_size = shape_flatten.w*shape_flatten.h*shape_flatten.c; + + flatten->create_pipeline(opt); + } + + + std::vector specializations(4 + 10); + specializations[0].i = bias_term; + specializations[1].i = activation_type; + specializations[2].f = 0.f; // activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[3].f = 0.f; // activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[4 + 0].i = 0; // shape_flatten_packed.dims; + specializations[4 + 1].i = 0; // shape_flatten_packed.w; + specializations[4 + 2].i = 0; // shape_flatten_packed.h; + specializations[4 + 3].i = 0; // shape_flatten_packed.c; + specializations[4 + 4].i = 0; // shape_flatten_packed.cstep; + specializations[4 + 5].i = 0; // out_shape_packed.dims; + specializations[4 + 6].i = 0; // out_shape_packed.w; + specializations[4 + 7].i = 0; // out_shape_packed.h; + specializations[4 + 8].i = 0; // out_shape_packed.c; + specializations[4 + 9].i = 0; // out_shape_packed.cstep; + + Tensor local_size_xyz(std::min(64, num_output / out_elempack), 1, 1, (void*)0); + if (out_shape_packed.dims != 0) + { + local_size_xyz.w = std::min(64, out_shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + + // pack1 + if (elempack == 1 && out_elempack == 1) + { + pipeline_innerproduct = new Pipeline(vkdev); + pipeline_innerproduct->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct->create(LayerShaderType::innerproduct, opt, specializations); + } + + // pack4 + if (elempack == 4 && out_elempack == 4) + { + pipeline_innerproduct_pack4 = new Pipeline(vkdev); + pipeline_innerproduct_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_pack4->create(LayerShaderType::innerproduct_pack4, opt, specializations); + } + + // pack1to4 + if (elempack == 1 && out_elempack == 4) + { + pipeline_innerproduct_pack1to4 = new Pipeline(vkdev); + pipeline_innerproduct_pack1to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_pack1to4->create(LayerShaderType::innerproduct_pack1to4, opt, specializations); + } + + // pack4to1 + if (elempack == 4 && out_elempack == 1) + { + pipeline_innerproduct_pack4to1 = new Pipeline(vkdev); + pipeline_innerproduct_pack4to1->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_pack4to1->create(LayerShaderType::innerproduct_pack4to1, opt, specializations); + } + + // pack8 + if (elempack == 8 && out_elempack == 8) + { + pipeline_innerproduct_pack8 = new Pipeline(vkdev); + pipeline_innerproduct_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_pack8->create(LayerShaderType::innerproduct_pack8, opt, specializations); + } + + // pack1to8 + if (elempack == 1 && out_elempack == 8) + { + pipeline_innerproduct_pack1to8 = new Pipeline(vkdev); + pipeline_innerproduct_pack1to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_pack1to8->create(LayerShaderType::innerproduct_pack1to8, opt, specializations); + } + + // pack4to8 + if (elempack == 4 && out_elempack == 8) + { + pipeline_innerproduct_pack4to8 = new Pipeline(vkdev); + pipeline_innerproduct_pack4to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_pack4to8->create(LayerShaderType::innerproduct_pack4to8, opt, specializations); + } + + // pack8to4 + if (elempack == 8 && out_elempack == 4) + { + pipeline_innerproduct_pack8to4 = new Pipeline(vkdev); + pipeline_innerproduct_pack8to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_pack8to4->create(LayerShaderType::innerproduct_pack8to4, opt, specializations); + } + + // pack8to1 + if (elempack == 8 && out_elempack == 1) + { + pipeline_innerproduct_pack8to1 = new Pipeline(vkdev); + pipeline_innerproduct_pack8to1->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_pack8to1->create(LayerShaderType::innerproduct_pack8to1, opt, specializations); + } + + return 0; +} + +int InnerProduct_vulkan::destroy_pipeline(const Option& opt) +{ + if (flatten) + { + flatten->destroy_pipeline(opt); + delete flatten; + flatten = 0; + } + + delete pipeline_innerproduct; + pipeline_innerproduct = 0; + + delete pipeline_innerproduct_pack4; + pipeline_innerproduct_pack4 = 0; + + delete pipeline_innerproduct_pack1to4; + pipeline_innerproduct_pack1to4 = 0; + + delete pipeline_innerproduct_pack4to1; + pipeline_innerproduct_pack4to1 = 0; + + delete pipeline_innerproduct_pack8; + pipeline_innerproduct_pack8 = 0; + + delete pipeline_innerproduct_pack1to8; + pipeline_innerproduct_pack1to8 = 0; + + delete pipeline_innerproduct_pack4to8; + pipeline_innerproduct_pack4to8 = 0; + + delete pipeline_innerproduct_pack8to4; + pipeline_innerproduct_pack8to4 = 0; + + delete pipeline_innerproduct_pack8to1; + pipeline_innerproduct_pack8to1 = 0; + + return 0; +} + +int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + int num_input = weight_data_size / num_output; + + int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + + // src = inch-outch + // dst = pa-pb-inch/pa-outch/pb + tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); + Tensor weight_data = Tensor(weight_tensor->elem_num, weight_tensor->data); + Tensor weight_data_packed; + { + Tensor weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_packed.create(num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + float* g00 = weight_data_packed.row(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int i = 0; i < out_elempack; i++) + { + const float* k0 = weight_data_r2.row(q + i); + k0 += p; + + for (int j = 0; j < elempack; j++) + { + g00[0] = k0[j]; + + g00++; + } + } + } + } + } + + if (support_image_storage && opt.use_image_storage) + { + // cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt); + } + else + { + cmd.record_upload(weight_data_packed, weight_data_gpu, opt); + } + + if (bias_term) + { + tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]); + Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data); + Tensor bias_data_packed; + convert_packing(bias_data, bias_data_packed, out_elempack); + + if (support_image_storage && opt.use_image_storage) + { + // cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt); + } + else + { + cmd.record_upload(bias_data_packed, bias_data_gpu, opt); + } + } + return 0; +} + +int InnerProduct_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + // flatten + VkTensor bottom_blob_flattened = bottom_blob; + { + Option opt_flatten = opt; + opt_flatten.blob_vkallocator = opt.workspace_vkallocator; + + flatten->record_pipeline(bottom_blob, bottom_blob_flattened, cmd, opt_flatten); + } + + size_t elemsize = bottom_blob_flattened.elemsize; + int elempack = bottom_blob_flattened.elempack; + + int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = bottom_blob_flattened; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu; + bindings[3] = bias_data_gpu; + + std::vector constants(10); + constants[0].i = bottom_blob_flattened.dims; + constants[1].i = bottom_blob_flattened.w; + constants[2].i = bottom_blob_flattened.h; + constants[3].i = bottom_blob_flattened.c; + constants[4].i = bottom_blob_flattened.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + const Pipeline* pipeline = 0; + if (elempack == 1 && out_elempack == 1) + { + pipeline = pipeline_innerproduct; + } + else if (elempack == 4 && out_elempack == 4) + { + pipeline = pipeline_innerproduct_pack4; + } + else if (elempack == 1 && out_elempack == 4) + { + pipeline = pipeline_innerproduct_pack1to4; + } + else if (elempack == 4 && out_elempack == 1) + { + pipeline = pipeline_innerproduct_pack4to1; + } + else if (elempack == 8 && out_elempack == 8) + { + pipeline = pipeline_innerproduct_pack8; + } + else if (elempack == 1 && out_elempack == 8) + { + pipeline = pipeline_innerproduct_pack1to8; + } + else if (elempack == 4 && out_elempack == 8) + { + pipeline = pipeline_innerproduct_pack4to8; + } + else if (elempack == 8 && out_elempack == 4) + { + pipeline = pipeline_innerproduct_pack8to4; + } + else if (elempack == 8 && out_elempack == 1) + { + pipeline = pipeline_innerproduct_pack8to1; + } + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/innerproduct_vulkan.hpp b/source/device/vulkan/layer/innerproduct_vulkan.hpp new file mode 100644 index 000000000..c682bcb46 --- /dev/null +++ b/source/device/vulkan/layer/innerproduct_vulkan.hpp @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_INNERPRODUCT_HPP +#define LAYER_INNERPRODUCT_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" +#include "padding_vulkan.hpp" +#include "flatten_vulkan.hpp" + +#include "fc_param.h" + +namespace TEngine { + +class InnerProduct_vulkan : public Layer +{ +public: + InnerProduct_vulkan(); + InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Flatten_vulkan* flatten; + + VkTensor weight_data_gpu; + VkTensor bias_data_gpu; + + Pipeline* pipeline_innerproduct; + Pipeline* pipeline_innerproduct_pack4; + Pipeline* pipeline_innerproduct_pack1to4; + Pipeline* pipeline_innerproduct_pack4to1; + Pipeline* pipeline_innerproduct_pack8; + Pipeline* pipeline_innerproduct_pack1to8; + Pipeline* pipeline_innerproduct_pack4to8; + Pipeline* pipeline_innerproduct_pack8to4; + Pipeline* pipeline_innerproduct_pack8to1; + +public: + // param + int num_output; + int bias_term; + + int weight_data_size; + + int int8_scale_term; + + // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid + int activation_type; + Tensor activation_params; + + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; +}; + +} // namespace TEngine + +#endif // LAYER_INNERPRODUCT_VULKAN_H \ No newline at end of file diff --git a/source/device/vulkan/layer/interp_vulkan.cpp b/source/device/vulkan/layer/interp_vulkan.cpp new file mode 100644 index 000000000..586846b72 --- /dev/null +++ b/source/device/vulkan/layer/interp_vulkan.cpp @@ -0,0 +1,464 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "interp_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Interp_vulkan::Interp_vulkan() +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_interp = 0; + pipeline_interp_pack4 = 0; + pipeline_interp_pack8 = 0; + + pipeline_interp_bicubic_coeffs_x = 0; + pipeline_interp_bicubic_coeffs_y = 0; + pipeline_interp_bicubic = 0; + pipeline_interp_bicubic_pack4 = 0; + pipeline_interp_bicubic_pack8 = 0; +} + +Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_interp = 0; + pipeline_interp_pack4 = 0; + pipeline_interp_pack8 = 0; + + pipeline_interp_bicubic_coeffs_x = 0; + pipeline_interp_bicubic_coeffs_y = 0; + pipeline_interp_bicubic = 0; + pipeline_interp_bicubic_pack4 = 0; + pipeline_interp_bicubic_pack8 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + // params + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + + struct interp_param *param = (struct interp_param *)ir_node->op.param_mem; + + if (param->height_scale != 0 && param->width_scale != 0) + { + output_height = input_h * param->height_scale; + output_width = input_w * param->width_scale; + } + else + { + height_scale = (float )output->dims[2] / (float )input_h; + width_scale = (float )output->dims[2] / (float )input_w; + } + resize_type = 2;//param->resize_type; +} + +int Interp_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + int out_elempack = 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; + if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; + if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + size_t out_elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + + Tensor shape_packed; + if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + + Tensor out_shape_packed; + if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); + if (out_shape.dims == 2) out_shape_packed = Tensor(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); + if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); + + // check blob shape + // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed)) + { + support_image_storage = false; + opt.use_image_storage = false; + } + + if (resize_type == 1 || resize_type == 2) + { + std::vector specializations(1 + 10); + specializations[0].i = resize_type; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; + specializations[1 + 5].i = 0; // out_shape_packed.dims; + specializations[1 + 6].i = 0; // out_shape_packed.w; + specializations[1 + 7].i = 0; // out_shape_packed.h; + specializations[1 + 8].i = 0; // out_shape_packed.c; + specializations[1 + 9].i = 0; // out_shape_packed.cstep; + + Tensor local_size_xyz; + if (out_shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, out_shape_packed.w); + local_size_xyz.h = std::min(8, out_shape_packed.h); + local_size_xyz.c = 1; + } + if (out_shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, out_shape_packed.w); + local_size_xyz.h = std::min(4, out_shape_packed.h); + local_size_xyz.c = std::min(4, out_shape_packed.c); + } + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_interp = new Pipeline(vkdev); + pipeline_interp->set_optimal_local_size_xyz(local_size_xyz); + pipeline_interp->create(LayerShaderType::interp, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_interp_pack4 = new Pipeline(vkdev); + pipeline_interp_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_interp_pack4->create(LayerShaderType::interp_pack4, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) + { + pipeline_interp_pack8 = new Pipeline(vkdev); + pipeline_interp_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_interp_pack8->create(LayerShaderType::interp_pack8, opt, specializations); + } + } + + if (resize_type == 3) + { + { + std::vector specializations(0 + 2); + specializations[0 + 0].i = shape_packed.w; + specializations[0 + 1].i = out_shape_packed.w; + + Tensor local_size_xyz(64, 1, 1, (void*)0); + if (out_shape_packed.dims != 0) + { + local_size_xyz.w = std::min(64, out_shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + + pipeline_interp_bicubic_coeffs_x = new Pipeline(vkdev); + pipeline_interp_bicubic_coeffs_x->set_optimal_local_size_xyz(local_size_xyz); + pipeline_interp_bicubic_coeffs_x->create(LayerShaderType::interp_bicubic_coeffs, opt, specializations); + } + { + std::vector specializations(0 + 2); + specializations[0 + 0].i = shape_packed.h; + specializations[0 + 1].i = out_shape_packed.h; + + Tensor local_size_xyz(64, 1, 1, (void*)0); + if (out_shape_packed.dims != 0) + { + local_size_xyz.w = std::min(64, out_shape_packed.h); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + + pipeline_interp_bicubic_coeffs_y = new Pipeline(vkdev); + pipeline_interp_bicubic_coeffs_y->set_optimal_local_size_xyz(local_size_xyz); + pipeline_interp_bicubic_coeffs_y->create(LayerShaderType::interp_bicubic_coeffs, opt, specializations); + } + + std::vector specializations(0 + 10); + specializations[0 + 0].i = 0; // shape_packed.dims; + specializations[0 + 1].i = 0; // shape_packed.w; + specializations[0 + 2].i = 0; // shape_packed.h; + specializations[0 + 3].i = 0; // shape_packed.c; + specializations[0 + 4].i = 0; // shape_packed.cstep; + specializations[0 + 5].i = 0; // out_shape_packed.dims; + specializations[0 + 6].i = 0; // out_shape_packed.w; + specializations[0 + 7].i = 0; // out_shape_packed.h; + specializations[0 + 8].i = 0; // out_shape_packed.c; + specializations[0 + 9].i = 0; // out_shape_packed.cstep; + + Tensor local_size_xyz; + if (out_shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, out_shape_packed.w); + local_size_xyz.h = std::min(8, out_shape_packed.h); + local_size_xyz.c = 1; + } + if (out_shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, out_shape_packed.w); + local_size_xyz.h = std::min(4, out_shape_packed.h); + local_size_xyz.c = std::min(4, out_shape_packed.c); + } + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_interp_bicubic = new Pipeline(vkdev); + pipeline_interp_bicubic->set_optimal_local_size_xyz(local_size_xyz); + pipeline_interp_bicubic->create(LayerShaderType::interp_bicubic, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_interp_bicubic_pack4 = new Pipeline(vkdev); + pipeline_interp_bicubic_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_interp_bicubic_pack4->create(LayerShaderType::interp_bicubic_pack4, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) + { + pipeline_interp_bicubic_pack8 = new Pipeline(vkdev); + pipeline_interp_bicubic_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_interp_bicubic_pack8->create(LayerShaderType::interp_bicubic_pack8, opt, specializations); + } + } + + return 0; +} + +int Interp_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_interp; + pipeline_interp = 0; + + delete pipeline_interp_pack4; + pipeline_interp_pack4 = 0; + + delete pipeline_interp_pack8; + pipeline_interp_pack8 = 0; + + delete pipeline_interp_bicubic_coeffs_x; + pipeline_interp_bicubic_coeffs_x = 0; + + delete pipeline_interp_bicubic_coeffs_y; + pipeline_interp_bicubic_coeffs_y = 0; + + delete pipeline_interp_bicubic; + pipeline_interp_bicubic = 0; + + delete pipeline_interp_bicubic_pack4; + pipeline_interp_bicubic_pack4 = 0; + + delete pipeline_interp_bicubic_pack8; + pipeline_interp_bicubic_pack8 = 0; + + return 0; +} + +int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = output_width; + int outh = output_height; + if (outw == 0 || outh == 0) + { + outw = w * width_scale; + outh = h * height_scale; + } + + if (outh == h && outw == w) + { + top_blob = bottom_blob; + return 0; + } + + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + if (resize_type == 1 || resize_type == 2) // nearest or bilinear + { + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(12); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].f = w / (float)outw; + constants[11].f = h / (float)outh; + + const Pipeline* pipeline = elempack == 8 ? pipeline_interp_pack8 + : elempack == 4 ? pipeline_interp_pack4 + : pipeline_interp; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + else if (resize_type == 3) // bicubic + { + VkTensor alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator); + if (alpha.empty()) + return -100; + + VkTensor xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator); + if (xofs.empty()) + return -100; + + { + std::vector bindings(2); + bindings[0] = alpha; + bindings[1] = xofs; + + std::vector constants(3); + constants[0].i = bottom_blob.w; + constants[1].i = outw; + constants[2].f = (float)bottom_blob.w / outw; + + // record + cmd.record_pipeline(pipeline_interp_bicubic_coeffs_x, bindings, constants, alpha); + } + + VkTensor beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator); + if (beta.empty()) + return -100; + + VkTensor yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator); + if (yofs.empty()) + return -100; + + { + std::vector bindings(2); + bindings[0] = beta; + bindings[1] = yofs; + + std::vector constants(3); + constants[0].i = bottom_blob.h; + constants[1].i = outh; + constants[2].f = (float)bottom_blob.h / outh; + + // record + cmd.record_pipeline(pipeline_interp_bicubic_coeffs_y, bindings, constants, beta); + } + + std::vector bindings(6); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + bindings[2] = alpha; + bindings[3] = xofs; + bindings[4] = beta; + bindings[5] = yofs; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_interp_bicubic_pack8 + : elempack == 4 ? pipeline_interp_bicubic_pack4 + : pipeline_interp_bicubic; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + } + + return 0; +} + +} // TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/interp_vulkan.hpp b/source/device/vulkan/layer/interp_vulkan.hpp new file mode 100644 index 000000000..ef3886f45 --- /dev/null +++ b/source/device/vulkan/layer/interp_vulkan.hpp @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_INTERP_HPP +#define LAYER_INTERP_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "interp_param.h" + +namespace TEngine{ + +class Interp_vulkan : public Layer +{ +public: + Interp_vulkan(); + Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + // virtual int upload_model(VkTransfer& cmd, const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_interp; + Pipeline* pipeline_interp_pack4; + Pipeline* pipeline_interp_pack8; + + Pipeline* pipeline_interp_bicubic_coeffs_x; + Pipeline* pipeline_interp_bicubic_coeffs_y; + Pipeline* pipeline_interp_bicubic; + Pipeline* pipeline_interp_bicubic_pack4; + Pipeline* pipeline_interp_bicubic_pack8; + +public: + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + + int resize_type; //1=nearest 2=bilinear 3=bicubic + int output_height; + int output_width; + float height_scale; + float width_scale; + + +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/packing_vulkan.cpp b/source/device/vulkan/layer/packing_vulkan.cpp new file mode 100644 index 000000000..86a6c9538 --- /dev/null +++ b/source/device/vulkan/layer/packing_vulkan.cpp @@ -0,0 +1,495 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "packing_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Packing_vulkan::Packing_vulkan() +{ + support_vulkan = true; + // support_image_storage = true; + + pipeline_packing = 0; + pipeline_packing_pack4 = 0; + pipeline_packing_pack8 = 0; + pipeline_packing_pack1to4 = 0; + pipeline_packing_pack4to1 = 0; + pipeline_packing_pack1to8 = 0; + pipeline_packing_pack4to8 = 0; + pipeline_packing_pack8to4 = 0; + pipeline_packing_pack8to1 = 0; +} + +int Packing_vulkan::create_pipeline(const Option& _opt) +{ + + + Option opt = _opt; + // const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0]; + // const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0]; + + size_t out_elemsize; + if (opt.use_fp16_storage) + { + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + out_elemsize = out_elempack * 4u; + } + + // type casting override + if (cast_type_to == 1) + { + out_elemsize = out_elempack * 4u; + } + + // Mat out_shape_packed; + // if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); + // if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); + // if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); + + + // check blob shape + // if (!vkdev->shape_support_image_storage(out_shape_packed)) + { + // support_image_storage = false; + opt.use_image_storage = false; + } + + std::vector specializations(2 + 10); + specializations[0].i = storage_type_from; + specializations[1].i = storage_type_to; + specializations[2 + 0].i = 0;// FIXME shape elempack may be dynamic + specializations[2 + 1].i = 0; + specializations[2 + 2].i = 0; + specializations[2 + 3].i = 0; + specializations[2 + 4].i = 0; + specializations[2 + 5].i = 0; //out_shape_packed_dims; + specializations[2 + 6].i = 0; //out_shape_packed_w; + specializations[2 + 7].i = 0; //out_shape_packed_h; + specializations[2 + 8].i = 0; //out_shape_packed_c; + specializations[2 + 9].i = 0; //out_shape_packed_cstep; + + + // printf("out shape dims:%d ---------------------------------\n", out_shape_packed_dims); + + VkTensor local_size_xyz;// TODO more precise group size guessed from out_shape_packed + if (out_shape_packed_dims == 1) + { + local_size_xyz.w = 64; + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (out_shape_packed_dims == 2) + { + local_size_xyz.w = 8; + local_size_xyz.h = 8; + local_size_xyz.c = 1; + } + if (out_shape_packed_dims == 3) + { + local_size_xyz.w = 4; + local_size_xyz.h = 4; + local_size_xyz.c = 4; + } + + if (out_elempack == 8) + { + pipeline_packing_pack8 = new Pipeline(vkdev); + pipeline_packing_pack8->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_packing_pack1to8 = new Pipeline(vkdev); + pipeline_packing_pack1to8->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_packing_pack4to8 = new Pipeline(vkdev); + pipeline_packing_pack4to8->set_optimal_local_size_xyz(local_size_xyz); + + if (cast_type_from == cast_type_to) + { + pipeline_packing_pack8->create(LayerShaderType::packing_pack8, opt, specializations); + pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations); + pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations); + } + else if (cast_type_from == 1) + { + pipeline_packing_pack8->create(LayerShaderType::packing_pack8_fp32_to_fp16, opt, specializations); + pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8_fp32_to_fp16, opt, specializations); + pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8_fp32_to_fp16, opt, specializations); + } + else if (cast_type_to == 1) + { + pipeline_packing_pack8->create(LayerShaderType::packing_pack8_fp16_to_fp32, opt, specializations); + pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8_fp16_to_fp32, opt, specializations); + pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8_fp16_to_fp32, opt, specializations); + } + } + + if (out_elempack == 4) + { + pipeline_packing_pack4 = new Pipeline(vkdev); + pipeline_packing_pack4->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_packing_pack1to4 = new Pipeline(vkdev); + pipeline_packing_pack1to4->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_packing_pack8to4 = new Pipeline(vkdev); + pipeline_packing_pack8to4->set_optimal_local_size_xyz(local_size_xyz); + + if (cast_type_from == cast_type_to) + { + pipeline_packing_pack4->create(LayerShaderType::packing_pack4, opt, specializations); + pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations); + pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations); + } + else if (cast_type_from == 1) + { + pipeline_packing_pack4->create(LayerShaderType::packing_pack4_fp32_to_fp16, opt, specializations); + pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4_fp32_to_fp16, opt, specializations); + pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4_fp32_to_fp16, opt, specializations); + } + else if (cast_type_to == 1) + { + pipeline_packing_pack4->create(LayerShaderType::packing_pack4_fp16_to_fp32, opt, specializations); + pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4_fp16_to_fp32, opt, specializations); + pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4_fp16_to_fp32, opt, specializations); + } + } + + if (out_elempack == 1) + { + pipeline_packing = new Pipeline(vkdev); + pipeline_packing->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_packing_pack4to1 = new Pipeline(vkdev); + pipeline_packing_pack4to1->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_packing_pack8to1 = new Pipeline(vkdev); + pipeline_packing_pack8to1->set_optimal_local_size_xyz(local_size_xyz); + + if (cast_type_from == cast_type_to) + { + pipeline_packing->create(LayerShaderType::packing, opt, specializations); + pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations); + pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations); + } + else if (cast_type_from == 1) + { + pipeline_packing->create(LayerShaderType::packing_fp32_to_fp16, opt, specializations); + pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1_fp32_to_fp16, opt, specializations); + pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1_fp32_to_fp16, opt, specializations); + } + else if (cast_type_to == 1) + { + pipeline_packing->create(LayerShaderType::packing_fp16_to_fp32, opt, specializations); + pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1_fp16_to_fp32, opt, specializations); + pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1_fp16_to_fp32, opt, specializations); + } + } + + return 0; +} + +int Packing_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_packing; + pipeline_packing = 0; + + delete pipeline_packing_pack4; + pipeline_packing_pack4 = 0; + + delete pipeline_packing_pack8; + pipeline_packing_pack8 = 0; + + delete pipeline_packing_pack1to4; + pipeline_packing_pack1to4 = 0; + + delete pipeline_packing_pack4to1; + pipeline_packing_pack4to1 = 0; + + delete pipeline_packing_pack1to8; + pipeline_packing_pack1to8 = 0; + + delete pipeline_packing_pack4to8; + pipeline_packing_pack4to8 = 0; + + delete pipeline_packing_pack8to4; + pipeline_packing_pack8to4 = 0; + + delete pipeline_packing_pack8to1; + pipeline_packing_pack8to1 = 0; + + return 0; +} + +int Packing_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + int elempack = bottom_blob.elempack; + // printf("Packing_vulkan b2b %d %d %d %d %d %d\n", elempack, out_elempack, cast_type_from, cast_type_to, storage_type_from, storage_type_to); + + if (elempack == out_elempack && cast_type_from == cast_type_to && bottom_blob.allocator == opt.blob_vkallocator) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + + if (!use_padding) + { + // identity if use_padding not allowed + if (dims == 1 && w * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if (dims == 2 && h * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + if (dims == 3 && channels * elempack % out_elempack != 0) + { + top_blob = bottom_blob; + return 0; + } + } + + size_t out_elemsize; + if (cast_type_to == 0) + { + if (opt.use_fp16_storage) + { + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + else + { + out_elemsize = out_elempack * 4u; + } + } + else if (cast_type_to == 1) + { + out_elemsize = out_elempack * 4u; + } + else if (cast_type_to == 2) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + else // if (cast_type_to == 3) + { + out_elemsize = out_elempack * 2u; + } + + if (dims == 1) + { + if (opt.use_fp16_storage && out_elempack == 1 && cast_type_from == cast_type_to && bottom_blob.allocator == opt.blob_vkallocator) + { + top_blob = bottom_blob; + top_blob.w = w * elempack; + top_blob.cstep = w * elempack; + top_blob.elemsize = elemsize / elempack; + top_blob.elempack = out_elempack; + return 0; + } + + int outw = (w * elempack + out_elempack - 1) / out_elempack; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + // int outw = (w * elempack + out_elempack - 1) / out_elempack; + + // if (opt.use_fp16_packed && !opt.use_fp16_storage) + // { + // if (out_elempack == 8) out_elemsize = 8*2u; + // if (out_elempack == 4) out_elemsize = 4*2u; + // if (out_elempack == 1) out_elemsize = 4u; + // } + + // // type casting override + // if (cast_type_to == 1) + // { + // out_elemsize = out_elempack * 4u; + // } + + // top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator); + // if (top_blob.empty()) + // return -100; + } + + if (dims == 2) + { + int outh = (h * elempack + out_elempack - 1) / out_elempack; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + // int outh = (h * elempack + out_elempack - 1) / out_elempack; + // size_t out_elemsize = elemsize / elempack * out_elempack; + // if (opt.use_fp16_packed && !opt.use_fp16_storage) + // { + // if (out_elempack == 8) out_elemsize = 8*2u; + // if (out_elempack == 4) out_elemsize = 4*2u; + // if (out_elempack == 1) out_elemsize = 4u; + // } + + // // type casting override + // if (cast_type_to == 1) + // { + // out_elemsize = out_elempack * 4u; + // } + + // top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator); + // if (top_blob.empty()) + // return -100; + } + + if (dims == 3) + { + int outc = (channels * elempack + out_elempack - 1) / out_elempack; + + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + // int outc = (channels * elempack + out_elempack - 1) / out_elempack; + // size_t out_elemsize = elemsize / elempack * out_elempack; + // if (opt.use_fp16_packed && !opt.use_fp16_storage) + // { + // if (out_elempack == 8) out_elemsize = 8*2u; + // if (out_elempack == 4) out_elemsize = 4*2u; + // if (out_elempack == 1) out_elemsize = 4u; + // } + + // // type casting override + // if (cast_type_to == 1) + // { + // out_elemsize = out_elempack * 4u; + // } + + // top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator); + // if (top_blob.empty()) + // return -100; + } + + std::vector buffer_bindings(2); + buffer_bindings[0] = bottom_blob; + buffer_bindings[1] = top_blob; + + std::vector image_bindings(2); + if (!opt.use_image_storage) + { + image_bindings.clear(); + } + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + // printf("record packing pipeline:%d %d %d %d %d %d %d %d %d\n", top_blob.dims, top_blob.c, top_blob.h, top_blob.w, top_blob.cstep, top_blob.elempack, top_blob.elemsize, elempack, out_elempack); + + if (elempack == 1 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_packing, buffer_bindings, image_bindings, constants, top_blob); + } + if (elempack == 4 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_packing_pack4, buffer_bindings, image_bindings, constants, top_blob); + } + if (elempack == 1 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_packing_pack1to4, buffer_bindings, image_bindings, constants, top_blob); + } + if (elempack == 4 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_packing_pack4to1, buffer_bindings, image_bindings, constants, bottom_blob); + } + if (elempack == 8 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_packing_pack8, buffer_bindings, image_bindings, constants, top_blob); + } + if (elempack == 1 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_packing_pack1to8, buffer_bindings, image_bindings, constants, top_blob); + } + if (elempack == 4 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_packing_pack4to8, buffer_bindings, image_bindings, constants, top_blob); + } + if (elempack == 8 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_packing_pack8to4, buffer_bindings, image_bindings, constants, bottom_blob); + } + if (elempack == 8 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_packing_pack8to1, buffer_bindings, image_bindings, constants, bottom_blob); + } + + + // printf("run packing vulkan record pipeline\n"); + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/packing_vulkan.hpp b/source/device/vulkan/layer/packing_vulkan.hpp new file mode 100644 index 000000000..10b748020 --- /dev/null +++ b/source/device/vulkan/layer/packing_vulkan.hpp @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_PACKING_HPP +#define LAYER_PACKING_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +namespace TEngine { + +class Packing_vulkan : public Layer +{ +public: + Packing_vulkan(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + int out_shape_packed_dims; + int out_shape_packed_w; + int out_shape_packed_h; + int out_shape_packed_c; + int out_shape_packed_cstep; + + int out_elempack; + int use_padding; + + // element type + // 0 = auto + // 1 = fp32 + // 2 = fp16p + // 3 = fp16s + int cast_type_from; + int cast_type_to; + + // storage type + // 0 = buffer + // 1 = image + int storage_type_from; + int storage_type_to; + + Pipeline* pipeline_packing; + Pipeline* pipeline_packing_pack4; + Pipeline* pipeline_packing_pack8; + Pipeline* pipeline_packing_pack1to4; + Pipeline* pipeline_packing_pack4to1; + Pipeline* pipeline_packing_pack1to8; + Pipeline* pipeline_packing_pack4to8; + Pipeline* pipeline_packing_pack8to4; + Pipeline* pipeline_packing_pack8to1; +}; + +} // namespace TEngine + + +#endif diff --git a/source/device/vulkan/layer/padding_vulkan.cpp b/source/device/vulkan/layer/padding_vulkan.cpp new file mode 100644 index 000000000..756fb05c9 --- /dev/null +++ b/source/device/vulkan/layer/padding_vulkan.cpp @@ -0,0 +1,174 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "padding_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Padding_vulkan::Padding_vulkan() +{ + support_vulkan = true; + pipeline_padding = 0; + pipeline_padding_pack4 = 0; + pipeline_padding_pack8 = 0; +} + + + +int Padding_vulkan::create_pipeline(const Option& opt) +{ + int elempack = 1; + elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 : 1; + int out_elempack; + out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1; + + // printf("create padding pipeline elempack:%d %d \n", elempack, out_elempack); + + + std::vector specializations(3 + 10); + specializations[0].i = type; + specializations[1].f = value; + specializations[2].i = 0; // per_channel_pad_data_size ? 1 : 0; + specializations[3 + 0].i = 3; // shape_packed.dims; + specializations[3 + 1].i = input_w; // shape_packed.w; + specializations[3 + 2].i = input_h; // shape_packed.h; + specializations[3 + 3].i = input_c; // shape_packed.c; + specializations[3 + 4].i = input_w * input_h; // shape_packed.cstep; + specializations[3 + 5].i = 3; // out_shape_packed.dims; + specializations[3 + 6].i = output_w; // out_shape_packed.w; + specializations[3 + 7].i = output_h; // out_shape_packed.h; + specializations[3 + 8].i = output_c; // out_shape_packed.c; + specializations[3 + 9].i = output_w * output_h; // out_shape_packed.cstep; + + VkTensor local_size_xyz; + // if (out_shape_packed.dims != 0) + { + local_size_xyz.w = std::min(4, output_w); + local_size_xyz.h = std::min(4, output_h); + local_size_xyz.c = std::min(4, output_c); + } + + // pack1 + // if (shape.dims == 0 || elempack == 1) + if(elempack == 1) + { + pipeline_padding = new Pipeline(vkdev); + pipeline_padding->set_optimal_local_size_xyz(local_size_xyz); + pipeline_padding->create(LayerShaderType::padding, opt, specializations); + } + + // pack4 + // if (shape.dims == 0 || elempack == 4) + if(elempack == 4) + { + pipeline_padding_pack4 = new Pipeline(vkdev); + pipeline_padding_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_padding_pack4->create(LayerShaderType::padding_pack4, opt, specializations); + } + + // pack8 + // if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) + if (opt.use_shader_pack8 || elempack == 8) + { + pipeline_padding_pack8 = new Pipeline(vkdev); + pipeline_padding_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_padding_pack8->create(LayerShaderType::padding_pack8, opt, specializations); + } + + return 0; +} + +int Padding_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + return 0; +} + + +int Padding_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + if (top == 0 && bottom == 0 && left == 0 && right == 0) + { + top_blob = bottom_blob; + return 0; + } + + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int outw = w + left + right; + int outh = h + top + bottom; + + // printf("create padding top_blob vktensor, w, h, c:%d %d %d\n", outw, outh, channels); + top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(12); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].i = left; + constants[11].i = top; + + // printf("padding shape:%d %d %d %d %d %d %d %d %d\n", top_blob.c, top_blob.h, top_blob.w, top_blob.cstep, bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.cstep, elempack); + const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8 + : elempack == 4 ? pipeline_padding_pack4 + : pipeline_padding; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/padding_vulkan.hpp b/source/device/vulkan/layer/padding_vulkan.hpp new file mode 100644 index 000000000..f6aabe066 --- /dev/null +++ b/source/device/vulkan/layer/padding_vulkan.hpp @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_PADDING_HPP +#define LAYER_PADDING_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +namespace TEngine { + +class Padding_vulkan : public Layer +{ +public: + Padding_vulkan(); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + int top; + int bottom; + int left; + int right; + int type;// 0=CONSTANT 1=REPLICATE 2=REFLECT + float value; + int input_w; + int input_h; + int input_c; + int output_w; + int output_h; + int output_c; + +public: + Pipeline* pipeline_padding; + Pipeline* pipeline_padding_pack4; + Pipeline* pipeline_padding_pack8; +}; + +} // namespace TEngine + + +#endif diff --git a/source/device/vulkan/layer/permute_vulkan.cpp b/source/device/vulkan/layer/permute_vulkan.cpp new file mode 100644 index 000000000..461b3cc25 --- /dev/null +++ b/source/device/vulkan/layer/permute_vulkan.cpp @@ -0,0 +1,475 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "permute_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Permute_vulkan::Permute_vulkan() +{ + support_vulkan = true; + support_image_storage = true; + + pipeline_permute = 0; + pipeline_permute_pack4 = 0; + pipeline_permute_pack1to4 = 0; + pipeline_permute_pack4to1 = 0; + pipeline_permute_pack8 = 0; + pipeline_permute_pack1to8 = 0; + pipeline_permute_pack4to8 = 0; + pipeline_permute_pack8to4 = 0; + pipeline_permute_pack8to1 = 0; +} + +Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = true; + + pipeline_permute = 0; + pipeline_permute_pack4 = 0; + pipeline_permute_pack1to4 = 0; + pipeline_permute_pack4to1 = 0; + pipeline_permute_pack8 = 0; + pipeline_permute_pack1to8 = 0; + pipeline_permute_pack4to8 = 0; + pipeline_permute_pack8to4 = 0; + pipeline_permute_pack8to1 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + // params + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + + // TODO fix order_type value + struct permute_param *param = (struct permute_param *)ir_node->op.param_mem; + if ((param->order0 == 0) && (param->order1 == 2) && (param->order2 == 3) && (param->order3 == 1)) + { + order_type = 3; + } + else if ((param->order0 == 1) && (param->order1 == 0) && (param->order2 == 2) && input->dim_num == 3) + { + order_type = 1; + } + else + { + order_type = 0; + } + +} + +int Permute_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + int out_elempack = 1; + if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1; + if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1; + if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + size_t out_elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + + Tensor shape_packed; + if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + + Tensor out_shape_packed; + if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack); + if (out_shape.dims == 2) out_shape_packed = Tensor(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack); + if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack); + + // check blob shape + // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed)) + { + support_image_storage = false; + opt.use_image_storage = false; + } + + std::vector specializations(1 + 10); + specializations[0].i = order_type; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; + specializations[1 + 5].i = 0; // out_shape_packed.dims; + specializations[1 + 6].i = 0; // out_shape_packed.w; + specializations[1 + 7].i = 0; // out_shape_packed.h; + specializations[1 + 8].i = 0; // out_shape_packed.c; + specializations[1 + 9].i = 0; // out_shape_packed.cstep; + + Tensor local_size_xyz_bottom; // pack4to1 and pack8to1 + if (shape_packed.dims == 2) + { + local_size_xyz_bottom.w = std::min(8, shape_packed.w); + local_size_xyz_bottom.h = std::min(8, shape_packed.h); + local_size_xyz_bottom.c = 1; + } + if (shape_packed.dims == 3) + { + local_size_xyz_bottom.w = std::min(4, shape_packed.w); + local_size_xyz_bottom.h = std::min(4, shape_packed.h); + local_size_xyz_bottom.c = std::min(4, shape_packed.c); + } + + Tensor local_size_xyz; + if (out_shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, out_shape_packed.w); + local_size_xyz.h = std::min(8, out_shape_packed.h); + local_size_xyz.c = 1; + } + if (out_shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, out_shape_packed.w); + local_size_xyz.h = std::min(4, out_shape_packed.h); + local_size_xyz.c = std::min(4, out_shape_packed.c); + } + + // pack1 + if (shape.dims == 0 || (elempack == 1 && out_elempack == 1)) + { + pipeline_permute = new Pipeline(vkdev); + pipeline_permute->set_optimal_local_size_xyz(local_size_xyz); + pipeline_permute->create(LayerShaderType::permute, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || (elempack == 4 && out_elempack == 4)) + { + pipeline_permute_pack4 = new Pipeline(vkdev); + pipeline_permute_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_permute_pack4->create(LayerShaderType::permute_pack4, opt, specializations); + } + + // pack1to4 + if (shape.dims == 0 || (elempack == 1 && out_elempack == 4)) + { + pipeline_permute_pack1to4 = new Pipeline(vkdev); + pipeline_permute_pack1to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_permute_pack1to4->create(LayerShaderType::permute_pack1to4, opt, specializations); + } + + // pack4to1 + if (shape.dims == 0 || (elempack == 4 && out_elempack == 1)) + { + pipeline_permute_pack4to1 = new Pipeline(vkdev); + pipeline_permute_pack4to1->set_optimal_local_size_xyz(local_size_xyz_bottom); + pipeline_permute_pack4to1->create(LayerShaderType::permute_pack4to1, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 8 && out_elempack == 8)) + { + pipeline_permute_pack8 = new Pipeline(vkdev); + pipeline_permute_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_permute_pack8->create(LayerShaderType::permute_pack8, opt, specializations); + } + + // pack1to8 + if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 1 && out_elempack == 8)) + { + pipeline_permute_pack1to8 = new Pipeline(vkdev); + pipeline_permute_pack1to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_permute_pack1to8->create(LayerShaderType::permute_pack1to8, opt, specializations); + } + + // pack4to8 + if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 4 && out_elempack == 8)) + { + pipeline_permute_pack4to8 = new Pipeline(vkdev); + pipeline_permute_pack4to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_permute_pack4to8->create(LayerShaderType::permute_pack4to8, opt, specializations); + } + + // pack8to4 + if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 8 && out_elempack == 4)) + { + pipeline_permute_pack8to4 = new Pipeline(vkdev); + pipeline_permute_pack8to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_permute_pack8to4->create(LayerShaderType::permute_pack8to4, opt, specializations); + } + + // pack8to1 + if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 8 && out_elempack == 1)) + { + pipeline_permute_pack8to1 = new Pipeline(vkdev); + pipeline_permute_pack8to1->set_optimal_local_size_xyz(local_size_xyz_bottom); + pipeline_permute_pack8to1->create(LayerShaderType::permute_pack8to1, opt, specializations); + } + + return 0; +} + +int Permute_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_permute; + pipeline_permute = 0; + + delete pipeline_permute_pack4; + pipeline_permute_pack4 = 0; + + delete pipeline_permute_pack1to4; + pipeline_permute_pack1to4 = 0; + + delete pipeline_permute_pack4to1; + pipeline_permute_pack4to1 = 0; + + delete pipeline_permute_pack8; + pipeline_permute_pack8 = 0; + + delete pipeline_permute_pack1to8; + pipeline_permute_pack1to8 = 0; + + delete pipeline_permute_pack4to8; + pipeline_permute_pack4to8 = 0; + + delete pipeline_permute_pack8to4; + pipeline_permute_pack8to4 = 0; + + delete pipeline_permute_pack8to1; + pipeline_permute_pack8to1 = 0; + + return 0; +} + +int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + int dims = bottom_blob.dims; + + if (dims == 1 || order_type == 0) + { + top_blob = bottom_blob; + return 0; + } + + int out_elempack; + size_t out_elemsize; + + if (dims == 2) + { + // order_type + // 0 = w h + // 1 = h w + + int outw; + int outh; + + // if (order_type == 1) + { + outw = h * elempack; + outh = w; + } + + out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1; + out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + } + else // if (dims == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + int outw; + int outh; + int outc; + + if (order_type == 1) + { + outw = h; + outh = w; + outc = channels * elempack; + } + else if (order_type == 2) + { + outw = w; + outh = channels * elempack; + outc = h; + } + else if (order_type == 3) + { + outw = channels * elempack; + outh = w; + outc = h; + } + else if (order_type == 4) + { + outw = h; + outh = channels * elempack; + outc = w; + } + else // if (order_type == 5) + { + outw = channels * elempack; + outh = h; + outc = w; + } + + out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1; + out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + } + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + if (elempack == 1 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_permute, bindings, constants, top_blob); + } + else if (elempack == 4 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_permute_pack4, bindings, constants, top_blob); + } + else if (elempack == 1 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_permute_pack1to4, bindings, constants, top_blob); + } + else if (elempack == 4 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_permute_pack4to1, bindings, constants, bottom_blob); + } + else if (elempack == 8 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_permute_pack8, bindings, constants, top_blob); + } + else if (elempack == 1 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_permute_pack1to8, bindings, constants, top_blob); + } + else if (elempack == 4 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_permute_pack4to8, bindings, constants, top_blob); + } + else if (elempack == 8 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_permute_pack8to4, bindings, constants, top_blob); + } + else if (elempack == 8 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_permute_pack8to1, bindings, constants, bottom_blob); + } + + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/permute_vulkan.hpp b/source/device/vulkan/layer/permute_vulkan.hpp new file mode 100644 index 000000000..5ea17c635 --- /dev/null +++ b/source/device/vulkan/layer/permute_vulkan.hpp @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_PERMUTE_HPP +#define LAYER_PERMUTE_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "permute_param.h" + +namespace TEngine{ + +class Permute_vulkan : public Layer +{ +public: + Permute_vulkan(); + Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_permute; + Pipeline* pipeline_permute_pack4; + Pipeline* pipeline_permute_pack1to4; + Pipeline* pipeline_permute_pack4to1; + Pipeline* pipeline_permute_pack8; + Pipeline* pipeline_permute_pack1to8; + Pipeline* pipeline_permute_pack4to8; + Pipeline* pipeline_permute_pack8to4; + Pipeline* pipeline_permute_pack8to1; + +public: + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + int order_type; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/pooling_vulkan.cpp b/source/device/vulkan/layer/pooling_vulkan.cpp new file mode 100644 index 000000000..eb50b1704 --- /dev/null +++ b/source/device/vulkan/layer/pooling_vulkan.cpp @@ -0,0 +1,338 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "pooling_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Pooling_vulkan::Pooling_vulkan() +{ + support_vulkan = true; + pipeline_pooling = 0; + pipeline_pooling_pack4 = 0; + pipeline_pooling_pack8 = 0; + pipeline_pooling_global = 0; + pipeline_pooling_global_pack4 = 0; + pipeline_pooling_global_pack8 = 0; + +} + +Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + pipeline_pooling = 0; + pipeline_pooling_pack4 = 0; + pipeline_pooling_pack8 = 0; + pipeline_pooling_global = 0; + pipeline_pooling_global_pack4 = 0; + pipeline_pooling_global_pack8 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + // Tensor* output_tensor = t_node->GetOutputTensor(0); + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + struct pool_param *param_ = (struct pool_param *)ir_node->op.param_mem; + + pooling_type = param_->pool_method; // 0:max 1:avg + kernel_h = param_->kernel_h; + kernel_w = param_->kernel_w; + stride_h = param_->stride_h; + stride_w = param_->stride_w; + global = param_->global; + caffe_flavor = param_->caffe_flavor; + pad_h0 = param_->pad_h0; + pad_w0 = param_->pad_w0; + pad_h1 = param_->pad_h1; + pad_w1 = param_->pad_w1; + input_c = input->dims[1]; + input_h = input->dims[2]; + input_w = input->dims[3]; + output_c = output->dims[1]; + output_h = output->dims[2]; + output_w = output->dims[3]; + // printf("create pooling layer with param:%d %d %d %d %d %d %d %d %d %d\n", kernel_h, kernel_w, stride_h, stride_w, global, pad_h0, pad_h1, pad_w0, pad_w1, param_->alg); +} + + +int Pooling_vulkan::create_pipeline(const Option& opt) +{ + int elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 : 1; + int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1; + + size_t elemsize; + size_t out_elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + + { + padding = new Padding_vulkan(); + padding->vkdev = vkdev; + + padding->top = pad_h0; + padding->bottom = pad_h1; + padding->left = pad_w0; + padding->right = pad_w1; + padding->type = 0; + padding->value = 0; + + padding->input_w = input_w; + padding->input_h = input_h; + padding->input_c = input_c; + padding->output_w = input_w + pad_w0 + pad_w1; + padding->output_h = input_h + pad_h0 + pad_h1; + padding->output_c = input_c; + + padding->create_pipeline(opt); + } + + if(global) + { + std::vector specializations(1 + 10); + specializations[0].i = pooling_type; + specializations[1 + 0].i = 3; + specializations[1 + 1].i = input_w + pad_w0 + pad_w1; + specializations[1 + 2].i = input_h + pad_h0 + pad_h1; + specializations[1 + 3].i = input_c; + specializations[1 + 4].i = (input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1); + specializations[1 + 5].i = 3; + specializations[1 + 6].i = output_c; + specializations[1 + 7].i = output_h; + specializations[1 + 8].i = output_w; + specializations[1 + 9].i = output_h * output_w; + + VkTensor local_size_xyz; + // if (out_shape_packed.dims != 0) + { + local_size_xyz.w = std::min(4, output_w); + local_size_xyz.h = std::min(4, output_h); + local_size_xyz.c = std::min(4, output_c); + } + + // pack1 + if (elempack == 1) + { + pipeline_pooling_global = new Pipeline(vkdev); + pipeline_pooling_global->set_optimal_local_size_xyz(local_size_xyz); + pipeline_pooling_global->create(LayerShaderType::pooling_global, opt, specializations); + } + + // pack4 + if (elempack == 4) + { + pipeline_pooling_global_pack4 = new Pipeline(vkdev); + pipeline_pooling_global_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_pooling_global_pack4->create(LayerShaderType::pooling_global_pack4, opt, specializations); + } + + // pack8 + if (opt.use_shader_pack8 || elempack == 8) + { + pipeline_pooling_global_pack8 = new Pipeline(vkdev); + pipeline_pooling_global_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_pooling_global_pack8->create(LayerShaderType::pooling_global_pack8, opt, specializations); + } + } + else + { + std::vector specializations(12 + 10); + specializations[0].i = pooling_type; + specializations[1].i = kernel_w; + specializations[2].i = kernel_h; + specializations[3].i = stride_w; + specializations[4].i = stride_h; + specializations[5].i = pad_w0; + specializations[6].i = pad_w1; + specializations[7].i = pad_h0; + specializations[8].i = pad_h1; + specializations[9].i = global; + specializations[10].i = 0; // pad_mode; + specializations[11].i = 0; // avgpool_count_include_pad; + specializations[12 + 0].i = 0; // 3; // shape_bordered_packed.dims; + specializations[12 + 1].i = 0; // input_w; // shape_bordered_packed.w; + specializations[12 + 2].i = 0; // input_h; // shape_bordered_packed.h; + specializations[12 + 3].i = 0; // input_c; // shape_bordered_packed.c; + specializations[12 + 4].i = 0; // input_w * input_h; // shape_bordered_packed.cstep; + specializations[12 + 5].i = 0; // 3; // out_shape_packed.dims; + specializations[12 + 6].i = 0; // output_w; // out_shape_packed.w; + specializations[12 + 7].i = 0; // output_h; // out_shape_packed.h; + specializations[12 + 8].i = 0; // output_c; // out_shape_packed.c; + specializations[12 + 9].i = 0; // output_h * output_c; // out_shape_packed.cstep; + + VkTensor local_size_xyz; + local_size_xyz.w = std::min(4, output_w); + local_size_xyz.h = std::min(4, output_h); + local_size_xyz.c = std::min(4, output_c); + + // pack1 + if (elempack == 1) + { + pipeline_pooling = new Pipeline(vkdev); + pipeline_pooling->set_optimal_local_size_xyz(local_size_xyz); + pipeline_pooling->create(LayerShaderType::pooling, opt, specializations); + } + + // pack4 + if (elempack == 4) + { + pipeline_pooling_pack4 = new Pipeline(vkdev); + pipeline_pooling_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_pooling_pack4->create(LayerShaderType::pooling_pack4, opt, specializations); + } + + // pack8 + if (opt.use_shader_pack8 || elempack == 8) + { + pipeline_pooling_pack8 = new Pipeline(vkdev); + pipeline_pooling_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_pooling_pack8->create(LayerShaderType::pooling_pack8, opt, specializations); + } + } + + return 0; +} + +int Pooling_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + return 0; +} + +int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + + if(global) + { + // printf("input shape: %d %d %d, out shape: %d %d %d\n", input_c, input_h, input_w, output_c, output_h, output_w); + top_blob.create(output_c/elempack, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + // printf("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w); + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8 + : elempack == 4 ? pipeline_pooling_global_pack4 + : pipeline_pooling_global; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + + return 0; + } + + VkTensor bottom_blob_bordered = bottom_blob; + if (pad_h0 > 0 || pad_h1 > 0 || pad_w0 > 0 || pad_w1 > 0) + { + bottom_blob_bordered.w = bottom_blob_bordered.w + pad_w0 + pad_w1; + bottom_blob_bordered.h = bottom_blob_bordered.h + pad_h0 + pad_h1; + bottom_blob_bordered.cstep = bottom_blob_bordered.w * bottom_blob_bordered.h; + Option opt_pad = opt; + opt_pad.blob_vkallocator = opt.workspace_vkallocator; + + padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad); + } + + top_blob.create(output_w, output_h, output_c/elempack, elemsize, elempack, opt.blob_vkallocator); + + + std::vector bindings(2); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + + std::vector constants(12); + constants[0].i = bottom_blob_bordered.dims; + constants[1].i = bottom_blob_bordered.w; + constants[2].i = bottom_blob_bordered.h; + constants[3].i = bottom_blob_bordered.c; + constants[4].i = bottom_blob_bordered.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + constants[10].i = 0; + constants[11].i = 0; + + const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_pack8 + : elempack == 4 ? pipeline_pooling_pack4 + : pipeline_pooling; + + cmd.record_pipeline(pipeline, bindings, constants, top_blob); + return 0; +} + +} // namespace TEngine diff --git a/source/device/vulkan/layer/pooling_vulkan.hpp b/source/device/vulkan/layer/pooling_vulkan.hpp new file mode 100644 index 000000000..e4a823e9e --- /dev/null +++ b/source/device/vulkan/layer/pooling_vulkan.hpp @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_POOLING_HPP +#define LAYER_POOLING_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" +#include "padding_vulkan.hpp" + +#include "pooling_param.h" + +namespace TEngine { + +class Pooling_vulkan : public Layer +{ +public: + Pooling_vulkan(); + Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + int pooling_type; // // 0:max 1:avg + int kernel_h; // = param_->kernel_h; + int kernel_w; // = param_->kernel_w; + int stride_h; // = param_->stride_h; + int stride_w; // = param_->stride_w; + int global; // = param_->global; + int caffe_flavor; // = param_->caffe_flavor; + int pad_h0; // = param_->pad_h0; + int pad_w0; // = param_->pad_w0; + int pad_h1; // = param_->pad_h1; + int pad_w1; // = param_->pad_w1; + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + +public: + Padding_vulkan* padding; + + Pipeline* pipeline_pooling; + Pipeline* pipeline_pooling_pack4; + Pipeline* pipeline_pooling_pack8; + Pipeline* pipeline_pooling_global; + Pipeline* pipeline_pooling_global_pack4; + Pipeline* pipeline_pooling_global_pack8; +}; + +} // namespace TEngine + + +#endif diff --git a/source/device/vulkan/layer/priorbox_vulkan.cpp b/source/device/vulkan/layer/priorbox_vulkan.cpp new file mode 100644 index 000000000..de81aec7a --- /dev/null +++ b/source/device/vulkan/layer/priorbox_vulkan.cpp @@ -0,0 +1,351 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "priorbox_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +PriorBox_vulkan::PriorBox_vulkan() +{ + support_vulkan = true; + + pipeline_priorbox = 0; + pipeline_priorbox_mxnet = 0; +} + +PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + + pipeline_priorbox = 0; + pipeline_priorbox_mxnet = 0; + + graph = ir_graph; + node = ir_node; + + for(int i = 0; i < ir_node->input_num; i++) + { + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]); + std::string name = input->name; + bottoms.push_back(name); + } + + for(int i = 0; i < ir_node->output_num; i++) + { + struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]); + std::string name = output->name; + tops.push_back(name); + } + + // params + struct tensor *featmap_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]); + struct tensor *data_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]); + struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]); + input_c = data_tensor->dims[1]; // param->input_channel; + input_h = data_tensor->dims[2]; + input_w = data_tensor->dims[3]; + output_c = output_tensor->dims[1]; // param->output_channel; + output_h = output_tensor->dims[2]; + output_w = output_tensor->dims[3]; + + const int data_height = data_tensor->dims[2]; + const int data_width = data_tensor->dims[3]; + const int feat_height = featmap_tensor->dims[2]; + const int feat_width = featmap_tensor->dims[3]; + + struct priorbox_param *param = (struct priorbox_param *)ir_node->op.param_mem; + + variances[0] = (param->variance)[0]; + variances[1] = (param->variance)[1]; + variances[2] = (param->variance)[2]; + variances[3] = (param->variance)[3]; + flip = param->flip; + clip = param->clip; + + if (param->image_h == 0 || param->image_w == 0) + { + image_width = data_width; + image_height = data_height; + } + else + { + image_width = param->image_w; + image_height = param->image_h; + } + + if (param->step_h == 0 || param->step_w == 0) + { + step_width = ( float )(image_width) / feat_width; + step_height = ( float )(image_height) / feat_height; + } + else + { + step_width = param->step_w; + step_height = param->step_h; + } + int num_priors = param->num_priors; + + offset = param->offset; + step_mmdetection = 0; // TODO fix step_mmdetection value + center_mmdetection = 0; // TODO fix center_mmdetection value + + min_sizes = Tensor(param->min_size_num, param->min_size); + max_sizes = Tensor(param->max_size_num, param->max_size); + aspect_ratios = Tensor(param->aspect_ratio_size, param->aspect_ratio); + TLOG_INFO("size min max aspect:%d %d %d\n", param->min_size_num, param->max_size_num, param->aspect_ratio_size); +} + +int PriorBox_vulkan::create_pipeline(const Option& opt) +{ + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else + { + elemsize = elempack * 4u; + } + + Tensor shape_packed; + if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + + // caffe style + { + int num_min_size = min_sizes.w; + int num_max_size = max_sizes.w; + int num_aspect_ratio = aspect_ratios.w; + + int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size; + if (flip) + num_prior += num_min_size * num_aspect_ratio; + + std::vector specializations(11 + 2); + specializations[0].i = flip; + specializations[1].i = clip; + specializations[2].f = offset; + specializations[3].f = variances[0]; + specializations[4].f = variances[1]; + specializations[5].f = variances[2]; + specializations[6].f = variances[3]; + specializations[7].i = num_min_size; + specializations[8].i = num_max_size; + specializations[9].i = num_aspect_ratio; + specializations[10].i = num_prior; + specializations[11 + 0].i = 0;//shape_packed.w; + specializations[11 + 1].i = 0;//shape_packed.h; + + pipeline_priorbox = new Pipeline(vkdev); + pipeline_priorbox->set_optimal_local_size_xyz(); + pipeline_priorbox->create(LayerShaderType::priorbox, opt, specializations); + } + + // mxnet style + { + int num_sizes = min_sizes.w; + int num_ratios = aspect_ratios.w; + + int num_prior = num_sizes - 1 + num_ratios; + + std::vector specializations(5 + 2); + specializations[0].i = clip; + specializations[1].f = offset; + specializations[2].i = num_sizes; + specializations[3].i = num_ratios; + specializations[4].i = num_prior; + specializations[5 + 0].i = shape_packed.w; + specializations[5 + 1].i = shape_packed.h; + + pipeline_priorbox_mxnet = new Pipeline(vkdev); + pipeline_priorbox_mxnet->set_optimal_local_size_xyz(); + pipeline_priorbox_mxnet->create(LayerShaderType::priorbox_mxnet, opt, specializations); + } + + return 0; +} + +int PriorBox_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_priorbox; + pipeline_priorbox = 0; + + delete pipeline_priorbox_mxnet; + pipeline_priorbox_mxnet = 0; + + return 0; +} + +int PriorBox_vulkan::upload_model(VkTransfer& cmd, const Option& opt) +{ + cmd.record_upload(min_sizes, min_sizes_gpu, opt); + + if (max_sizes.w > 0) + cmd.record_upload(max_sizes, max_sizes_gpu, opt); + + cmd.record_upload(aspect_ratios, aspect_ratios_gpu, opt); + + return 0; +} + +int PriorBox_vulkan::record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + int w = bottom_blobs[0].w; + int h = bottom_blobs[0].h; + + if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty()) + { + // mxnet style _contrib_MultiBoxPrior + float step_w = step_width; + float step_h = step_height; + if (step_w == -233) + step_w = 1.f / (float)w; + if (step_h == -233) + step_h = 1.f / (float)h; + + int num_sizes = min_sizes.w; + int num_ratios = aspect_ratios.w; + + int num_prior = num_sizes - 1 + num_ratios; + + int elempack = 4; + + size_t elemsize = elempack * 4u; + if (opt.use_fp16_packed || opt.use_fp16_storage) + { + elemsize = elempack * 2u; + } + + VkTensor& top_blob = top_blobs[0]; + top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(3); + bindings[0] = top_blob; + bindings[1] = min_sizes_gpu; + bindings[2] = aspect_ratios_gpu; + + std::vector constants(4); + constants[0].i = w; + constants[1].i = h; + constants[2].f = step_w; + constants[3].f = step_h; + + VkTensor dispatcher; + dispatcher.w = num_sizes; + dispatcher.h = w; + dispatcher.c = h; + + cmd.record_pipeline(pipeline_priorbox_mxnet, bindings, constants, dispatcher); + + return 0; + } + + int image_w = image_width; + int image_h = image_height; + if (image_w == -233) + image_w = bottom_blobs[1].w; + if (image_h == -233) + image_h = bottom_blobs[1].h; + + float step_w = step_width; + float step_h = step_height; + if (step_w == -233) + step_w = (float)image_w / w; + if (step_h == -233) + step_h = (float)image_h / h; + + int num_min_size = min_sizes.w; + int num_max_size = max_sizes.w; + int num_aspect_ratio = aspect_ratios.w; + + int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size; + if (flip) + num_prior += num_min_size * num_aspect_ratio; + + size_t elemsize = 4u; + if (opt.use_fp16_storage) + { + elemsize = 2u; + } + + VkTensor& top_blob = top_blobs[0]; + top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = top_blob; + bindings[1] = min_sizes_gpu; + bindings[2] = num_max_size > 0 ? max_sizes_gpu : min_sizes_gpu; + bindings[3] = aspect_ratios_gpu; + + std::vector constants(6); + constants[0].i = w; + constants[1].i = h; + constants[2].f = image_w; + constants[3].f = image_h; + constants[4].f = step_w; + constants[5].f = step_h; + + VkTensor dispatcher; + dispatcher.w = num_min_size; + dispatcher.h = w; + dispatcher.c = h; + + cmd.record_pipeline(pipeline_priorbox, bindings, constants, dispatcher); + + return 0; +} + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/priorbox_vulkan.hpp b/source/device/vulkan/layer/priorbox_vulkan.hpp new file mode 100644 index 000000000..69b8f8bb7 --- /dev/null +++ b/source/device/vulkan/layer/priorbox_vulkan.hpp @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_PRIORBOX_HPP +#define LAYER_PRIORBOX_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "priorbox_param.h" + +namespace TEngine{ + +class PriorBox_vulkan : public Layer +{ +public: + PriorBox_vulkan(); + PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_priorbox; + Pipeline* pipeline_priorbox_mxnet; + +public: + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + + float variances[4]; + int flip; + int clip; + int image_width; + int image_height; + float step_width; + float step_height; + float offset; + int num_priors; + bool step_mmdetection; + bool center_mmdetection; + + Tensor min_sizes; + Tensor max_sizes; + Tensor aspect_ratios; + VkTensor min_sizes_gpu; + VkTensor max_sizes_gpu; + VkTensor aspect_ratios_gpu; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/relu_vulkan.cpp b/source/device/vulkan/layer/relu_vulkan.cpp new file mode 100644 index 000000000..f541806cf --- /dev/null +++ b/source/device/vulkan/layer/relu_vulkan.cpp @@ -0,0 +1,214 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "relu_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +ReLU_vulkan::ReLU_vulkan() +{ + support_vulkan = true; + support_image_storage = true; + + pipeline_relu = 0; + pipeline_relu_pack4 = 0; + pipeline_relu_pack8 = 0; +} + +ReLU_vulkan::ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = false; + + pipeline_relu = 0; + pipeline_relu_pack4 = 0; + pipeline_relu_pack8 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + // params + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + + struct relu_param *param = (struct relu_param *)ir_node->op.param_mem; + negative_slope = param->negative_slope; +} + +int ReLU_vulkan::create_pipeline(const Option& opt) +{ + const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0]; + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else + { + elemsize = elempack * 4u; + } + + Tensor shape_packed; + if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + + std::vector specializations(1 + 5); + specializations[0].f = negative_slope; // slope; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; + + Tensor local_size_xyz; + if (shape_packed.dims == 1) + { + local_size_xyz.w = std::min(64, shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, shape_packed.w); + local_size_xyz.h = std::min(8, shape_packed.h); + local_size_xyz.c = 1; + } + if (shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, shape_packed.w); + local_size_xyz.h = std::min(4, shape_packed.h); + local_size_xyz.c = std::min(4, shape_packed.c); + } + + // pack1 + if (shape.dims == 0 || elempack == 1) + { + pipeline_relu = new Pipeline(vkdev); + pipeline_relu->set_optimal_local_size_xyz(local_size_xyz); + pipeline_relu->create(LayerShaderType::relu, opt, specializations); + } + + // pack4 + if (shape.dims == 0 || elempack == 4) + { + pipeline_relu_pack4 = new Pipeline(vkdev); + pipeline_relu_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_relu_pack4->create(LayerShaderType::relu_pack4, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8) + { + pipeline_relu_pack8 = new Pipeline(vkdev); + pipeline_relu_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_relu_pack8->create(LayerShaderType::relu_pack8, opt, specializations); + } + + return 0; +} + + +int ReLU_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_relu; + pipeline_relu = 0; + + delete pipeline_relu_pack4; + pipeline_relu_pack4 = 0; + + delete pipeline_relu_pack8; + pipeline_relu_pack8 = 0; + + return 0; +} + +int ReLU_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const +{ + int elempack = bottom_top_blob.elempack; + + std::vector bindings(1); + bindings[0] = bottom_top_blob; + + std::vector constants(5); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = bottom_top_blob.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_relu_pack8 + : elempack == 4 ? pipeline_relu_pack4 + : pipeline_relu; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); + + return 0; +} + +int ReLU_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + printf("run record_pipeline relu!\n"); + return 0; +} + +} \ No newline at end of file diff --git a/source/device/vulkan/layer/relu_vulkan.hpp b/source/device/vulkan/layer/relu_vulkan.hpp new file mode 100644 index 000000000..c928a756f --- /dev/null +++ b/source/device/vulkan/layer/relu_vulkan.hpp @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_RELU_HPP +#define LAYER_RELU_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "relu_param.h" + +namespace TEngine{ + +class ReLU_vulkan : public Layer +{ +public: + ReLU_vulkan(); + ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_relu; + Pipeline* pipeline_relu_pack4; + Pipeline* pipeline_relu_pack8; + +public: + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + float negative_slope; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/reshape_vulkan.cpp b/source/device/vulkan/layer/reshape_vulkan.cpp new file mode 100644 index 000000000..7e36dca8f --- /dev/null +++ b/source/device/vulkan/layer/reshape_vulkan.cpp @@ -0,0 +1,580 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "reshape_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Reshape_vulkan::Reshape_vulkan() +{ + support_vulkan = true; + support_image_storage = true; + + permute_hwc = 0; + permute_hc = 0; + permute_hw = 0; + permute_chw = 0; + + pipeline_reshape = 0; + pipeline_reshape_pack4 = 0; + pipeline_reshape_pack1to4 = 0; + pipeline_reshape_pack4to1 = 0; + pipeline_reshape_pack8 = 0; + pipeline_reshape_pack1to8 = 0; + pipeline_reshape_pack4to8 = 0; + pipeline_reshape_pack8to4 = 0; + pipeline_reshape_pack8to1 = 0; +} + +Reshape_vulkan::Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = true; + + permute_hwc = 0; + permute_hc = 0; + permute_hw = 0; + permute_chw = 0; + + pipeline_reshape = 0; + pipeline_reshape_pack4 = 0; + pipeline_reshape_pack1to4 = 0; + pipeline_reshape_pack4to1 = 0; + pipeline_reshape_pack8 = 0; + pipeline_reshape_pack1to8 = 0; + pipeline_reshape_pack4to8 = 0; + pipeline_reshape_pack8to4 = 0; + pipeline_reshape_pack8to1 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + // params + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + + struct reshape_param *param = (struct reshape_param *)ir_node->op.param_mem; + + ndim = param->dim_size; + permute = param->reverse; + // TODO fix + // c = param->re_shape[0]; + // w = param->re_shape[1]; + // h = param->re_shape[2]; + if(param->dim_size == 4) + { + ndim = 3; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + + c = output->dims[1]; // param->output_channel; + h = output->dims[2]; + w = output->dims[3]; + } + else + { + ndim = param->dim_size; + + output_c = output->dims[0]; // param->output_channel; + output_h = output->dims[1]; + output_w = output->dims[2]; + + c = output_c; // param->output_channel; + h = output_h; + w = output_w; + } + + + +} + +int Reshape_vulkan::create_pipeline(const Option& _opt) +{ + Option opt = _opt; + const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0]; + const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; + + bool need_permute = permute == 1; + if (shape.dims == 2 && ndim == 2 && shape.h == out_shape.h) + need_permute = false; + if (shape.dims == 3 && ndim == 3 && shape.c == out_shape.c) + need_permute = false; + + Tensor shape_permuted = shape; + Tensor out_shape_permuted = out_shape; + if (need_permute) + { + if (shape.dims == 1) shape_permuted = Tensor(shape.w, (void*)0); + if (shape.dims == 2) shape_permuted = Tensor(shape.h, shape.w, (void*)0); + if (shape.dims == 3) shape_permuted = Tensor(shape.c, shape.w, shape.h, (void*)0); + + if (out_shape.dims == 1) out_shape_permuted = Tensor(out_shape.w, (void*)0); + if (out_shape.dims == 2) out_shape_permuted = Tensor(out_shape.h, out_shape.w, (void*)0); + if (out_shape.dims == 3) out_shape_permuted = Tensor(out_shape.c, out_shape.w, out_shape.h, (void*)0); + } + + int elempack = 1; + if (shape_permuted.dims == 1) elempack = opt.use_shader_pack8 && shape_permuted.w % 8 == 0 ? 8 : shape_permuted.w % 4 == 0 ? 4 : 1; + if (shape_permuted.dims == 2) elempack = opt.use_shader_pack8 && shape_permuted.h % 8 == 0 ? 8 : shape_permuted.h % 4 == 0 ? 4 : 1; + if (shape_permuted.dims == 3) elempack = opt.use_shader_pack8 && shape_permuted.c % 8 == 0 ? 8 : shape_permuted.c % 4 == 0 ? 4 : 1; + + int out_elempack = 1; + if (out_shape_permuted.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape_permuted.w % 8 == 0 ? 8 : out_shape_permuted.w % 4 == 0 ? 4 : 1; + if (out_shape_permuted.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape_permuted.h % 8 == 0 ? 8 : out_shape_permuted.h % 4 == 0 ? 4 : 1; + if (out_shape_permuted.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape_permuted.c % 8 == 0 ? 8 : out_shape_permuted.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + size_t out_elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + out_elemsize = out_elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u; + } + else + { + elemsize = elempack * 4u; + out_elemsize = out_elempack * 4u; + } + + Tensor shape_packed; + if (shape_permuted.dims == 1) shape_packed = Tensor(shape_permuted.w / elempack, (void*)0, elemsize, elempack); + if (shape_permuted.dims == 2) shape_packed = Tensor(shape_permuted.w, shape_permuted.h / elempack, (void*)0, elemsize, elempack); + if (shape_permuted.dims == 3) shape_packed = Tensor(shape_permuted.w, shape_permuted.h, shape_permuted.c / elempack, (void*)0, elemsize, elempack); + + Tensor out_shape_packed; + if (out_shape_permuted.dims == 1) out_shape_packed = Tensor(out_shape_permuted.w / out_elempack, (void*)0, out_elemsize, out_elempack); + if (out_shape_permuted.dims == 2) out_shape_packed = Tensor(out_shape_permuted.w, out_shape_permuted.h / out_elempack, (void*)0, out_elemsize, out_elempack); + if (out_shape_permuted.dims == 3) out_shape_packed = Tensor(out_shape_permuted.w, out_shape_permuted.h, out_shape_permuted.c / out_elempack, (void*)0, out_elemsize, out_elempack); + + // check blob shape + // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed)) + { + support_image_storage = false; + opt.use_image_storage = false; + } + + std::vector specializations(1 + 10); + specializations[0].i = ndim; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; + specializations[1 + 5].i = 0; // out_shape_packed.dims; + specializations[1 + 6].i = 0; // out_shape_packed.w; + specializations[1 + 7].i = 0; // out_shape_packed.h; + specializations[1 + 8].i = 0; // out_shape_packed.c; + specializations[1 + 9].i = 0; // out_shape_packed.cstep; + + Tensor local_size_xyz_bottom; // pack4to1 and pack8to1 + if (shape_packed.dims == 1) + { + local_size_xyz_bottom.w = std::min(64, shape_packed.w); + local_size_xyz_bottom.h = 1; + local_size_xyz_bottom.c = 1; + } + if (shape_packed.dims == 2) + { + local_size_xyz_bottom.w = std::min(8, shape_packed.w); + local_size_xyz_bottom.h = std::min(8, shape_packed.h); + local_size_xyz_bottom.c = 1; + } + if (shape_packed.dims == 3) + { + local_size_xyz_bottom.w = std::min(4, shape_packed.w); + local_size_xyz_bottom.h = std::min(4, shape_packed.h); + local_size_xyz_bottom.c = std::min(4, shape_packed.c); + } + + Tensor local_size_xyz; + if (out_shape_packed.dims == 1) + { + local_size_xyz.w = std::min(64, out_shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (out_shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, out_shape_packed.w); + local_size_xyz.h = std::min(8, out_shape_packed.h); + local_size_xyz.c = 1; + } + if (out_shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, out_shape_packed.w); + local_size_xyz.h = std::min(4, out_shape_packed.h); + local_size_xyz.c = std::min(4, out_shape_packed.c); + } + + // pack1 + if (shape_permuted.dims == 0 || (elempack == 1 && out_elempack == 1)) + { + pipeline_reshape = new Pipeline(vkdev); + pipeline_reshape->set_optimal_local_size_xyz(local_size_xyz); + pipeline_reshape->create(LayerShaderType::reshape, opt, specializations); + } + + // pack4 + if (shape_permuted.dims == 0 || (elempack == 4 && out_elempack == 4)) + { + pipeline_reshape_pack4 = new Pipeline(vkdev); + pipeline_reshape_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_reshape_pack4->create(LayerShaderType::reshape_pack4, opt, specializations); + } + + // pack1to4 + if (shape_permuted.dims == 0 || (elempack == 1 && out_elempack == 4)) + { + pipeline_reshape_pack1to4 = new Pipeline(vkdev); + pipeline_reshape_pack1to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_reshape_pack1to4->create(LayerShaderType::reshape_pack1to4, opt, specializations); + } + + // pack4to1 + if (shape_permuted.dims == 0 || (elempack == 4 && out_elempack == 1)) + { + pipeline_reshape_pack4to1 = new Pipeline(vkdev); + pipeline_reshape_pack4to1->set_optimal_local_size_xyz(local_size_xyz_bottom); + pipeline_reshape_pack4to1->create(LayerShaderType::reshape_pack4to1, opt, specializations); + } + + // pack8 + if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 8 && out_elempack == 8)) + { + pipeline_reshape_pack8 = new Pipeline(vkdev); + pipeline_reshape_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_reshape_pack8->create(LayerShaderType::reshape_pack8, opt, specializations); + } + + // pack1to8 + if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 1 && out_elempack == 8)) + { + pipeline_reshape_pack1to8 = new Pipeline(vkdev); + pipeline_reshape_pack1to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_reshape_pack1to8->create(LayerShaderType::reshape_pack1to8, opt, specializations); + } + + // pack4to8 + if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 4 && out_elempack == 8)) + { + pipeline_reshape_pack4to8 = new Pipeline(vkdev); + pipeline_reshape_pack4to8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_reshape_pack4to8->create(LayerShaderType::reshape_pack4to8, opt, specializations); + } + + // pack8to4 + if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 8 && out_elempack == 4)) + { + pipeline_reshape_pack8to4 = new Pipeline(vkdev); + pipeline_reshape_pack8to4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_reshape_pack8to4->create(LayerShaderType::reshape_pack8to4, opt, specializations); + } + + // pack8to1 + if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 8 && out_elempack == 1)) + { + pipeline_reshape_pack8to1 = new Pipeline(vkdev); + pipeline_reshape_pack8to1->set_optimal_local_size_xyz(local_size_xyz_bottom); + pipeline_reshape_pack8to1->create(LayerShaderType::reshape_pack8to1, opt, specializations); + } + + return 0; +} + +int Reshape_vulkan::destroy_pipeline(const Option& opt) +{ + if (permute_hwc) + { + permute_hwc->destroy_pipeline(opt); + delete permute_hwc; + permute_hwc = 0; + } + + if (permute_hc) + { + permute_hc->destroy_pipeline(opt); + delete permute_hc; + permute_hc = 0; + } + + if (permute_hw) + { + permute_hw->destroy_pipeline(opt); + delete permute_hw; + permute_hw = 0; + } + + if (permute_chw) + { + permute_chw->destroy_pipeline(opt); + delete permute_chw; + permute_chw = 0; + } + + delete pipeline_reshape; + pipeline_reshape = 0; + + delete pipeline_reshape_pack4; + pipeline_reshape_pack4 = 0; + + delete pipeline_reshape_pack1to4; + pipeline_reshape_pack1to4 = 0; + + delete pipeline_reshape_pack4to1; + pipeline_reshape_pack4to1 = 0; + + delete pipeline_reshape_pack8; + pipeline_reshape_pack8 = 0; + + delete pipeline_reshape_pack1to8; + pipeline_reshape_pack1to8 = 0; + + delete pipeline_reshape_pack4to8; + pipeline_reshape_pack4to8 = 0; + + delete pipeline_reshape_pack8to4; + pipeline_reshape_pack8to4 = 0; + + delete pipeline_reshape_pack8to1; + pipeline_reshape_pack8to1 = 0; + + return 0; +} + +int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + int dims = bottom_blob.dims; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob.elempack; + int out_elempack; + + int total = bottom_blob.w * bottom_blob.h * bottom_blob.c * elempack; + + // resolve out shape + int outw = w; + int outh = h; + int outc = c; + + if (ndim == 1) + { + if (outw == 0) + outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w; + + if (outw == -1) + outw = total; + + out_elempack = opt.use_shader_pack8 && outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1; + + if (dims == 1 && bottom_blob.w == outw && elempack == out_elempack) + { + top_blob = bottom_blob; + return 0; + } + } + if (ndim == 2) + { + if (outw == 0) + outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w; + if (outh == 0) + outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h; + + if (outw == -1) + outw = total / outh; + if (outh == -1) + outh = total / outw; + + out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1; + + if (dims == 2 && bottom_blob.h == outh && elempack == out_elempack) + { + top_blob = bottom_blob; + return 0; + } + } + + if (ndim == 3) + { + if (outw == 0) + outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w; + if (outh == 0) + outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h; + if (outc == 0) + outc = dims == 3 ? bottom_blob.c * elempack : bottom_blob.c; + + if (outw == -1) + outw = total / outc / outh; + if (outh == -1) + outh = total / outc / outw; + if (outc == -1) + outc = total / outh / outw; + + out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1; + + if (dims == 3 && bottom_blob.c == outc && elempack == out_elempack) + { + top_blob = bottom_blob; + top_blob.w = outw; + top_blob.h = outh; + return 0; + } + } + + bool need_permute = permute == 1; + if (dims == 2 && ndim == 2 && bottom_blob.h * elempack == outh) + need_permute = false; + if (dims == 3 && ndim == 3 && bottom_blob.c * elempack == outc) + need_permute = false; + + if (ndim == 1) + { + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + if (ndim == 2) + { + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + if (ndim == 3) + { + size_t out_elemsize = elemsize / elempack * out_elempack; + + if (opt.use_fp16_packed && !opt.use_fp16_storage) + { + if (out_elempack == 8) out_elemsize = 8 * 2u; + if (out_elempack == 4) out_elemsize = 4 * 2u; + if (out_elempack == 1) out_elemsize = 4u; + } + + top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + } + + if (top_blob.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(10); + constants[0].i = bottom_blob.dims; + constants[1].i = bottom_blob.w; + constants[2].i = bottom_blob.h; + constants[3].i = bottom_blob.c; + constants[4].i = bottom_blob.cstep; + constants[5].i = top_blob.dims; + constants[6].i = top_blob.w; + constants[7].i = top_blob.h; + constants[8].i = top_blob.c; + constants[9].i = top_blob.cstep; + + if (elempack == 1 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_reshape, bindings, constants, top_blob); + } + else if (elempack == 4 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_reshape_pack4, bindings, constants, top_blob); + } + else if (elempack == 1 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_reshape_pack1to4, bindings, constants, top_blob); + } + else if (elempack == 4 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_reshape_pack4to1, bindings, constants, bottom_blob); + } + else if (elempack == 8 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_reshape_pack8, bindings, constants, top_blob); + } + else if (elempack == 1 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_reshape_pack1to8, bindings, constants, top_blob); + } + else if (elempack == 4 && out_elempack == 8) + { + cmd.record_pipeline(pipeline_reshape_pack4to8, bindings, constants, top_blob); + } + else if (elempack == 8 && out_elempack == 4) + { + cmd.record_pipeline(pipeline_reshape_pack8to4, bindings, constants, top_blob); + } + else if (elempack == 8 && out_elempack == 1) + { + cmd.record_pipeline(pipeline_reshape_pack8to1, bindings, constants, bottom_blob); + } + + return 0; +} + + +} // namespace TEngine \ No newline at end of file diff --git a/source/device/vulkan/layer/reshape_vulkan.hpp b/source/device/vulkan/layer/reshape_vulkan.hpp new file mode 100644 index 000000000..33bc2be41 --- /dev/null +++ b/source/device/vulkan/layer/reshape_vulkan.hpp @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_RESHAPE_HPP +#define LAYER_RESHAPE_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "reshape_param.h" + +namespace TEngine{ + +class Reshape_vulkan : public Layer +{ +public: + Reshape_vulkan(); + Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + +public: + TEngine::Layer* permute_hwc; + TEngine::Layer* permute_hc; + TEngine::Layer* permute_hw; + TEngine::Layer* permute_chw; + + Pipeline* pipeline_reshape; + Pipeline* pipeline_reshape_pack4; + Pipeline* pipeline_reshape_pack1to4; + Pipeline* pipeline_reshape_pack4to1; + Pipeline* pipeline_reshape_pack8; + Pipeline* pipeline_reshape_pack1to8; + Pipeline* pipeline_reshape_pack4to8; + Pipeline* pipeline_reshape_pack8to4; + Pipeline* pipeline_reshape_pack8to1; + +public: + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + + int w; + int h; + int c; + + // flag permute chw->hwc or hw->wh before and after reshape + int permute; + + int ndim; + +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer/softmax_vulkan.cpp b/source/device/vulkan/layer/softmax_vulkan.cpp new file mode 100644 index 000000000..970e03295 --- /dev/null +++ b/source/device/vulkan/layer/softmax_vulkan.cpp @@ -0,0 +1,486 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "softmax_vulkan.hpp" +#include "../layer_shader_type.h" + +namespace TEngine { + +Softmax_vulkan::Softmax_vulkan() +{ + support_vulkan = true; + support_image_storage = true; + + pipeline_softmax_reduce_max = 0; + pipeline_softmax_exp_sub_max = 0; + pipeline_softmax_reduce_sum = 0; + pipeline_softmax_div_sum = 0; + + pipeline_softmax_reduce_max_pack4 = 0; + pipeline_softmax_exp_sub_max_pack4 = 0; + pipeline_softmax_reduce_sum_pack4 = 0; + pipeline_softmax_div_sum_pack4 = 0; + + pipeline_softmax_reduce_max_pack8 = 0; + pipeline_softmax_exp_sub_max_pack8 = 0; + pipeline_softmax_reduce_sum_pack8 = 0; + pipeline_softmax_div_sum_pack8 = 0; +} + +Softmax_vulkan::Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node) +{ + support_vulkan = true; + support_image_storage = true; + + pipeline_softmax_reduce_max = 0; + pipeline_softmax_exp_sub_max = 0; + pipeline_softmax_reduce_sum = 0; + pipeline_softmax_div_sum = 0; + + pipeline_softmax_reduce_max_pack4 = 0; + pipeline_softmax_exp_sub_max_pack4 = 0; + pipeline_softmax_reduce_sum_pack4 = 0; + pipeline_softmax_div_sum_pack4 = 0; + + pipeline_softmax_reduce_max_pack8 = 0; + pipeline_softmax_exp_sub_max_pack8 = 0; + pipeline_softmax_reduce_sum_pack8 = 0; + pipeline_softmax_div_sum_pack8 = 0; + + graph = ir_graph; + node = ir_node; + + struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]); + std::string name = input->name; + bottoms.push_back(name); + + struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]); + name = output->name; + tops.push_back(name); + + // params + input_c = input->dims[1]; // param->input_channel; + input_h = input->dims[2]; + input_w = input->dims[3]; + output_c = output->dims[1]; // param->output_channel; + output_h = output->dims[2]; + output_w = output->dims[3]; + + struct softmax_param *param = (struct softmax_param *)ir_node->op.param_mem; + axis = param->axis-1; +} + +int Softmax_vulkan::create_pipeline(const Option& opt) +{ + const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0]; + + int elempack = 1; + if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1; + if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1; + if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1; + + size_t elemsize; + if (opt.use_fp16_storage) + { + elemsize = elempack * 2u; + } + else if (opt.use_fp16_packed) + { + elemsize = elempack == 1 ? 4u : elempack * 2u; + } + else + { + elemsize = elempack * 4u; + } + + Tensor shape_packed; + if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack); + if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + + Tensor workspace_shape_packed; + if (shape.dims == 1) // axis == 0 + { + workspace_shape_packed = Tensor(1, (void*)0, elemsize, elempack); + } + else if (shape.dims == 2 && axis == 0) + { + workspace_shape_packed = Tensor(shape.w, (void*)0, elemsize, elempack); + } + else if (shape.dims == 2 && axis == 1) + { + workspace_shape_packed = Tensor(shape.h / elempack, (void*)0, elemsize, elempack); + } + else if (shape.dims == 3 && axis == 0) + { + workspace_shape_packed = Tensor(shape.w, shape.h, (void*)0, elemsize, elempack); + } + else if (shape.dims == 3 && axis == 1) + { + workspace_shape_packed = Tensor(shape.w, shape.c / elempack, (void*)0, elemsize, elempack); + } + else if (shape.dims == 3 && axis == 2) + { + workspace_shape_packed = Tensor(shape.h, shape.c / elempack, (void*)0, elemsize, elempack); + } + + std::vector specializations(1 + 10); + specializations[0].i = axis; + specializations[1 + 0].i = 0; // shape_packed.dims; + specializations[1 + 1].i = 0; // shape_packed.w; + specializations[1 + 2].i = 0; // shape_packed.h; + specializations[1 + 3].i = 0; // shape_packed.c; + specializations[1 + 4].i = 0; // shape_packed.cstep; + specializations[1 + 5].i = 0; // workspace_shape_packed.dims; + specializations[1 + 6].i = 0; // workspace_shape_packed.w; + specializations[1 + 7].i = 0; // workspace_shape_packed.h; + specializations[1 + 8].i = 0; // workspace_shape_packed.c; + specializations[1 + 9].i = 0; // workspace_shape_packed.cstep; + + { + Tensor local_size_xyz; + if (workspace_shape_packed.dims == 1) + { + local_size_xyz.w = std::min(64, workspace_shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (workspace_shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, workspace_shape_packed.w); + local_size_xyz.h = std::min(8, workspace_shape_packed.h); + local_size_xyz.c = 1; + } + if (workspace_shape_packed.dims != 0) + { + local_size_xyz.w = std::min(4, workspace_shape_packed.w); + local_size_xyz.h = std::min(4, workspace_shape_packed.h); + local_size_xyz.c = std::min(4, workspace_shape_packed.c); + } + + // pack1 + { + pipeline_softmax_reduce_max = new Pipeline(vkdev); + pipeline_softmax_reduce_sum = new Pipeline(vkdev); + + pipeline_softmax_reduce_max->set_optimal_local_size_xyz(local_size_xyz); + pipeline_softmax_reduce_sum->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_softmax_reduce_max->create(LayerShaderType::softmax_reduce_max, opt, specializations); + pipeline_softmax_reduce_sum->create(LayerShaderType::softmax_reduce_sum, opt, specializations); + } + + // pack4 + { + pipeline_softmax_reduce_max_pack4 = new Pipeline(vkdev); + pipeline_softmax_reduce_sum_pack4 = new Pipeline(vkdev); + + pipeline_softmax_reduce_max_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_softmax_reduce_sum_pack4->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_softmax_reduce_max_pack4->create(LayerShaderType::softmax_reduce_max_pack4, opt, specializations); + pipeline_softmax_reduce_sum_pack4->create(LayerShaderType::softmax_reduce_sum_pack4, opt, specializations); + } + + // pack8 + if (opt.use_shader_pack8) + { + pipeline_softmax_reduce_max_pack8 = new Pipeline(vkdev); + pipeline_softmax_reduce_sum_pack8 = new Pipeline(vkdev); + + pipeline_softmax_reduce_max_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_softmax_reduce_sum_pack8->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_softmax_reduce_max_pack8->create(LayerShaderType::softmax_reduce_max_pack8, opt, specializations); + pipeline_softmax_reduce_sum_pack8->create(LayerShaderType::softmax_reduce_sum_pack8, opt, specializations); + } + } + + { + Tensor local_size_xyz; + if (shape_packed.dims == 1) + { + local_size_xyz.w = std::min(64, shape_packed.w); + local_size_xyz.h = 1; + local_size_xyz.c = 1; + } + if (shape_packed.dims == 2) + { + local_size_xyz.w = std::min(8, shape_packed.w); + local_size_xyz.h = std::min(8, shape_packed.h); + local_size_xyz.c = 1; + } + if (shape_packed.dims == 3) + { + local_size_xyz.w = std::min(4, shape_packed.w); + local_size_xyz.h = std::min(4, shape_packed.h); + local_size_xyz.c = std::min(4, shape_packed.c); + } + + // pack1 + { + pipeline_softmax_exp_sub_max = new Pipeline(vkdev); + pipeline_softmax_div_sum = new Pipeline(vkdev); + + pipeline_softmax_exp_sub_max->set_optimal_local_size_xyz(local_size_xyz); + pipeline_softmax_div_sum->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_softmax_exp_sub_max->create(LayerShaderType::softmax_exp_sub_max, opt, specializations); + pipeline_softmax_div_sum->create(LayerShaderType::softmax_div_sum, opt, specializations); + } + + // pack4 + { + pipeline_softmax_exp_sub_max_pack4 = new Pipeline(vkdev); + pipeline_softmax_div_sum_pack4 = new Pipeline(vkdev); + + pipeline_softmax_exp_sub_max_pack4->set_optimal_local_size_xyz(local_size_xyz); + pipeline_softmax_div_sum_pack4->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_softmax_exp_sub_max_pack4->create(LayerShaderType::softmax_exp_sub_max_pack4, opt, specializations); + pipeline_softmax_div_sum_pack4->create(LayerShaderType::softmax_div_sum_pack4, opt, specializations); + } + + // pack8 + if (opt.use_shader_pack8) + { + pipeline_softmax_exp_sub_max_pack8 = new Pipeline(vkdev); + pipeline_softmax_div_sum_pack8 = new Pipeline(vkdev); + + pipeline_softmax_exp_sub_max_pack8->set_optimal_local_size_xyz(local_size_xyz); + pipeline_softmax_div_sum_pack8->set_optimal_local_size_xyz(local_size_xyz); + + pipeline_softmax_exp_sub_max_pack8->create(LayerShaderType::softmax_exp_sub_max_pack8, opt, specializations); + pipeline_softmax_div_sum_pack8->create(LayerShaderType::softmax_div_sum_pack8, opt, specializations); + } + } + + return 0; +} + + +int Softmax_vulkan::destroy_pipeline(const Option& /*opt*/) +{ + delete pipeline_softmax_reduce_max; + pipeline_softmax_reduce_max = 0; + + delete pipeline_softmax_exp_sub_max; + pipeline_softmax_exp_sub_max = 0; + + delete pipeline_softmax_reduce_sum; + pipeline_softmax_reduce_sum = 0; + + delete pipeline_softmax_div_sum; + pipeline_softmax_div_sum = 0; + + delete pipeline_softmax_reduce_max_pack4; + pipeline_softmax_reduce_max_pack4 = 0; + + delete pipeline_softmax_exp_sub_max_pack4; + pipeline_softmax_exp_sub_max_pack4 = 0; + + delete pipeline_softmax_reduce_sum_pack4; + pipeline_softmax_reduce_sum_pack4 = 0; + + delete pipeline_softmax_div_sum_pack4; + pipeline_softmax_div_sum_pack4 = 0; + + delete pipeline_softmax_reduce_max_pack8; + pipeline_softmax_reduce_max_pack8 = 0; + + delete pipeline_softmax_exp_sub_max_pack8; + pipeline_softmax_exp_sub_max_pack8 = 0; + + delete pipeline_softmax_reduce_sum_pack8; + pipeline_softmax_reduce_sum_pack8 = 0; + + delete pipeline_softmax_div_sum_pack8; + pipeline_softmax_div_sum_pack8 = 0; + + return 0; +} + +int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const +{ + int dims = bottom_top_blob.dims; + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + size_t elemsize = bottom_top_blob.elemsize; + int elempack = bottom_top_blob.elempack; + + VkTensor max_workspace; + VkTensor sum_workspace; + + if (dims == 1) // axis == 0 + { + max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 2 && axis == 0) + { + max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 2 && axis == 1) + { + max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 3 && axis == 0) + { + max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 3 && axis == 1) + { + max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator); + } + else if (dims == 3 && axis == 2) + { + max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator); + sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator); + } + + // reduce max + { + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = max_workspace; + + std::vector constants(10); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = bottom_top_blob.cstep; + constants[5].i = max_workspace.dims; + constants[6].i = max_workspace.w; + constants[7].i = max_workspace.h; + constants[8].i = max_workspace.c; + constants[9].i = max_workspace.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_max_pack8 + : elempack == 4 ? pipeline_softmax_reduce_max_pack4 + : pipeline_softmax_reduce_max; + + cmd.record_pipeline(pipeline, bindings, constants, max_workspace); + } + + // exp( v - max ) + { + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = max_workspace; + + std::vector constants(10); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = bottom_top_blob.cstep; + constants[5].i = max_workspace.dims; + constants[6].i = max_workspace.w; + constants[7].i = max_workspace.h; + constants[8].i = max_workspace.c; + constants[9].i = max_workspace.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_exp_sub_max_pack8 + : elempack == 4 ? pipeline_softmax_exp_sub_max_pack4 + : pipeline_softmax_exp_sub_max; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); + } + + // reduce sum + { + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = sum_workspace; + + std::vector constants(10); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = bottom_top_blob.cstep; + constants[5].i = sum_workspace.dims; + constants[6].i = sum_workspace.w; + constants[7].i = sum_workspace.h; + constants[8].i = sum_workspace.c; + constants[9].i = sum_workspace.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_sum_pack8 + : elempack == 4 ? pipeline_softmax_reduce_sum_pack4 + : pipeline_softmax_reduce_sum; + + cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); + } + + // div sum + { + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = sum_workspace; + + std::vector constants(10); + constants[0].i = bottom_top_blob.dims; + constants[1].i = bottom_top_blob.w; + constants[2].i = bottom_top_blob.h; + constants[3].i = bottom_top_blob.c; + constants[4].i = bottom_top_blob.cstep; + constants[5].i = sum_workspace.dims; + constants[6].i = sum_workspace.w; + constants[7].i = sum_workspace.h; + constants[8].i = sum_workspace.c; + constants[9].i = sum_workspace.cstep; + + const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_div_sum_pack8 + : elempack == 4 ? pipeline_softmax_div_sum_pack4 + : pipeline_softmax_div_sum; + + cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob); + } + + return 0; +} + + +} // namespace TEngine diff --git a/source/device/vulkan/layer/softmax_vulkan.hpp b/source/device/vulkan/layer/softmax_vulkan.hpp new file mode 100644 index 000000000..108ea5d62 --- /dev/null +++ b/source/device/vulkan/layer/softmax_vulkan.hpp @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_SOFTMAX_HPP +#define LAYER_SOFTMAX_HPP + +#include "../vulkan_layer.hpp" +#include "../vulkan_command.hpp" + +#include "softmax_param.h" + +namespace TEngine{ + +class Softmax_vulkan : public Layer +{ +public: + Softmax_vulkan(); + Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node); + + virtual int create_pipeline(const Option& opt); + virtual int destroy_pipeline(const Option& opt); + + virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + +public: + Pipeline* pipeline_softmax_reduce_max; + Pipeline* pipeline_softmax_exp_sub_max; + Pipeline* pipeline_softmax_reduce_sum; + Pipeline* pipeline_softmax_div_sum; + + Pipeline* pipeline_softmax_reduce_max_pack4; + Pipeline* pipeline_softmax_exp_sub_max_pack4; + Pipeline* pipeline_softmax_reduce_sum_pack4; + Pipeline* pipeline_softmax_div_sum_pack4; + + Pipeline* pipeline_softmax_reduce_max_pack8; + Pipeline* pipeline_softmax_exp_sub_max_pack8; + Pipeline* pipeline_softmax_reduce_sum_pack8; + Pipeline* pipeline_softmax_div_sum_pack8; + +public: + int axis; + int input_c; + int input_h; + int input_w; + int output_c; + int output_h; + int output_w; + +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/layer_shader_registry.h.in b/source/device/vulkan/layer_shader_registry.h.in new file mode 100644 index 000000000..9a88eb460 --- /dev/null +++ b/source/device/vulkan/layer_shader_registry.h.in @@ -0,0 +1,6 @@ +// Layer Shader Registry header +// +// This file is auto-generated by cmake, don't edit it. + +@layer_shader_registry@ + diff --git a/source/device/vulkan/layer_shader_spv_data.h.in b/source/device/vulkan/layer_shader_spv_data.h.in new file mode 100644 index 000000000..ab1b7b8aa --- /dev/null +++ b/source/device/vulkan/layer_shader_spv_data.h.in @@ -0,0 +1,6 @@ +// Layer Shader Spv Data header +// +// This file is auto-generated by cmake, don't edit it. + +@layer_shader_spv_data@ + diff --git a/source/device/vulkan/layer_shader_type.h b/source/device/vulkan/layer_shader_type.h new file mode 100644 index 000000000..e9c713062 --- /dev/null +++ b/source/device/vulkan/layer_shader_type.h @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef LAYER_SHADER_TYPE_H +#define LAYER_SHADER_TYPE_H + +namespace TEngine { + +namespace LayerShaderType { +enum LayerShaderType +{ +#include "layer_shader_type_enum.h" +}; +} // namespace LayerType + +} // namespace TEngine + +#endif // LAYER_SHADER_TYPE_H \ No newline at end of file diff --git a/source/device/vulkan/layer_shader_type_enum.h.in b/source/device/vulkan/layer_shader_type_enum.h.in new file mode 100644 index 000000000..1d3db77e9 --- /dev/null +++ b/source/device/vulkan/layer_shader_type_enum.h.in @@ -0,0 +1,5 @@ +// Layer Shader Enum header +// +// This file is auto-generated by cmake, don't edit it. + +@layer_shader_type_enum@ \ No newline at end of file diff --git a/source/device/vulkan/layer_type_enum.h.in b/source/device/vulkan/layer_type_enum.h.in new file mode 100644 index 000000000..88fa1a51b --- /dev/null +++ b/source/device/vulkan/layer_type_enum.h.in @@ -0,0 +1,5 @@ +// Layer Type Enum header +// +// This file is auto-generated by cmake, don't edit it. + +@layer_type_enum@ \ No newline at end of file diff --git a/source/device/vulkan/shaders/concat.comp b/source/device/vulkan/shaders/concat.comp new file mode 100644 index 000000000..5c904b42e --- /dev/null +++ b/source/device/vulkan/shaders/concat.comp @@ -0,0 +1,108 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int offset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp1(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + if (axis == 0) image2d_cp1(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); + if (axis == 1) image2d_cp1(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + if (axis == 0) image3d_cp1(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 1) image3d_cp1(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 2) image3d_cp1(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + ivec3 gxyz = ivec3(gx, gy, gz); + + gxyz[psc(dims) - 1 - axis] += p.offset; + + int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; + + buffer_cp1(top_blob_data, v_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/concat_pack4.comp b/source/device/vulkan/shaders/concat_pack4.comp new file mode 100644 index 000000000..e904aec55 --- /dev/null +++ b/source/device/vulkan/shaders/concat_pack4.comp @@ -0,0 +1,108 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int offset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp4(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + if (axis == 0) image2d_cp4(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); + if (axis == 1) image2d_cp4(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + if (axis == 0) image3d_cp4(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 1) image3d_cp4(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 2) image3d_cp4(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + ivec3 gxyz = ivec3(gx, gy, gz); + + gxyz[psc(dims) - 1 - axis] += p.offset; + + int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; + + buffer_cp4(top_blob_data, v_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/concat_pack4to1.comp b/source/device/vulkan/shaders/concat_pack4to1.comp new file mode 100644 index 000000000..bf69cebab --- /dev/null +++ b/source/device/vulkan/shaders/concat_pack4to1.comp @@ -0,0 +1,164 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int offset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + afpvec4 v = image1d_ld4(bottom_blob_1d, gx); + + int gx4 = gx * 4 + p.offset; + + image1d_st1(top_blob_1d, gx4 + 0, v.r); + image1d_st1(top_blob_1d, gx4 + 1, v.g); + image1d_st1(top_blob_1d, gx4 + 2, v.b); + image1d_st1(top_blob_1d, gx4 + 3, v.a); + } + else if (psc(dims) == 2) + { + afpvec4 v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + + if (axis == 0) + { + int gy4 = gy * 4 + p.offset; + + image2d_st1(top_blob_2d, ivec2(gx, gy4 + 0), v.r); + image2d_st1(top_blob_2d, ivec2(gx, gy4 + 1), v.g); + image2d_st1(top_blob_2d, ivec2(gx, gy4 + 2), v.b); + image2d_st1(top_blob_2d, ivec2(gx, gy4 + 3), v.a); + } + if (axis == 1) + { + int gx4 = gx * 4 + p.offset; + + image2d_st1(top_blob_2d, ivec2(gx4 + 0, gy), v.r); + image2d_st1(top_blob_2d, ivec2(gx4 + 1, gy), v.g); + image2d_st1(top_blob_2d, ivec2(gx4 + 2, gy), v.b); + image2d_st1(top_blob_2d, ivec2(gx4 + 3, gy), v.a); + } + } + else // if (psc(dims) == 3) + { + afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + + if (axis == 0) + { + int gz4 = gz * 4 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 0), v.r); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 1), v.g); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 2), v.b); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 3), v.a); + } + if (axis == 1) + { + int gy4 = gy * 4 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx, gy4 + 0, gz), v.r); + image3d_st1(top_blob_3d, ivec3(gx, gy4 + 1, gz), v.g); + image3d_st1(top_blob_3d, ivec3(gx, gy4 + 2, gz), v.b); + image3d_st1(top_blob_3d, ivec3(gx, gy4 + 3, gz), v.a); + } + if (axis == 2) + { + int gx4 = gx * 4 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx4 + 0, gy, gz), v.r); + image3d_st1(top_blob_3d, ivec3(gx4 + 1, gy, gz), v.g); + image3d_st1(top_blob_3d, ivec3(gx4 + 2, gy, gz), v.b); + image3d_st1(top_blob_3d, ivec3(gx4 + 3, gy, gz), v.a); + } + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + ivec3 gxyz = ivec3(gx, gy, gz); + + gxyz[psc(dims) - 1] *= 4; + gxyz[psc(dims) - 1 - axis] += p.offset; + + int v_offset_0 = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; + + ivec3 gxyz4 = ivec3(1, psc(outw), psc(outcstep)); + + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1 - axis]; + + buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/concat_pack8.comp b/source/device/vulkan/shaders/concat_pack8.comp new file mode 100644 index 000000000..6353705a5 --- /dev/null +++ b/source/device/vulkan/shaders/concat_pack8.comp @@ -0,0 +1,109 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int offset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_cp8(top_blob_1d, gx + p.offset, bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + if (axis == 0) image2d_cp8(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy)); + if (axis == 1) image2d_cp8(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + if (axis == 0) image3d_cp8(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 1) image3d_cp8(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + if (axis == 2) image3d_cp8(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + ivec3 gxyz = ivec3(gx, gy, gz); + + gxyz[psc(dims) - 1 - axis] += p.offset; + + int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; + + buffer_cp8(top_blob_data, v_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/concat_pack8to1.comp b/source/device/vulkan/shaders/concat_pack8to1.comp new file mode 100644 index 000000000..ffeedd8c9 --- /dev/null +++ b/source/device/vulkan/shaders/concat_pack8to1.comp @@ -0,0 +1,190 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int offset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + afpvec8 v = image1d_ld8(bottom_blob_1d, gx); + + int gx8 = gx * 8 + p.offset; + + image1d_st1(top_blob_1d, gx8 + 0, v[0].r); + image1d_st1(top_blob_1d, gx8 + 1, v[0].g); + image1d_st1(top_blob_1d, gx8 + 2, v[0].b); + image1d_st1(top_blob_1d, gx8 + 3, v[0].a); + image1d_st1(top_blob_1d, gx8 + 4, v[1].r); + image1d_st1(top_blob_1d, gx8 + 5, v[1].g); + image1d_st1(top_blob_1d, gx8 + 6, v[1].b); + image1d_st1(top_blob_1d, gx8 + 7, v[1].a); + } + else if (psc(dims) == 2) + { + afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + + if (axis == 0) + { + int gy8 = gy * 8 + p.offset; + + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 0), v[0].r); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 1), v[0].g); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 2), v[0].b); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 3), v[0].a); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 4), v[1].r); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 5), v[1].g); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 6), v[1].b); + image2d_st1(top_blob_2d, ivec2(gx, gy8 + 7), v[1].a); + } + if (axis == 1) + { + int gx8 = gx * 8 + p.offset; + + image2d_st1(top_blob_2d, ivec2(gx8 + 0, gy), v[0].r); + image2d_st1(top_blob_2d, ivec2(gx8 + 1, gy), v[0].g); + image2d_st1(top_blob_2d, ivec2(gx8 + 2, gy), v[0].b); + image2d_st1(top_blob_2d, ivec2(gx8 + 3, gy), v[0].a); + image2d_st1(top_blob_2d, ivec2(gx8 + 4, gy), v[1].r); + image2d_st1(top_blob_2d, ivec2(gx8 + 5, gy), v[1].g); + image2d_st1(top_blob_2d, ivec2(gx8 + 6, gy), v[1].b); + image2d_st1(top_blob_2d, ivec2(gx8 + 7, gy), v[1].a); + } + } + else // if (psc(dims) == 3) + { + afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + + if (axis == 0) + { + int gz8 = gz * 8 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 0), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 1), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 2), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 3), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 4), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 5), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 6), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 7), v[1].a); + } + if (axis == 1) + { + int gy8 = gy * 8 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 0, gz), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 1, gz), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 2, gz), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 3, gz), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 4, gz), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 5, gz), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 6, gz), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx, gy8 + 7, gz), v[1].a); + } + if (axis == 2) + { + int gx8 = gx * 8 + p.offset; + + image3d_st1(top_blob_3d, ivec3(gx8 + 0, gy, gz), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx8 + 1, gy, gz), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx8 + 2, gy, gz), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx8 + 3, gy, gz), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx8 + 4, gy, gz), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx8 + 5, gy, gz), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx8 + 6, gy, gz), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx8 + 7, gy, gz), v[1].a); + } + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + ivec3 gxyz = ivec3(gx, gy, gz); + + gxyz[psc(dims) - 1] *= 8; + gxyz[psc(dims) - 1 - axis] += p.offset; + + int v_offset_0 = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; + + ivec3 gxyz4 = ivec3(1, psc(outw), psc(outcstep)); + + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1 - axis]; + ivec4 vv_offset = v_offset + 4 * gxyz4[psc(dims) - 1 - axis]; + + buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/concat_pack8to4.comp b/source/device/vulkan/shaders/concat_pack8to4.comp new file mode 100644 index 000000000..6890e0f14 --- /dev/null +++ b/source/device/vulkan/shaders/concat_pack8to4.comp @@ -0,0 +1,154 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int offset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + afpvec8 v = image1d_ld8(bottom_blob_1d, gx); + + int gx2 = gx * 2 + p.offset; + + image1d_st4(top_blob_1d, gx2 + 0, v[0]); + image1d_st4(top_blob_1d, gx2 + 1, v[1]); + + } + else if (psc(dims) == 2) + { + afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + + if (axis == 0) + { + int gy2 = gy * 2 + p.offset; + + image2d_st4(top_blob_2d, ivec2(gx, gy2 + 0), v[0]); + image2d_st4(top_blob_2d, ivec2(gx, gy2 + 1), v[1]); + } + if (axis == 1) + { + int gx2 = gx * 2 + p.offset; + + image2d_st4(top_blob_2d, ivec2(gx2 + 0, gy), v[0]); + image2d_st4(top_blob_2d, ivec2(gx2 + 1, gy), v[1]); + } + } + else // if (psc(dims) == 3) + { + afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + + if (axis == 0) + { + int gz2 = gz * 2 + p.offset; + + image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 0), v[0]); + image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 1), v[1]); + } + if (axis == 1) + { + int gy2 = gy * 2 + p.offset; + + image3d_st4(top_blob_3d, ivec3(gx, gy2 + 0, gz), v[0]); + image3d_st4(top_blob_3d, ivec3(gx, gy2 + 1, gz), v[1]); + } + if (axis == 2) + { + int gx2 = gx * 2 + p.offset; + + image3d_st4(top_blob_3d, ivec3(gx2 + 0, gy, gz), v[0]); + image3d_st4(top_blob_3d, ivec3(gx2 + 1, gy, gz), v[1]); + } + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + ivec3 gxyz = ivec3(gx, gy, gz); + + gxyz[psc(dims) - 1] *= 2; + gxyz[psc(dims) - 1 - axis] += p.offset; + + int v_offset_0 = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x; + + ivec3 gxyz4 = ivec3(1, psc(outw), psc(outcstep)); + + ivec2 v_offset = v_offset_0 + ivec2(0, 1) * gxyz4[psc(dims) - 1 - axis]; + + buffer_cp8to4(top_blob_data, v_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/convolution.comp b/source/device/vulkan/shaders/convolution.comp new file mode 100644 index 000000000..1d1070950 --- /dev/null +++ b/source/device/vulkan/shaders/convolution.comp @@ -0,0 +1,175 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else + sum = buffer_ld1(bias_data, gz); +#endif + } + else + { + sum = afp(0.f); + } + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, z)); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else + int w_offset = gz * psc(c) * kernel_w * kernel_h; + + for (int z = 0; z < psc(c); z++) + { + int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + sum += buffer_ld1(weight_data, w_offset + x) * buffer_ld1(bottom_blob_data, v_offset + x * dilation_w); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_1x1s1d1.comp b/source/device/vulkan/shaders/convolution_1x1s1d1.comp new file mode 100644 index 000000000..947f21fbe --- /dev/null +++ b/source/device/vulkan/shaders/convolution_1x1s1d1.comp @@ -0,0 +1,187 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif +layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ +#if NCNN_image_shader + int gx = int(gl_GlobalInvocationID.x) * 4; + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + return; +#else + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + return; +#endif + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = afpvec4(image1d_ld1(bias_blob, gz)); +#else + sum = afpvec4(buffer_ld1(bias_data, gz)); +#endif + } + else + { + sum = afpvec4(0.f); + } + +#if NCNN_image_shader + ivec4 gx4 = gx + ivec4(0, 1, 2, 3); + + ivec4 sy4 = gx4 / psc(w); + ivec4 sx4 = gx4 % psc(w); + + for (int z = 0; z < psc(c); z++) + { + afp k = image3d_ld1(weight_blob, ivec3(0, z, gz)); + + sum.r += k * image3d_ld1(bottom_blob, ivec3(sx4.r, sy4.r, z)); + sum.g += k * image3d_ld1(bottom_blob, ivec3(sx4.g, sy4.g, z)); + sum.b += k * image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z)); + sum.a += k * image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); + } +#else + int w_offset = gz * psc(c); + int v_offset = gx; + + for (int z = 0; z < psc(c); z++) + { +#if NCNN_fp16_packed + sum += afp(weight_data[w_offset]) * afpvec4(bottom_blob_data[v_offset]); +#else + sum += buffer_ld1(weight_data, w_offset) * buffer_ld4(bottom_blob_data, v_offset); +#endif + + w_offset += 1; + v_offset += psc(cstep) / 4; + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum.r); + image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum.g); + image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum.b); + image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum.a); +#else + const int gi = gz * psc(outcstep) + gx; + +#if NCNN_fp16_packed + top_blob_data[gi] = sum; +#else + buffer_st4(top_blob_data, gi, sum); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack1to4.comp b/source/device/vulkan/shaders/convolution_pack1to4.comp new file mode 100644 index 000000000..711f44aa9 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack1to4.comp @@ -0,0 +1,183 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else + sum = buffer_ld4(bias_data, gz); +#endif + } + else + { + sum = afpvec4(0.f); + } + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += v * k; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else + int w_offset = gz * psc(c) * kernel_w * kernel_h; + + for (int z = 0; z < psc(c); z++) + { + int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v = buffer_ld1(bottom_blob_data, v_offset + x * dilation_w); + + afpvec4 k = buffer_ld4(weight_data, w_offset + x); + + sum += v * k; + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack1to8.comp b/source/device/vulkan/shaders/convolution_pack1to8.comp new file mode 100644 index 000000000..d9849b8fa --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack1to8.comp @@ -0,0 +1,193 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else + sum = buffer_ld8(bias_data, gz); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else + int w_offset = gz * psc(c) * kernel_w * kernel_h; + + for (int z = 0; z < psc(c); z++) + { + int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v = buffer_ld1(bottom_blob_data, v_offset + x * dilation_w); + + afpvec8 k = buffer_ld8(weight_data, w_offset + x); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack4.comp b/source/device/vulkan/shaders/convolution_pack4.comp new file mode 100644 index 000000000..5a714f86d --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack4.comp @@ -0,0 +1,203 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else +layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else + sum = buffer_ld4(bias_data, gz); +#endif + } + else + { + sum = afpvec4(0.f); + } + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)) + ); + + sum += v * k; + + sx += dilation_w; + wx += 4; + } + + sy += dilation_h; + } + } +#else + int w_offset = gz * psc(c) * kernel_w * kernel_h; + + for (int z = 0; z < psc(c); z++) + { + int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w); + +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + buffer_ld4(weight_data, (w_offset + x) * 4 + 0), + buffer_ld4(weight_data, (w_offset + x) * 4 + 1), + buffer_ld4(weight_data, (w_offset + x) * 4 + 2), + buffer_ld4(weight_data, (w_offset + x) * 4 + 3) + ); +#else + afpmat4 k = sfp2afpmat4(weight_data[w_offset + x]); +#endif + + sum += v * k; + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack4_1x1s1d1.comp b/source/device/vulkan/shaders/convolution_pack4_1x1s1d1.comp new file mode 100644 index 000000000..a7efaefd7 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack4_1x1s1d1.comp @@ -0,0 +1,237 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else +layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ +#if NCNN_image_shader + int gx = int(gl_GlobalInvocationID.x) * 4; + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + return; +#else + int gx = int(gl_GlobalInvocationID.x) * 4; + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + return; +#endif + + afpvec4 sum0; + afpvec4 sum1; + afpvec4 sum2; + afpvec4 sum3; + + if (bias_term == 1) + { +#if NCNN_image_shader + afpvec4 b = image1d_ld4(bias_blob, gz); +#else + afpvec4 b = buffer_ld4(bias_data, gz); +#endif + sum0 = b; + sum1 = b; + sum2 = b; + sum3 = b; + } + else + { + sum0 = afpvec4(0.f); + sum1 = afpvec4(0.f); + sum2 = afpvec4(0.f); + sum3 = afpvec4(0.f); + } + +#if NCNN_image_shader + ivec4 gx4 = gx + ivec4(0, 1, 2, 3); + + ivec4 sy4 = gx4 / psc(w); + ivec4 sx4 = gx4 % psc(w); + + for (int z = 0; z < psc(c); z++) + { + afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(sx4.r, sy4.r, z)); + afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(sx4.g, sy4.g, z)); + afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z)); + afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_blob, ivec3(0, z, gz)), + image3d_ld4(weight_blob, ivec3(1, z, gz)), + image3d_ld4(weight_blob, ivec3(2, z, gz)), + image3d_ld4(weight_blob, ivec3(3, z, gz)) + ); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } +#else + int w_offset = gz * psc(c); + int v_offset = gx; + + for (int z = 0; z < psc(c); z++) + { + afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset + 0); + afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset + 1); + afpvec4 v2 = buffer_ld4(bottom_blob_data, v_offset + 2); + afpvec4 v3 = buffer_ld4(bottom_blob_data, v_offset + 3); + +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + buffer_ld4(weight_data, w_offset * 4 + 0), + buffer_ld4(weight_data, w_offset * 4 + 1), + buffer_ld4(weight_data, w_offset * 4 + 2), + buffer_ld4(weight_data, w_offset * 4 + 3) + ); +#else + afpmat4 k = sfp2afpmat4(weight_data[w_offset]); +#endif + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + + w_offset += 1; + v_offset += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum0 = max(sum0, afp(0.f)); + sum1 = max(sum1, afp(0.f)); + sum2 = max(sum2, afp(0.f)); + sum3 = max(sum3, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f))); + sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f))); + sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f))); + sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum0 = clamp(sum0, const_min, const_max); + sum1 = clamp(sum1, const_min, const_max); + sum2 = clamp(sum2, const_min, const_max); + sum3 = clamp(sum3, const_min, const_max); + } + if (activation_type == 4) + { + sum0 = afp(1.f) / (afp(1.f) + exp(-sum0)); + sum1 = afp(1.f) / (afp(1.f) + exp(-sum1)); + sum2 = afp(1.f) / (afp(1.f) + exp(-sum2)); + sum3 = afp(1.f) / (afp(1.f) + exp(-sum3)); + } + if (activation_type == 5) + { + sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f))); + sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f))); + sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f))); + sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); + image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); + image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); + image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); +#else + int gi = gz * psc(outcstep) + gx; + + buffer_st4(top_blob_data, gi + 0, sum0); + if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(outcstep)) buffer_st4(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(outcstep)) buffer_st4(top_blob_data, gi + 3, sum3); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp new file mode 100644 index 000000000..40211c64f --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp @@ -0,0 +1,139 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int c = 0; +layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; +layout (binding = 2) uniform unfp sampler3D weight_tm_blob; +#else +layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; }; +#else +layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; }; +#endif +#endif + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y) * 4; + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= 16 || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 sum0 = afpvec4(0.f); + afpvec4 sum1 = afpvec4(0.f); + afpvec4 sum2 = afpvec4(0.f); + afpvec4 sum3 = afpvec4(0.f); + +#if NCNN_image_shader + int wx = gx * 4; + + for (int z = 0; z < psc(c); z++) + { + afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 0, z)); + afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 1, z)); + afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 2, z)); + afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 3, z)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_tm_blob, ivec3(wx + 0, z, gz)), + image3d_ld4(weight_tm_blob, ivec3(wx + 1, z, gz)), + image3d_ld4(weight_tm_blob, ivec3(wx + 2, z, gz)), + image3d_ld4(weight_tm_blob, ivec3(wx + 3, z, gz)) + ); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } +#else + int v_offset = gy * 16 + gx; + int w_offset = gz * psc(c) * 16 + gx; + + for (int z = 0; z < psc(c); z++) + { + afpvec4 v0 = buffer_ld4(bottom_tm_blob_data, v_offset + 0); + afpvec4 v1 = buffer_ld4(bottom_tm_blob_data, v_offset + 16); + afpvec4 v2 = buffer_ld4(bottom_tm_blob_data, v_offset + 32); + afpvec4 v3 = buffer_ld4(bottom_tm_blob_data, v_offset + 48); + +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + buffer_ld4(weight_tm_data, w_offset * 4 + 0), + buffer_ld4(weight_tm_data, w_offset * 4 + 1), + buffer_ld4(weight_tm_data, w_offset * 4 + 2), + buffer_ld4(weight_tm_data, w_offset * 4 + 3) + ); +#else + afpmat4 k = sfpmat4(weight_tm_data[w_offset]); +#endif + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + + v_offset += psc(cstep); + w_offset += 16; + } +#endif + +#if NCNN_image_shader + image3d_st4(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); + image3d_st4(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); + image3d_st4(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); + image3d_st4(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); +#else + int gi = gz * psc(outcstep) + gy * 16 + gx; + + buffer_st4(top_tm_blob_data, gi + 0, sum0); + if (gy + 1 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 16, sum1); + if (gy + 2 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 32, sum2); + if (gy + 3 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 48, sum3); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp new file mode 100644 index 000000000..8734d01de --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp @@ -0,0 +1,202 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int block_x = 0; +layout (constant_id = shape_constant_id_offset + 6) const int block_y = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outcstep; + + int block_x; + int block_y; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + return; + + // load 4x4 +#if NCNN_image_shader + int sx = gx * 2; + int sy = gy * 2; + + afpvec4 v00 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 0, gz)); + afpvec4 v01 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 0, gz)); + afpvec4 v02 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 0, gz)); + afpvec4 v03 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 0, gz)); + + afpvec4 v10 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 1, gz)); + afpvec4 v11 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 1, gz)); + afpvec4 v12 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 1, gz)); + afpvec4 v13 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 1, gz)); + + afpvec4 v20 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 2, gz)); + afpvec4 v21 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 2, gz)); + afpvec4 v22 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 2, gz)); + afpvec4 v23 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 2, gz)); + + afpvec4 v30 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 3, gz)); + afpvec4 v31 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 3, gz)); + afpvec4 v32 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 3, gz)); + afpvec4 v33 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 3, gz)); +#else + int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + + afpvec4 v00 = buffer_ld4(bottom_blob_data, v_offset.r + 0); + afpvec4 v01 = buffer_ld4(bottom_blob_data, v_offset.r + 1); + afpvec4 v02 = buffer_ld4(bottom_blob_data, v_offset.r + 2); + afpvec4 v03 = buffer_ld4(bottom_blob_data, v_offset.r + 3); + + afpvec4 v10 = buffer_ld4(bottom_blob_data, v_offset.g + 0); + afpvec4 v11 = buffer_ld4(bottom_blob_data, v_offset.g + 1); + afpvec4 v12 = buffer_ld4(bottom_blob_data, v_offset.g + 2); + afpvec4 v13 = buffer_ld4(bottom_blob_data, v_offset.g + 3); + + afpvec4 v20 = buffer_ld4(bottom_blob_data, v_offset.b + 0); + afpvec4 v21 = buffer_ld4(bottom_blob_data, v_offset.b + 1); + afpvec4 v22 = buffer_ld4(bottom_blob_data, v_offset.b + 2); + afpvec4 v23 = buffer_ld4(bottom_blob_data, v_offset.b + 3); + + afpvec4 v30 = buffer_ld4(bottom_blob_data, v_offset.a + 0); + afpvec4 v31 = buffer_ld4(bottom_blob_data, v_offset.a + 1); + afpvec4 v32 = buffer_ld4(bottom_blob_data, v_offset.a + 2); + afpvec4 v33 = buffer_ld4(bottom_blob_data, v_offset.a + 3); +#endif + + // const float itm[4][4] = { + // {1.0f, 0.0f, -1.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, -1.0f, 1.0f, 0.0f}, + // {0.0f, -1.0f, 0.0f, 1.0f} + // }; + + // implicit transpose + afpvec4 m00 = v00 - v02; + afpvec4 m01 = v10 - v12; + afpvec4 m02 = v20 - v22; + afpvec4 m03 = v30 - v32; + + afpvec4 m10 = v02 + v01; + afpvec4 m11 = v12 + v11; + afpvec4 m12 = v22 + v21; + afpvec4 m13 = v32 + v31; + + afpvec4 m20 = v02 - v01; + afpvec4 m21 = v12 - v11; + afpvec4 m22 = v22 - v21; + afpvec4 m23 = v32 - v31; + + afpvec4 m30 = v03 - v01; + afpvec4 m31 = v13 - v11; + afpvec4 m32 = v23 - v21; + afpvec4 m33 = v33 - v31; + + v00 = m00 - m02; + v10 = m10 - m12; + v20 = m20 - m22; + v30 = m30 - m32; + + v01 = m02 + m01; + v11 = m12 + m11; + v21 = m22 + m21; + v31 = m32 + m31; + + v02 = m02 - m01; + v12 = m12 - m11; + v22 = m22 - m21; + v32 = m32 - m31; + + v03 = m03 - m01; + v13 = m13 - m11; + v23 = m23 - m21; + v33 = m33 - m31; + + // store 16 +#if NCNN_image_shader + int y = gy * p.block_x + gx; + + image3d_st4(bottom_tm_blob, ivec3(0, y, gz), v00); + image3d_st4(bottom_tm_blob, ivec3(1, y, gz), v01); + image3d_st4(bottom_tm_blob, ivec3(2, y, gz), v02); + image3d_st4(bottom_tm_blob, ivec3(3, y, gz), v03); + image3d_st4(bottom_tm_blob, ivec3(4, y, gz), v10); + image3d_st4(bottom_tm_blob, ivec3(5, y, gz), v11); + image3d_st4(bottom_tm_blob, ivec3(6, y, gz), v12); + image3d_st4(bottom_tm_blob, ivec3(7, y, gz), v13); + image3d_st4(bottom_tm_blob, ivec3(8, y, gz), v20); + image3d_st4(bottom_tm_blob, ivec3(9, y, gz), v21); + image3d_st4(bottom_tm_blob, ivec3(10, y, gz), v22); + image3d_st4(bottom_tm_blob, ivec3(11, y, gz), v23); + image3d_st4(bottom_tm_blob, ivec3(12, y, gz), v30); + image3d_st4(bottom_tm_blob, ivec3(13, y, gz), v31); + image3d_st4(bottom_tm_blob, ivec3(14, y, gz), v32); + image3d_st4(bottom_tm_blob, ivec3(15, y, gz), v33); +#else + int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16; + + buffer_st4(bottom_tm_blob_data, v_tm_offset + 0, v00); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 1, v01); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 2, v02); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 3, v03); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 4, v10); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 5, v11); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 6, v12); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 7, v13); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 8, v20); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 9, v21); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 10, v22); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 11, v23); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 12, v30); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 13, v31); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 14, v32); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 15, v33); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp new file mode 100644 index 000000000..c693e74a4 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp @@ -0,0 +1,209 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int c = 0; +layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int block_x = 0; +layout (constant_id = shape_constant_id_offset + 3) const int block_y = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D top_tm_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int block_x; + int block_y; + + int outw; + int outh; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + return; + + // load 16 +#if NCNN_image_shader + int sy = gy * p.block_x + gx; + + afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(0, sy, gz)); + afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(1, sy, gz)); + afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(2, sy, gz)); + afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(3, sy, gz)); + afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(4, sy, gz)); + afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(5, sy, gz)); + afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(6, sy, gz)); + afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(7, sy, gz)); + afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(8, sy, gz)); + afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(9, sy, gz)); + afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(10, sy, gz)); + afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(11, sy, gz)); + afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(12, sy, gz)); + afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(13, sy, gz)); + afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(14, sy, gz)); + afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(15, sy, gz)); +#else + int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16; + + afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0); + afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1); + afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2); + afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3); + afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4); + afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5); + afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6); + afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7); + afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8); + afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9); + afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10); + afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11); + afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12); + afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13); + afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14); + afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15); +#endif + + // const float itm[2][4] = { + // {1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 1.0f} + // }; + + // implicit transpose + afpvec4 m00 = v00 + v01 + v02; + afpvec4 m01 = v10 + v11 + v12; + afpvec4 m02 = v20 + v21 + v22; + afpvec4 m03 = v30 + v31 + v32; + + afpvec4 m10 = v01 - v02 + v03; + afpvec4 m11 = v11 - v12 + v13; + afpvec4 m12 = v21 - v22 + v23; + afpvec4 m13 = v31 - v32 + v33; + + if (bias_term == 1) + { +#if NCNN_image_shader + const afpvec4 bias_value = image1d_ld4(bias_blob, gz); +#else + const afpvec4 bias_value = buffer_ld4(bias_data, gz); +#endif + + v00 = bias_value + m00 + m01 + m02; + v10 = bias_value + m10 + m11 + m12; + + v01 = bias_value + m01 - m02 + m03; + v11 = bias_value + m11 - m12 + m13; + } + else + { + v00 = m00 + m01 + m02; + v10 = m10 + m11 + m12; + + v01 = m01 - m02 + m03; + v11 = m11 - m12 + m13; + } + + if (activation_type == 1) + { + v00 = max(v00, afp(0.f)); + v10 = max(v10, afp(0.f)); + v01 = max(v01, afp(0.f)); + v11 = max(v11, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + v00 = mix(v00, v00 * afp(slope), lessThan(v00, afpvec4(0.f))); + v10 = mix(v10, v10 * afp(slope), lessThan(v10, afpvec4(0.f))); + v01 = mix(v01, v01 * afp(slope), lessThan(v01, afpvec4(0.f))); + v11 = mix(v11, v11 * afp(slope), lessThan(v11, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + v00 = clamp(v00, const_min, const_max); + v10 = clamp(v10, const_min, const_max); + v01 = clamp(v01, const_min, const_max); + v11 = clamp(v11, const_min, const_max); + } + if (activation_type == 4) + { + v00 = afp(1.f) / (afp(1.f) + exp(-v00)); + v10 = afp(1.f) / (afp(1.f) + exp(-v10)); + v01 = afp(1.f) / (afp(1.f) + exp(-v01)); + v11 = afp(1.f) / (afp(1.f) + exp(-v11)); + } + if (activation_type == 5) + { + v00 = v00 * tanh(log(exp(v00) + afp(1.f))); + v01 = v01 * tanh(log(exp(v01) + afp(1.f))); + v10 = v10 * tanh(log(exp(v10) + afp(1.f))); + v11 = v11 * tanh(log(exp(v11) + afp(1.f))); + } + + // store 2x2 +#if NCNN_image_shader + int x = gx * 2; + int y = gy * 2; + + image3d_st4(top_blob, ivec3(x, y, gz), v00); + image3d_st4(top_blob, ivec3(x + 1, y, gz), v01); + image3d_st4(top_blob, ivec3(x, y + 1, gz), v10); + image3d_st4(top_blob, ivec3(x + 1, y + 1, gz), v11); +#else + int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2; + int v_offset_1 = v_offset_0 + psc(outw); + + buffer_st4(top_blob_data, v_offset_0 + 0, v00); + buffer_st4(top_blob_data, v_offset_0 + 1, v01); + buffer_st4(top_blob_data, v_offset_1 + 0, v10); + buffer_st4(top_blob_data, v_offset_1 + 1, v11); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack4to1.comp b/source/device/vulkan/shaders/convolution_pack4to1.comp new file mode 100644 index 000000000..b318f7562 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack4to1.comp @@ -0,0 +1,183 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else + sum = buffer_ld1(bias_data, gz); +#endif + } + else + { + sum = afp(0.f); + } + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += dot(v, k); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else + int w_offset = gz * psc(c) * kernel_w * kernel_h; + + for (int z = 0; z < psc(c); z++) + { + int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w); + + afpvec4 k = buffer_ld4(weight_data, w_offset + x); + + sum += dot(v, k); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack4to8.comp b/source/device/vulkan/shaders/convolution_pack4to8.comp new file mode 100644 index 000000000..aed8ad6a9 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack4to8.comp @@ -0,0 +1,219 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else + sum = buffer_ld8(bias_data, gz); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + + sx += dilation_w; + wx += 8; + } + + sy += dilation_h; + } + } +#else + int w_offset = gz * psc(c) * kernel_w * kernel_h; + + for (int z = 0; z < psc(c); z++) + { + int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w); + + afpvec4 k0 = buffer_ld4(weight_data, (w_offset + x) * 8 + 0); + afpvec4 k1 = buffer_ld4(weight_data, (w_offset + x) * 8 + 1); + afpvec4 k2 = buffer_ld4(weight_data, (w_offset + x) * 8 + 2); + afpvec4 k3 = buffer_ld4(weight_data, (w_offset + x) * 8 + 3); + afpvec4 k4 = buffer_ld4(weight_data, (w_offset + x) * 8 + 4); + afpvec4 k5 = buffer_ld4(weight_data, (w_offset + x) * 8 + 5); + afpvec4 k6 = buffer_ld4(weight_data, (w_offset + x) * 8 + 6); + afpvec4 k7 = buffer_ld4(weight_data, (w_offset + x) * 8 + 7); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack8.comp b/source/device/vulkan/shaders/convolution_pack8.comp new file mode 100644 index 000000000..7c1d5cbc2 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack8.comp @@ -0,0 +1,219 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else + sum = buffer_ld8(bias_data, gz); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k; + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + + sx += dilation_w; + wx += 8; + } + + sy += dilation_h; + } + } +#else + int w_offset = gz * psc(c) * kernel_w * kernel_h; + + for (int z = 0; z < psc(c); z++) + { + int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w); + + afpvec8 k0 = buffer_ld8(weight_data, (w_offset + x) * 8 + 0); + afpvec8 k1 = buffer_ld8(weight_data, (w_offset + x) * 8 + 1); + afpvec8 k2 = buffer_ld8(weight_data, (w_offset + x) * 8 + 2); + afpvec8 k3 = buffer_ld8(weight_data, (w_offset + x) * 8 + 3); + afpvec8 k4 = buffer_ld8(weight_data, (w_offset + x) * 8 + 4); + afpvec8 k5 = buffer_ld8(weight_data, (w_offset + x) * 8 + 5); + afpvec8 k6 = buffer_ld8(weight_data, (w_offset + x) * 8 + 6); + afpvec8 k7 = buffer_ld8(weight_data, (w_offset + x) * 8 + 7); + + // sum += v * k + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack8_1x1s1d1.comp b/source/device/vulkan/shaders/convolution_pack8_1x1s1d1.comp new file mode 100644 index 000000000..48c548efb --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack8_1x1s1d1.comp @@ -0,0 +1,327 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ +#if NCNN_image_shader + int gx = int(gl_GlobalInvocationID.x) * 4; + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + return; +#else + int gx = int(gl_GlobalInvocationID.x) * 4; + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + return; +#endif + + afpvec8 sum0; + afpvec8 sum1; + afpvec8 sum2; + afpvec8 sum3; + + if (bias_term == 1) + { +#if NCNN_image_shader + afpvec8 b = image1d_ld8(bias_blob, gz); +#else + afpvec8 b = buffer_ld8(bias_data, gz); +#endif + sum0 = b; + sum1 = b; + sum2 = b; + sum3 = b; + } + else + { + sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f)); + sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f)); + sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f)); + sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + +#if NCNN_image_shader + ivec4 gx4 = gx + ivec4(0, 1, 2, 3); + + ivec4 sy4 = gx4 / psc(w); + ivec4 sx4 = gx4 % psc(w); + + for (int z = 0; z < psc(c); z++) + { + afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(sx4.r, sy4.r, z)); + afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(sx4.g, sy4.g, z)); + afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); + afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz)); + + // sum += v * k + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); + sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); + sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); + sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); + + sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); + sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); + sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); + sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); + sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); + } +#else + int w_offset = gz * psc(c) * 8; + int v_offset = gx; + + for (int z = 0; z < psc(c); z++) + { + afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset + 0); + afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset + 1); + afpvec8 v2 = buffer_ld8(bottom_blob_data, v_offset + 2); + afpvec8 v3 = buffer_ld8(bottom_blob_data, v_offset + 3); + + afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0); + afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1); + afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2); + afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3); + afpvec8 k4 = buffer_ld8(weight_data, w_offset + 4); + afpvec8 k5 = buffer_ld8(weight_data, w_offset + 5); + afpvec8 k6 = buffer_ld8(weight_data, w_offset + 6); + afpvec8 k7 = buffer_ld8(weight_data, w_offset + 7); + + // sum += v * k + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); + sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); + sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); + sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); + + sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); + sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); + sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); + sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); + sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); + + w_offset += 8; + v_offset += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum0[0] = max(sum0[0], afp(0.f)); + sum0[1] = max(sum0[1], afp(0.f)); + sum1[0] = max(sum1[0], afp(0.f)); + sum1[1] = max(sum1[1], afp(0.f)); + sum2[0] = max(sum2[0], afp(0.f)); + sum2[1] = max(sum2[1], afp(0.f)); + sum3[0] = max(sum3[0], afp(0.f)); + sum3[1] = max(sum3[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f))); + sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f))); + sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f))); + sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f))); + sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f))); + sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f))); + sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f))); + sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum0[0] = clamp(sum0[0], const_min, const_max); + sum0[1] = clamp(sum0[1], const_min, const_max); + sum1[0] = clamp(sum1[0], const_min, const_max); + sum1[1] = clamp(sum1[1], const_min, const_max); + sum2[0] = clamp(sum2[0], const_min, const_max); + sum2[1] = clamp(sum2[1], const_min, const_max); + sum3[0] = clamp(sum3[0], const_min, const_max); + sum3[1] = clamp(sum3[1], const_min, const_max); + } + if (activation_type == 4) + { + sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0])); + sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1])); + sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0])); + sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1])); + sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0])); + sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1])); + sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0])); + sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1])); + } + if (activation_type == 5) + { + sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f))); + sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f))); + sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f))); + sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f))); + sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f))); + sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f))); + sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f))); + sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); + image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); + image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); + image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); +#else + int gi = gz * psc(outcstep) + gx; + + // afp tmp = afp(1.0f); + // sum0 = afpvec8(afpvec4(tmp), afpvec4(tmp)); + // sum1 = afpvec8(afpvec4(tmp), afpvec4(tmp)); + // sum2 = afpvec8(afpvec4(tmp), afpvec4(tmp)); + // sum3 = afpvec8(afpvec4(tmp), afpvec4(tmp)); + // w_offset = 0; + // sum0 = buffer_ld8(weight_data, w_offset + 0); + // sum1 = buffer_ld8(weight_data, w_offset + 1); + // sum2 = buffer_ld8(weight_data, w_offset + 2); + // sum3 = buffer_ld8(weight_data, w_offset + 3); + + buffer_st8(top_blob_data, gi + 0, sum0); + if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(outcstep)) buffer_st8(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(outcstep)) buffer_st8(top_blob_data, gi + 3, sum3); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp new file mode 100644 index 000000000..e5f619fd3 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp @@ -0,0 +1,198 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int c = 0; +layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; +layout (binding = 2) uniform unfp sampler3D weight_tm_blob; +#else +layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; }; +layout (binding = 2) readonly buffer weight_tm_blob { sfpvec8 weight_tm_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y) * 4; + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= 16 || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f)); + afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f)); + afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f)); + afpvec8 sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f)); + +#if NCNN_image_shader + int wx = gx * 8; + + for (int z = 0; z < psc(c); z++) + { + afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 0, z)); + afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 1, z)); + afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 2, z)); + afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 3, z)); + + afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(wx + 3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(wx + 4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(wx + 5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(wx + 6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); + sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); + sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); + sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); + + sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); + sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); + sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); + sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); + sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); + } +#else + int v_offset = gy * 16 + gx; + int w_offset = (gz * psc(c) * 16 + gx) * 8; + + for (int z = 0; z < psc(c); z++) + { + afpvec8 v0 = buffer_ld8(bottom_tm_blob_data, v_offset + 0); + afpvec8 v1 = buffer_ld8(bottom_tm_blob_data, v_offset + 16); + afpvec8 v2 = buffer_ld8(bottom_tm_blob_data, v_offset + 32); + afpvec8 v3 = buffer_ld8(bottom_tm_blob_data, v_offset + 48); + + afpvec8 k0 = buffer_ld8(weight_tm_data, w_offset + 0); + afpvec8 k1 = buffer_ld8(weight_tm_data, w_offset + 1); + afpvec8 k2 = buffer_ld8(weight_tm_data, w_offset + 2); + afpvec8 k3 = buffer_ld8(weight_tm_data, w_offset + 3); + afpvec8 k4 = buffer_ld8(weight_tm_data, w_offset + 4); + afpvec8 k5 = buffer_ld8(weight_tm_data, w_offset + 5); + afpvec8 k6 = buffer_ld8(weight_tm_data, w_offset + 6); + afpvec8 k7 = buffer_ld8(weight_tm_data, w_offset + 7); + + // sum += v * k + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); + sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); + sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); + sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); + + sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); + sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); + sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); + sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); + sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); + + v_offset += psc(cstep); + w_offset += 16 * 8; + } +#endif + +#if NCNN_image_shader + image3d_st8(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); + image3d_st8(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); + image3d_st8(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); + image3d_st8(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); +#else + int gi = gz * psc(outcstep) + gy * 16 + gx; + + buffer_st8(top_tm_blob_data, gi + 0, sum0); + if (gy + 1 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 16, sum1); + if (gy + 2 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 32, sum2); + if (gy + 3 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 48, sum3); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp new file mode 100644 index 000000000..23b89c572 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp @@ -0,0 +1,203 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int block_x = 0; +layout (constant_id = shape_constant_id_offset + 6) const int block_y = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outcstep; + + int block_x; + int block_y; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + return; + + // load 4x4 +#if NCNN_image_shader + int sx = gx * 2; + int sy = gy * 2; + + afpvec8 v00 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 0, gz)); + afpvec8 v01 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 0, gz)); + afpvec8 v02 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 0, gz)); + afpvec8 v03 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 0, gz)); + + afpvec8 v10 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 1, gz)); + afpvec8 v11 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz)); + afpvec8 v12 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 1, gz)); + afpvec8 v13 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 1, gz)); + + afpvec8 v20 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 2, gz)); + afpvec8 v21 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 2, gz)); + afpvec8 v22 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 2, gz)); + afpvec8 v23 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 2, gz)); + + afpvec8 v30 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 3, gz)); + afpvec8 v31 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 3, gz)); + afpvec8 v32 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 3, gz)); + afpvec8 v33 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 3, gz)); +#else + int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2; + ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w); + + afpvec8 v00 = buffer_ld8(bottom_blob_data, v_offset.r + 0); + afpvec8 v01 = buffer_ld8(bottom_blob_data, v_offset.r + 1); + afpvec8 v02 = buffer_ld8(bottom_blob_data, v_offset.r + 2); + afpvec8 v03 = buffer_ld8(bottom_blob_data, v_offset.r + 3); + + afpvec8 v10 = buffer_ld8(bottom_blob_data, v_offset.g + 0); + afpvec8 v11 = buffer_ld8(bottom_blob_data, v_offset.g + 1); + afpvec8 v12 = buffer_ld8(bottom_blob_data, v_offset.g + 2); + afpvec8 v13 = buffer_ld8(bottom_blob_data, v_offset.g + 3); + + afpvec8 v20 = buffer_ld8(bottom_blob_data, v_offset.b + 0); + afpvec8 v21 = buffer_ld8(bottom_blob_data, v_offset.b + 1); + afpvec8 v22 = buffer_ld8(bottom_blob_data, v_offset.b + 2); + afpvec8 v23 = buffer_ld8(bottom_blob_data, v_offset.b + 3); + + afpvec8 v30 = buffer_ld8(bottom_blob_data, v_offset.a + 0); + afpvec8 v31 = buffer_ld8(bottom_blob_data, v_offset.a + 1); + afpvec8 v32 = buffer_ld8(bottom_blob_data, v_offset.a + 2); + afpvec8 v33 = buffer_ld8(bottom_blob_data, v_offset.a + 3); +#endif + + // const float itm[4][4] = { + // {1.0f, 0.0f, -1.0f, 0.0f}, + // {0.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, -1.0f, 1.0f, 0.0f}, + // {0.0f, -1.0f, 0.0f, 1.0f} + // }; + + // implicit transpose + afpvec8 m00 = v00 - v02; + afpvec8 m01 = v10 - v12; + afpvec8 m02 = v20 - v22; + afpvec8 m03 = v30 - v32; + + afpvec8 m10 = v02 + v01; + afpvec8 m11 = v12 + v11; + afpvec8 m12 = v22 + v21; + afpvec8 m13 = v32 + v31; + + afpvec8 m20 = v02 - v01; + afpvec8 m21 = v12 - v11; + afpvec8 m22 = v22 - v21; + afpvec8 m23 = v32 - v31; + + afpvec8 m30 = v03 - v01; + afpvec8 m31 = v13 - v11; + afpvec8 m32 = v23 - v21; + afpvec8 m33 = v33 - v31; + + v00 = m00 - m02; + v10 = m10 - m12; + v20 = m20 - m22; + v30 = m30 - m32; + + v01 = m02 + m01; + v11 = m12 + m11; + v21 = m22 + m21; + v31 = m32 + m31; + + v02 = m02 - m01; + v12 = m12 - m11; + v22 = m22 - m21; + v32 = m32 - m31; + + v03 = m03 - m01; + v13 = m13 - m11; + v23 = m23 - m21; + v33 = m33 - m31; + + // store 16 +#if NCNN_image_shader + int y = gy * p.block_x + gx; + + image3d_st8(bottom_tm_blob, ivec3(0, y, gz), v00); + image3d_st8(bottom_tm_blob, ivec3(1, y, gz), v01); + image3d_st8(bottom_tm_blob, ivec3(2, y, gz), v02); + image3d_st8(bottom_tm_blob, ivec3(3, y, gz), v03); + image3d_st8(bottom_tm_blob, ivec3(4, y, gz), v10); + image3d_st8(bottom_tm_blob, ivec3(5, y, gz), v11); + image3d_st8(bottom_tm_blob, ivec3(6, y, gz), v12); + image3d_st8(bottom_tm_blob, ivec3(7, y, gz), v13); + image3d_st8(bottom_tm_blob, ivec3(8, y, gz), v20); + image3d_st8(bottom_tm_blob, ivec3(9, y, gz), v21); + image3d_st8(bottom_tm_blob, ivec3(10, y, gz), v22); + image3d_st8(bottom_tm_blob, ivec3(11, y, gz), v23); + image3d_st8(bottom_tm_blob, ivec3(12, y, gz), v30); + image3d_st8(bottom_tm_blob, ivec3(13, y, gz), v31); + image3d_st8(bottom_tm_blob, ivec3(14, y, gz), v32); + image3d_st8(bottom_tm_blob, ivec3(15, y, gz), v33); +#else + int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16; + + buffer_st8(bottom_tm_blob_data, v_tm_offset + 0, v00); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 1, v01); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 2, v02); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 3, v03); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 4, v10); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 5, v11); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 6, v12); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 7, v13); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 8, v20); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 9, v21); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 10, v22); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 11, v23); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 12, v30); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 13, v31); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 14, v32); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 15, v33); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp new file mode 100644 index 000000000..f15f48b8e --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp @@ -0,0 +1,230 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int c = 0; +layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int block_x = 0; +layout (constant_id = shape_constant_id_offset + 3) const int block_y = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D top_tm_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int block_x; + int block_y; + + int outw; + int outh; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + return; + + // load 16 +#if NCNN_image_shader + int sy = gy * p.block_x + gx; + + afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(0, sy, gz)); + afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(1, sy, gz)); + afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(2, sy, gz)); + afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(3, sy, gz)); + afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(4, sy, gz)); + afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(5, sy, gz)); + afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(6, sy, gz)); + afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(7, sy, gz)); + afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(8, sy, gz)); + afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(9, sy, gz)); + afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(10, sy, gz)); + afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(11, sy, gz)); + afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(12, sy, gz)); + afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(13, sy, gz)); + afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(14, sy, gz)); + afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(15, sy, gz)); +#else + int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16; + + afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0); + afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1); + afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2); + afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3); + afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4); + afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5); + afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6); + afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7); + afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8); + afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9); + afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10); + afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11); + afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12); + afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13); + afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14); + afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15); +#endif + + // const float itm[2][4] = { + // {1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 1.0f} + // }; + + // implicit transpose + afpvec8 m00 = v00 + v01 + v02; + afpvec8 m01 = v10 + v11 + v12; + afpvec8 m02 = v20 + v21 + v22; + afpvec8 m03 = v30 + v31 + v32; + + afpvec8 m10 = v01 - v02 + v03; + afpvec8 m11 = v11 - v12 + v13; + afpvec8 m12 = v21 - v22 + v23; + afpvec8 m13 = v31 - v32 + v33; + + if (bias_term == 1) + { +#if NCNN_image_shader + const afpvec8 bias_value = image1d_ld8(bias_blob, gz); +#else + const afpvec8 bias_value = buffer_ld8(bias_data, gz); +#endif + + v00 = bias_value + m00 + m01 + m02; + v10 = bias_value + m10 + m11 + m12; + + v01 = bias_value + m01 - m02 + m03; + v11 = bias_value + m11 - m12 + m13; + } + else + { + v00 = m00 + m01 + m02; + v10 = m10 + m11 + m12; + + v01 = m01 - m02 + m03; + v11 = m11 - m12 + m13; + } + + if (activation_type == 1) + { + v00[0] = max(v00[0], afp(0.f)); + v00[1] = max(v00[1], afp(0.f)); + v10[0] = max(v10[0], afp(0.f)); + v10[1] = max(v10[1], afp(0.f)); + v01[0] = max(v01[0], afp(0.f)); + v01[1] = max(v01[1], afp(0.f)); + v11[0] = max(v11[0], afp(0.f)); + v11[1] = max(v11[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + v00[0] = mix(v00[0], v00[0] * afp(slope), lessThan(v00[0], afpvec4(0.f))); + v00[1] = mix(v00[1], v00[1] * afp(slope), lessThan(v00[1], afpvec4(0.f))); + v10[0] = mix(v10[0], v10[0] * afp(slope), lessThan(v10[0], afpvec4(0.f))); + v10[1] = mix(v10[1], v10[1] * afp(slope), lessThan(v10[1], afpvec4(0.f))); + v01[0] = mix(v01[0], v01[0] * afp(slope), lessThan(v01[0], afpvec4(0.f))); + v01[1] = mix(v01[1], v01[1] * afp(slope), lessThan(v01[1], afpvec4(0.f))); + v11[0] = mix(v11[0], v11[0] * afp(slope), lessThan(v11[0], afpvec4(0.f))); + v11[1] = mix(v11[1], v11[1] * afp(slope), lessThan(v11[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + v00[0] = clamp(v00[0], const_min, const_max); + v00[1] = clamp(v00[1], const_min, const_max); + v10[0] = clamp(v10[0], const_min, const_max); + v10[1] = clamp(v10[1], const_min, const_max); + v01[0] = clamp(v01[0], const_min, const_max); + v01[1] = clamp(v01[1], const_min, const_max); + v11[0] = clamp(v11[0], const_min, const_max); + v11[1] = clamp(v11[1], const_min, const_max); + } + if (activation_type == 4) + { + v00[0] = afp(1.f) / (afp(1.f) + exp(-v00[0])); + v00[1] = afp(1.f) / (afp(1.f) + exp(-v00[1])); + v10[0] = afp(1.f) / (afp(1.f) + exp(-v10[0])); + v10[1] = afp(1.f) / (afp(1.f) + exp(-v10[1])); + v01[0] = afp(1.f) / (afp(1.f) + exp(-v01[0])); + v01[1] = afp(1.f) / (afp(1.f) + exp(-v01[1])); + v11[0] = afp(1.f) / (afp(1.f) + exp(-v11[0])); + v11[1] = afp(1.f) / (afp(1.f) + exp(-v11[1])); + } + if (activation_type == 5) + { + v00[0] = v00[0] * tanh(log(exp(v00[0]) + afp(1.f))); + v00[1] = v00[1] * tanh(log(exp(v00[1]) + afp(1.f))); + v10[0] = v10[0] * tanh(log(exp(v10[0]) + afp(1.f))); + v10[1] = v10[1] * tanh(log(exp(v10[1]) + afp(1.f))); + v01[0] = v01[0] * tanh(log(exp(v01[0]) + afp(1.f))); + v01[1] = v01[1] * tanh(log(exp(v01[1]) + afp(1.f))); + v11[0] = v11[0] * tanh(log(exp(v11[0]) + afp(1.f))); + v11[1] = v11[1] * tanh(log(exp(v11[1]) + afp(1.f))); + } + + // store 2x2 +#if NCNN_image_shader + int x = gx * 2; + int y = gy * 2; + + image3d_st8(top_blob, ivec3(x, y, gz), v00); + image3d_st8(top_blob, ivec3(x + 1, y, gz), v01); + image3d_st8(top_blob, ivec3(x, y + 1, gz), v10); + image3d_st8(top_blob, ivec3(x + 1, y + 1, gz), v11); +#else + int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2; + int v_offset_1 = v_offset_0 + psc(outw); + + buffer_st8(top_blob_data, v_offset_0 + 0, v00); + buffer_st8(top_blob_data, v_offset_0 + 1, v01); + buffer_st8(top_blob_data, v_offset_1 + 0, v10); + buffer_st8(top_blob_data, v_offset_1 + 1, v11); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack8to1.comp b/source/device/vulkan/shaders/convolution_pack8to1.comp new file mode 100644 index 000000000..8d5afd5d5 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack8to1.comp @@ -0,0 +1,186 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else + sum = buffer_ld1(bias_data, gz); +#endif + } + else + { + sum = afp(0.f); + } + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + } +#else + int w_offset = gz * psc(c) * kernel_w * kernel_h; + + for (int z = 0; z < psc(c); z++) + { + int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w); + + afpvec8 k = buffer_ld8(weight_data, w_offset + x); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolution_pack8to4.comp b/source/device/vulkan/shaders/convolution_pack8to4.comp new file mode 100644 index 000000000..a60bbffe8 --- /dev/null +++ b/source/device/vulkan/shaders/convolution_pack8to4.comp @@ -0,0 +1,198 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int activation_type = 0; +layout (constant_id = 8) const float activation_param_0 = 0; +layout (constant_id = 9) const float activation_param_1 = 0; + +#define shape_constant_id_offset 10 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else + sum = buffer_ld4(bias_data, gz); +#endif + } + else + { + sum = afpvec4(0.f); + } + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + + sx += dilation_w; + wx += 4; + } + + sy += dilation_h; + } + } +#else + int w_offset = gz * psc(c) * kernel_w * kernel_h; + + for (int z = 0; z < psc(c); z++) + { + int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w); + + afpvec8 k0 = buffer_ld8(weight_data, (w_offset + x) * 4 + 0); + afpvec8 k1 = buffer_ld8(weight_data, (w_offset + x) * 4 + 1); + afpvec8 k2 = buffer_ld8(weight_data, (w_offset + x) * 4 + 2); + afpvec8 k3 = buffer_ld8(weight_data, (w_offset + x) * 4 + 3); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise.comp b/source/device/vulkan/shaders/convolutiondepthwise.comp new file mode 100644 index 000000000..b4316deb3 --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise.comp @@ -0,0 +1,170 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else + sum = buffer_ld1(bias_data, gz); +#endif + } + else + { + sum = afp(0.f); + } + + // depth-wise convolution +#if NCNN_image_shader + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + sum += image2d_ld1(weight_blob, ivec2(wx, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, gz)); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } +#else + int w_offset = gz * kernel_w * kernel_h; + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + sum += buffer_ld1(weight_data, w_offset + x) * buffer_ld1(bottom_blob_data, v_offset + x * dilation_w); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group.comp b/source/device/vulkan/shaders/convolutiondepthwise_group.comp new file mode 100644 index 000000000..32069bf5a --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_group.comp @@ -0,0 +1,186 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else + sum = buffer_ld1(bias_data, gz); +#endif + } + else + { + sum = afp(0.f); + } + + // group convolution + const int channels_g = psc(c) / group; + const int num_output_g = psc(outc) / group; + + // group id + const int gg = gz / num_output_g; + +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else + int w_offset = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * psc(cstep); + + for (int z = 0; z < channels_g; z++) + { + int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + sum += buffer_ld1(weight_data, w_offset + x) * buffer_ld1(bottom_blob_data, v_offset + x * dilation_w); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + v_offset_0 += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp new file mode 100644 index 000000000..a3e9eb2e5 --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp @@ -0,0 +1,194 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else + sum = buffer_ld4(bias_data, gz); +#endif + } + else + { + sum = afpvec4(0.f); + } + + // group convolution + const int channels_g = psc(c) / group; + const int num_output_g = psc(outc) / group; + + // group id + const int gg = gz / num_output_g; + +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += v * k; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else + int w_offset = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * psc(cstep); + + for (int z = 0; z < channels_g; z++) + { + int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v = buffer_ld1(bottom_blob_data, v_offset + x * dilation_w); + + afpvec4 k = buffer_ld4(weight_data, w_offset + x); + + sum += v * k; + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + v_offset_0 += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp new file mode 100644 index 000000000..b32a6aa87 --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp @@ -0,0 +1,204 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else + sum = buffer_ld8(bias_data, gz); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + + // group convolution + const int channels_g = psc(c) / group; + const int num_output_g = psc(outc) / group; + + // group id + const int gg = gz / num_output_g; + +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else + int w_offset = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * psc(cstep); + + for (int z = 0; z < channels_g; z++) + { + int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v = buffer_ld1(bottom_blob_data, v_offset + x * dilation_w); + + afpvec8 k = buffer_ld8(weight_data, w_offset + x); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + v_offset_0 += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack4.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4.comp new file mode 100644 index 000000000..2c9661fa6 --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4.comp @@ -0,0 +1,214 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else +layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else + sum = buffer_ld4(bias_data, gz); +#endif + } + else + { + sum = afpvec4(0.f); + } + + // group convolution + const int channels_g = psc(c) / group; + const int num_output_g = psc(outc) / group; + + // group id + const int gg = gz / num_output_g; + +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)), + image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)) + ); + + sum += v * k; + + sx += dilation_w; + wx += 4; + } + + sy += dilation_h; + } + + sz += 1; + } +#else + int w_offset = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * psc(cstep); + + for (int z = 0; z < channels_g; z++) + { + int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w); + +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + buffer_ld4(weight_data, (w_offset + x) * 4 + 0), + buffer_ld4(weight_data, (w_offset + x) * 4 + 1), + buffer_ld4(weight_data, (w_offset + x) * 4 + 2), + buffer_ld4(weight_data, (w_offset + x) * 4 + 3) + ); +#else + afpmat4 k = afpmat4(weight_data[w_offset + x]); +#endif + + sum += v * k; + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + v_offset_0 += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp new file mode 100644 index 000000000..7871cccb1 --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp @@ -0,0 +1,194 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else + sum = buffer_ld1(bias_data, gz); +#endif + } + else + { + sum = afp(0.f); + } + + // group convolution + const int channels_g = psc(c) / group; + const int num_output_g = psc(outc) / group; + + // group id + const int gg = gz / num_output_g; + +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz)); + + sum += dot(v, k); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else + int w_offset = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * psc(cstep); + + for (int z = 0; z < channels_g; z++) + { + int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w); + + afpvec4 k = buffer_ld4(weight_data, w_offset + x); + + sum += dot(v, k); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + v_offset_0 += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp new file mode 100644 index 000000000..f369a244d --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp @@ -0,0 +1,230 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else + sum = buffer_ld8(bias_data, gz); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + + // group convolution + const int channels_g = psc(c) / group; + const int num_output_g = psc(outc) / group; + + // group id + const int gg = gz / num_output_g; + +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + + sx += dilation_w; + wx += 8; + } + + sy += dilation_h; + } + + sz += 1; + } +#else + int w_offset = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * psc(cstep); + + for (int z = 0; z < channels_g; z++) + { + int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w); + + afpvec4 k0 = buffer_ld4(weight_data, (w_offset + x) * 8 + 0); + afpvec4 k1 = buffer_ld4(weight_data, (w_offset + x) * 8 + 1); + afpvec4 k2 = buffer_ld4(weight_data, (w_offset + x) * 8 + 2); + afpvec4 k3 = buffer_ld4(weight_data, (w_offset + x) * 8 + 3); + afpvec4 k4 = buffer_ld4(weight_data, (w_offset + x) * 8 + 4); + afpvec4 k5 = buffer_ld4(weight_data, (w_offset + x) * 8 + 5); + afpvec4 k6 = buffer_ld4(weight_data, (w_offset + x) * 8 + 6); + afpvec4 k7 = buffer_ld4(weight_data, (w_offset + x) * 8 + 7); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + v_offset_0 += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack8.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8.comp new file mode 100644 index 000000000..abd16aed8 --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8.comp @@ -0,0 +1,230 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else + sum = buffer_ld8(bias_data, gz); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + + // group convolution + const int channels_g = psc(c) / group; + const int num_output_g = psc(outc) / group; + + // group id + const int gg = gz / num_output_g; + +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz)); + + // sum += v * k + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + + sx += dilation_w; + wx += 8; + } + + sy += dilation_h; + } + + sz += 1; + } +#else + int w_offset = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * psc(cstep); + + for (int z = 0; z < channels_g; z++) + { + int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w); + + afpvec8 k0 = buffer_ld8(weight_data, (w_offset + x) * 8 + 0); + afpvec8 k1 = buffer_ld8(weight_data, (w_offset + x) * 8 + 1); + afpvec8 k2 = buffer_ld8(weight_data, (w_offset + x) * 8 + 2); + afpvec8 k3 = buffer_ld8(weight_data, (w_offset + x) * 8 + 3); + afpvec8 k4 = buffer_ld8(weight_data, (w_offset + x) * 8 + 4); + afpvec8 k5 = buffer_ld8(weight_data, (w_offset + x) * 8 + 5); + afpvec8 k6 = buffer_ld8(weight_data, (w_offset + x) * 8 + 6); + afpvec8 k7 = buffer_ld8(weight_data, (w_offset + x) * 8 + 7); + + // sum += v * k + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + v_offset_0 += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp new file mode 100644 index 000000000..c77771154 --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp @@ -0,0 +1,197 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gz); +#else + sum = buffer_ld1(bias_data, gz); +#endif + } + else + { + sum = afp(0.f); + } + + // group convolution + const int channels_g = psc(c) / group; + const int num_output_g = psc(outc) / group; + + // group id + const int gg = gz / num_output_g; + +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz)); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } + + sz += 1; + } +#else + int w_offset = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * psc(cstep); + + for (int z = 0; z < channels_g; z++) + { + int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w); + + afpvec8 k = buffer_ld8(weight_data, w_offset + x); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + v_offset_0 += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp new file mode 100644 index 000000000..9c9f43a89 --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp @@ -0,0 +1,209 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler3D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else + sum = buffer_ld4(bias_data, gz); +#endif + } + else + { + sum = afpvec4(0.f); + } + + // group convolution + const int channels_g = psc(c) / group; + const int num_output_g = psc(outc) / group; + + // group id + const int gg = gz / num_output_g; + +#if NCNN_image_shader + int sz = gg * channels_g; + + for (int z = 0; z < channels_g; z++) + { + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz)); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + + sx += dilation_w; + wx += 4; + } + + sy += dilation_h; + } + + sz += 1; + } +#else + int w_offset = gz * channels_g * kernel_w * kernel_h; + int v_offset_0 = gg * channels_g * psc(cstep); + + for (int z = 0; z < channels_g; z++) + { + int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w); + + afpvec8 k0 = buffer_ld8(weight_data, (w_offset + x) * 4 + 0); + afpvec8 k1 = buffer_ld8(weight_data, (w_offset + x) * 4 + 1); + afpvec8 k2 = buffer_ld8(weight_data, (w_offset + x) * 4 + 2); + afpvec8 k3 = buffer_ld8(weight_data, (w_offset + x) * 4 + 3); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + v_offset_0 += psc(cstep); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_pack4.comp b/source/device/vulkan/shaders/convolutiondepthwise_pack4.comp new file mode 100644 index 000000000..0bd4929bf --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_pack4.comp @@ -0,0 +1,178 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gz); +#else + sum = buffer_ld4(bias_data, gz); +#endif + } + else + { + sum = afpvec4(0.f); + } + + // depth-wise convolution +#if NCNN_image_shader + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, gz)); + + afpvec4 k = image2d_ld4(weight_blob, ivec2(wx, gz)); + + sum += v * k; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } +#else + int w_offset = gz * kernel_w * kernel_h; + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w); + + afpvec4 k = buffer_ld4(weight_data, w_offset + x); + + sum += v * k; + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/convolutiondepthwise_pack8.comp b/source/device/vulkan/shaders/convolutiondepthwise_pack8.comp new file mode 100644 index 000000000..d19c97053 --- /dev/null +++ b/source/device/vulkan/shaders/convolutiondepthwise_pack8.comp @@ -0,0 +1,191 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gz); +#else + sum = buffer_ld8(bias_data, gz); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + + // depth-wise convolution +#if NCNN_image_shader + int sy = gy * stride_h; + int wx = 0; + + for (int y = 0; y < kernel_h; y++) + { + int sx = gx * stride_w; + + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, gz)); + + afpvec8 k = image2d_ld8(weight_blob, ivec2(wx, gz)); + + // sum += v * k; + sum[0] += v[0] * k[0]; + sum[1] += v[1] * k[1]; + + sx += dilation_w; + wx += 1; + } + + sy += dilation_h; + } +#else + int w_offset = gz * kernel_w * kernel_h; + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w); + + afpvec8 k = buffer_ld8(weight_data, w_offset + x); + + // sum += v * k; + sum[0] += v[0] * k[0]; + sum[1] += v[1] * k[1]; + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), sum); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + // sum = afpvec8(afpvec4(gi), afpvec4(gi)); + // sum = buffer_ld8(bias_data, gz); + + buffer_st8(top_blob_data, gi, sum); +#endif +} diff --git a/source/device/vulkan/shaders/crop.comp b/source/device/vulkan/shaders/crop.comp new file mode 100644 index 000000000..234983eb0 --- /dev/null +++ b/source/device/vulkan/shaders/crop.comp @@ -0,0 +1,92 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bugihfa = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + int z = gz + p.coffset; + +#if NCNN_image_shader + image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z)); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int v_offset = z * psc(cstep) + y * psc(w) + x; + + buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/crop_pack1to4.comp b/source/device/vulkan/shaders/crop_pack1to4.comp new file mode 100644 index 000000000..27056a11c --- /dev/null +++ b/source/device/vulkan/shaders/crop_pack1to4.comp @@ -0,0 +1,98 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bugihfa = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + int z = gz * 4 + p.coffset; + +#if NCNN_image_shader + afpvec4 v; + v.r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0)); + v.g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1)); + v.b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2)); + v.a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3)); + + image3d_st4(top_blob, ivec3(gx, gy, gz), v); +#else + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep); + + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/crop_pack1to8.comp b/source/device/vulkan/shaders/crop_pack1to8.comp new file mode 100644 index 000000000..5116a6995 --- /dev/null +++ b/source/device/vulkan/shaders/crop_pack1to8.comp @@ -0,0 +1,104 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bugihfa = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + int z = gz * 8 + p.coffset; + +#if NCNN_image_shader + afpvec8 v; + v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0)); + v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1)); + v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2)); + v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3)); + v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 4)); + v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 5)); + v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 6)); + v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 7)); + + image3d_st8(top_blob, ivec3(gx, gy, gz), v); +#else + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep); + ivec4 vv_offset = v_offset + 4 * psc(cstep); + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +} diff --git a/source/device/vulkan/shaders/crop_pack4.comp b/source/device/vulkan/shaders/crop_pack4.comp new file mode 100644 index 000000000..d9262c217 --- /dev/null +++ b/source/device/vulkan/shaders/crop_pack4.comp @@ -0,0 +1,92 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bugihfa = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + int z = gz + p.coffset; + +#if NCNN_image_shader + image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z)); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int v_offset = z * psc(cstep) + y * psc(w) + x; + + buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/crop_pack4to1.comp b/source/device/vulkan/shaders/crop_pack4to1.comp new file mode 100644 index 000000000..69bf5069c --- /dev/null +++ b/source/device/vulkan/shaders/crop_pack4to1.comp @@ -0,0 +1,107 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bugihfa = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + int z = gz + p.coffset; + +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z / 4)); + + image3d_st1(top_blob, ivec3(gx, gy, gz), v[z % 4]); +#else + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + +#if NCNN_fp16_packed + int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z % 4) / 2; + int lane2 = z % 2; + + afpvec2 v = buffer_ld2(bottom_blob_data, v_offset); + + buffer_st1(top_blob_data, gi, v[lane2]); +#else + int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 4 + z % 4; + + buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/crop_pack4to8.comp b/source/device/vulkan/shaders/crop_pack4to8.comp new file mode 100644 index 000000000..6b46cdf26 --- /dev/null +++ b/source/device/vulkan/shaders/crop_pack4to8.comp @@ -0,0 +1,182 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bugihfa = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + ivec4 z4 = gz * 8 + p.coffset + ivec4(0, 1, 2, 3); + ivec4 zz4 = z4 + 4; + +#if NCNN_image_shader + afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z4.r / 4)); + afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z4.g / 4)); + afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y, z4.b / 4)); + afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y, z4.a / 4)); + afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.r / 4)); + afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.g / 4)); + afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.b / 4)); + afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.a / 4)); + + afpvec8 v; +#if NCNN_fp16_arithmetic + if (bugihfa == 1) + { + ivec4 z4m4 = z4 % 4; + ivec4 zz4m4 = zz4 % 4; + + if (z4m4.r == 0) v[0].r = v0.r; + if (z4m4.r == 1) v[0].r = v0.g; + if (z4m4.r == 2) v[0].r = v0.b; + if (z4m4.r == 3) v[0].r = v0.a; + if (z4m4.g == 0) v[0].g = v1.r; + if (z4m4.g == 1) v[0].g = v1.g; + if (z4m4.g == 2) v[0].g = v1.b; + if (z4m4.g == 3) v[0].g = v1.a; + if (z4m4.b == 0) v[0].b = v2.r; + if (z4m4.b == 1) v[0].b = v2.g; + if (z4m4.b == 2) v[0].b = v2.b; + if (z4m4.b == 3) v[0].b = v2.a; + if (z4m4.a == 0) v[0].a = v3.r; + if (z4m4.a == 1) v[0].a = v3.g; + if (z4m4.a == 2) v[0].a = v3.b; + if (z4m4.a == 3) v[0].a = v3.a; + if (zz4m4.r == 0) v[1].r = v4.r; + if (zz4m4.r == 1) v[1].r = v4.g; + if (zz4m4.r == 2) v[1].r = v4.b; + if (zz4m4.r == 3) v[1].r = v4.a; + if (zz4m4.g == 0) v[1].g = v5.r; + if (zz4m4.g == 1) v[1].g = v5.g; + if (zz4m4.g == 2) v[1].g = v5.b; + if (zz4m4.g == 3) v[1].g = v5.a; + if (zz4m4.b == 0) v[1].b = v6.r; + if (zz4m4.b == 1) v[1].b = v6.g; + if (zz4m4.b == 2) v[1].b = v6.b; + if (zz4m4.b == 3) v[1].b = v6.a; + if (zz4m4.a == 0) v[1].a = v7.r; + if (zz4m4.a == 1) v[1].a = v7.g; + if (zz4m4.a == 2) v[1].a = v7.b; + if (zz4m4.a == 3) v[1].a = v7.a; + } + else +#endif + { + v[0].r = v0[z4.r % 4]; + v[0].g = v1[z4.g % 4]; + v[0].b = v2[z4.b % 4]; + v[0].a = v3[z4.a % 4]; + v[1].r = v4[zz4.r % 4]; + v[1].g = v5[zz4.g % 4]; + v[1].b = v6[zz4.b % 4]; + v[1].a = v7[zz4.a % 4]; + } + + image3d_st8(top_blob, ivec3(gx, gy, gz), v); +#else +#if NCNN_fp16_packed + ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z4 % 4) / 2; + ivec4 lane2 = z4 % 2; + ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (zz4 % 4) / 2; + ivec4 lane4 = zz4 % 2; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); + afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); + afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); + afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); + + afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); + + buffer_st8(top_blob_data, gi, v); +#else + ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + z4 % 4; + ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + zz4 % 4; + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/crop_pack8.comp b/source/device/vulkan/shaders/crop_pack8.comp new file mode 100644 index 000000000..3465c79fe --- /dev/null +++ b/source/device/vulkan/shaders/crop_pack8.comp @@ -0,0 +1,93 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bugihfa = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + int z = gz + p.coffset; + +#if NCNN_image_shader + image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z)); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int v_offset = z * psc(cstep) + y * psc(w) + x; + + buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/crop_pack8to1.comp b/source/device/vulkan/shaders/crop_pack8to1.comp new file mode 100644 index 000000000..885f9260b --- /dev/null +++ b/source/device/vulkan/shaders/crop_pack8to1.comp @@ -0,0 +1,108 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bugihfa = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + int z = gz + p.coffset; + +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_blob, ivec3(x, y, z / 8)); + + image3d_st1(top_blob, ivec3(gx, gy, gz), v[(z % 8) / 4][z % 4]); +#else + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + +#if NCNN_fp16_packed + int v_offset = ((z / 8) * psc(cstep) + y * psc(w) + x) * 4 + (z % 8) / 2; + int lane2 = z % 2; + + afpvec2 v = buffer_ld2(bottom_blob_data, v_offset); + + buffer_st1(top_blob_data, gi, v[lane2]); +#else + int v_offset = ((z / 8) * psc(cstep) + y * psc(w) + x) * 8 + z % 8; + + buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/crop_pack8to4.comp b/source/device/vulkan/shaders/crop_pack8to4.comp new file mode 100644 index 000000000..e102ce724 --- /dev/null +++ b/source/device/vulkan/shaders/crop_pack8to4.comp @@ -0,0 +1,149 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bugihfa = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int woffset; + int hoffset; + int coffset; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx + p.woffset; + int y = gy + p.hoffset; + ivec4 z4 = gz * 4 + p.coffset + ivec4(0, 1, 2, 3); + +#if NCNN_image_shader + afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(x, y, z4.r / 8)); + afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(x, y, z4.g / 8)); + afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(x, y, z4.b / 8)); + afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(x, y, z4.a / 8)); + + afpvec4 v; +#if NCNN_fp16_arithmetic + if (bugihfa == 1) + { + ivec4 z4lane2 = (z4 % 8) / 4; + ivec4 z4m4 = z4 % 4; + + if (z4m4.r == 0) v.r = v0[z4lane2.r].r; + if (z4m4.r == 1) v.r = v0[z4lane2.r].g; + if (z4m4.r == 2) v.r = v0[z4lane2.r].b; + if (z4m4.r == 3) v.r = v0[z4lane2.r].a; + if (z4m4.g == 0) v.g = v0[z4lane2.g].r; + if (z4m4.g == 1) v.g = v0[z4lane2.g].g; + if (z4m4.g == 2) v.g = v0[z4lane2.g].b; + if (z4m4.g == 3) v.g = v0[z4lane2.g].a; + if (z4m4.b == 0) v.b = v0[z4lane2.b].r; + if (z4m4.b == 1) v.b = v0[z4lane2.b].g; + if (z4m4.b == 2) v.b = v0[z4lane2.b].b; + if (z4m4.b == 3) v.b = v0[z4lane2.b].a; + if (z4m4.a == 0) v.a = v0[z4lane2.a].r; + if (z4m4.a == 1) v.a = v0[z4lane2.a].g; + if (z4m4.a == 2) v.a = v0[z4lane2.a].b; + if (z4m4.a == 3) v.a = v0[z4lane2.a].a; + } + else +#endif + { + v.r = v0[(z4.r % 8) / 4][z4.r % 4]; + v.g = v1[(z4.g % 8) / 4][z4.g % 4]; + v.b = v2[(z4.b % 8) / 4][z4.b % 4]; + v.a = v3[(z4.a % 8) / 4][z4.a % 4]; + } + + image3d_st4(top_blob, ivec3(gx, gy, gz), v); +#else +#if NCNN_fp16_packed + ivec4 v_offset = ((z4 / 8) * psc(cstep) + y * psc(w) + x) * 4 + (z4 % 8) / 2; + ivec4 lane2 = z4 % 2; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); + + buffer_st4(top_blob_data, gi, v); +#else + ivec4 v_offset = ((z4 / 8) * psc(cstep) + y * psc(w) + x) * 8 + z4 % 8; + + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/depthwiseconvolution.comp b/source/device/vulkan/shaders/depthwiseconvolution.comp new file mode 100644 index 000000000..bbbabf1c9 --- /dev/null +++ b/source/device/vulkan/shaders/depthwiseconvolution.comp @@ -0,0 +1,121 @@ +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int kernel_w = 1; +layout (constant_id = 1) const int kernel_h = 1; +layout (constant_id = 2) const int dilation_w = 1; +layout (constant_id = 3) const int dilation_h = 1; +layout (constant_id = 4) const int stride_w = 1; +layout (constant_id = 5) const int stride_h = 1; +layout (constant_id = 6) const int bias_term = 0; +layout (constant_id = 7) const int group = 1; +layout (constant_id = 8) const int activation_type = 0; +layout (constant_id = 9) const float activation_param_0 = 0; +layout (constant_id = 10) const float activation_param_1 = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp sum; + + if (bias_term == 1) + { + sum = buffer_ld1(bias_data, gz); + } + else + { + sum = afp(0.f); + } + + // depth-wise convolution + int w_offset = gz * kernel_w * kernel_h; + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + sum += buffer_ld1(weight_data, w_offset + x) * buffer_ld1(bottom_blob_data, v_offset + x * dilation_w); + } + + v_offset += dilation_h * psc(w); + w_offset += kernel_w; + } + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + // sum = gi;//bottom_blob_data[gi]; + buffer_st1(top_blob_data, gi, sum); +} \ No newline at end of file diff --git a/source/device/vulkan/shaders/dropout.comp b/source/device/vulkan/shaders/dropout.comp new file mode 100644 index 000000000..53bf43a38 --- /dev/null +++ b/source/device/vulkan/shaders/dropout.comp @@ -0,0 +1,104 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const float scale = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afp v; + if (psc(dims) == 1) + { + v = image1d_ld1(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afp v = buffer_ld1(bottom_top_blob_data, gi); +#endif + + v *= afp(scale); + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st1(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st1(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/dropout_pack4.comp b/source/device/vulkan/shaders/dropout_pack4.comp new file mode 100644 index 000000000..c71d37966 --- /dev/null +++ b/source/device/vulkan/shaders/dropout_pack4.comp @@ -0,0 +1,104 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const float scale = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec4 v; + if (psc(dims) == 1) + { + v = image1d_ld4(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); +#endif + + v *= afp(scale); + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st4(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/dropout_pack8.comp b/source/device/vulkan/shaders/dropout_pack8.comp new file mode 100644 index 000000000..acecba62d --- /dev/null +++ b/source/device/vulkan/shaders/dropout_pack8.comp @@ -0,0 +1,106 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const float scale = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec8 v; + if (psc(dims) == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); +#endif + + v[0] = v[0] * afp(scale); + v[1] = v[1] * afp(scale); + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st8(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/eltwise.comp b/source/device/vulkan/shaders/eltwise.comp new file mode 100644 index 000000000..addb1bfb0 --- /dev/null +++ b/source/device/vulkan/shaders/eltwise.comp @@ -0,0 +1,141 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int op_type = 0; +layout (constant_id = 1) const int coeff_term = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob1_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob1_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob1_3d; +layout (binding = 1) uniform unfp sampler1D bottom_blob2_1d; +layout (binding = 1) uniform unfp sampler2D bottom_blob2_2d; +layout (binding = 1) uniform unfp sampler3D bottom_blob2_3d; +layout (binding = 2, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 2, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 2, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; }; +layout (binding = 1) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + float coeff0; + float coeff1; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afp v1; + afp v2; + if (psc(dims) == 1) + { + v1 = image1d_ld1(bottom_blob1_1d, gx); + v2 = image1d_ld1(bottom_blob2_1d, gx); + } + else if (psc(dims) == 2) + { + v1 = image2d_ld1(bottom_blob1_2d, ivec2(gx, gy)); + v2 = image2d_ld1(bottom_blob2_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v1 = image3d_ld1(bottom_blob1_3d, ivec3(gx, gy, gz)); + v2 = image3d_ld1(bottom_blob2_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afp v1 = buffer_ld1(bottom_blob1_data, gi); + afp v2 = buffer_ld1(bottom_blob2_data, gi); +#endif + + afp res; + + if (coeff_term == 0) + { + if (op_type == 0) + res = v1 * v2; + + if (op_type == 1) + res = v1 + v2; + + if (op_type == 2) + res = max(v1, v2); + } + else + { + if (op_type == 0) + res = v1 * v2; + + if (op_type == 1) + res = v1 * afp(p.coeff0) + v2 * afp(p.coeff1); + + if (op_type == 2) + res = max(v1, v2); + } + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st1(top_blob_1d, gx, res); + } + else if (psc(dims) == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), res); + } + else // if (psc(dims) == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res); + } +#else + buffer_st1(top_blob_data, gi, res); +#endif +} diff --git a/source/device/vulkan/shaders/eltwise_pack4.comp b/source/device/vulkan/shaders/eltwise_pack4.comp new file mode 100644 index 000000000..c93d1000b --- /dev/null +++ b/source/device/vulkan/shaders/eltwise_pack4.comp @@ -0,0 +1,141 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int op_type = 0; +layout (constant_id = 1) const int coeff_term = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob1_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob1_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob1_3d; +layout (binding = 1) uniform unfp sampler1D bottom_blob2_1d; +layout (binding = 1) uniform unfp sampler2D bottom_blob2_2d; +layout (binding = 1) uniform unfp sampler3D bottom_blob2_3d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob1 { sfpvec4 bottom_blob1_data[]; }; +layout (binding = 1) readonly buffer bottom_blob2 { sfpvec4 bottom_blob2_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + float coeff0; + float coeff1; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec4 v1; + afpvec4 v2; + if (psc(dims) == 1) + { + v1 = image1d_ld4(bottom_blob1_1d, gx); + v2 = image1d_ld4(bottom_blob2_1d, gx); + } + else if (psc(dims) == 2) + { + v1 = image2d_ld4(bottom_blob1_2d, ivec2(gx, gy)); + v2 = image2d_ld4(bottom_blob2_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v1 = image3d_ld4(bottom_blob1_3d, ivec3(gx, gy, gz)); + v2 = image3d_ld4(bottom_blob2_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec4 v1 = buffer_ld4(bottom_blob1_data, gi); + afpvec4 v2 = buffer_ld4(bottom_blob2_data, gi); +#endif + + afpvec4 res; + + if (coeff_term == 0) + { + if (op_type == 0) + res = v1 * v2; + + if (op_type == 1) + res = v1 + v2; + + if (op_type == 2) + res = max(v1, v2); + } + else + { + if (op_type == 0) + res = v1 * v2; + + if (op_type == 1) + res = v1 * afp(p.coeff0) + v2 * afp(p.coeff1); + + if (op_type == 2) + res = max(v1, v2); + } + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st4(top_blob_1d, gx, res); + } + else if (psc(dims) == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), res); + } + else // if (psc(dims) == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res); + } +#else + buffer_st4(top_blob_data, gi, res); +#endif +} diff --git a/source/device/vulkan/shaders/eltwise_pack8.comp b/source/device/vulkan/shaders/eltwise_pack8.comp new file mode 100644 index 000000000..5f767b82f --- /dev/null +++ b/source/device/vulkan/shaders/eltwise_pack8.comp @@ -0,0 +1,160 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int op_type = 0; +layout (constant_id = 1) const int coeff_term = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob1_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob1_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob1_3d; +layout (binding = 1) uniform unfp sampler1D bottom_blob2_1d; +layout (binding = 1) uniform unfp sampler2D bottom_blob2_2d; +layout (binding = 1) uniform unfp sampler3D bottom_blob2_3d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 2, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob1 { sfpvec8 bottom_blob1_data[]; }; +layout (binding = 1) readonly buffer bottom_blob2 { sfpvec8 bottom_blob2_data[]; }; +layout (binding = 2) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + float coeff0; + float coeff1; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec8 v1; + afpvec8 v2; + if (psc(dims) == 1) + { + v1 = image1d_ld8(bottom_blob1_1d, gx); + v2 = image1d_ld8(bottom_blob2_1d, gx); + } + else if (psc(dims) == 2) + { + v1 = image2d_ld8(bottom_blob1_2d, ivec2(gx, gy)); + v2 = image2d_ld8(bottom_blob2_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v1 = image3d_ld8(bottom_blob1_3d, ivec3(gx, gy, gz)); + v2 = image3d_ld8(bottom_blob2_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec8 v1 = buffer_ld8(bottom_blob1_data, gi); + afpvec8 v2 = buffer_ld8(bottom_blob2_data, gi); +#endif + + afpvec8 res; + + if (coeff_term == 0) + { + if (op_type == 0) + { + res[0] = v1[0] * v2[0]; + res[1] = v1[1] * v2[1]; + } + + if (op_type == 1) + { + res[0] = v1[0] + v2[0]; + res[1] = v1[1] + v2[1]; + } + + if (op_type == 2) + { + res[0] = max(v1[0], v2[0]); + res[1] = max(v1[1], v2[1]); + } + } + else + { + if (op_type == 0) + { + res[0] = v1[0] * v2[0]; + res[1] = v1[1] * v2[1]; + } + + if (op_type == 1) + { + res[0] = v1[0] * afp(p.coeff0) + v2[0] * afp(p.coeff1); + res[1] = v1[1] * afp(p.coeff0) + v2[1] * afp(p.coeff1); + } + + if (op_type == 2) + { + res[0] = max(v1[0], v2[0]); + res[1] = max(v1[1], v2[1]); + } + } + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st8(top_blob_1d, gx, res); + } + else if (psc(dims) == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), res); + } + else // if (psc(dims) == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res); + } +#else + buffer_st8(top_blob_data, gi, res); +#endif +} diff --git a/source/device/vulkan/shaders/flatten.comp b/source/device/vulkan/shaders/flatten.comp new file mode 100644 index 000000000..8cc137789 --- /dev/null +++ b/source/device/vulkan/shaders/flatten.comp @@ -0,0 +1,98 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + int size = psc(w) * psc(h); + + int z = gx / size; + int y = gx % size / psc(w); + int x = gx % size % psc(w); + +#if NCNN_image_shader + afp v; + + if (psc(dims) == 2) + { + v = image2d_ld1(bottom_blob_2d, ivec2(x, y)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld1(bottom_blob_3d, ivec3(x, y, z)); + } + + image1d_st1(top_blob, gx, v); +#else + int v_offset = z * psc(cstep) + y * psc(w) + x; + + buffer_cp1(top_blob_data, gx, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/flatten_pack1to4.comp b/source/device/vulkan/shaders/flatten_pack1to4.comp new file mode 100644 index 000000000..b0ff244e5 --- /dev/null +++ b/source/device/vulkan/shaders/flatten_pack1to4.comp @@ -0,0 +1,127 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3); + +#if NCNN_image_shader + afpvec4 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v.r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r)); + v.g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g)); + v.b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b)); + v.a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a)); + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a)); + } + + image1d_st4(top_blob, gx, v); +#else + ivec4 v_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = y4 * psc(w) + x4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = z4 * psc(cstep) + y4 * psc(w) + x4; + } + + buffer_cp1to4(top_blob_data, gx, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/flatten_pack1to8.comp b/source/device/vulkan/shaders/flatten_pack1to8.comp new file mode 100644 index 000000000..38f3f89d3 --- /dev/null +++ b/source/device/vulkan/shaders/flatten_pack1to8.comp @@ -0,0 +1,154 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + ivec4 i4 = gx * 8 + ivec4(0, 1, 2, 3); + ivec4 ii4 = i4 + 4; + +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v[0].r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r)); + v[0].g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g)); + v[0].b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b)); + v[0].a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a)); + v[1].r = image2d_ld1(bottom_blob_2d, ivec2(xx4.r, yy4.r)); + v[1].g = image2d_ld1(bottom_blob_2d, ivec2(xx4.g, yy4.g)); + v[1].b = image2d_ld1(bottom_blob_2d, ivec2(xx4.b, yy4.b)); + v[1].a = image2d_ld1(bottom_blob_2d, ivec2(xx4.a, yy4.a)); + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a)); + } + + image1d_st8(top_blob, gx, v); +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = y4 * psc(w) + x4; + vv_offset = yy4 * psc(w) + xx4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = z4 * psc(cstep) + y4 * psc(w) + x4; + vv_offset = zz4 * psc(cstep) + yy4 * psc(w) + xx4; + } + + buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset); +#endif +} diff --git a/source/device/vulkan/shaders/flatten_pack4.comp b/source/device/vulkan/shaders/flatten_pack4.comp new file mode 100644 index 000000000..a6827efd4 --- /dev/null +++ b/source/device/vulkan/shaders/flatten_pack4.comp @@ -0,0 +1,175 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3); + +#if NCNN_image_shader + afpvec4 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4)); + afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4)); + afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4)); + afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4)); + + v.r = v0[y4.r % 4]; + v.g = v1[y4.g % 4]; + v.b = v2[y4.b % 4]; + v.a = v3[y4.a % 4]; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4)); + afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4)); + afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4)); + afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4)); + + v.r = v0[z4.r % 4]; + v.g = v1[z4.g % 4]; + v.b = v2[z4.b % 4]; + v.a = v3[z4.a % 4]; + } + + image1d_st4(top_blob, gx, v); +#else +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 lane2; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2; + lane2 = y4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2; + lane2 = z4 % 2; + } + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); + + buffer_st4(top_blob_data, gx, v); +#else + ivec4 v_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4; + } + + buffer_cp1to4(top_blob_data, gx, bottom_blob_data, v_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/flatten_pack4to8.comp b/source/device/vulkan/shaders/flatten_pack4to8.comp new file mode 100644 index 000000000..8dfaf3b15 --- /dev/null +++ b/source/device/vulkan/shaders/flatten_pack4to8.comp @@ -0,0 +1,222 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + ivec4 i4 = gx * 8 + ivec4(0, 1, 2, 3); + ivec4 ii4 = i4 + 4; + +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4)); + afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4)); + afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4)); + afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4)); + afpvec4 v4 = image2d_ld4(bottom_blob_2d, ivec2(xx4.r, yy4.r / 4)); + afpvec4 v5 = image2d_ld4(bottom_blob_2d, ivec2(xx4.g, yy4.g / 4)); + afpvec4 v6 = image2d_ld4(bottom_blob_2d, ivec2(xx4.b, yy4.b / 4)); + afpvec4 v7 = image2d_ld4(bottom_blob_2d, ivec2(xx4.a, yy4.a / 4)); + + v[0].r = v0[y4.r % 4]; + v[0].g = v1[y4.g % 4]; + v[0].b = v2[y4.b % 4]; + v[0].a = v3[y4.a % 4]; + v[1].r = v4[yy4.r % 4]; + v[1].g = v5[yy4.g % 4]; + v[1].b = v6[yy4.b % 4]; + v[1].a = v7[yy4.a % 4]; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4)); + afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4)); + afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4)); + afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4)); + afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 4)); + afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 4)); + afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 4)); + afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 4)); + + v[0].r = v0[z4.r % 4]; + v[0].g = v1[z4.g % 4]; + v[0].b = v2[z4.b % 4]; + v[0].a = v3[z4.a % 4]; + v[1].r = v4[zz4.r % 4]; + v[1].g = v5[zz4.g % 4]; + v[1].b = v6[zz4.b % 4]; + v[1].a = v7[zz4.a % 4]; + } + + image1d_st8(top_blob, gx, v); +#else +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 lane4; + ivec4 vv_offset; + ivec4 lane8; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2; + lane4 = y4 % 2; + vv_offset = ((yy4 / 4) * psc(w) + xx4) * 2 + (yy4 % 4) / 2; + lane8 = yy4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2; + lane4 = z4 % 2; + vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2; + lane8 = zz4 % 2; + } + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); + afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); + afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); + afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); + + afpvec8 v = afpvec8(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a], vvr[lane8.r], vvg[lane8.g], vvb[lane8.b], vva[lane8.a]); + + buffer_st8(top_blob_data, gx, v); +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4; + vv_offset = ((yy4 / 4) * psc(w) + xx4) * 4 + yy4 % 4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4; + vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4; + } + + buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/flatten_pack8.comp b/source/device/vulkan/shaders/flatten_pack8.comp new file mode 100644 index 000000000..01a06f451 --- /dev/null +++ b/source/device/vulkan/shaders/flatten_pack8.comp @@ -0,0 +1,222 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + ivec4 i4 = gx * 8 + ivec4(0, 1, 2, 3); + ivec4 ii4 = i4 + 4; + +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8)); + afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8)); + afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8)); + afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8)); + afpvec8 v4 = image2d_ld8(bottom_blob_2d, ivec2(xx4.r, yy4.r / 8)); + afpvec8 v5 = image2d_ld8(bottom_blob_2d, ivec2(xx4.g, yy4.g / 8)); + afpvec8 v6 = image2d_ld8(bottom_blob_2d, ivec2(xx4.b, yy4.b / 8)); + afpvec8 v7 = image2d_ld8(bottom_blob_2d, ivec2(xx4.a, yy4.a / 8)); + + v[0].r = v0[(y4.r % 8) / 4][y4.r % 4]; + v[0].g = v1[(y4.g % 8) / 4][y4.g % 4]; + v[0].b = v2[(y4.b % 8) / 4][y4.b % 4]; + v[0].a = v3[(y4.a % 8) / 4][y4.a % 4]; + v[1].r = v4[(yy4.r % 8) / 4][yy4.r % 4]; + v[1].g = v5[(yy4.g % 8) / 4][yy4.g % 4]; + v[1].b = v6[(yy4.b % 8) / 4][yy4.b % 4]; + v[1].a = v7[(yy4.a % 8) / 4][yy4.a % 4]; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8)); + afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8)); + afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8)); + afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8)); + afpvec8 v4 = image3d_ld8(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 8)); + afpvec8 v5 = image3d_ld8(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 8)); + afpvec8 v6 = image3d_ld8(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 8)); + afpvec8 v7 = image3d_ld8(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 8)); + + v[0].r = v0[(z4.r % 8) / 4][z4.r % 4]; + v[0].g = v1[(z4.g % 8) / 4][z4.g % 4]; + v[0].b = v2[(z4.b % 8) / 4][z4.b % 4]; + v[0].a = v3[(z4.a % 8) / 4][z4.a % 4]; + v[1].r = v4[(zz4.r % 8) / 4][zz4.r % 4]; + v[1].g = v5[(zz4.g % 8) / 4][zz4.g % 4]; + v[1].b = v6[(zz4.b % 8) / 4][zz4.b % 4]; + v[1].a = v7[(zz4.a % 8) / 4][zz4.a % 4]; + } + + image1d_st8(top_blob, gx, v); +#else +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 lane4; + ivec4 vv_offset; + ivec4 lane8; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2; + lane4 = y4 % 2; + vv_offset = ((yy4 / 8) * psc(w) + xx4) * 4 + (yy4 % 8) / 2; + lane8 = yy4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2; + lane4 = z4 % 2; + vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + (zz4 % 8) / 2; + lane8 = zz4 % 2; + } + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); + afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); + afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); + afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); + + afpvec8 v = afpvec8(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a], vvr[lane8.r], vvg[lane8.g], vvb[lane8.b], vva[lane8.a]); + + buffer_st8(top_blob_data, gx, v); +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8; + vv_offset = ((yy4 / 8) * psc(w) + xx4) * 8 + yy4 % 8; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8; + vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 8 + zz4 % 8; + } + + buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/innerproduct.comp b/source/device/vulkan/shaders/innerproduct.comp new file mode 100644 index 000000000..baa8c4b9e --- /dev/null +++ b/source/device/vulkan/shaders/innerproduct.comp @@ -0,0 +1,140 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gx); +#else + sum = buffer_ld1(bias_data, gx); +#endif + } + else + { + sum = afp(0.f); + } + +#if NCNN_image_shader + for (int i = 0; i < psc(w); i++) + { + sum += image2d_ld1(weight_blob, ivec2(i, gx)) * image1d_ld1(bottom_blob, i); + } +#else + int w_offset = gx * psc(w); + + for (int i = 0; i < psc(w); i++) + { + sum += buffer_ld1(weight_data, w_offset + i) * buffer_ld1(bottom_blob_data, i); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image1d_st1(top_blob, gx, sum); +#else + buffer_st1(top_blob_data, gx, sum); +#endif +} diff --git a/source/device/vulkan/shaders/innerproduct_pack1to4.comp b/source/device/vulkan/shaders/innerproduct_pack1to4.comp new file mode 100644 index 000000000..d2f96e4ec --- /dev/null +++ b/source/device/vulkan/shaders/innerproduct_pack1to4.comp @@ -0,0 +1,148 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gx); +#else + sum = buffer_ld4(bias_data, gx); +#endif + } + else + { + sum = afpvec4(0.f); + } + +#if NCNN_image_shader + for (int i = 0; i < psc(w); i++) + { + afp v = image1d_ld1(bottom_blob, i); + + afpvec4 k = image2d_ld4(weight_blob, ivec2(i, gx)); + + sum += v * k; + } +#else + int w_offset = gx * psc(w); + + for (int i = 0; i < psc(w); i++) + { + afp v = buffer_ld1(bottom_blob_data, i); + + afpvec4 k = buffer_ld4(weight_data, w_offset + i); + + sum += v * k; + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image1d_st4(top_blob, gx, sum); +#else + buffer_st4(top_blob_data, gx, sum); +#endif +} diff --git a/source/device/vulkan/shaders/innerproduct_pack1to8.comp b/source/device/vulkan/shaders/innerproduct_pack1to8.comp new file mode 100644 index 000000000..5bb3ffd84 --- /dev/null +++ b/source/device/vulkan/shaders/innerproduct_pack1to8.comp @@ -0,0 +1,160 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gx); +#else + sum = buffer_ld8(bias_data, gx); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afp v = image1d_ld1(bottom_blob, i); + + afpvec8 k = image2d_ld8(weight_blob, ivec2(i, gx)); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + } +#else + int w_offset = gx * psc(w); + + for (int i = 0; i < psc(w); i++) + { + afp v = buffer_ld1(bottom_blob_data, i); + + afpvec8 k = buffer_ld8(weight_data, w_offset + i); + + // sum += v * k; + sum[0] += v * k[0]; + sum[1] += v * k[1]; + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image1d_st8(top_blob, gx, sum); +#else + buffer_st8(top_blob_data, gx, sum); +#endif +} diff --git a/source/device/vulkan/shaders/innerproduct_pack4.comp b/source/device/vulkan/shaders/innerproduct_pack4.comp new file mode 100644 index 000000000..b8d4d7554 --- /dev/null +++ b/source/device/vulkan/shaders/innerproduct_pack4.comp @@ -0,0 +1,171 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else +layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gx); +#else + sum = buffer_ld4(bias_data, gx); +#endif + } + else + { + sum = afpvec4(0.f); + } + +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afpvec4 v = image1d_ld4(bottom_blob, i); + afpmat4 k = afpmat4( + image2d_ld4(weight_blob, ivec2(wx + 0, gx)), + image2d_ld4(weight_blob, ivec2(wx + 1, gx)), + image2d_ld4(weight_blob, ivec2(wx + 2, gx)), + image2d_ld4(weight_blob, ivec2(wx + 3, gx)) + ); + + sum += v * k; + + wx += 4; + } +#else + int w_offset = gx * psc(w); + + for (int i = 0; i < psc(w); i++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, i); + +#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + buffer_ld4(weight_data, (w_offset + i) * 4 + 0), + buffer_ld4(weight_data, (w_offset + i) * 4 + 1), + buffer_ld4(weight_data, (w_offset + i) * 4 + 2), + buffer_ld4(weight_data, (w_offset + i) * 4 + 3) + ); +#else + afpmat4 k = afpmat4(weight_data[w_offset + i]); +#endif + + sum += v * k; + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image1d_st4(top_blob, gx, sum); +#else + buffer_st4(top_blob_data, gx, sum); +#endif +} diff --git a/source/device/vulkan/shaders/innerproduct_pack4to1.comp b/source/device/vulkan/shaders/innerproduct_pack4to1.comp new file mode 100644 index 000000000..9faf8100f --- /dev/null +++ b/source/device/vulkan/shaders/innerproduct_pack4to1.comp @@ -0,0 +1,148 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gx); +#else + sum = buffer_ld1(bias_data, gx); +#endif + } + else + { + sum = afp(0.f); + } + +#if NCNN_image_shader + for (int i = 0; i < psc(w); i++) + { + afpvec4 v = image1d_ld4(bottom_blob, i); + + afpvec4 k = image2d_ld4(weight_blob, ivec2(i, gx)); + + sum += dot(v, k); + } +#else + int w_offset = gx * psc(w); + + for (int i = 0; i < psc(w); i++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, i); + + afpvec4 k = buffer_ld4(weight_data, w_offset + i); + + sum += dot(v, k); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image1d_st1(top_blob, gx, sum); +#else + buffer_st1(top_blob_data, gx, sum); +#endif +} diff --git a/source/device/vulkan/shaders/innerproduct_pack4to8.comp b/source/device/vulkan/shaders/innerproduct_pack4to8.comp new file mode 100644 index 000000000..a8ee4a309 --- /dev/null +++ b/source/device/vulkan/shaders/innerproduct_pack4to8.comp @@ -0,0 +1,188 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gx); +#else + sum = buffer_ld8(bias_data, gx); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afpvec4 v = image1d_ld4(bottom_blob, i); + + afpvec4 k0 = image2d_ld4(weight_blob, ivec2(wx + 0, gx)); + afpvec4 k1 = image2d_ld4(weight_blob, ivec2(wx + 1, gx)); + afpvec4 k2 = image2d_ld4(weight_blob, ivec2(wx + 2, gx)); + afpvec4 k3 = image2d_ld4(weight_blob, ivec2(wx + 3, gx)); + afpvec4 k4 = image2d_ld4(weight_blob, ivec2(wx + 4, gx)); + afpvec4 k5 = image2d_ld4(weight_blob, ivec2(wx + 5, gx)); + afpvec4 k6 = image2d_ld4(weight_blob, ivec2(wx + 6, gx)); + afpvec4 k7 = image2d_ld4(weight_blob, ivec2(wx + 7, gx)); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + + wx += 8; + } +#else + int w_offset = gx * psc(w); + + for (int i = 0; i < psc(w); i++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, i); + + afpvec4 k0 = buffer_ld4(weight_data, (w_offset + i) * 8 + 0); + afpvec4 k1 = buffer_ld4(weight_data, (w_offset + i) * 8 + 1); + afpvec4 k2 = buffer_ld4(weight_data, (w_offset + i) * 8 + 2); + afpvec4 k3 = buffer_ld4(weight_data, (w_offset + i) * 8 + 3); + afpvec4 k4 = buffer_ld4(weight_data, (w_offset + i) * 8 + 4); + afpvec4 k5 = buffer_ld4(weight_data, (w_offset + i) * 8 + 5); + afpvec4 k6 = buffer_ld4(weight_data, (w_offset + i) * 8 + 6); + afpvec4 k7 = buffer_ld4(weight_data, (w_offset + i) * 8 + 7); + + // sum += v * k; + sum[0].r += dot(v, k0); + sum[0].g += dot(v, k1); + sum[0].b += dot(v, k2); + sum[0].a += dot(v, k3); + sum[1].r += dot(v, k4); + sum[1].g += dot(v, k5); + sum[1].b += dot(v, k6); + sum[1].a += dot(v, k7); + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image1d_st8(top_blob, gx, sum); +#else + buffer_st8(top_blob_data, gx, sum); +#endif +} diff --git a/source/device/vulkan/shaders/innerproduct_pack8.comp b/source/device/vulkan/shaders/innerproduct_pack8.comp new file mode 100644 index 000000000..50f7f4139 --- /dev/null +++ b/source/device/vulkan/shaders/innerproduct_pack8.comp @@ -0,0 +1,188 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afpvec8 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld8(bias_blob, gx); +#else + sum = buffer_ld8(bias_data, gx); +#endif + } + else + { + sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afpvec8 v = image1d_ld8(bottom_blob, i); + + afpvec8 k0 = image2d_ld8(weight_blob, ivec2(wx + 0, gx)); + afpvec8 k1 = image2d_ld8(weight_blob, ivec2(wx + 1, gx)); + afpvec8 k2 = image2d_ld8(weight_blob, ivec2(wx + 2, gx)); + afpvec8 k3 = image2d_ld8(weight_blob, ivec2(wx + 3, gx)); + afpvec8 k4 = image2d_ld8(weight_blob, ivec2(wx + 4, gx)); + afpvec8 k5 = image2d_ld8(weight_blob, ivec2(wx + 5, gx)); + afpvec8 k6 = image2d_ld8(weight_blob, ivec2(wx + 6, gx)); + afpvec8 k7 = image2d_ld8(weight_blob, ivec2(wx + 7, gx)); + + // sum += v * k + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + + wx += 8; + } +#else + int w_offset = gx * psc(w); + + for (int i = 0; i < psc(w); i++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, i); + + afpvec8 k0 = buffer_ld8(weight_data, (w_offset + i) * 8 + 0); + afpvec8 k1 = buffer_ld8(weight_data, (w_offset + i) * 8 + 1); + afpvec8 k2 = buffer_ld8(weight_data, (w_offset + i) * 8 + 2); + afpvec8 k3 = buffer_ld8(weight_data, (w_offset + i) * 8 + 3); + afpvec8 k4 = buffer_ld8(weight_data, (w_offset + i) * 8 + 4); + afpvec8 k5 = buffer_ld8(weight_data, (w_offset + i) * 8 + 5); + afpvec8 k6 = buffer_ld8(weight_data, (w_offset + i) * 8 + 6); + afpvec8 k7 = buffer_ld8(weight_data, (w_offset + i) * 8 + 7); + + // sum += v * k + sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]); + sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]); + sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]); + sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]); + } +#endif + + if (activation_type == 1) + { + sum[0] = max(sum[0], afp(0.f)); + sum[1] = max(sum[1], afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f))); + sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum[0] = clamp(sum[0], const_min, const_max); + sum[1] = clamp(sum[1], const_min, const_max); + } + if (activation_type == 4) + { + sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0])); + sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1])); + } + if (activation_type == 5) + { + sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f))); + sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f))); + } + +#if NCNN_image_shader + image1d_st8(top_blob, gx, sum); +#else + buffer_st8(top_blob_data, gx, sum); +#endif +} diff --git a/source/device/vulkan/shaders/innerproduct_pack8to1.comp b/source/device/vulkan/shaders/innerproduct_pack8to1.comp new file mode 100644 index 000000000..6fa3b1adc --- /dev/null +++ b/source/device/vulkan/shaders/innerproduct_pack8to1.comp @@ -0,0 +1,151 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afp sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld1(bias_blob, gx); +#else + sum = buffer_ld1(bias_data, gx); +#endif + } + else + { + sum = afp(0.f); + } + +#if NCNN_image_shader + for (int i = 0; i < psc(w); i++) + { + afpvec8 v = image1d_ld8(bottom_blob, i); + + afpvec8 k = image2d_ld8(weight_blob, ivec2(i, gx)); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + } +#else + int w_offset = gx * psc(w); + + for (int i = 0; i < psc(w); i++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, i); + + afpvec8 k = buffer_ld8(weight_data, w_offset + i); + + // sum += dot(v, k); + sum += dot(v[0], k[0]) + dot(v[1], k[1]); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = sum < afp(0.f) ? sum * slope : sum; + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image1d_st1(top_blob, gx, sum); +#else + buffer_st1(top_blob_data, gx, sum); +#endif +} diff --git a/source/device/vulkan/shaders/innerproduct_pack8to4.comp b/source/device/vulkan/shaders/innerproduct_pack8to4.comp new file mode 100644 index 000000000..0fb99082b --- /dev/null +++ b/source/device/vulkan/shaders/innerproduct_pack8to4.comp @@ -0,0 +1,167 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +layout (binding = 2) uniform unfp sampler2D weight_blob; +layout (binding = 3) uniform unfp sampler1D bias_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; +layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afpvec4 sum; + + if (bias_term == 1) + { +#if NCNN_image_shader + sum = image1d_ld4(bias_blob, gx); +#else + sum = buffer_ld4(bias_data, gx); +#endif + } + else + { + sum = afpvec4(0.f); + } + +#if NCNN_image_shader + int wx = 0; + + for (int i = 0; i < psc(w); i++) + { + afpvec8 v = image1d_ld8(bottom_blob, i); + + afpvec8 k0 = image2d_ld8(weight_blob, ivec2(wx + 0, gx)); + afpvec8 k1 = image2d_ld8(weight_blob, ivec2(wx + 1, gx)); + afpvec8 k2 = image2d_ld8(weight_blob, ivec2(wx + 2, gx)); + afpvec8 k3 = image2d_ld8(weight_blob, ivec2(wx + 3, gx)); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + + wx += 4; + } +#else + int w_offset = gx * psc(w); + + for (int i = 0; i < psc(w); i++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, i); + + afpvec8 k0 = buffer_ld8(weight_data, (w_offset + i) * 4 + 0); + afpvec8 k1 = buffer_ld8(weight_data, (w_offset + i) * 4 + 1); + afpvec8 k2 = buffer_ld8(weight_data, (w_offset + i) * 4 + 2); + afpvec8 k3 = buffer_ld8(weight_data, (w_offset + i) * 4 + 3); + + // sum += v * k + sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]); + sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]); + sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]); + sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]); + } +#endif + + if (activation_type == 1) + { + sum = max(sum, afp(0.f)); + } + if (activation_type == 2) + { + const afp slope = afp(activation_param_0); + sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f))); + } + if (activation_type == 3) + { + const afp const_min = afp(activation_param_0); + const afp const_max = afp(activation_param_1); + sum = clamp(sum, const_min, const_max); + } + if (activation_type == 4) + { + sum = afp(1.f) / (afp(1.f) + exp(-sum)); + } + if (activation_type == 5) + { + sum = sum * tanh(log(exp(sum) + afp(1.f))); + } + +#if NCNN_image_shader + image1d_st4(top_blob, gx, sum); +#else + buffer_st4(top_blob_data, gx, sum); +#endif +} diff --git a/source/device/vulkan/shaders/interp.comp b/source/device/vulkan/shaders/interp.comp new file mode 100644 index 000000000..f0f24fa33 --- /dev/null +++ b/source/device/vulkan/shaders/interp.comp @@ -0,0 +1,149 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int resize_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + float scale_x; + float scale_y; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + if (resize_type == 1) // nearest + { + afpvec2 gxy = afpvec2(gx, gy); + ivec2 sxy_max = ivec2(psc(w) - 1, psc(h) - 1); + ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max); + + int sx = sxy.r; + int sy = sxy.g; + +#if NCNN_image_shader + image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx, sy, gz)); +#else + int v_offset = gz * psc(cstep) + sy * psc(w) + sx; + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } + if (resize_type == 2) // bilinear + { + afpvec2 gxy = afpvec2(gx, gy); + afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f); + + ivec2 sxy = ivec2(floor(fxy)); + + fxy -= afpvec2(sxy); + + ivec2 sxy_max = ivec2(psc(w) - 2, psc(h) - 2); + + bvec2 underflow = lessThan(sxy, ivec2(0)); + bvec2 overflow = greaterThan(sxy, sxy_max); + + sxy = clamp(sxy, ivec2(0), sxy_max); + + fxy = mix(fxy, afpvec2(0.f), underflow); + fxy = mix(fxy, afpvec2(1.f), overflow); + + int sx = sxy.r; + int sy = sxy.g; + +#if NCNN_image_shader + afp a0 = image3d_ld1(bottom_blob, ivec3(sx, sy, gz)); + afp a1 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy, gz)); + afp b0 = image3d_ld1(bottom_blob, ivec3(sx, sy + 1, gz)); + afp b1 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 1, gz)); +#else + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + int v_offset_1 = gz * psc(cstep) + (sy + 1) * psc(w) + sx; + + afp a0 = buffer_ld1(bottom_blob_data, v_offset_0); + afp a1 = buffer_ld1(bottom_blob_data, v_offset_0 + 1); + afp b0 = buffer_ld1(bottom_blob_data, v_offset_1); + afp b1 = buffer_ld1(bottom_blob_data, v_offset_1 + 1); +#endif + + afp fx = fxy.r; + afp fy = fxy.g; + + afpvec2 ab = afpvec2(a0, b0) * (afp(1.f) - fx) + afpvec2(a1, b1) * fx; + + afp res = ab.r * (afp(1.f) - fy) + ab.g * fy; + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), res); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, res); +#endif + } +} diff --git a/source/device/vulkan/shaders/interp_bicubic.comp b/source/device/vulkan/shaders/interp_bicubic.comp new file mode 100644 index 000000000..2f4e26886 --- /dev/null +++ b/source/device/vulkan/shaders/interp_bicubic.comp @@ -0,0 +1,149 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif +layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; }; +layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; }; +layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; }; +layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int sx = xofs_blob_data[gx]; + int sy = yofs_blob_data[gy]; + +#if NCNN_image_shader + afp a0 = image3d_ld1(bottom_blob, ivec3(sx - 1, sy - 1, gz)); + afp a1 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy - 1, gz)); + afp a2 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy - 1, gz)); + afp a3 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy - 1, gz)); + + afp b0 = image3d_ld1(bottom_blob, ivec3(sx - 1, sy + 0, gz)); + afp b1 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 0, gz)); + afp b2 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 0, gz)); + afp b3 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 0, gz)); + + afp c0 = image3d_ld1(bottom_blob, ivec3(sx - 1, sy + 1, gz)); + afp c1 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 1, gz)); + afp c2 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 1, gz)); + afp c3 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 1, gz)); + + afp d0 = image3d_ld1(bottom_blob, ivec3(sx - 1, sy + 2, gz)); + afp d1 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 2, gz)); + afp d2 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 2, gz)); + afp d3 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 2, gz)); +#else + int v_offset_0 = gz * psc(cstep) + (sy - 1) * psc(w) + sx; + int v_offset_1 = gz * psc(cstep) + (sy + 0) * psc(w) + sx; + int v_offset_2 = gz * psc(cstep) + (sy + 1) * psc(w) + sx; + int v_offset_3 = gz * psc(cstep) + (sy + 2) * psc(w) + sx; + + afp a0 = buffer_ld1(bottom_blob_data, v_offset_0 - 1); + afp a1 = buffer_ld1(bottom_blob_data, v_offset_0 + 0); + afp a2 = buffer_ld1(bottom_blob_data, v_offset_0 + 1); + afp a3 = buffer_ld1(bottom_blob_data, v_offset_0 + 2); + + afp b0 = buffer_ld1(bottom_blob_data, v_offset_1 - 1); + afp b1 = buffer_ld1(bottom_blob_data, v_offset_1 + 0); + afp b2 = buffer_ld1(bottom_blob_data, v_offset_1 + 1); + afp b3 = buffer_ld1(bottom_blob_data, v_offset_1 + 2); + + afp c0 = buffer_ld1(bottom_blob_data, v_offset_2 - 1); + afp c1 = buffer_ld1(bottom_blob_data, v_offset_2 + 0); + afp c2 = buffer_ld1(bottom_blob_data, v_offset_2 + 1); + afp c3 = buffer_ld1(bottom_blob_data, v_offset_2 + 2); + + afp d0 = buffer_ld1(bottom_blob_data, v_offset_3 - 1); + afp d1 = buffer_ld1(bottom_blob_data, v_offset_3 + 0); + afp d2 = buffer_ld1(bottom_blob_data, v_offset_3 + 1); + afp d3 = buffer_ld1(bottom_blob_data, v_offset_3 + 2); +#endif + + afpmat4 abcd0123 = afpmat4( + a0, a1, a2, a3, + b0, b1, b2, b3, + c0, c1, c2, c3, + d0, d1, d2, d3 + ); + + afpvec4 alpha = buffer_ld4(alpha_blob_data, gx); + + afpvec4 abcd = alpha * abcd0123; + + afpvec4 beta = buffer_ld4(beta_blob_data, gy); + + afp v = dot(abcd, beta); + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), v); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/interp_bicubic_coeffs.comp b/source/device/vulkan/shaders/interp_bicubic_coeffs.comp new file mode 100644 index 000000000..1de3ce938 --- /dev/null +++ b/source/device/vulkan/shaders/interp_bicubic_coeffs.comp @@ -0,0 +1,107 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int outw = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) writeonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; }; +layout (binding = 1) writeonly buffer xofs_blob { int xofs_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int outw; + float scale; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + afp fx = (afp(gx) + afp(0.5f)) * afp(p.scale) - afp(0.5f); + int sx = int(floor(fx)); + fx -= afp(sx); + + // interpolate_cubic(fx, coeffs); + afpvec4 coeffs; + { + const afp A = afp(-0.75f); + + afp fx0 = fx + afp(1.f); + afp fx1 = fx; + afp fx2 = afp(1.f) - fx; + // afp fx3 = afp(2.f) - fx; + + coeffs.r = A * fx0*fx0*fx0 - afp(5.f)*A * fx0*fx0 + afp(8.f)*A * fx0 - afp(4.f)*A; + coeffs.g = (A+afp(2.f)) * fx1*fx1*fx1 - (A+afp(3.f)) * fx1*fx1 + afp(1.f); + coeffs.b = (A+afp(2.f)) * fx2*fx2*fx2 - (A+afp(3.f)) * fx2*fx2 + afp(1.f); + coeffs.a = afp(1.f) - coeffs.r - coeffs.g - coeffs.b; + } + + if (sx <= -1) + { + sx = 1; + coeffs.r = afp(1.f) - coeffs.a; + coeffs.g = coeffs.a; + coeffs.b = afp(0.f); + coeffs.a = afp(0.f); + } + if (sx == 0) + { + sx = 1; + coeffs.r = coeffs.r + coeffs.g; + coeffs.g = coeffs.b; + coeffs.b = coeffs.a; + coeffs.a = afp(0.f); + } + if (sx == psc(w) - 2) + { + sx = psc(w) - 3; + coeffs.a = coeffs.b + coeffs.a; + coeffs.b = coeffs.g; + coeffs.g = coeffs.r; + coeffs.r = afp(0.f); + } + if (sx >= psc(w) - 1) + { + sx = psc(w) - 3; + coeffs.a = afp(1.f) - coeffs.r; + coeffs.b = coeffs.r; + coeffs.g = afp(0.f); + coeffs.r = afp(0.f); + } + + buffer_st4(alpha_blob_data, gx, coeffs); + + xofs_blob_data[gx] = sx; +} diff --git a/source/device/vulkan/shaders/interp_bicubic_pack4.comp b/source/device/vulkan/shaders/interp_bicubic_pack4.comp new file mode 100644 index 000000000..e89d6b141 --- /dev/null +++ b/source/device/vulkan/shaders/interp_bicubic_pack4.comp @@ -0,0 +1,163 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif +layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; }; +layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; }; +layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; }; +layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int sx = xofs_blob_data[gx]; + int sy = yofs_blob_data[gy]; + +#if NCNN_image_shader + afpvec4 a0 = image3d_ld4(bottom_blob, ivec3(sx - 1, sy - 1, gz)); + afpvec4 a1 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy - 1, gz)); + afpvec4 a2 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy - 1, gz)); + afpvec4 a3 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy - 1, gz)); + + afpmat4 a0123 = afpmat4(a0, a1, a2, a3); + + afpvec4 b0 = image3d_ld4(bottom_blob, ivec3(sx - 1, sy + 0, gz)); + afpvec4 b1 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 0, gz)); + afpvec4 b2 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 0, gz)); + afpvec4 b3 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 0, gz)); + + afpmat4 b0123 = afpmat4(b0, b1, b2, b3); + + afpvec4 c0 = image3d_ld4(bottom_blob, ivec3(sx - 1, sy + 1, gz)); + afpvec4 c1 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 1, gz)); + afpvec4 c2 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 1, gz)); + afpvec4 c3 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 1, gz)); + + afpmat4 c0123 = afpmat4(c0, c1, c2, c3); + + afpvec4 d0 = image3d_ld4(bottom_blob, ivec3(sx - 1, sy + 2, gz)); + afpvec4 d1 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 2, gz)); + afpvec4 d2 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 2, gz)); + afpvec4 d3 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 2, gz)); + + afpmat4 d0123 = afpmat4(d0, d1, d2, d3); +#else + int v_offset_0 = gz * psc(cstep) + (sy - 1) * psc(w) + sx; + int v_offset_1 = gz * psc(cstep) + (sy + 0) * psc(w) + sx; + int v_offset_2 = gz * psc(cstep) + (sy + 1) * psc(w) + sx; + int v_offset_3 = gz * psc(cstep) + (sy + 2) * psc(w) + sx; + + afpvec4 a0 = buffer_ld4(bottom_blob_data, v_offset_0 - 1); + afpvec4 a1 = buffer_ld4(bottom_blob_data, v_offset_0 + 0); + afpvec4 a2 = buffer_ld4(bottom_blob_data, v_offset_0 + 1); + afpvec4 a3 = buffer_ld4(bottom_blob_data, v_offset_0 + 2); + + afpmat4 a0123 = afpmat4(a0, a1, a2, a3); + + afpvec4 b0 = buffer_ld4(bottom_blob_data, v_offset_1 - 1); + afpvec4 b1 = buffer_ld4(bottom_blob_data, v_offset_1 + 0); + afpvec4 b2 = buffer_ld4(bottom_blob_data, v_offset_1 + 1); + afpvec4 b3 = buffer_ld4(bottom_blob_data, v_offset_1 + 2); + + afpmat4 b0123 = afpmat4(b0, b1, b2, b3); + + afpvec4 c0 = buffer_ld4(bottom_blob_data, v_offset_2 - 1); + afpvec4 c1 = buffer_ld4(bottom_blob_data, v_offset_2 + 0); + afpvec4 c2 = buffer_ld4(bottom_blob_data, v_offset_2 + 1); + afpvec4 c3 = buffer_ld4(bottom_blob_data, v_offset_2 + 2); + + afpmat4 c0123 = afpmat4(c0, c1, c2, c3); + + afpvec4 d0 = buffer_ld4(bottom_blob_data, v_offset_3 - 1); + afpvec4 d1 = buffer_ld4(bottom_blob_data, v_offset_3 + 0); + afpvec4 d2 = buffer_ld4(bottom_blob_data, v_offset_3 + 1); + afpvec4 d3 = buffer_ld4(bottom_blob_data, v_offset_3 + 2); + + afpmat4 d0123 = afpmat4(d0, d1, d2, d3); +#endif + + afpvec4 alpha = buffer_ld4(alpha_blob_data, gx); + + afpvec4 a = a0123 * alpha; + afpvec4 b = b0123 * alpha; + afpvec4 c = c0123 * alpha; + afpvec4 d = d0123 * alpha; + + afpmat4 abcd = afpmat4(a, b, c, d); + + afpvec4 beta = buffer_ld4(beta_blob_data, gy); + + afpvec4 v = abcd * beta; + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), v); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/interp_bicubic_pack8.comp b/source/device/vulkan/shaders/interp_bicubic_pack8.comp new file mode 100644 index 000000000..f51bd3bee --- /dev/null +++ b/source/device/vulkan/shaders/interp_bicubic_pack8.comp @@ -0,0 +1,175 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif +layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; }; +layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; }; +layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; }; +layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int sx = xofs_blob_data[gx]; + int sy = yofs_blob_data[gy]; + + afpvec4 alpha = buffer_ld4(alpha_blob_data, gx); + +#if NCNN_image_shader + afpvec8 a0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy - 1, gz)); + afpvec8 a1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy - 1, gz)); + afpvec8 a2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy - 1, gz)); + afpvec8 a3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy - 1, gz)); + + afpvec8 a; + a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a; + a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a; + + afpvec8 b0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy + 0, gz)); + afpvec8 b1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 0, gz)); + afpvec8 b2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 0, gz)); + afpvec8 b3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 0, gz)); + + afpvec8 b; + b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a; + b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a; + + afpvec8 c0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy + 1, gz)); + afpvec8 c1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 1, gz)); + afpvec8 c2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz)); + afpvec8 c3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 1, gz)); + + afpvec8 c; + c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a; + c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a; + + afpvec8 d0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy + 2, gz)); + afpvec8 d1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 2, gz)); + afpvec8 d2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 2, gz)); + afpvec8 d3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 2, gz)); + + afpvec8 d; + d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a; + d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a; +#else + int v_offset_0 = gz * psc(cstep) + (sy - 1) * psc(w) + sx; + int v_offset_1 = gz * psc(cstep) + (sy + 0) * psc(w) + sx; + int v_offset_2 = gz * psc(cstep) + (sy + 1) * psc(w) + sx; + int v_offset_3 = gz * psc(cstep) + (sy + 2) * psc(w) + sx; + + afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0 - 1); + afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 0); + afpvec8 a2 = buffer_ld8(bottom_blob_data, v_offset_0 + 1); + afpvec8 a3 = buffer_ld8(bottom_blob_data, v_offset_0 + 2); + + afpvec8 a; + a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a; + a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a; + + afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1 - 1); + afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 0); + afpvec8 b2 = buffer_ld8(bottom_blob_data, v_offset_1 + 1); + afpvec8 b3 = buffer_ld8(bottom_blob_data, v_offset_1 + 2); + + afpvec8 b; + b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a; + b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a; + + afpvec8 c0 = buffer_ld8(bottom_blob_data, v_offset_2 - 1); + afpvec8 c1 = buffer_ld8(bottom_blob_data, v_offset_2 + 0); + afpvec8 c2 = buffer_ld8(bottom_blob_data, v_offset_2 + 1); + afpvec8 c3 = buffer_ld8(bottom_blob_data, v_offset_2 + 2); + + afpvec8 c; + c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a; + c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a; + + afpvec8 d0 = buffer_ld8(bottom_blob_data, v_offset_3 - 1); + afpvec8 d1 = buffer_ld8(bottom_blob_data, v_offset_3 + 0); + afpvec8 d2 = buffer_ld8(bottom_blob_data, v_offset_3 + 1); + afpvec8 d3 = buffer_ld8(bottom_blob_data, v_offset_3 + 2); + + afpvec8 d; + d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a; + d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a; +#endif + + afpvec4 beta = buffer_ld4(beta_blob_data, gy); + + afpvec8 v; + v[0] = a[0] * beta.r + b[0] * beta.g + c[0] * beta.b + d[0] * beta.a; + v[1] = a[1] * beta.r + b[1] * beta.g + c[1] * beta.b + d[1] * beta.a; + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), v); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/interp_pack4.comp b/source/device/vulkan/shaders/interp_pack4.comp new file mode 100644 index 000000000..47d652e5f --- /dev/null +++ b/source/device/vulkan/shaders/interp_pack4.comp @@ -0,0 +1,150 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int resize_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + float scale_x; + float scale_y; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + if (resize_type == 1) // nearest + { + afpvec2 gxy = afpvec2(gx, gy); + ivec2 sxy_max = ivec2(psc(w) - 1, psc(h) - 1); + ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max); + + int sx = sxy.r; + int sy = sxy.g; + +#if NCNN_image_shader + image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx, sy, gz)); +#else + int v_offset = gz * psc(cstep) + sy * psc(w) + sx; + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } + if (resize_type == 2) // bilinear + { + afpvec2 gxy = afpvec2(gx, gy); + afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f); + + ivec2 sxy = ivec2(floor(fxy)); + + fxy -= afpvec2(sxy); + + ivec2 sxy_max = ivec2(psc(w) - 2, psc(h) - 2); + + bvec2 underflow = lessThan(sxy, ivec2(0)); + bvec2 overflow = greaterThan(sxy, sxy_max); + + sxy = clamp(sxy, ivec2(0), sxy_max); + + fxy = mix(fxy, afpvec2(0.f), underflow); + fxy = mix(fxy, afpvec2(1.f), overflow); + + int sx = sxy.r; + int sy = sxy.g; + +#if NCNN_image_shader + afpvec4 a0 = image3d_ld4(bottom_blob, ivec3(sx, sy, gz)); + afpvec4 a1 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy, gz)); + afpvec4 b0 = image3d_ld4(bottom_blob, ivec3(sx, sy + 1, gz)); + afpvec4 b1 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 1, gz)); +#else + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + int v_offset_1 = gz * psc(cstep) + (sy + 1) * psc(w) + sx; + + afpvec4 a0 = buffer_ld4(bottom_blob_data, v_offset_0); + afpvec4 a1 = buffer_ld4(bottom_blob_data, v_offset_0 + 1); + afpvec4 b0 = buffer_ld4(bottom_blob_data, v_offset_1); + afpvec4 b1 = buffer_ld4(bottom_blob_data, v_offset_1 + 1); +#endif + + afp fx = fxy.r; + afp fy = fxy.g; + + afpvec4 a = a0 * (afp(1.f) - fx) + a1 * fx; + afpvec4 b = b0 * (afp(1.f) - fx) + b1 * fx; + + afpvec4 res = a * (afp(1.f) - fy) + b * fy; + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), res); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, res); +#endif + } +} diff --git a/source/device/vulkan/shaders/interp_pack8.comp b/source/device/vulkan/shaders/interp_pack8.comp new file mode 100644 index 000000000..e62c831e8 --- /dev/null +++ b/source/device/vulkan/shaders/interp_pack8.comp @@ -0,0 +1,238 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int resize_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + float scale_x; + float scale_y; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + if (resize_type == 1) // nearest + { + afpvec2 gxy = afpvec2(gx, gy); + ivec2 sxy_max = ivec2(psc(w) - 1, psc(h) - 1); + ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max); + + int sx = sxy.r; + int sy = sxy.g; + +#if NCNN_image_shader + image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx, sy, gz)); +#else + int v_offset = gz * psc(cstep) + sy * psc(w) + sx; + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } + if (resize_type == 5) // bilinear + { + afpvec2 gxy = afpvec2(gx, gy); + afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f); + + ivec2 sxy = ivec2(floor(fxy)); + + fxy -= afpvec2(sxy); + + ivec2 sxy_max = ivec2(psc(w) - 2, psc(h) - 2); + + bvec2 underflow = lessThan(sxy, ivec2(0)); + bvec2 overflow = greaterThan(sxy, sxy_max); + + sxy = clamp(sxy, ivec2(0), sxy_max); + + fxy = mix(fxy, afpvec2(0.f), underflow); + fxy = mix(fxy, afpvec2(1.f), overflow); + + int sx = sxy.r; + int sy = sxy.g; + +#if NCNN_image_shader + afpvec8 a0 = image3d_ld8(bottom_blob, ivec3(sx, sy, gz)); + afpvec8 a1 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy, gz)); + afpvec8 b0 = image3d_ld8(bottom_blob, ivec3(sx, sy + 1, gz)); + afpvec8 b1 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz)); +#else + int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx; + int v_offset_1 = gz * psc(cstep) + (sy + 1) * psc(w) + sx; + + afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0); + afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 1); + afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1); + afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 1); +#endif + + afp fx = fxy.r; + afp fy = fxy.g; + + afpvec8 a; + afpvec8 b; + a[0] = a0[0] * (afp(1.f) - fx) + a1[0] * fx; + a[1] = a0[1] * (afp(1.f) - fx) + a1[1] * fx; + b[0] = b0[0] * (afp(1.f) - fx) + b1[0] * fx; + b[1] = b0[1] * (afp(1.f) - fx) + b1[1] * fx; + + afpvec8 res; + res[0] = a[0] * (afp(1.f) - fy) + b[0] * fy; + res[1] = a[1] * (afp(1.f) - fy) + b[1] * fy; + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), res); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, res); +#endif + } + else + { + afpvec2 gxy = afpvec2(gx, gy); + ivec2 sxy_max = ivec2(psc(w) - 1, psc(h) - 1); + // ivec2 in_xy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max); + ivec2 in_xy = min(ivec2(floor(gxy / afpvec2(2.0f, 2.0f))), sxy_max); + + afpvec2 ff_sxy_max = afpvec2(psc(w) - 1, psc(h) - 1); + afpvec2 ffin_xy = afpvec2(gxy * afpvec2(p.scale_x, p.scale_y)); + + afp ff_in_x = ffin_xy.r; + afp ff_in_y = ffin_xy.g; + + int in_x = in_xy.r; + int in_y = in_xy.g; + + int in_y1 = min(in_y, psc(h) - 1); + int in_y2 = min(in_y1 + 1, psc(h) - 1); + + float dy1 = abs(in_y - in_y1); + float dy2 = abs(in_y - in_y2); + + afp ff_dy1 = abs(ff_in_y - afp(in_y1)); + afp ff_dy2 = abs(ff_in_y - afp(in_y2)); + + if (in_y1 == in_y2) + { + dy1 = 0.5f; + dy2 = 0.5f; + } + + if (ff_dy1 == ff_dy2) + { + dy1 = 0.5f; + dy2 = 0.5f; + } + + int in_x1 = min(in_x, psc(w) - 1); + int in_x2 = min(in_x1 + 1, psc(w) - 1); + + float dx1 = abs(in_x - in_x1); + float dx2 = abs(in_x - in_x2); + + afp ff_dx1 = abs(ff_in_x - afp(in_x1)); + afp ff_dx2 = abs(ff_in_x - afp(in_x2)); + if (in_x1 == in_x2) + { + dx1 = 0.5f; + dx2 = 0.5f; + } + if (ff_dx1 == ff_dx2) + { + dx1 = 0.5f; + dx2 = 0.5f; + } + + +#if NCNN_image_shader +#else + int v_offset_0 = gz * psc(cstep) + in_y1 * psc(w) + in_x1; + int v_offset_1 = gz * psc(cstep) + in_y1 * psc(w) + in_x2; + int v_offset_2 = gz * psc(cstep) + in_y2 * psc(w) + in_x1; + int v_offset_3 = gz * psc(cstep) + in_y2 * psc(w) + in_x2; + + afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0); + afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_1); + afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_2); + afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_3); + + afpvec8 res; + res[0] = afp(dx2 * dy2) * a0[0] + afp(dx1 * dy2) * a1[0] + afp(dx2 * dy1) * b0[0] + afp(dx1 * dy1) * b1[0]; + res[1] = afp(dx2 * dy2) * a0[1] + afp(dx1 * dy2) * a1[1] + afp(dx2 * dy1) * b0[1] + afp(dx1 * dy1) * b1[1]; + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + // res = afpvec8(afpvec4(ff_dy1), afpvec4(ff_dy1)); + + buffer_st8(top_blob_data, gi, res); +#endif + + } +} diff --git a/source/device/vulkan/shaders/packing.comp b/source/device/vulkan/shaders/packing.comp new file mode 100644 index 000000000..f018ab5fe --- /dev/null +++ b/source/device/vulkan/shaders/packing.comp @@ -0,0 +1,165 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld1(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld1(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st1(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st1(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld1(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld1(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_fp16_to_fp32.comp new file mode 100644 index 000000000..7a2b2bf9d --- /dev/null +++ b/source/device/vulkan/shaders/packing_fp16_to_fp32.comp @@ -0,0 +1,165 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob_fp32 { float top_blob_fp32_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, r32f) writeonly uniform highp image1D top_blob_1d_fp32; +layout (binding = 3, r32f) writeonly uniform highp image2D top_blob_2d_fp32; +layout (binding = 3, r32f) writeonly uniform highp image3D top_blob_3d_fp32; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld1(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld1(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + top_blob_fp32_data[gi] = float(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_1d_fp32, gx, vec4(v)); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld1(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = float(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_2d_fp32, ivec2(gx, gy), vec4(v)); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld1(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = float(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_3d_fp32, ivec3(gx, gy, gz), vec4(v)); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_fp32_to_fp16.comp new file mode 100644 index 000000000..213f62cd2 --- /dev/null +++ b/source/device/vulkan/shaders/packing_fp32_to_fp16.comp @@ -0,0 +1,165 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob_fp32 { float bottom_blob_fp32_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32; +layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32; +layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32; +layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = afp(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afp(texelFetch(bottom_blob_1d_fp32, gx, 0).r); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st1(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st1(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = afp(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, gy), 0).r); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = afp(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, gz), 0).r); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack1to4.comp b/source/device/vulkan/shaders/packing_pack1to4.comp new file mode 100644 index 000000000..ba270d696 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack1to4.comp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + + v.r = buffer_ld1(bottom_blob_data, v_offset.r); + v.g = buffer_ld1(bottom_blob_data, v_offset.g); + v.b = buffer_ld1(bottom_blob_data, v_offset.b); + v.a = buffer_ld1(bottom_blob_data, v_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int x4 = gx * 4; + + v.r = image1d_ld1(bottom_blob_1d, x4 + 0); + v.g = image1d_ld1(bottom_blob_1d, x4 + 1); + v.b = image1d_ld1(bottom_blob_1d, x4 + 2); + v.a = image1d_ld1(bottom_blob_1d, x4 + 3); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st4(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(w) + gx; + + v.r = buffer_ld1(bottom_blob_data, v_offset.r); + v.g = buffer_ld1(bottom_blob_data, v_offset.g); + v.b = buffer_ld1(bottom_blob_data, v_offset.b); + v.a = buffer_ld1(bottom_blob_data, v_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int y4 = gy * 4; + + v.r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0)); + v.g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1)); + v.b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2)); + v.a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx); + + v.r = buffer_ld1(bottom_blob_data, v_offset.r); + v.g = buffer_ld1(bottom_blob_data, v_offset.g); + v.b = buffer_ld1(bottom_blob_data, v_offset.b); + v.a = buffer_ld1(bottom_blob_data, v_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int z4 = gz * 4; + + v.r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp new file mode 100644 index 000000000..f75546fbb --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob_fp32 { vec4 top_blob_fp32_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + + v.r = buffer_ld1(bottom_blob_data, v_offset.r); + v.g = buffer_ld1(bottom_blob_data, v_offset.g); + v.b = buffer_ld1(bottom_blob_data, v_offset.b); + v.a = buffer_ld1(bottom_blob_data, v_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int x4 = gx * 4; + + v.r = image1d_ld1(bottom_blob_1d, x4 + 0); + v.g = image1d_ld1(bottom_blob_1d, x4 + 1); + v.b = image1d_ld1(bottom_blob_1d, x4 + 2); + v.a = image1d_ld1(bottom_blob_1d, x4 + 3); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + top_blob_fp32_data[gi] = vec4(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_1d_fp32, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(w) + gx; + + v.r = buffer_ld1(bottom_blob_data, v_offset.r); + v.g = buffer_ld1(bottom_blob_data, v_offset.g); + v.b = buffer_ld1(bottom_blob_data, v_offset.b); + v.a = buffer_ld1(bottom_blob_data, v_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int y4 = gy * 4; + + v.r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0)); + v.g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1)); + v.b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2)); + v.a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = vec4(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_2d_fp32, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx); + + v.r = buffer_ld1(bottom_blob_data, v_offset.r); + v.g = buffer_ld1(bottom_blob_data, v_offset.g); + v.b = buffer_ld1(bottom_blob_data, v_offset.b); + v.a = buffer_ld1(bottom_blob_data, v_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int z4 = gz * 4; + + v.r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = vec4(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_3d_fp32, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp new file mode 100644 index 000000000..48569c9f1 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob_fp32 { float bottom_blob_fp32_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32; +layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32; +layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + + v.r = afp(bottom_blob_fp32_data[v_offset.r]); + v.g = afp(bottom_blob_fp32_data[v_offset.g]); + v.b = afp(bottom_blob_fp32_data[v_offset.b]); + v.a = afp(bottom_blob_fp32_data[v_offset.a]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int x4 = gx * 4; + + v.r = afp(texelFetch(bottom_blob_1d_fp32, x4 + 0, 0).r); + v.g = afp(texelFetch(bottom_blob_1d_fp32, x4 + 1, 0).r); + v.b = afp(texelFetch(bottom_blob_1d_fp32, x4 + 2, 0).r); + v.a = afp(texelFetch(bottom_blob_1d_fp32, x4 + 3, 0).r); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st4(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(w) + gx; + + v.r = afp(bottom_blob_fp32_data[v_offset.r]); + v.g = afp(bottom_blob_fp32_data[v_offset.g]); + v.b = afp(bottom_blob_fp32_data[v_offset.b]); + v.a = afp(bottom_blob_fp32_data[v_offset.a]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int y4 = gy * 4; + + v.r = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 0), 0).r); + v.g = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 1), 0).r); + v.b = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 2), 0).r); + v.a = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 3), 0).r); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx); + + v.r = afp(bottom_blob_fp32_data[v_offset.r]); + v.g = afp(bottom_blob_fp32_data[v_offset.g]); + v.b = afp(bottom_blob_fp32_data[v_offset.b]); + v.a = afp(bottom_blob_fp32_data[v_offset.a]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int z4 = gz * 4; + + v.r = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 0), 0).r); + v.g = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 1), 0).r); + v.b = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 2), 0).r); + v.a = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 3), 0).r); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack1to8.comp b/source/device/vulkan/shaders/packing_pack1to8.comp new file mode 100644 index 000000000..a97fbe923 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack1to8.comp @@ -0,0 +1,223 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + ivec4 vv_offset = x4 + 4; + + v[0].r = buffer_ld1(bottom_blob_data, v_offset.r); + v[0].g = buffer_ld1(bottom_blob_data, v_offset.g); + v[0].b = buffer_ld1(bottom_blob_data, v_offset.b); + v[0].a = buffer_ld1(bottom_blob_data, v_offset.a); + v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r); + v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g); + v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b); + v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int x4 = gx * 8; + + v[0].r = image1d_ld1(bottom_blob_1d, x4 + 0); + v[0].g = image1d_ld1(bottom_blob_1d, x4 + 1); + v[0].b = image1d_ld1(bottom_blob_1d, x4 + 2); + v[0].a = image1d_ld1(bottom_blob_1d, x4 + 3); + v[1].r = image1d_ld1(bottom_blob_1d, x4 + 4); + v[1].g = image1d_ld1(bottom_blob_1d, x4 + 5); + v[1].b = image1d_ld1(bottom_blob_1d, x4 + 6); + v[1].a = image1d_ld1(bottom_blob_1d, x4 + 7); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st8(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(w) + gx; + ivec4 vv_offset = (y4 + 4) * psc(w) + gx; + + v[0].r = buffer_ld1(bottom_blob_data, v_offset.r); + v[0].g = buffer_ld1(bottom_blob_data, v_offset.g); + v[0].b = buffer_ld1(bottom_blob_data, v_offset.b); + v[0].a = buffer_ld1(bottom_blob_data, v_offset.a); + v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r); + v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g); + v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b); + v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int y4 = gy * 8; + + v[0].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0)); + v[0].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1)); + v[0].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2)); + v[0].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3)); + v[1].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 4)); + v[1].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 5)); + v[1].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 6)); + v[1].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 7)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx); + ivec4 vv_offset = (z4 + 4) * psc(cstep) + ivec4(gy * psc(w) + gx); + + v[0].r = buffer_ld1(bottom_blob_data, v_offset.r); + v[0].g = buffer_ld1(bottom_blob_data, v_offset.g); + v[0].b = buffer_ld1(bottom_blob_data, v_offset.b); + v[0].a = buffer_ld1(bottom_blob_data, v_offset.a); + v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r); + v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g); + v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b); + v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int z4 = gz * 8; + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 4)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 5)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 6)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 7)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp new file mode 100644 index 000000000..62a980788 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp @@ -0,0 +1,226 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob_fp32 { mat2x4 top_blob_fp32_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + ivec4 vv_offset = x4 + 4; + + v[0].r = buffer_ld1(bottom_blob_data, v_offset.r); + v[0].g = buffer_ld1(bottom_blob_data, v_offset.g); + v[0].b = buffer_ld1(bottom_blob_data, v_offset.b); + v[0].a = buffer_ld1(bottom_blob_data, v_offset.a); + v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r); + v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g); + v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b); + v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int x4 = gx * 8; + + v[0].r = image1d_ld1(bottom_blob_1d, x4 + 0); + v[0].g = image1d_ld1(bottom_blob_1d, x4 + 1); + v[0].b = image1d_ld1(bottom_blob_1d, x4 + 2); + v[0].a = image1d_ld1(bottom_blob_1d, x4 + 3); + v[1].r = image1d_ld1(bottom_blob_1d, x4 + 4); + v[1].g = image1d_ld1(bottom_blob_1d, x4 + 5); + v[1].b = image1d_ld1(bottom_blob_1d, x4 + 6); + v[1].a = image1d_ld1(bottom_blob_1d, x4 + 7); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1])); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_1d_fp32, gx * 2, v[0]); + imageStore(top_blob_1d_fp32, gx * 2 + 1, v[1]); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(w) + gx; + ivec4 vv_offset = (y4 + 4) * psc(w) + gx; + + v[0].r = buffer_ld1(bottom_blob_data, v_offset.r); + v[0].g = buffer_ld1(bottom_blob_data, v_offset.g); + v[0].b = buffer_ld1(bottom_blob_data, v_offset.b); + v[0].a = buffer_ld1(bottom_blob_data, v_offset.a); + v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r); + v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g); + v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b); + v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int y4 = gy * 8; + + v[0].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0)); + v[0].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1)); + v[0].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2)); + v[0].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3)); + v[1].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 4)); + v[1].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 5)); + v[1].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 6)); + v[1].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 7)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1])); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_2d_fp32, ivec2(gx * 2, gy), v[0]); + imageStore(top_blob_2d_fp32, ivec2(gx * 2 + 1, gy), v[1]); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx); + ivec4 vv_offset = (z4 + 4) * psc(cstep) + ivec4(gy * psc(w) + gx); + + v[0].r = buffer_ld1(bottom_blob_data, v_offset.r); + v[0].g = buffer_ld1(bottom_blob_data, v_offset.g); + v[0].b = buffer_ld1(bottom_blob_data, v_offset.b); + v[0].a = buffer_ld1(bottom_blob_data, v_offset.a); + v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r); + v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g); + v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b); + v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int z4 = gz * 8; + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 4)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 5)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 6)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 7)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1])); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_3d_fp32, ivec3(gx * 2, gy, gz), v[0]); + imageStore(top_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), v[1]); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp new file mode 100644 index 000000000..6b3a405e7 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp @@ -0,0 +1,223 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob_fp32 { float bottom_blob_fp32_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32; +layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32; +layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + ivec4 vv_offset = x4 + 4; + + v[0].r = afp(bottom_blob_fp32_data[v_offset.r]); + v[0].g = afp(bottom_blob_fp32_data[v_offset.g]); + v[0].b = afp(bottom_blob_fp32_data[v_offset.b]); + v[0].a = afp(bottom_blob_fp32_data[v_offset.a]); + v[1].r = afp(bottom_blob_fp32_data[vv_offset.r]); + v[1].g = afp(bottom_blob_fp32_data[vv_offset.g]); + v[1].b = afp(bottom_blob_fp32_data[vv_offset.b]); + v[1].a = afp(bottom_blob_fp32_data[vv_offset.a]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int x4 = gx * 8; + + v[0].r = afp(texelFetch(bottom_blob_1d_fp32, x4 + 0, 0).r); + v[0].g = afp(texelFetch(bottom_blob_1d_fp32, x4 + 1, 0).r); + v[0].b = afp(texelFetch(bottom_blob_1d_fp32, x4 + 2, 0).r); + v[0].a = afp(texelFetch(bottom_blob_1d_fp32, x4 + 3, 0).r); + v[1].r = afp(texelFetch(bottom_blob_1d_fp32, x4 + 4, 0).r); + v[1].g = afp(texelFetch(bottom_blob_1d_fp32, x4 + 5, 0).r); + v[1].b = afp(texelFetch(bottom_blob_1d_fp32, x4 + 6, 0).r); + v[1].a = afp(texelFetch(bottom_blob_1d_fp32, x4 + 7, 0).r); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st8(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(w) + gx; + ivec4 vv_offset = (y4 + 4) * psc(w) + gx; + + v[0].r = afp(bottom_blob_fp32_data[v_offset.r]); + v[0].g = afp(bottom_blob_fp32_data[v_offset.g]); + v[0].b = afp(bottom_blob_fp32_data[v_offset.b]); + v[0].a = afp(bottom_blob_fp32_data[v_offset.a]); + v[1].r = afp(bottom_blob_fp32_data[vv_offset.r]); + v[1].g = afp(bottom_blob_fp32_data[vv_offset.g]); + v[1].b = afp(bottom_blob_fp32_data[vv_offset.b]); + v[1].a = afp(bottom_blob_fp32_data[vv_offset.a]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int y4 = gy * 8; + + v[0].r = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 0), 0).r); + v[0].g = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 1), 0).r); + v[0].b = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 2), 0).r); + v[0].a = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 3), 0).r); + v[1].r = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 4), 0).r); + v[1].g = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 5), 0).r); + v[1].b = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 6), 0).r); + v[1].a = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 7), 0).r); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx); + ivec4 vv_offset = (z4 + 4) * psc(cstep) + ivec4(gy * psc(w) + gx); + + v[0].r = afp(bottom_blob_fp32_data[v_offset.r]); + v[0].g = afp(bottom_blob_fp32_data[v_offset.g]); + v[0].b = afp(bottom_blob_fp32_data[v_offset.b]); + v[0].a = afp(bottom_blob_fp32_data[v_offset.a]); + v[1].r = afp(bottom_blob_fp32_data[vv_offset.r]); + v[1].g = afp(bottom_blob_fp32_data[vv_offset.g]); + v[1].b = afp(bottom_blob_fp32_data[vv_offset.b]); + v[1].a = afp(bottom_blob_fp32_data[vv_offset.a]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int z4 = gz * 8; + + v[0].r = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 0), 0).r); + v[0].g = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 1), 0).r); + v[0].b = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 2), 0).r); + v[0].a = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 3), 0).r); + v[1].r = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 4), 0).r); + v[1].g = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 5), 0).r); + v[1].b = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 6), 0).r); + v[1].a = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 7), 0).r); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack4.comp b/source/device/vulkan/shaders/packing_pack4.comp new file mode 100644 index 000000000..c0c64e5a2 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack4.comp @@ -0,0 +1,165 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld4(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st4(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack4_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack4_fp16_to_fp32.comp new file mode 100644 index 000000000..b05b9eda0 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack4_fp16_to_fp32.comp @@ -0,0 +1,165 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob_fp32 { vec4 top_blob_fp32_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld4(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + top_blob_fp32_data[gi] = vec4(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_1d_fp32, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = vec4(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_2d_fp32, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = vec4(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_3d_fp32, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack4_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack4_fp32_to_fp16.comp new file mode 100644 index 000000000..fcd96950f --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack4_fp32_to_fp16.comp @@ -0,0 +1,165 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob_fp32 { vec4 bottom_blob_fp32_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32; +layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32; +layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = afpvec4(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec4(texelFetch(bottom_blob_1d_fp32, gx, 0)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st4(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = afpvec4(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec4(texelFetch(bottom_blob_2d_fp32, ivec2(gx, gy), 0)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = afpvec4(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec4(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, gz), 0)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack4to1.comp b/source/device/vulkan/shaders/packing_pack4to1.comp new file mode 100644 index 000000000..ef070eaab --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack4to1.comp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + afpvec4 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld4(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + + buffer_st1(top_blob_data, v_offset.r, v.r); + buffer_st1(top_blob_data, v_offset.g, v.g); + buffer_st1(top_blob_data, v_offset.b, v.b); + buffer_st1(top_blob_data, v_offset.a, v.a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int x4 = gx * 4; + + image1d_st1(top_blob_1d, x4 + 0, v.r); + image1d_st1(top_blob_1d, x4 + 1, v.g); + image1d_st1(top_blob_1d, x4 + 2, v.b); + image1d_st1(top_blob_1d, x4 + 3, v.a); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(outw) + gx; + + buffer_st1(top_blob_data, v_offset.r, v.r); + buffer_st1(top_blob_data, v_offset.g, v.g); + buffer_st1(top_blob_data, v_offset.b, v.b); + buffer_st1(top_blob_data, v_offset.a, v.a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int y4 = gy * 4; + + image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v.r); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v.g); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v.b); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v.a); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx); + + buffer_st1(top_blob_data, v_offset.r, v.r); + buffer_st1(top_blob_data, v_offset.g, v.g); + buffer_st1(top_blob_data, v_offset.b, v.b); + buffer_st1(top_blob_data, v_offset.a, v.a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int z4 = gz * 4; + + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v.r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v.g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v.b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v.a); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp new file mode 100644 index 000000000..7fd911969 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob_fp32 { float top_blob_fp32_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, r32f) writeonly uniform highp image1D top_blob_1d_fp32; +layout (binding = 3, r32f) writeonly uniform highp image2D top_blob_2d_fp32; +layout (binding = 3, r32f) writeonly uniform highp image3D top_blob_3d_fp32; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + afpvec4 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld4(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + + top_blob_fp32_data[v_offset.r] = float(v.r); + top_blob_fp32_data[v_offset.g] = float(v.g); + top_blob_fp32_data[v_offset.b] = float(v.b); + top_blob_fp32_data[v_offset.a] = float(v.a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int x4 = gx * 4; + + image1d_st1(top_blob_1d_fp32, x4 + 0, v.r); + image1d_st1(top_blob_1d_fp32, x4 + 1, v.g); + image1d_st1(top_blob_1d_fp32, x4 + 2, v.b); + image1d_st1(top_blob_1d_fp32, x4 + 3, v.a); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(outw) + gx; + + top_blob_fp32_data[v_offset.r] = float(v.r); + top_blob_fp32_data[v_offset.g] = float(v.g); + top_blob_fp32_data[v_offset.b] = float(v.b); + top_blob_fp32_data[v_offset.a] = float(v.a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int y4 = gy * 4; + + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 0), v.r); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 1), v.g); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 2), v.b); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 3), v.a); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld4(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx); + + top_blob_fp32_data[v_offset.r] = float(v.r); + top_blob_fp32_data[v_offset.g] = float(v.g); + top_blob_fp32_data[v_offset.b] = float(v.b); + top_blob_fp32_data[v_offset.a] = float(v.a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int z4 = gz * 4; + + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 0), v.r); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 1), v.g); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 2), v.b); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 3), v.a); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp new file mode 100644 index 000000000..6a0d0346d --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob_fp32 { vec4 bottom_blob_fp32_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32; +layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32; +layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32; +layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + afpvec4 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = afpvec4(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec4(texelFetch(bottom_blob_1d_fp32, gx, 0)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + + buffer_st1(top_blob_data, v_offset.r, v.r); + buffer_st1(top_blob_data, v_offset.g, v.g); + buffer_st1(top_blob_data, v_offset.b, v.b); + buffer_st1(top_blob_data, v_offset.a, v.a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int x4 = gx * 4; + + image1d_st1(top_blob_1d, x4 + 0, v.r); + image1d_st1(top_blob_1d, x4 + 1, v.g); + image1d_st1(top_blob_1d, x4 + 2, v.b); + image1d_st1(top_blob_1d, x4 + 3, v.a); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = afpvec4(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec4(texelFetch(bottom_blob_2d_fp32, ivec2(gx, gy), 0)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(outw) + gx; + + buffer_st1(top_blob_data, v_offset.r, v.r); + buffer_st1(top_blob_data, v_offset.g, v.g); + buffer_st1(top_blob_data, v_offset.b, v.b); + buffer_st1(top_blob_data, v_offset.a, v.a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int y4 = gy * 4; + + image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v.r); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v.g); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v.b); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v.a); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = afpvec4(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec4(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, gz), 0)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx); + + buffer_st1(top_blob_data, v_offset.r, v.r); + buffer_st1(top_blob_data, v_offset.g, v.g); + buffer_st1(top_blob_data, v_offset.b, v.b); + buffer_st1(top_blob_data, v_offset.a, v.a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int z4 = gz * 4; + + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v.r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v.g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v.b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v.a); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack4to8.comp b/source/device/vulkan/shaders/packing_pack4to8.comp new file mode 100644 index 000000000..4dd23773e --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack4to8.comp @@ -0,0 +1,184 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1); + + ivec2 v_offset = x2; + + v[0] = buffer_ld4(bottom_blob_data, v_offset.r); + v[1] = buffer_ld4(bottom_blob_data, v_offset.g); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int x2 = gx * 2; + + v[0] = image1d_ld4(bottom_blob_1d, x2 + 0); + v[1] = image1d_ld4(bottom_blob_1d, x2 + 1); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st8(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1); + + ivec2 v_offset = y2 * psc(w) + gx; + + v[0] = buffer_ld4(bottom_blob_data, v_offset.r); + v[1] = buffer_ld4(bottom_blob_data, v_offset.g); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int y2 = gy * 2; + + v[0] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 0)); + v[1] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 1)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1); + + ivec2 v_offset = z2 * psc(cstep) + ivec2(gy * psc(w) + gx); + + v[0] = buffer_ld4(bottom_blob_data, v_offset.r); + v[1] = buffer_ld4(bottom_blob_data, v_offset.g); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int z2 = gz * 2; + + v[0] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 0)); + v[1] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 1)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp new file mode 100644 index 000000000..defc14089 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp @@ -0,0 +1,184 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob_fp32 { mat2x4 top_blob_fp32_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1); + + ivec2 v_offset = x2; + + v[0] = buffer_ld4(bottom_blob_data, v_offset.r); + v[1] = buffer_ld4(bottom_blob_data, v_offset.g); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int x2 = gx * 2; + + v[0] = image1d_ld4(bottom_blob_1d, x2 + 0); + v[1] = image1d_ld4(bottom_blob_1d, x2 + 1); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1])); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st8(top_blob_1d_fp32, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1); + + ivec2 v_offset = y2 * psc(w) + gx; + + v[0] = buffer_ld4(bottom_blob_data, v_offset.r); + v[1] = buffer_ld4(bottom_blob_data, v_offset.g); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int y2 = gy * 2; + + v[0] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 0)); + v[1] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 1)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1])); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st8(top_blob_2d_fp32, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1); + + ivec2 v_offset = z2 * psc(cstep) + ivec2(gy * psc(w) + gx); + + v[0] = buffer_ld4(bottom_blob_data, v_offset.r); + v[1] = buffer_ld4(bottom_blob_data, v_offset.g); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int z2 = gz * 2; + + v[0] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 0)); + v[1] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 1)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1])); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st8(top_blob_3d_fp32, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp new file mode 100644 index 000000000..48ac54ca9 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp @@ -0,0 +1,184 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob_fp32 { vec4 bottom_blob_fp32_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32; +layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32; +layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1); + + ivec2 v_offset = x2; + + v[0] = afpvec4(bottom_blob_fp32_data[v_offset.r]); + v[1] = afpvec4(bottom_blob_fp32_data[v_offset.g]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int x2 = gx * 2; + + v[0] = image1d_ld4(bottom_blob_1d_fp32, x2 + 0); + v[1] = image1d_ld4(bottom_blob_1d_fp32, x2 + 1); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st8(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1); + + ivec2 v_offset = y2 * psc(w) + gx; + + v[0] = afpvec4(bottom_blob_fp32_data[v_offset.r]); + v[1] = afpvec4(bottom_blob_fp32_data[v_offset.g]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int y2 = gy * 2; + + v[0] = image2d_ld4(bottom_blob_2d_fp32, ivec2(gx, y2 + 0)); + v[1] = image2d_ld4(bottom_blob_2d_fp32, ivec2(gx, y2 + 1)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1); + + ivec2 v_offset = z2 * psc(cstep) + ivec2(gy * psc(w) + gx); + + v[0] = afpvec4(bottom_blob_fp32_data[v_offset.r]); + v[1] = afpvec4(bottom_blob_fp32_data[v_offset.g]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + int z2 = gz * 2; + + v[0] = image3d_ld4(bottom_blob_3d_fp32, ivec3(gx, gy, z2 + 0)); + v[1] = image3d_ld4(bottom_blob_3d_fp32, ivec3(gx, gy, z2 + 1)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack8.comp b/source/device/vulkan/shaders/packing_pack8.comp new file mode 100644 index 000000000..5b53e5b55 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack8.comp @@ -0,0 +1,166 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st8(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack8_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack8_fp16_to_fp32.comp new file mode 100644 index 000000000..9576e59a6 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack8_fp16_to_fp32.comp @@ -0,0 +1,169 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob_fp32 { mat2x4 top_blob_fp32_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + top_blob_fp32_data[gi] = mat2x4(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_1d_fp32, gx * 2, v[0]); + imageStore(top_blob_1d_fp32, gx * 2 + 1, v[1]); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = mat2x4(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_2d_fp32, ivec2(gx * 2, gy), v[0]); + imageStore(top_blob_2d_fp32, ivec2(gx * 2 + 1, gy), v[1]); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + top_blob_fp32_data[gi] = mat2x4(v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + imageStore(top_blob_3d_fp32, ivec3(gx * 2, gy, gz), v[0]); + imageStore(top_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), v[1]); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack8_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack8_fp32_to_fp16.comp new file mode 100644 index 000000000..b78422346 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack8_fp32_to_fp16.comp @@ -0,0 +1,166 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob_fp32 { mat2x4 bottom_blob_fp32_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32; +layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32; +layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = afpvec8(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec8(texelFetch(bottom_blob_1d_fp32, gx * 2, 0), texelFetch(bottom_blob_1d_fp32, gx * 2 + 1, 0)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image1d_st8(top_blob_1d, gx, v); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = afpvec8(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec8(texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2, gy), 0), texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2 + 1, gy), 0)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = afpvec8(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec8(texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2, gy, gz), 0), texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), 0)); + } +#endif + + if (storage_type_to == 0) + { + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st8(top_blob_data, gi, v); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack8to1.comp b/source/device/vulkan/shaders/packing_pack8to1.comp new file mode 100644 index 000000000..6eed4ce56 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack8to1.comp @@ -0,0 +1,223 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + ivec4 vv_offset = x4 + 4; + + buffer_st1(top_blob_data, v_offset.r, v[0].r); + buffer_st1(top_blob_data, v_offset.g, v[0].g); + buffer_st1(top_blob_data, v_offset.b, v[0].b); + buffer_st1(top_blob_data, v_offset.a, v[0].a); + buffer_st1(top_blob_data, vv_offset.r, v[1].r); + buffer_st1(top_blob_data, vv_offset.g, v[1].g); + buffer_st1(top_blob_data, vv_offset.b, v[1].b); + buffer_st1(top_blob_data, vv_offset.a, v[1].a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int x4 = gx * 8; + + image1d_st1(top_blob_1d, x4 + 0, v[0].r); + image1d_st1(top_blob_1d, x4 + 1, v[0].g); + image1d_st1(top_blob_1d, x4 + 2, v[0].b); + image1d_st1(top_blob_1d, x4 + 3, v[0].a); + image1d_st1(top_blob_1d, x4 + 4, v[1].r); + image1d_st1(top_blob_1d, x4 + 5, v[1].g); + image1d_st1(top_blob_1d, x4 + 6, v[1].b); + image1d_st1(top_blob_1d, x4 + 7, v[1].a); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(outw) + gx; + ivec4 vv_offset = (y4 + 4) * psc(outw) + gx; + + buffer_st1(top_blob_data, v_offset.r, v[0].r); + buffer_st1(top_blob_data, v_offset.g, v[0].g); + buffer_st1(top_blob_data, v_offset.b, v[0].b); + buffer_st1(top_blob_data, v_offset.a, v[0].a); + buffer_st1(top_blob_data, vv_offset.r, v[1].r); + buffer_st1(top_blob_data, vv_offset.g, v[1].g); + buffer_st1(top_blob_data, vv_offset.b, v[1].b); + buffer_st1(top_blob_data, vv_offset.a, v[1].a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int y4 = gy * 8; + + image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v[0].r); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v[0].g); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v[0].b); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v[0].a); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 4), v[1].r); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 5), v[1].g); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 6), v[1].b); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 7), v[1].a); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx); + ivec4 vv_offset = (z4 + 4) * psc(outcstep) + ivec4(gy * psc(outw) + gx); + + buffer_st1(top_blob_data, v_offset.r, v[0].r); + buffer_st1(top_blob_data, v_offset.g, v[0].g); + buffer_st1(top_blob_data, v_offset.b, v[0].b); + buffer_st1(top_blob_data, v_offset.a, v[0].a); + buffer_st1(top_blob_data, vv_offset.r, v[1].r); + buffer_st1(top_blob_data, vv_offset.g, v[1].g); + buffer_st1(top_blob_data, vv_offset.b, v[1].b); + buffer_st1(top_blob_data, vv_offset.a, v[1].a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int z4 = gz * 8; + + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 4), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 5), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 6), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 7), v[1].a); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp new file mode 100644 index 000000000..f670c5443 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp @@ -0,0 +1,223 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob_fp32 { float top_blob_fp32_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, r32f) writeonly uniform highp image1D top_blob_1d_fp32; +layout (binding = 3, r32f) writeonly uniform highp image2D top_blob_2d_fp32; +layout (binding = 3, r32f) writeonly uniform highp image3D top_blob_3d_fp32; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + ivec4 vv_offset = x4 + 4; + + top_blob_fp32_data[v_offset.r] = float(v[0].r); + top_blob_fp32_data[v_offset.g] = float(v[0].g); + top_blob_fp32_data[v_offset.b] = float(v[0].b); + top_blob_fp32_data[v_offset.a] = float(v[0].a); + top_blob_fp32_data[vv_offset.r] = float(v[1].r); + top_blob_fp32_data[vv_offset.g] = float(v[1].g); + top_blob_fp32_data[vv_offset.b] = float(v[1].b); + top_blob_fp32_data[vv_offset.a] = float(v[1].a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int x4 = gx * 8; + + image1d_st1(top_blob_1d_fp32, x4 + 0, v[0].r); + image1d_st1(top_blob_1d_fp32, x4 + 1, v[0].g); + image1d_st1(top_blob_1d_fp32, x4 + 2, v[0].b); + image1d_st1(top_blob_1d_fp32, x4 + 3, v[0].a); + image1d_st1(top_blob_1d_fp32, x4 + 4, v[1].r); + image1d_st1(top_blob_1d_fp32, x4 + 5, v[1].g); + image1d_st1(top_blob_1d_fp32, x4 + 6, v[1].b); + image1d_st1(top_blob_1d_fp32, x4 + 7, v[1].a); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(outw) + gx; + ivec4 vv_offset = (y4 + 4) * psc(outw) + gx; + + top_blob_fp32_data[v_offset.r] = float(v[0].r); + top_blob_fp32_data[v_offset.g] = float(v[0].g); + top_blob_fp32_data[v_offset.b] = float(v[0].b); + top_blob_fp32_data[v_offset.a] = float(v[0].a); + top_blob_fp32_data[vv_offset.r] = float(v[1].r); + top_blob_fp32_data[vv_offset.g] = float(v[1].g); + top_blob_fp32_data[vv_offset.b] = float(v[1].b); + top_blob_fp32_data[vv_offset.a] = float(v[1].a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int y4 = gy * 8; + + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 0), v[0].r); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 1), v[0].g); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 2), v[0].b); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 3), v[0].a); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 4), v[1].r); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 5), v[1].g); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 6), v[1].b); + image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 7), v[1].a); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx); + ivec4 vv_offset = (z4 + 4) * psc(outcstep) + ivec4(gy * psc(outw) + gx); + + top_blob_fp32_data[v_offset.r] = float(v[0].r); + top_blob_fp32_data[v_offset.g] = float(v[0].g); + top_blob_fp32_data[v_offset.b] = float(v[0].b); + top_blob_fp32_data[v_offset.a] = float(v[0].a); + top_blob_fp32_data[vv_offset.r] = float(v[1].r); + top_blob_fp32_data[vv_offset.g] = float(v[1].g); + top_blob_fp32_data[vv_offset.b] = float(v[1].b); + top_blob_fp32_data[vv_offset.a] = float(v[1].a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int z4 = gz * 8; + + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 0), v[0].r); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 1), v[0].g); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 2), v[0].b); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 3), v[0].a); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 4), v[1].r); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 5), v[1].g); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 6), v[1].b); + image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 7), v[1].a); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp new file mode 100644 index 000000000..8c162f0f3 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp @@ -0,0 +1,223 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob_fp32 { mat2x4 bottom_blob_fp32_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32; +layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32; +layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32; +layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = afpvec8(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec8(texelFetch(bottom_blob_1d_fp32, gx * 2, 0), texelFetch(bottom_blob_1d_fp32, gx * 2 + 1, 0)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = x4; + ivec4 vv_offset = x4 + 4; + + buffer_st1(top_blob_data, v_offset.r, v[0].r); + buffer_st1(top_blob_data, v_offset.g, v[0].g); + buffer_st1(top_blob_data, v_offset.b, v[0].b); + buffer_st1(top_blob_data, v_offset.a, v[0].a); + buffer_st1(top_blob_data, vv_offset.r, v[1].r); + buffer_st1(top_blob_data, vv_offset.g, v[1].g); + buffer_st1(top_blob_data, vv_offset.b, v[1].b); + buffer_st1(top_blob_data, vv_offset.a, v[1].a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int x4 = gx * 8; + + image1d_st1(top_blob_1d, x4 + 0, v[0].r); + image1d_st1(top_blob_1d, x4 + 1, v[0].g); + image1d_st1(top_blob_1d, x4 + 2, v[0].b); + image1d_st1(top_blob_1d, x4 + 3, v[0].a); + image1d_st1(top_blob_1d, x4 + 4, v[1].r); + image1d_st1(top_blob_1d, x4 + 5, v[1].g); + image1d_st1(top_blob_1d, x4 + 6, v[1].b); + image1d_st1(top_blob_1d, x4 + 7, v[1].a); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = afpvec8(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec8(texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2, gy), 0), texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2 + 1, gy), 0)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = y4 * psc(outw) + gx; + ivec4 vv_offset = (y4 + 4) * psc(outw) + gx; + + buffer_st1(top_blob_data, v_offset.r, v[0].r); + buffer_st1(top_blob_data, v_offset.g, v[0].g); + buffer_st1(top_blob_data, v_offset.b, v[0].b); + buffer_st1(top_blob_data, v_offset.a, v[0].a); + buffer_st1(top_blob_data, vv_offset.r, v[1].r); + buffer_st1(top_blob_data, vv_offset.g, v[1].g); + buffer_st1(top_blob_data, vv_offset.b, v[1].b); + buffer_st1(top_blob_data, vv_offset.a, v[1].a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int y4 = gy * 8; + + image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v[0].r); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v[0].g); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v[0].b); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v[0].a); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 4), v[1].r); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 5), v[1].g); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 6), v[1].b); + image2d_st1(top_blob_2d, ivec2(gx, y4 + 7), v[1].a); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = afpvec8(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec8(texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2, gy, gz), 0), texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), 0)); + } +#endif + + if (storage_type_to == 0) + { + ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3); + + ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx); + ivec4 vv_offset = (z4 + 4) * psc(outcstep) + ivec4(gy * psc(outw) + gx); + + buffer_st1(top_blob_data, v_offset.r, v[0].r); + buffer_st1(top_blob_data, v_offset.g, v[0].g); + buffer_st1(top_blob_data, v_offset.b, v[0].b); + buffer_st1(top_blob_data, v_offset.a, v[0].a); + buffer_st1(top_blob_data, vv_offset.r, v[1].r); + buffer_st1(top_blob_data, vv_offset.g, v[1].g); + buffer_st1(top_blob_data, vv_offset.b, v[1].b); + buffer_st1(top_blob_data, vv_offset.a, v[1].a); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int z4 = gz * 8; + + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 4), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 5), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 6), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 7), v[1].a); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack8to4.comp b/source/device/vulkan/shaders/packing_pack8to4.comp new file mode 100644 index 000000000..4a61fb77e --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack8to4.comp @@ -0,0 +1,184 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1); + + ivec2 v_offset = x2; + + buffer_st4(top_blob_data, v_offset.r, v[0]); + buffer_st4(top_blob_data, v_offset.g, v[1]); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int x2 = gx * 2; + + image1d_st4(top_blob_1d, x2 + 0, v[0]); + image1d_st4(top_blob_1d, x2 + 1, v[1]); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1); + + ivec2 v_offset = y2 * psc(outw) + gx; + + buffer_st4(top_blob_data, v_offset.r, v[0]); + buffer_st4(top_blob_data, v_offset.g, v[1]); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int y2 = gy * 2; + + image2d_st4(top_blob_2d, ivec2(gx, y2 + 0), v[0]); + image2d_st4(top_blob_2d, ivec2(gx, y2 + 1), v[1]); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1); + + ivec2 v_offset = z2 * psc(outcstep) + ivec2(gy * psc(outw) + gx); + + buffer_st4(top_blob_data, v_offset.r, v[0]); + buffer_st4(top_blob_data, v_offset.g, v[1]); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int z2 = gz * 2; + + image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 0), v[0]); + image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 1), v[1]); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp new file mode 100644 index 000000000..564356caa --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp @@ -0,0 +1,184 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob_fp32 { vec4 top_blob_fp32_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 2) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 2) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32; +layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } +#endif + + if (storage_type_to == 0) + { + ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1); + + ivec2 v_offset = x2; + + top_blob_fp32_data[v_offset.r] = vec4(v[0]); + top_blob_fp32_data[v_offset.g] = vec4(v[1]); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int x2 = gx * 2; + + imageStore(top_blob_1d_fp32, x2 + 0, v[0]); + imageStore(top_blob_1d_fp32, x2 + 1, v[1]); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } +#endif + + if (storage_type_to == 0) + { + ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1); + + ivec2 v_offset = y2 * psc(outw) + gx; + + top_blob_fp32_data[v_offset.r] = vec4(v[0]); + top_blob_fp32_data[v_offset.g] = vec4(v[1]); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int y2 = gy * 2; + + imageStore(top_blob_2d_fp32, ivec2(gx, y2 + 0), v[0]); + imageStore(top_blob_2d_fp32, ivec2(gx, y2 + 1), v[1]); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = buffer_ld8(bottom_blob_data, gi); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#endif + + if (storage_type_to == 0) + { + ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1); + + ivec2 v_offset = z2 * psc(outcstep) + ivec2(gy * psc(outw) + gx); + + top_blob_fp32_data[v_offset.r] = vec4(v[0]); + top_blob_fp32_data[v_offset.g] = vec4(v[1]); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int z2 = gz * 2; + + imageStore(top_blob_3d_fp32, ivec3(gx, gy, z2 + 0), v[0]); + imageStore(top_blob_3d_fp32, ivec3(gx, gy, z2 + 1), v[1]); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp new file mode 100644 index 000000000..762977406 --- /dev/null +++ b/source/device/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp @@ -0,0 +1,184 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int storage_type_from = 0; +layout (constant_id = 1) const int storage_type_to = 0; + +#define shape_constant_id_offset 2 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob_fp32 { mat2x4 bottom_blob_fp32_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_image_shader +layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32; +layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32; +layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32; +layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + afpvec8 v; + + if (psc(dims) == 1) + { + if (storage_type_from == 0) + { + int gi = gx; + + v = afpvec8(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec8(texelFetch(bottom_blob_1d_fp32, gx * 2, 0), texelFetch(bottom_blob_1d_fp32, gx * 2 + 1, 0)); + } +#endif + + if (storage_type_to == 0) + { + ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1); + + ivec2 v_offset = x2; + + buffer_st4(top_blob_data, v_offset.r, v[0]); + buffer_st4(top_blob_data, v_offset.g, v[1]); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int x2 = gx * 2; + + image1d_st4(top_blob_1d, x2 + 0, v[0]); + image1d_st4(top_blob_1d, x2 + 1, v[1]); + } +#endif + } + else if (psc(dims) == 2) + { + if (storage_type_from == 0) + { + int gi = gy * psc(w) + gx; + + v = afpvec8(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec8(texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2, gy), 0), texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2 + 1, gy), 0)); + } +#endif + + if (storage_type_to == 0) + { + ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1); + + ivec2 v_offset = y2 * psc(outw) + gx; + + buffer_st4(top_blob_data, v_offset.r, v[0]); + buffer_st4(top_blob_data, v_offset.g, v[1]); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int y2 = gy * 2; + + image2d_st4(top_blob_2d, ivec2(gx, y2 + 0), v[0]); + image2d_st4(top_blob_2d, ivec2(gx, y2 + 1), v[1]); + } +#endif + } + else // if (psc(dims) == 3) + { + if (storage_type_from == 0) + { + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + v = afpvec8(bottom_blob_fp32_data[gi]); + } +#if NCNN_image_shader + if (storage_type_from == 1) + { + v = afpvec8(texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2, gy, gz), 0), texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), 0)); + } +#endif + + if (storage_type_to == 0) + { + ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1); + + ivec2 v_offset = z2 * psc(outcstep) + ivec2(gy * psc(outw) + gx); + + buffer_st4(top_blob_data, v_offset.r, v[0]); + buffer_st4(top_blob_data, v_offset.g, v[1]); + } +#if NCNN_image_shader + if (storage_type_to == 1) + { + int z2 = gz * 2; + + image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 0), v[0]); + image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 1), v[1]); + } +#endif + } +} diff --git a/source/device/vulkan/shaders/padding.comp b/source/device/vulkan/shaders/padding.comp new file mode 100644 index 000000000..9a5dd1c8b --- /dev/null +++ b/source/device/vulkan/shaders/padding.comp @@ -0,0 +1,145 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int type = 1; +layout (constant_id = 1) const float value = 0; +layout (constant_id = 2) const int per_channel_pad = 0; + +#define shape_constant_id_offset 3 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +//layout (binding = 2) uniform unfp sampler1D per_channel_pad_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +//layout (binding = 2) readonly buffer per_channel_pad_blob { sfp per_channel_pad_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int left; + int top; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) + { +#if NCNN_image_shader + image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else + int v_offset = gz * psc(cstep) + y * psc(w) + x; + buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } +/* + else if (per_channel_pad == 1) + { +#if NCNN_image_shader + afp v = image1d_ld1(per_channel_pad_blob, gz); + image3d_st1(top_blob, ivec3(gx, gy, gz), v); +#else + buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz); +#endif + } +*/ + else + { + afp v = afp(value); +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), v); +#else + buffer_st1(top_blob_data, gi, v); +#endif + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) - 1); + +#if NCNN_image_shader + image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else + int v_offset = gz * psc(cstep) + y * psc(w) + x; + buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } + if (type == 2) + { + x = abs(x); + y = abs(y); + x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + y = (psc(h) - 1) - abs(y - (psc(h) - 1)); + +#if NCNN_image_shader + image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else + int v_offset = gz * psc(cstep) + y * psc(w) + x; + buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } +} diff --git a/source/device/vulkan/shaders/padding_pack4.comp b/source/device/vulkan/shaders/padding_pack4.comp new file mode 100644 index 000000000..9f8cf99af --- /dev/null +++ b/source/device/vulkan/shaders/padding_pack4.comp @@ -0,0 +1,144 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int type = 1; +layout (constant_id = 1) const float value = 0; +layout (constant_id = 2) const int per_channel_pad = 0; + +#define shape_constant_id_offset 3 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +// layout (binding = 2) uniform unfp sampler1D per_channel_pad_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +// layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec4 per_channel_pad_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int left; + int top; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) + { +#if NCNN_image_shader + image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else + int v_offset = gz * psc(cstep) + y * psc(w) + x; + buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } +// else if (per_channel_pad == 1) +// { +// #if NCNN_image_shader +// afpvec4 v = image1d_ld4(per_channel_pad_blob, gz); +// image3d_st4(top_blob, ivec3(gx, gy, gz), v); +// #else +// buffer_cp4(top_blob_data, gi, per_channel_pad_blob_data, gz); +// #endif +// } + else + { + afpvec4 v = afpvec4(value); +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), v); +#else + buffer_st4(top_blob_data, gi, v); +#endif + } + + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) - 1); + +#if NCNN_image_shader + image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else + int v_offset = gz * psc(cstep) + y * psc(w) + x; + buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } + if (type == 2) + { + x = abs(x); + y = abs(y); + x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + y = (psc(h) - 1) - abs(y - (psc(h) - 1)); + +#if NCNN_image_shader + image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else + int v_offset = gz * psc(cstep) + y * psc(w) + x; + buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } +} diff --git a/source/device/vulkan/shaders/padding_pack8.comp b/source/device/vulkan/shaders/padding_pack8.comp new file mode 100644 index 000000000..b1d84887f --- /dev/null +++ b/source/device/vulkan/shaders/padding_pack8.comp @@ -0,0 +1,144 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int type = 1; +layout (constant_id = 1) const float value = 0; +layout (constant_id = 2) const int per_channel_pad = 0; + +#define shape_constant_id_offset 3 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +// layout (binding = 2) uniform unfp sampler1D per_channel_pad_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +// layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec8 per_channel_pad_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int left; + int top; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + int x = gx - p.left; + int y = gy - p.top; + + if (type == 0) + { + if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h)) + { +#if NCNN_image_shader + image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else + int v_offset = gz * psc(cstep) + y * psc(w) + x; + buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } +// else if (per_channel_pad == 1) +// { +// #if NCNN_image_shader +// afpvec8 v = image1d_ld8(per_channel_pad_blob, gz); +// image3d_st8(top_blob, ivec3(gx, gy, gz), v); +// #else +// buffer_cp8(top_blob_data, gi, per_channel_pad_blob_data, gz); +// #endif +// } + else + { + afpvec8 v = afpvec8(afpvec4(value), afpvec4(value)); +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), v); +#else + buffer_st8(top_blob_data, gi, v); +#endif + } + } + if (type == 1) + { + x = clamp(x, 0, psc(w) - 1); + y = clamp(y, 0, psc(h) - 1); + +#if NCNN_image_shader + image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else + int v_offset = gz * psc(cstep) + y * psc(w) + x; + buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } + if (type == 2) + { + x = abs(x); + y = abs(y); + x = (psc(w) - 1) - abs(x - (psc(w) - 1)); + y = (psc(h) - 1) - abs(y - (psc(h) - 1)); + +#if NCNN_image_shader + image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz)); +#else + int v_offset = gz * psc(cstep) + y * psc(w) + x; + buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); +#endif + } +} diff --git a/source/device/vulkan/shaders/permute.comp b/source/device/vulkan/shaders/permute.comp new file mode 100644 index 000000000..613734a68 --- /dev/null +++ b/source/device/vulkan/shaders/permute.comp @@ -0,0 +1,186 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int order_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + +#if NCNN_image_shader + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy)); + } + if (order_type == 1) + { + image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gy, gx)); + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz)); + } + if (order_type == 1) + { + image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gy, gx, gz)); + } + if (order_type == 2) + { + image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gz, gy)); + } + if (order_type == 3) + { + image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gy, gz, gx)); + } + if (order_type == 4) + { + image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gz, gx, gy)); + } + if (order_type == 5) + { + image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gz, gy, gx)); + } + } +#else + int v_offset; + + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + v_offset = gy * psc(w) + gx; + } + if (order_type == 1) + { + v_offset = gx * psc(w) + gy; + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + v_offset = gz * psc(cstep) + gy * psc(w) + gx; + } + if (order_type == 1) + { + v_offset = gz * psc(cstep) + gx * psc(w) + gy; + } + if (order_type == 2) + { + v_offset = gy * psc(cstep) + gz * psc(w) + gx; + } + if (order_type == 3) + { + v_offset = gx * psc(cstep) + gz * psc(w) + gy; + } + if (order_type == 4) + { + v_offset = gy * psc(cstep) + gx * psc(w) + gz; + } + if (order_type == 5) + { + v_offset = gx * psc(cstep) + gy * psc(w) + gz; + } + } + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/permute_pack1to4.comp b/source/device/vulkan/shaders/permute_pack1to4.comp new file mode 100644 index 000000000..d1ad932ff --- /dev/null +++ b/source/device/vulkan/shaders/permute_pack1to4.comp @@ -0,0 +1,234 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int order_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + +#if NCNN_image_shader + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + afpvec4 v; + + if (order_type == 0) + { + ivec4 y4 = gy * 4 + ivec4(0, 1, 2, 3); + + v.r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.r)); + v.g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.g)); + v.b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.b)); + v.a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.a)); + } + if (order_type == 1) + { + ivec4 x4 = gy * 4 + ivec4(0, 1, 2, 3); + + v.r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, gx)); + v.g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, gx)); + v.b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, gx)); + v.a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, gx)); + } + + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + afpvec4 v; + + if (order_type == 0) + { + ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3); + + v.r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.r)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.g)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.b)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.a)); + } + if (order_type == 1) + { + ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3); + + v.r = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.r)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.g)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.b)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.a)); + } + if (order_type == 2) + { + ivec4 y4 = gz * 4 + ivec4(0, 1, 2, 3); + + v.r = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.r, gy)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.g, gy)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.b, gy)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.a, gy)); + } + if (order_type == 3) + { + ivec4 y4 = gz * 4 + ivec4(0, 1, 2, 3); + + v.r = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.r, gx)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.g, gx)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.b, gx)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.a, gx)); + } + if (order_type == 4) + { + ivec4 x4 = gz * 4 + ivec4(0, 1, 2, 3); + + v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, gx, gy)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, gx, gy)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, gx, gy)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, gx, gy)); + } + if (order_type == 5) + { + ivec4 x4 = gz * 4 + ivec4(0, 1, 2, 3); + + v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, gy, gx)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, gy, gx)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, gy, gx)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, gy, gx)); + } + + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + ivec4 v_offset; + + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + v_offset = (gy * 4 + ivec4(0, 1, 2, 3)) * psc(w) + gx; + } + if (order_type == 1) + { + v_offset = gx * psc(w) + (gy * 4 + ivec4(0, 1, 2, 3)); + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + v_offset = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(cstep) + gy * psc(w) + gx; + } + if (order_type == 1) + { + v_offset = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(cstep) + gx * psc(w) + gy; + } + if (order_type == 2) + { + v_offset = gy * psc(cstep) + (gz * 4 + ivec4(0, 1, 2, 3)) * psc(w) + gx; + } + if (order_type == 3) + { + v_offset = gx * psc(cstep) + (gz * 4 + ivec4(0, 1, 2, 3)) * psc(w) + gy; + } + if (order_type == 4) + { + v_offset = gy * psc(cstep) + gx * psc(w) + (gz * 4 + ivec4(0, 1, 2, 3)); + } + if (order_type == 5) + { + v_offset = gx * psc(cstep) + gy * psc(w) + (gz * 4 + ivec4(0, 1, 2, 3)); + } + } + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/permute_pack1to8.comp b/source/device/vulkan/shaders/permute_pack1to8.comp new file mode 100644 index 000000000..816a94268 --- /dev/null +++ b/source/device/vulkan/shaders/permute_pack1to8.comp @@ -0,0 +1,284 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int order_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + +#if NCNN_image_shader + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + afpvec8 v; + + if (order_type == 0) + { + ivec4 y4 = gy * 8 + ivec4(0, 1, 2, 3); + ivec4 yy4 = y4 + 4; + + v[0].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.r)); + v[0].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.g)); + v[0].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.b)); + v[0].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.a)); + v[1].r = image2d_ld1(bottom_blob_2d, ivec2(gx, yy4.r)); + v[1].g = image2d_ld1(bottom_blob_2d, ivec2(gx, yy4.g)); + v[1].b = image2d_ld1(bottom_blob_2d, ivec2(gx, yy4.b)); + v[1].a = image2d_ld1(bottom_blob_2d, ivec2(gx, yy4.a)); + } + if (order_type == 1) + { + ivec4 x4 = gy * 8 + ivec4(0, 1, 2, 3); + ivec4 xx4 = x4 + 4; + + v[0].r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, gx)); + v[0].g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, gx)); + v[0].b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, gx)); + v[0].a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, gx)); + v[1].r = image2d_ld1(bottom_blob_2d, ivec2(xx4.r, gx)); + v[1].g = image2d_ld1(bottom_blob_2d, ivec2(xx4.g, gx)); + v[1].b = image2d_ld1(bottom_blob_2d, ivec2(xx4.b, gx)); + v[1].a = image2d_ld1(bottom_blob_2d, ivec2(xx4.a, gx)); + } + + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + afpvec8 v; + + if (order_type == 0) + { + ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 zz4 = z4 + 4; + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.r)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.g)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.b)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.a)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, zz4.r)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, zz4.g)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, zz4.b)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, zz4.a)); + } + if (order_type == 1) + { + ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 zz4 = z4 + 4; + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.r)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.g)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.b)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.a)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, zz4.r)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, zz4.g)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, zz4.b)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, zz4.a)); + } + if (order_type == 2) + { + ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 yy4 = y4 + 4; + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.r, gy)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.g, gy)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.b, gy)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.a, gy)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gx, yy4.r, gy)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gx, yy4.g, gy)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gx, yy4.b, gy)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gx, yy4.a, gy)); + } + if (order_type == 3) + { + ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 yy4 = y4 + 4; + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.r, gx)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.g, gx)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.b, gx)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.a, gx)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gy, yy4.r, gx)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gy, yy4.g, gx)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gy, yy4.b, gx)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gy, yy4.a, gx)); + } + if (order_type == 4) + { + ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 xx4 = x4 + 4; + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, gx, gy)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, gx, gy)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, gx, gy)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, gx, gy)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(xx4.r, gx, gy)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(xx4.g, gx, gy)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(xx4.b, gx, gy)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(xx4.a, gx, gy)); + } + if (order_type == 5) + { + ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 xx4 = x4 + 4; + + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, gy, gx)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, gy, gx)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, gy, gx)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, gy, gx)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(xx4.r, gy, gx)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(xx4.g, gy, gx)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(xx4.b, gy, gx)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(xx4.a, gy, gx)); + } + + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + v_offset = (gy * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx; + vv_offset = v_offset + 4 * psc(w); + } + if (order_type == 1) + { + v_offset = gx * psc(w) + (gy * 8 + ivec4(0, 1, 2, 3)); + vv_offset = v_offset + 4; + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + v_offset = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gy * psc(w) + gx; + vv_offset = v_offset + 4 * psc(cstep); + } + if (order_type == 1) + { + v_offset = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gx * psc(w) + gy; + vv_offset = v_offset + 4 * psc(cstep); + } + if (order_type == 2) + { + v_offset = gy * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx; + vv_offset = v_offset + 4 * psc(w); + } + if (order_type == 3) + { + v_offset = gx * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gy; + vv_offset = v_offset + 4 * psc(w); + } + if (order_type == 4) + { + v_offset = gy * psc(cstep) + gx * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3)); + vv_offset = v_offset + 4; + } + if (order_type == 5) + { + v_offset = gx * psc(cstep) + gy * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3)); + vv_offset = v_offset + 4; + } + } + + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +} diff --git a/source/device/vulkan/shaders/permute_pack4.comp b/source/device/vulkan/shaders/permute_pack4.comp new file mode 100644 index 000000000..3e1ff6ef8 --- /dev/null +++ b/source/device/vulkan/shaders/permute_pack4.comp @@ -0,0 +1,281 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int order_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + +#if NCNN_image_shader + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + ivec4 i4; + + if (order_type == 0) + { + i4 = ivec4((gy * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 1) + { + i4 = ivec4(gx * psc(w) + gy * 4) + ivec4(0, 1, 2, 3); + } + + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + afpvec4 vr = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4)); + afpvec4 vg = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4)); + afpvec4 vb = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4)); + afpvec4 va = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4)); + + ivec4 lane4 = y4 % 4; + + afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]); + + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + int size = psc(w) * psc(h); + + ivec4 i4; + + if (order_type == 0) + { + i4 = ivec4((gz * 4) * size + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * size; + } + if (order_type == 1) + { + i4 = ivec4((gz * 4) * size + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * size; + } + if (order_type == 2) + { + i4 = ivec4(gy * size + (gz * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 3) + { + i4 = ivec4(gx * size+ (gz * 4) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 4) + { + i4 = ivec4(gy * size + gx * psc(w) + gz * 4) + ivec4(0, 1, 2, 3); + } + if (order_type == 5) + { + i4 = ivec4(gx * size + gy * psc(w) + gz * 4) + ivec4(0, 1, 2, 3); + } + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + afpvec4 vr = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4)); + afpvec4 vg = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4)); + afpvec4 vb = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4)); + afpvec4 va = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4)); + + ivec4 lane4 = z4 % 4; + + afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]); + + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + ivec4 i4; + + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + i4 = ivec4((gy * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 1) + { + i4 = ivec4(gx * psc(w) + gy * 4) + ivec4(0, 1, 2, 3); + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + i4 = ivec4((gz * 4) * psc(cstep) + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(cstep); + } + if (order_type == 1) + { + i4 = ivec4((gz * 4) * psc(cstep) + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(cstep); + } + if (order_type == 2) + { + i4 = ivec4(gy * psc(cstep) + (gz * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 3) + { + i4 = ivec4(gx * psc(cstep) + (gz * 4) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 4) + { + i4 = ivec4(gy * psc(cstep) + gx * psc(w) + gz * 4) + ivec4(0, 1, 2, 3); + } + if (order_type == 5) + { + i4 = ivec4(gx * psc(cstep) + gy * psc(w) + gz * 4) + ivec4(0, 1, 2, 3); + } + } + +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 lane2; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2; + lane2 = y4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2; + lane2 = z4 % 2; + } + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); + + buffer_st4(top_blob_data, gi, v); +#else + ivec4 v_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4; + } + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/permute_pack4to1.comp b/source/device/vulkan/shaders/permute_pack4to1.comp new file mode 100644 index 000000000..5d33904d0 --- /dev/null +++ b/source/device/vulkan/shaders/permute_pack4to1.comp @@ -0,0 +1,230 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int order_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + afpvec4 v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + + if (order_type == 0) + { + ivec4 y4 = gy * 4 + ivec4(0, 1, 2, 3); + + image2d_st1(top_blob_2d, ivec2(gx, y4.r), v.r); + image2d_st1(top_blob_2d, ivec2(gx, y4.g), v.g); + image2d_st1(top_blob_2d, ivec2(gx, y4.b), v.b); + image2d_st1(top_blob_2d, ivec2(gx, y4.a), v.a); + } + if (order_type == 1) + { + ivec4 x4 = gy * 4 + ivec4(0, 1, 2, 3); + + image2d_st1(top_blob_2d, ivec2(x4.r, gx), v.r); + image2d_st1(top_blob_2d, ivec2(x4.g, gx), v.g); + image2d_st1(top_blob_2d, ivec2(x4.b, gx), v.b); + image2d_st1(top_blob_2d, ivec2(x4.a, gx), v.a); + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + + if (order_type == 0) + { + ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3); + + image3d_st1(top_blob_3d, ivec3(gx, gy, z4.r), v.r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4.g), v.g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4.b), v.b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4.a), v.a); + } + if (order_type == 1) + { + ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3); + + image3d_st1(top_blob_3d, ivec3(gy, gx, z4.r), v.r); + image3d_st1(top_blob_3d, ivec3(gy, gx, z4.g), v.g); + image3d_st1(top_blob_3d, ivec3(gy, gx, z4.b), v.b); + image3d_st1(top_blob_3d, ivec3(gy, gx, z4.a), v.a); + } + if (order_type == 2) + { + ivec4 y4 = gz * 4 + ivec4(0, 1, 2, 3); + + image3d_st1(top_blob_3d, ivec3(gx, y4.r, gy), v.r); + image3d_st1(top_blob_3d, ivec3(gx, y4.g, gy), v.g); + image3d_st1(top_blob_3d, ivec3(gx, y4.b, gy), v.b); + image3d_st1(top_blob_3d, ivec3(gx, y4.a, gy), v.a); + } + if (order_type == 3) + { + ivec4 x4 = gz * 4 + ivec4(0, 1, 2, 3); + + image3d_st1(top_blob_3d, ivec3(x4.r, gx, gy), v.r); + image3d_st1(top_blob_3d, ivec3(x4.g, gx, gy), v.g); + image3d_st1(top_blob_3d, ivec3(x4.b, gx, gy), v.b); + image3d_st1(top_blob_3d, ivec3(x4.a, gx, gy), v.a); + } + if (order_type == 4) + { + ivec4 y4 = gz * 4 + ivec4(0, 1, 2, 3); + + image3d_st1(top_blob_3d, ivec3(gy, y4.r, gx), v.r); + image3d_st1(top_blob_3d, ivec3(gy, y4.g, gx), v.g); + image3d_st1(top_blob_3d, ivec3(gy, y4.b, gx), v.b); + image3d_st1(top_blob_3d, ivec3(gy, y4.a, gx), v.a); + } + if (order_type == 5) + { + ivec4 x4 = gz * 4 + ivec4(0, 1, 2, 3); + + image3d_st1(top_blob_3d, ivec3(x4.r, gy, gx), v.r); + image3d_st1(top_blob_3d, ivec3(x4.g, gy, gx), v.g); + image3d_st1(top_blob_3d, ivec3(x4.b, gy, gx), v.b); + image3d_st1(top_blob_3d, ivec3(x4.a, gy, gx), v.a); + } + } +#else + ivec4 v_offset; + + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + v_offset = ivec4((gy * 4) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw); + } + if (order_type == 1) + { + v_offset = ivec4(gx * psc(outw) + gy * 4) + ivec4(0, 1, 2, 3); + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + v_offset = ivec4((gz * 4) * psc(outcstep) + gy * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outcstep); + } + if (order_type == 1) + { + v_offset = ivec4((gz * 4) * psc(outcstep) + gx * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outcstep); + } + if (order_type == 2) + { + v_offset = ivec4(gy * psc(outcstep) + (gz * 4) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw); + } + if (order_type == 3) + { + v_offset = ivec4(gy * psc(outcstep) + gx * psc(outw) + gz * 4) + ivec4(0, 1, 2, 3); + } + if (order_type == 4) + { + v_offset = ivec4(gx * psc(outcstep) + (gz * 4) * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outw); + } + if (order_type == 5) + { + v_offset = ivec4(gx * psc(outcstep) + gy * psc(outw) + gz * 4) + ivec4(0, 1, 2, 3); + } + } + + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/permute_pack4to8.comp b/source/device/vulkan/shaders/permute_pack4to8.comp new file mode 100644 index 000000000..f35abd828 --- /dev/null +++ b/source/device/vulkan/shaders/permute_pack4to8.comp @@ -0,0 +1,350 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int order_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + +#if NCNN_image_shader + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + ivec4 i4; + ivec4 ii4; + + if (order_type == 0) + { + i4 = ivec4((gy * 8) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + ii4 = i4 + 4 * psc(w); + } + if (order_type == 1) + { + i4 = ivec4(gx * psc(w) + gy * 8) + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4)); + afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4)); + afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4)); + afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4)); + afpvec4 v4 = image2d_ld4(bottom_blob_2d, ivec2(xx4.r, yy4.r / 4)); + afpvec4 v5 = image2d_ld4(bottom_blob_2d, ivec2(xx4.g, yy4.g / 4)); + afpvec4 v6 = image2d_ld4(bottom_blob_2d, ivec2(xx4.b, yy4.b / 4)); + afpvec4 v7 = image2d_ld4(bottom_blob_2d, ivec2(xx4.a, yy4.a / 4)); + + afpvec8 v; + v[0].r = v0[y4.r % 4]; + v[0].g = v1[y4.g % 4]; + v[0].b = v2[y4.b % 4]; + v[0].a = v3[y4.a % 4]; + v[1].r = v4[yy4.r % 4]; + v[1].g = v5[yy4.g % 4]; + v[1].b = v6[yy4.b % 4]; + v[1].a = v7[yy4.a % 4]; + + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + int size = psc(w) * psc(h); + + ivec4 i4; + ivec4 ii4; + + if (order_type == 0) + { + i4 = ivec4((gz * 8) * size + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * size; + ii4 = i4 + 4 * size; + } + if (order_type == 1) + { + i4 = ivec4((gz * 8) * size + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * size; + ii4 = i4 + 4 * size; + } + if (order_type == 2) + { + i4 = ivec4(gy * size + (gz * 8) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + ii4 = i4 + 4 * psc(w); + } + if (order_type == 3) + { + i4 = ivec4(gx * size+ (gz * 8) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w); + ii4 = i4 + 4 * psc(w); + } + if (order_type == 4) + { + i4 = ivec4(gy * size + gx * psc(w) + gz * 8) + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + if (order_type == 5) + { + i4 = ivec4(gx * size + gy * psc(w) + gz * 8) + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4)); + afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4)); + afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4)); + afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4)); + afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 4)); + afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 4)); + afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 4)); + afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 4)); + + afpvec8 v; + v[0].r = v0[z4.r % 4]; + v[0].g = v1[z4.g % 4]; + v[0].b = v2[z4.b % 4]; + v[0].a = v3[z4.a % 4]; + v[1].r = v4[zz4.r % 4]; + v[1].g = v5[zz4.g % 4]; + v[1].b = v6[zz4.b % 4]; + v[1].a = v7[zz4.a % 4]; + + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + ivec4 i4; + ivec4 ii4; + + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + i4 = (gy * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx; + ii4 = i4 + 4 * psc(w); + } + if (order_type == 1) + { + i4 = gx * psc(w) + (gy * 8 + ivec4(0, 1, 2, 3)); + ii4 = i4 + 4; + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + i4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gy * psc(w) + gx; + ii4 = i4 + 4 * psc(cstep); + } + if (order_type == 1) + { + i4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gx * psc(w) + gy; + ii4 = i4 + 4 * psc(cstep); + } + if (order_type == 2) + { + i4 = gy * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx; + ii4 = i4 + 4 * psc(w); + } + if (order_type == 3) + { + i4 = gx * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gy; + ii4 = i4 + 4 * psc(w); + } + if (order_type == 4) + { + i4 = gy * psc(cstep) + gx * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3)); + ii4 = i4 + 4; + } + if (order_type == 5) + { + i4 = gx * psc(cstep) + gy * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3)); + ii4 = i4 + 4; + } + } + +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 vv_offset; + ivec4 lane2; + ivec4 lane4; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2; + lane2 = y4 % 2; + vv_offset = ((yy4 / 4) * psc(w) + xx4) * 2 + (yy4 % 4) / 2; + lane4 = yy4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2; + lane2 = z4 % 2; + vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2; + lane4 = zz4 % 2; + } + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); + afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); + afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); + afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); + + afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); + + buffer_st8(top_blob_data, gi, v); +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4; + vv_offset = ((yy4 / 4) * psc(w) + xx4) * 4 + yy4 % 4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4; + vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4; + } + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/permute_pack8.comp b/source/device/vulkan/shaders/permute_pack8.comp new file mode 100644 index 000000000..5fa215538 --- /dev/null +++ b/source/device/vulkan/shaders/permute_pack8.comp @@ -0,0 +1,350 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int order_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + +#if NCNN_image_shader + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + ivec4 i4; + ivec4 ii4; + + if (order_type == 0) + { + i4 = ivec4((gy * 8) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + ii4 = i4 + 4 * psc(w); + } + if (order_type == 1) + { + i4 = ivec4(gx * psc(w) + gy * 8) + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = i4 / psc(w); + ivec4 xx4 = i4 % psc(w); + + afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8)); + afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8)); + afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8)); + afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8)); + afpvec8 v4 = image2d_ld8(bottom_blob_2d, ivec2(xx4.r, yy4.r / 8)); + afpvec8 v5 = image2d_ld8(bottom_blob_2d, ivec2(xx4.g, yy4.g / 8)); + afpvec8 v6 = image2d_ld8(bottom_blob_2d, ivec2(xx4.b, yy4.b / 8)); + afpvec8 v7 = image2d_ld8(bottom_blob_2d, ivec2(xx4.a, yy4.a / 8)); + + afpvec8 v; + v[0].r = v0[(y4.r % 8) / 4][y4.r % 4]; + v[0].g = v1[(y4.g % 8) / 4][y4.g % 4]; + v[0].b = v2[(y4.b % 8) / 4][y4.b % 4]; + v[0].a = v3[(y4.a % 8) / 4][y4.a % 4]; + v[1].r = v4[(yy4.r % 8) / 4][yy4.r % 4]; + v[1].g = v5[(yy4.g % 8) / 4][yy4.g % 4]; + v[1].b = v6[(yy4.b % 8) / 4][yy4.b % 4]; + v[1].a = v7[(yy4.a % 8) / 4][yy4.a % 4]; + + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + int size = psc(w) * psc(h); + + ivec4 i4; + ivec4 ii4; + + if (order_type == 0) + { + i4 = ivec4((gz * 8) * size + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * size; + ii4 = i4 + 4 * size; + } + if (order_type == 1) + { + i4 = ivec4((gz * 8) * size + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * size; + ii4 = i4 + 4 * size; + } + if (order_type == 2) + { + i4 = ivec4(gy * size + (gz * 8) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + ii4 = i4 + 4 * psc(w); + } + if (order_type == 3) + { + i4 = ivec4(gx * size+ (gz * 8) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w); + ii4 = i4 + 4 * psc(w); + } + if (order_type == 4) + { + i4 = ivec4(gy * size + gx * psc(w) + gz * 8) + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + if (order_type == 5) + { + i4 = ivec4(gx * size + gy * psc(w) + gz * 8) + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8)); + afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8)); + afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8)); + afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8)); + afpvec8 v4 = image3d_ld8(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 8)); + afpvec8 v5 = image3d_ld8(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 8)); + afpvec8 v6 = image3d_ld8(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 8)); + afpvec8 v7 = image3d_ld8(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 8)); + + afpvec8 v; + v[0].r = v0[(z4.r % 8) / 4][z4.r % 4]; + v[0].g = v1[(z4.g % 8) / 4][z4.g % 4]; + v[0].b = v2[(z4.b % 8) / 4][z4.b % 4]; + v[0].a = v3[(z4.a % 8) / 4][z4.a % 4]; + v[1].r = v4[(zz4.r % 8) / 4][zz4.r % 4]; + v[1].g = v5[(zz4.g % 8) / 4][zz4.g % 4]; + v[1].b = v6[(zz4.b % 8) / 4][zz4.b % 4]; + v[1].a = v7[(zz4.a % 8) / 4][zz4.a % 4]; + + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + ivec4 i4; + ivec4 ii4; + + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + i4 = (gy * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx; + ii4 = i4 + 4 * psc(w); + } + if (order_type == 1) + { + i4 = gx * psc(w) + (gy * 8 + ivec4(0, 1, 2, 3)); + ii4 = i4 + 4; + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + i4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gy * psc(w) + gx; + ii4 = i4 + 4 * psc(cstep); + } + if (order_type == 1) + { + i4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gx * psc(w) + gy; + ii4 = i4 + 4 * psc(cstep); + } + if (order_type == 2) + { + i4 = gy * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx; + ii4 = i4 + 4 * psc(w); + } + if (order_type == 3) + { + i4 = gx * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gy; + ii4 = i4 + 4 * psc(w); + } + if (order_type == 4) + { + i4 = gy * psc(cstep) + gx * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3)); + ii4 = i4 + 4; + } + if (order_type == 5) + { + i4 = gx * psc(cstep) + gy * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3)); + ii4 = i4 + 4; + } + } + +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 vv_offset; + ivec4 lane2; + ivec4 lane4; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2; + lane2 = y4 % 2; + vv_offset = ((yy4 / 8) * psc(w) + xx4) * 4 + (yy4 % 8) / 2; + lane4 = yy4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2; + lane2 = z4 % 2; + vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + (zz4 % 8) / 2; + lane4 = zz4 % 2; + } + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); + afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); + afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); + afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); + + afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); + + buffer_st8(top_blob_data, gi, v); +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8; + vv_offset = ((yy4 / 8) * psc(w) + xx4) * 8 + yy4 % 8; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8; + vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 8 + zz4 % 8; + } + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/permute_pack8to1.comp b/source/device/vulkan/shaders/permute_pack8to1.comp new file mode 100644 index 000000000..3d152f68b --- /dev/null +++ b/source/device/vulkan/shaders/permute_pack8to1.comp @@ -0,0 +1,280 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int order_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + + if (order_type == 0) + { + ivec4 y4 = gy * 8 + ivec4(0, 1, 2, 3); + ivec4 yy4 = y4 + 4; + + image2d_st1(top_blob_2d, ivec2(gx, y4.r), v[0].r); + image2d_st1(top_blob_2d, ivec2(gx, y4.g), v[0].g); + image2d_st1(top_blob_2d, ivec2(gx, y4.b), v[0].b); + image2d_st1(top_blob_2d, ivec2(gx, y4.a), v[0].a); + image2d_st1(top_blob_2d, ivec2(gx, yy4.r), v[1].r); + image2d_st1(top_blob_2d, ivec2(gx, yy4.g), v[1].g); + image2d_st1(top_blob_2d, ivec2(gx, yy4.b), v[1].b); + image2d_st1(top_blob_2d, ivec2(gx, yy4.a), v[1].a); + } + if (order_type == 1) + { + ivec4 x4 = gy * 8 + ivec4(0, 1, 2, 3); + ivec4 xx4 = x4 + 4; + + image2d_st1(top_blob_2d, ivec2(x4.r, gx), v[0].r); + image2d_st1(top_blob_2d, ivec2(x4.g, gx), v[0].g); + image2d_st1(top_blob_2d, ivec2(x4.b, gx), v[0].b); + image2d_st1(top_blob_2d, ivec2(x4.a, gx), v[0].a); + image2d_st1(top_blob_2d, ivec2(xx4.r, gx), v[1].r); + image2d_st1(top_blob_2d, ivec2(xx4.g, gx), v[1].g); + image2d_st1(top_blob_2d, ivec2(xx4.b, gx), v[1].b); + image2d_st1(top_blob_2d, ivec2(xx4.a, gx), v[1].a); + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + + if (order_type == 0) + { + ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 zz4 = z4 + 4; + + image3d_st1(top_blob_3d, ivec3(gx, gy, z4.r), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4.g), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4.b), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, z4.a), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.r), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.g), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.b), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.a), v[1].a); + } + if (order_type == 1) + { + ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 zz4 = z4 + 4; + + image3d_st1(top_blob_3d, ivec3(gy, gx, z4.r), v[0].r); + image3d_st1(top_blob_3d, ivec3(gy, gx, z4.g), v[0].g); + image3d_st1(top_blob_3d, ivec3(gy, gx, z4.b), v[0].b); + image3d_st1(top_blob_3d, ivec3(gy, gx, z4.a), v[0].a); + image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.r), v[1].r); + image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.g), v[1].g); + image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.b), v[1].b); + image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.a), v[1].a); + } + if (order_type == 2) + { + ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 yy4 = y4 + 4; + + image3d_st1(top_blob_3d, ivec3(gx, y4.r, gy), v[0].r); + image3d_st1(top_blob_3d, ivec3(gx, y4.g, gy), v[0].g); + image3d_st1(top_blob_3d, ivec3(gx, y4.b, gy), v[0].b); + image3d_st1(top_blob_3d, ivec3(gx, y4.a, gy), v[0].a); + image3d_st1(top_blob_3d, ivec3(gx, yy4.r, gy), v[1].r); + image3d_st1(top_blob_3d, ivec3(gx, yy4.g, gy), v[1].g); + image3d_st1(top_blob_3d, ivec3(gx, yy4.b, gy), v[1].b); + image3d_st1(top_blob_3d, ivec3(gx, yy4.a, gy), v[1].a); + } + if (order_type == 3) + { + ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 xx4 = x4 + 4; + + image3d_st1(top_blob_3d, ivec3(x4.r, gx, gy), v[0].r); + image3d_st1(top_blob_3d, ivec3(x4.g, gx, gy), v[0].g); + image3d_st1(top_blob_3d, ivec3(x4.b, gx, gy), v[0].b); + image3d_st1(top_blob_3d, ivec3(x4.a, gx, gy), v[0].a); + image3d_st1(top_blob_3d, ivec3(xx4.r, gx, gy), v[1].r); + image3d_st1(top_blob_3d, ivec3(xx4.g, gx, gy), v[1].g); + image3d_st1(top_blob_3d, ivec3(xx4.b, gx, gy), v[1].b); + image3d_st1(top_blob_3d, ivec3(xx4.a, gx, gy), v[1].a); + } + if (order_type == 4) + { + ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 yy4 = y4 + 4; + + image3d_st1(top_blob_3d, ivec3(gy, y4.r, gx), v[0].r); + image3d_st1(top_blob_3d, ivec3(gy, y4.g, gx), v[0].g); + image3d_st1(top_blob_3d, ivec3(gy, y4.b, gx), v[0].b); + image3d_st1(top_blob_3d, ivec3(gy, y4.a, gx), v[0].a); + image3d_st1(top_blob_3d, ivec3(gy, yy4.r, gx), v[1].r); + image3d_st1(top_blob_3d, ivec3(gy, yy4.g, gx), v[1].g); + image3d_st1(top_blob_3d, ivec3(gy, yy4.b, gx), v[1].b); + image3d_st1(top_blob_3d, ivec3(gy, yy4.a, gx), v[1].a); + } + if (order_type == 5) + { + ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 xx4 = x4 + 4; + + image3d_st1(top_blob_3d, ivec3(x4.r, gy, gx), v[0].r); + image3d_st1(top_blob_3d, ivec3(x4.g, gy, gx), v[0].g); + image3d_st1(top_blob_3d, ivec3(x4.b, gy, gx), v[0].b); + image3d_st1(top_blob_3d, ivec3(x4.a, gy, gx), v[0].a); + image3d_st1(top_blob_3d, ivec3(xx4.r, gy, gx), v[1].r); + image3d_st1(top_blob_3d, ivec3(xx4.g, gy, gx), v[1].g); + image3d_st1(top_blob_3d, ivec3(xx4.b, gy, gx), v[1].b); + image3d_st1(top_blob_3d, ivec3(xx4.a, gy, gx), v[1].a); + } + } +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + v_offset = ivec4((gy * 8) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw); + vv_offset = v_offset + 4 * psc(outw); + } + if (order_type == 1) + { + v_offset = ivec4(gx * psc(outw) + gy * 8) + ivec4(0, 1, 2, 3); + vv_offset = v_offset + 4; + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + v_offset = ivec4((gz * 8) * psc(outcstep) + gy * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outcstep); + vv_offset = v_offset + 4 * psc(outcstep); + } + if (order_type == 1) + { + v_offset = ivec4((gz * 8) * psc(outcstep) + gx * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outcstep); + vv_offset = v_offset + 4 * psc(outcstep); + } + if (order_type == 2) + { + v_offset = ivec4(gy * psc(outcstep) + (gz * 8) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw); + vv_offset = v_offset + 4 * psc(outw); + } + if (order_type == 3) + { + v_offset = ivec4(gy * psc(outcstep) + gx * psc(outw) + gz * 8) + ivec4(0, 1, 2, 3); + vv_offset = v_offset + 4; + } + if (order_type == 4) + { + v_offset = ivec4(gx * psc(outcstep) + (gz * 8) * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outw); + vv_offset = v_offset + 4 * psc(outw); + } + if (order_type == 5) + { + v_offset = ivec4(gx * psc(outcstep) + gy * psc(outw) + gz * 8) + ivec4(0, 1, 2, 3); + vv_offset = v_offset + 4; + } + } + + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/permute_pack8to4.comp b/source/device/vulkan/shaders/permute_pack8to4.comp new file mode 100644 index 000000000..86e01e6fd --- /dev/null +++ b/source/device/vulkan/shaders/permute_pack8to4.comp @@ -0,0 +1,285 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int order_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + +#if NCNN_image_shader + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + ivec4 i4; + + if (order_type == 0) + { + i4 = ivec4((gy * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 1) + { + i4 = ivec4(gx * psc(w) + gy * 4) + ivec4(0, 1, 2, 3); + } + + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8)); + afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8)); + afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8)); + afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8)); + + afpvec4 v; + v.r = v0[(y4.r % 8) / 4][y4.r % 4]; + v.g = v1[(y4.g % 8) / 4][y4.g % 4]; + v.b = v2[(y4.b % 8) / 4][y4.b % 4]; + v.a = v3[(y4.a % 8) / 4][y4.a % 4]; + + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + int size = psc(w) * psc(h); + + ivec4 i4; + + if (order_type == 0) + { + i4 = ivec4((gz * 4) * size + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * size; + } + if (order_type == 1) + { + i4 = ivec4((gz * 4) * size + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * size; + } + if (order_type == 2) + { + i4 = ivec4(gy * size + (gz * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 3) + { + i4 = ivec4(gx * size+ (gz * 4) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 4) + { + i4 = ivec4(gy * size + gx * psc(w) + gz * 4) + ivec4(0, 1, 2, 3); + } + if (order_type == 5) + { + i4 = ivec4(gx * size + gy * psc(w) + gz * 4) + ivec4(0, 1, 2, 3); + } + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8)); + afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8)); + afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8)); + afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8)); + + afpvec4 v; + v.r = v0[(z4.r % 8) / 4][z4.r % 4]; + v.g = v1[(z4.g % 8) / 4][z4.g % 4]; + v.b = v2[(z4.b % 8) / 4][z4.b % 4]; + v.a = v3[(z4.a % 8) / 4][z4.a % 4]; + + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + ivec4 i4; + + if (psc(dims) == 2) + { + // order_type + // 0 = w h + // 1 = h w + + if (order_type == 0) + { + i4 = ivec4((gy * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 1) + { + i4 = ivec4(gx * psc(w) + gy * 4) + ivec4(0, 1, 2, 3); + } + } + else // if (psc(dims) == 3) + { + // order_type + // 0 = w h c + // 1 = h w c + // 2 = w c h + // 3 = c w h + // 4 = h c w + // 5 = c h w + + if (order_type == 0) + { + i4 = ivec4((gz * 4) * psc(cstep) + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(cstep); + } + if (order_type == 1) + { + i4 = ivec4((gz * 4) * psc(cstep) + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(cstep); + } + if (order_type == 2) + { + i4 = ivec4(gy * psc(cstep) + (gz * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 3) + { + i4 = ivec4(gx * psc(cstep) + (gz * 4) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w); + } + if (order_type == 4) + { + i4 = ivec4(gy * psc(cstep) + gx * psc(w) + gz * 4) + ivec4(0, 1, 2, 3); + } + if (order_type == 5) + { + i4 = ivec4(gx * psc(cstep) + gy * psc(w) + gz * 4) + ivec4(0, 1, 2, 3); + } + } + +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 lane2; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2; + lane2 = y4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2; + lane2 = z4 % 2; + } + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); + + buffer_st4(top_blob_data, gi, v); +#else + ivec4 v_offset; + + if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8; + } + + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/pooling.comp b/source/device/vulkan/shaders/pooling.comp new file mode 100644 index 000000000..5a647430f --- /dev/null +++ b/source/device/vulkan/shaders/pooling.comp @@ -0,0 +1,226 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define FLT_MAX 3.402823466e+38 + +layout (constant_id = 0) const int pooling_type = 0; +layout (constant_id = 1) const int kernel_w = 1; +layout (constant_id = 2) const int kernel_h = 1; +layout (constant_id = 3) const int stride_w = 1; +layout (constant_id = 4) const int stride_h = 1; +layout (constant_id = 5) const int pad_left = 0; +layout (constant_id = 6) const int pad_right = 0; +layout (constant_id = 7) const int pad_top = 0; +layout (constant_id = 8) const int pad_bottom = 0; +layout (constant_id = 9) const int global_pooling = 0; +layout (constant_id = 10) const int pad_mode = 0; +layout (constant_id = 11) const int avgpool_count_include_pad = 0; + +#define shape_constant_id_offset 12 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int wtailpad; + int htailpad; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp res; + + if (pooling_type == 0) + { + res = afp(-FLT_MAX); + +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz)); + res = max(res, v); + } + } +#else + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afp v = buffer_ld1(bottom_blob_data, v_offset + x); + res = max(res, v); + } + + v_offset += psc(w); + } +#endif + } + if (pooling_type == 1 && avgpool_count_include_pad == 0) + { + res = afp(0.f); + int area = 0; + + int sx = gx * stride_w; + int sy = gy * stride_h; + +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + if (sy + y < pad_top) + continue; + + if (sy + y >= psc(h) - pad_bottom - p.htailpad) + break; + + for (int x = 0; x < kernel_w; x++) + { + if (sx + x < pad_left) + continue; + + if (sx + x >= psc(w) - pad_right - p.wtailpad) + break; + + res += image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz)); + area += 1; + } + } +#else + int v_offset = gz * psc(cstep) + sy * psc(w) + sx; + + for (int y = 0; y < kernel_h; y++) + { + if (sy + y < pad_top) + { + v_offset += psc(w); + continue; + } + + if (sy + y >= psc(h) - pad_bottom - p.htailpad) + break; + + for (int x = 0; x < kernel_w; x++) + { + if (sx + x < pad_left) + { + continue; + } + + if (sx + x >= psc(w) - pad_right - p.wtailpad) + break; + + res += buffer_ld1(bottom_blob_data, v_offset + x); + area += 1; + } + + v_offset += psc(w); + } +#endif + + res /= afp(area); + } + if (pooling_type == 1 && avgpool_count_include_pad == 1) + { + res = afp(0.f); + +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + res += image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz)); + } + } +#else + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + res += buffer_ld1(bottom_blob_data, v_offset + x); + } + + v_offset += psc(w); + } +#endif + + res /= afp(kernel_w * kernel_h); + } + +#if NCNN_image_shader + image3d_st1(top_blob, ivec3(gx, gy, gz), res); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st1(top_blob_data, gi, res); +#endif +} diff --git a/source/device/vulkan/shaders/pooling_global.comp b/source/device/vulkan/shaders/pooling_global.comp new file mode 100644 index 000000000..8947a3d7a --- /dev/null +++ b/source/device/vulkan/shaders/pooling_global.comp @@ -0,0 +1,130 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define FLT_MAX 3.402823466e+38 + +layout (constant_id = 0) const int pooling_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + int size = psc(w) * psc(h); + int v_offset = gx * psc(cstep); + + afp res; + + if (pooling_type == 0) + { + res = afp(-FLT_MAX); + +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + afp v = image3d_ld1(bottom_blob, ivec3(x, y, gx)); + res = max(res, v); + } + } +#else + for (int i = 0; i < size; i++) + { + afp v = buffer_ld1(bottom_blob_data, v_offset + i); + res = max(res, v); + } +#endif + } + if (pooling_type == 1) + { + res = afp(0.f); + +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + res += image3d_ld1(bottom_blob, ivec3(x, y, gx)); + } + } +#else + for (int i = 0; i < size; i++) + { + res += buffer_ld1(bottom_blob_data, v_offset + i); + } +#endif + + res /= afp(size); + } + +#if NCNN_image_shader + image1d_st1(top_blob, gx, res); +#else + buffer_st1(top_blob_data, gx, res); +#endif +} diff --git a/source/device/vulkan/shaders/pooling_global_pack4.comp b/source/device/vulkan/shaders/pooling_global_pack4.comp new file mode 100644 index 000000000..a8634cce8 --- /dev/null +++ b/source/device/vulkan/shaders/pooling_global_pack4.comp @@ -0,0 +1,130 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define FLT_MAX 3.402823466e+38 + +layout (constant_id = 0) const int pooling_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + int size = psc(w) * psc(h); + int v_offset = gx * psc(cstep); + + afpvec4 res; + + if (pooling_type == 0) + { + res = afpvec4(-FLT_MAX); + +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, gx)); + res = max(res, v); + } + } +#else + for (int i = 0; i < size; i++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i); + res = max(res, v); + } +#endif + } + if (pooling_type == 1) + { + res = afpvec4(0.f); + +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + res += image3d_ld4(bottom_blob, ivec3(x, y, gx)); + } + } +#else + for (int i = 0; i < size; i++) + { + res += buffer_ld4(bottom_blob_data, v_offset + i); + } +#endif + + res /= afp(size); + } + +#if NCNN_image_shader + image1d_st4(top_blob, gx, res); +#else + buffer_st4(top_blob_data, gx, res); +#endif +} diff --git a/source/device/vulkan/shaders/pooling_global_pack8.comp b/source/device/vulkan/shaders/pooling_global_pack8.comp new file mode 100644 index 000000000..3b9f43069 --- /dev/null +++ b/source/device/vulkan/shaders/pooling_global_pack8.comp @@ -0,0 +1,139 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define FLT_MAX 3.402823466e+38 + +layout (constant_id = 0) const int pooling_type = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= 1 || gz >= 1) + return; + + int size = psc(w) * psc(h); + int v_offset = gx * psc(cstep); + + afpvec8 res; + + if (pooling_type == 0) + { + res = afpvec8(afpvec4(-FLT_MAX), afpvec4(-FLT_MAX)); + +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(x, y, gx)); + res[0] = max(res[0], v[0]); + res[1] = max(res[1], v[1]); + } + } +#else + for (int i = 0; i < size; i++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + i); + res[0] = max(res[0], v[0]); + res[1] = max(res[1], v[1]); + } +#endif + } + if (pooling_type == 1) + { + res = afpvec8(afpvec4(0.f), afpvec4(0.f)); + +#if NCNN_image_shader + for (int y = 0; y < psc(h); y++) + { + for (int x = 0; x < psc(w); x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(x, y, gx)); + res[0] += v[0]; + res[1] += v[1]; + } + } +#else + for (int i = 0; i < size; i++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + i); + res[0] += v[0]; + res[1] += v[1]; + } +#endif + + afp area = afp(size); + res[0] /= area; + res[1] /= area; + } + +#if NCNN_image_shader + image1d_st8(top_blob, gx, res); +#else + buffer_st8(top_blob_data, gx, res); +#endif +} diff --git a/source/device/vulkan/shaders/pooling_pack4.comp b/source/device/vulkan/shaders/pooling_pack4.comp new file mode 100644 index 000000000..4b574ac4d --- /dev/null +++ b/source/device/vulkan/shaders/pooling_pack4.comp @@ -0,0 +1,226 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define FLT_MAX 3.402823466e+38 + +layout (constant_id = 0) const int pooling_type = 0; +layout (constant_id = 1) const int kernel_w = 1; +layout (constant_id = 2) const int kernel_h = 1; +layout (constant_id = 3) const int stride_w = 1; +layout (constant_id = 4) const int stride_h = 1; +layout (constant_id = 5) const int pad_left = 0; +layout (constant_id = 6) const int pad_right = 0; +layout (constant_id = 7) const int pad_top = 0; +layout (constant_id = 8) const int pad_bottom = 0; +layout (constant_id = 9) const int global_pooling = 0; +layout (constant_id = 10) const int pad_mode = 0; +layout (constant_id = 11) const int avgpool_count_include_pad = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#define shape_constant_id_offset 12 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int wtailpad; + int htailpad; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 res; + + if (pooling_type == 0) + { + res = afpvec4(-FLT_MAX); + +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz)); + res = max(res, v); + } + } +#else + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x); + res = max(res, v); + } + + v_offset += psc(w); + } +#endif + } + else if (pooling_type == 1 && avgpool_count_include_pad == 0) + { + res = afpvec4(0.f); + int area = 0; + + int sx = gx * stride_w; + int sy = gy * stride_h; + +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + if (sy + y < pad_top) + continue; + + if (sy + y >= psc(h) - pad_bottom - p.htailpad) + break; + + for (int x = 0; x < kernel_w; x++) + { + if (sx + x < pad_left) + continue; + + if (sx + x >= psc(w) - pad_right - p.wtailpad) + break; + + res += image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz)); + area += 1; + } + } +#else + int v_offset = gz * psc(cstep) + sy * psc(w) + sx; + + for (int y = 0; y < kernel_h; y++) + { + if (sy + y < pad_top) + { + v_offset += psc(w); + continue; + } + + if (sy + y >= psc(h) - pad_bottom - p.htailpad) + break; + + for (int x = 0; x < kernel_w; x++) + { + if (sx + x < pad_left) + { + continue; + } + + if (sx + x >= psc(w) - pad_right - p.wtailpad) + break; + + res += buffer_ld4(bottom_blob_data, v_offset + x); + area += 1; + } + + v_offset += psc(w); + } +#endif + + res /= afp(area); + } + else if (pooling_type == 1 && avgpool_count_include_pad == 1) + { + res = afpvec4(0.f); + +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + res += image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz)); + } + } +#else + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + res += buffer_ld4(bottom_blob_data, v_offset + x); + } + + v_offset += psc(w); + } +#endif + + res /= afp(kernel_w * kernel_h); + } + +#if NCNN_image_shader + image3d_st4(top_blob, ivec3(gx, gy, gz), res); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_blob_data, gi, res); +#endif +} diff --git a/source/device/vulkan/shaders/pooling_pack8.comp b/source/device/vulkan/shaders/pooling_pack8.comp new file mode 100644 index 000000000..4ff7ac902 --- /dev/null +++ b/source/device/vulkan/shaders/pooling_pack8.comp @@ -0,0 +1,242 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define FLT_MAX 3.402823466e+38 + +layout (constant_id = 0) const int pooling_type = 0; +layout (constant_id = 1) const int kernel_w = 1; +layout (constant_id = 2) const int kernel_h = 1; +layout (constant_id = 3) const int stride_w = 1; +layout (constant_id = 4) const int stride_h = 1; +layout (constant_id = 5) const int pad_left = 0; +layout (constant_id = 6) const int pad_right = 0; +layout (constant_id = 7) const int pad_top = 0; +layout (constant_id = 8) const int pad_bottom = 0; +layout (constant_id = 9) const int global_pooling = 0; +layout (constant_id = 10) const int pad_mode = 0; +layout (constant_id = 11) const int avgpool_count_include_pad = 0; + +#define shape_constant_id_offset 12 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + int wtailpad; + int htailpad; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 res; + + if (pooling_type == 0) + { + res = afpvec8(afpvec4(-FLT_MAX), afpvec4(-FLT_MAX)); + +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz)); + res[0] = max(res[0], v[0]); + res[1] = max(res[1], v[1]); + } + } +#else + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x); + res[0] = max(res[0], v[0]); + res[1] = max(res[1], v[1]); + } + + v_offset += psc(w); + } +#endif + } + else if (pooling_type == 1 && avgpool_count_include_pad == 0) + { + res = afpvec8(afpvec4(0.f), afpvec4(0.f)); + int area = 0; + + int sx = gx * stride_w; + int sy = gy * stride_h; + +#if NCNN_image_shader + for (int y = 0; y < kernel_h; y++) + { + if (sy + y < pad_top) + continue; + + if (sy + y >= psc(h) - pad_bottom - p.htailpad) + break; + + for (int x = 0; x < kernel_w; x++) + { + if (sx + x < pad_left) + continue; + + if (sx + x >= psc(w) - pad_right - p.wtailpad) + break; + + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz)); + res[0] += v[0]; + res[1] += v[1]; + area += 1; + } + } +#else + int v_offset = gz * psc(cstep) + sy * psc(w) + sx; + + for (int y = 0; y < kernel_h; y++) + { + if (sy + y < pad_top) + { + v_offset += psc(w); + continue; + } + + if (sy + y >= psc(h) - pad_bottom - p.htailpad) + break; + + for (int x = 0; x < kernel_w; x++) + { + if (sx + x < pad_left) + { + continue; + } + + if (sx + x >= psc(w) - pad_right - p.wtailpad) + break; + + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x); + res[0] += v[0]; + res[1] += v[1]; + area += 1; + } + + v_offset += psc(w); + } +#endif + + res[0] /= afp(area); + res[1] /= afp(area); + } + else if (pooling_type == 1 && avgpool_count_include_pad == 1) + { + res = afpvec8(afpvec4(0.f), afpvec4(0.f)); + +#if NCNN_image_shader + int sx = gx * stride_w; + int sy = gy * stride_h; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz)); + res[0] += v[0]; + res[1] += v[1]; + } + } +#else + int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w; + + for (int y = 0; y < kernel_h; y++) + { + for (int x = 0; x < kernel_w; x++) + { + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x); + res[0] += v[0]; + res[1] += v[1]; + } + + v_offset += psc(w); + } +#endif + + afp area = afp(kernel_w * kernel_h); + res[0] /= area; + res[1] /= area; + } + +#if NCNN_image_shader + image3d_st8(top_blob, ivec3(gx, gy, gz), res); +#else + const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + // res = afpvec8(afpvec4(1.0f), afpvec4(1.0f)); + + buffer_st8(top_blob_data, gi, res); +#endif +} diff --git a/source/device/vulkan/shaders/priorbox.comp b/source/device/vulkan/shaders/priorbox.comp new file mode 100644 index 000000000..1503b8866 --- /dev/null +++ b/source/device/vulkan/shaders/priorbox.comp @@ -0,0 +1,170 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int flip = 0; +layout (constant_id = 1) const int clip = 0; +layout (constant_id = 2) const float offset = 0; +layout (constant_id = 3) const float variances_0 = 0; +layout (constant_id = 4) const float variances_1 = 0; +layout (constant_id = 5) const float variances_2 = 0; +layout (constant_id = 6) const float variances_3 = 0; +layout (constant_id = 7) const int num_min_size = 0; +layout (constant_id = 8) const int num_max_size = 0; +layout (constant_id = 9) const int num_aspect_ratio = 0; +layout (constant_id = 10) const int num_prior = 0; + +#define shape_constant_id_offset 11 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_fp16_packed +layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; }; +#else +layout (binding = 0) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif +layout (binding = 1) readonly buffer min_sizes { sfp min_sizes_data[]; }; +layout (binding = 2) readonly buffer max_sizes { sfp max_sizes_data[]; }; +layout (binding = 3) readonly buffer aspect_ratios { sfp aspect_ratios_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + + float image_w; + float image_h; + float step_w; + float step_h; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= num_min_size || gy >= psc(w) || gz >= psc(h)) + return; + + // anchor and variance + int v_offset = (gz * psc(w) + gy) * num_prior + gx; + int var_offset = psc(w) * psc(h) * num_prior + v_offset; + + afp center_x = (afp(gy) + afp(offset)) * afp(p.step_w); + afp center_y = (afp(gz) + afp(offset)) * afp(p.step_h); + afpvec4 center = afpvec4(center_x, center_y, center_x, center_y); + + afpvec4 image_norm = afp(1.f) / afpvec4(p.image_w, p.image_h, p.image_w, p.image_h); + + afpvec4 box; + + afp box_w; + afp box_h; + + afp min_size = buffer_ld1(min_sizes_data, gx); + + afpvec4 variances = afpvec4(variances_0, variances_1, variances_2, variances_3); + + // min size box + box_w = box_h = min_size; + + box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; + box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; + +#if NCNN_fp16_packed + top_blob_data[v_offset] = vec4(box); + top_blob_data[var_offset] = vec4(variances); +#else + buffer_st4(top_blob_data, v_offset, box); + buffer_st4(top_blob_data, var_offset, variances); +#endif + + v_offset += 1; + var_offset += 1; + + if (num_max_size > 0) + { + afp max_size = buffer_ld1(max_sizes_data, gx); + + // max size box + box_w = box_h = sqrt(min_size * max_size); + + box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; + box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; + +#if NCNN_fp16_packed + top_blob_data[v_offset] = vec4(box); + top_blob_data[var_offset] = vec4(variances); +#else + buffer_st4(top_blob_data, v_offset, box); + buffer_st4(top_blob_data, var_offset, variances); +#endif + + v_offset += 1; + var_offset += 1; + } + + // all aspect_ratios + for (int pi = 0; pi < num_aspect_ratio; pi++) + { + afp ar = buffer_ld1(aspect_ratios_data, pi); + + box_w = min_size * sqrt(ar); + box_h = min_size / sqrt(ar); + + box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; + box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; + +#if NCNN_fp16_packed + top_blob_data[v_offset] = vec4(box); + top_blob_data[var_offset] = vec4(variances); +#else + buffer_st4(top_blob_data, v_offset, box); + buffer_st4(top_blob_data, var_offset, variances); +#endif + + v_offset += 1; + var_offset += 1; + + if (flip == 1) + { + box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm; + box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; + +#if NCNN_fp16_packed + top_blob_data[v_offset] = vec4(box); + top_blob_data[var_offset] = vec4(variances); +#else + buffer_st4(top_blob_data, v_offset, box); + buffer_st4(top_blob_data, var_offset, variances); +#endif + + v_offset += 1; + var_offset += 1; + } + } +} diff --git a/source/device/vulkan/shaders/priorbox_mxnet.comp b/source/device/vulkan/shaders/priorbox_mxnet.comp new file mode 100644 index 000000000..bec66fde9 --- /dev/null +++ b/source/device/vulkan/shaders/priorbox_mxnet.comp @@ -0,0 +1,92 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int clip = 0; +layout (constant_id = 1) const float offset = 0; +layout (constant_id = 2) const int num_sizes = 0; +layout (constant_id = 3) const int num_ratios = 0; +layout (constant_id = 4) const int num_prior = 0; + +#define shape_constant_id_offset 5 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +layout (binding = 1) readonly buffer min_sizes { sfp min_sizes_data[]; }; +layout (binding = 2) readonly buffer aspect_ratios { sfp aspect_ratios_data[]; }; + +layout (push_constant) uniform parameter +{ + int w; + int h; + + float step_w; + float step_h; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= num_sizes || gy >= psc(w) || gz >= psc(h)) + return; + + // mxnet style _contrib_MultiBoxPrior + int v_offset = (gz * psc(w) + gy) * num_prior + gx; + + afp center_x = (afp(gy) + afp(offset)) * afp(p.step_w); + afp center_y = (afp(gz) + afp(offset)) * afp(p.step_h); + afpvec4 center = afpvec4(center_x, center_y, center_x, center_y); + + // ratio = 1, various sizes + afp size = buffer_ld1(min_sizes_data, gx); + afp cw = size * afp(0.5f) * afp(psc(h)) / afp(psc(w)); + afp ch = size * afp(0.5f); + + afpvec4 box = center + afpvec4(-cw, -ch, cw, ch); + box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; + + buffer_st4(top_blob_data, v_offset, box); + + if (gx == 0) + { + // various ratios, size = min_size = size[0] + for (int pi = 1; pi < num_ratios; pi++) + { + afp v = buffer_ld1(aspect_ratios_data, pi); + afp cwr = cw * sqrt(v); + afp chr = ch / sqrt(v); + + afpvec4 box = center + afpvec4(-cwr, -chr, cwr, chr); + box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box; + + buffer_st4(top_blob_data, v_offset + num_sizes - 1 + pi, box); + } + } +} diff --git a/source/device/vulkan/shaders/relu.comp b/source/device/vulkan/shaders/relu.comp new file mode 100644 index 000000000..cb08948d3 --- /dev/null +++ b/source/device/vulkan/shaders/relu.comp @@ -0,0 +1,107 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const float slope = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afp v; + if (psc(dims) == 1) + { + v = image1d_ld1(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afp v = buffer_ld1(bottom_top_blob_data, gi); +#endif + + if (slope == 0) + v = max(v, afp(0.f)); + else + v = v < afp(0.f) ? v * afp(slope) : v; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st1(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st1(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/relu_pack4.comp b/source/device/vulkan/shaders/relu_pack4.comp new file mode 100644 index 000000000..cc02824cb --- /dev/null +++ b/source/device/vulkan/shaders/relu_pack4.comp @@ -0,0 +1,107 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const float slope = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec4 v; + if (psc(dims) == 1) + { + v = image1d_ld4(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); +#endif + + if (slope == 0) + v = max(v, afp(0.f)); + else + v = mix(v, v * afp(slope), lessThan(v, afpvec4(0.f))); + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st4(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/relu_pack8.comp b/source/device/vulkan/shaders/relu_pack8.comp new file mode 100644 index 000000000..25862cde3 --- /dev/null +++ b/source/device/vulkan/shaders/relu_pack8.comp @@ -0,0 +1,114 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const float slope = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec8 v; + if (psc(dims) == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); +#endif + + if (slope == 0) + { + v[0] = max(v[0], afp(0.f)); + v[1] = max(v[1], afp(0.f)); + } + else + { + v[0] = mix(v[0], v[0] * afp(slope), lessThan(v[0], afpvec4(0.f))); + v[1] = mix(v[1], v[1] * afp(slope), lessThan(v[1], afpvec4(0.f))); + } + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st8(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/reshape.comp b/source/device/vulkan/shaders/reshape.comp new file mode 100644 index 000000000..3b2109789 --- /dev/null +++ b/source/device/vulkan/shaders/reshape.comp @@ -0,0 +1,138 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + int i; + + if (ndim == 1) i = gx; + if (ndim == 2) i = gy * psc(outw) + gx; + if (ndim == 3) i = gz * psc(outh) * psc(outw) + gy * psc(outw) + gx; + + int size = psc(w) * psc(h); + + int z = i / size; + int y = i % size / psc(w); + int x = i % size % psc(w); + +#if NCNN_image_shader + afp v; + + if (psc(dims) == 1) + { + v = image1d_ld1(bottom_blob_1d, x); + } + else if (psc(dims) == 2) + { + v = image2d_ld1(bottom_blob_2d, ivec2(x, y)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld1(bottom_blob_3d, ivec3(x, y, z)); + } + + if (ndim == 1) + { + image1d_st1(top_blob_1d, gx, v); + } + if (ndim == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } + if (ndim == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + int v_offset = z * psc(cstep) + y * psc(w) + x; + + int gi; + if (ndim == 1) + { + gi = gx; + } + if (ndim == 2) + { + gi = gy * psc(outw) + gx; + } + if (ndim == 3) + { + gi = gz * psc(outcstep) + gy * psc(outw) + gx; + } + + buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/reshape_pack1to4.comp b/source/device/vulkan/shaders/reshape_pack1to4.comp new file mode 100644 index 000000000..9a33d7908 --- /dev/null +++ b/source/device/vulkan/shaders/reshape_pack1to4.comp @@ -0,0 +1,147 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + ivec4 i4; + + if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3); + if (ndim == 2) i4 = (gy * 4) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw); + if (ndim == 3) i4 = (gz * 4) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw); + + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + +#if NCNN_image_shader + afpvec4 v; + + if (psc(dims) == 1) + { + v.r = image1d_ld1(bottom_blob_1d, x4.r); + v.g = image1d_ld1(bottom_blob_1d, x4.g); + v.b = image1d_ld1(bottom_blob_1d, x4.b); + v.a = image1d_ld1(bottom_blob_1d, x4.a); + } + else if (psc(dims) == 2) + { + v.r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r)); + v.g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g)); + v.b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b)); + v.a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a)); + } + else // if (psc(dims) == 3) + { + v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r)); + v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g)); + v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b)); + v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a)); + } + + if (ndim == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + if (ndim == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + if (ndim == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + ivec4 v_offset = z4 * psc(cstep) + y4 * psc(w) + x4; + + int gi; + if (ndim == 1) + { + gi = gx; + } + if (ndim == 2) + { + gi = gy * psc(outw) + gx; + } + if (ndim == 3) + { + gi = gz * psc(outcstep) + gy * psc(outw) + gx; + } + + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +} diff --git a/source/device/vulkan/shaders/reshape_pack1to8.comp b/source/device/vulkan/shaders/reshape_pack1to8.comp new file mode 100644 index 000000000..93e096a51 --- /dev/null +++ b/source/device/vulkan/shaders/reshape_pack1to8.comp @@ -0,0 +1,177 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + ivec4 i4; + ivec4 ii4; + + if (ndim == 1) + { + i4 = gx * 8 + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + if (ndim == 2) + { + i4 = (gy * 8) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw); + ii4 = i4 + 4 * psc(outw); + } + if (ndim == 3) + { + i4 = (gz * 8) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw); + ii4 = i4 + 4 * psc(outh) * psc(outw); + } + + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 1) + { + v[0].r = image1d_ld1(bottom_blob_1d, x4.r); + v[0].g = image1d_ld1(bottom_blob_1d, x4.g); + v[0].b = image1d_ld1(bottom_blob_1d, x4.b); + v[0].a = image1d_ld1(bottom_blob_1d, x4.a); + v[1].r = image1d_ld1(bottom_blob_1d, xx4.r); + v[1].g = image1d_ld1(bottom_blob_1d, xx4.g); + v[1].b = image1d_ld1(bottom_blob_1d, xx4.b); + v[1].a = image1d_ld1(bottom_blob_1d, xx4.a); + } + else if (psc(dims) == 2) + { + v[0].r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r)); + v[0].g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g)); + v[0].b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b)); + v[0].a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a)); + v[1].r = image2d_ld1(bottom_blob_2d, ivec2(xx4.r, yy4.r)); + v[1].g = image2d_ld1(bottom_blob_2d, ivec2(xx4.g, yy4.g)); + v[1].b = image2d_ld1(bottom_blob_2d, ivec2(xx4.b, yy4.b)); + v[1].a = image2d_ld1(bottom_blob_2d, ivec2(xx4.a, yy4.a)); + } + else // if (psc(dims) == 3) + { + v[0].r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r)); + v[0].g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g)); + v[0].b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b)); + v[0].a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a)); + v[1].r = image3d_ld1(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r)); + v[1].g = image3d_ld1(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g)); + v[1].b = image3d_ld1(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b)); + v[1].a = image3d_ld1(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a)); + } + + if (ndim == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + if (ndim == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + if (ndim == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + ivec4 v_offset = z4 * psc(cstep) + y4 * psc(w) + x4; + ivec4 vv_offset = zz4 * psc(cstep) + yy4 * psc(w) + xx4; + + int gi; + if (ndim == 1) + { + gi = gx; + } + if (ndim == 2) + { + gi = gy * psc(outw) + gx; + } + if (ndim == 3) + { + gi = gz * psc(outcstep) + gy * psc(outw) + gx; + } + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +} diff --git a/source/device/vulkan/shaders/reshape_pack4.comp b/source/device/vulkan/shaders/reshape_pack4.comp new file mode 100644 index 000000000..6f85d9779 --- /dev/null +++ b/source/device/vulkan/shaders/reshape_pack4.comp @@ -0,0 +1,228 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + ivec4 i4; + + if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3); + if (ndim == 2) i4 = (gy * 4) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw); + if (ndim == 3) i4 = (gz * 4) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw); + +#if NCNN_image_shader + afpvec4 vr; + afpvec4 vg; + afpvec4 vb; + afpvec4 va; + + ivec4 lane4; + + if (psc(dims) == 1) + { + ivec4 x4 = i4; + + vr = image1d_ld4(bottom_blob_1d, x4.r / 4); + vg = image1d_ld4(bottom_blob_1d, x4.g / 4); + vb = image1d_ld4(bottom_blob_1d, x4.b / 4); + va = image1d_ld4(bottom_blob_1d, x4.a / 4); + + lane4 = x4 % 4; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + vr = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4)); + vg = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4)); + vb = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4)); + va = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4)); + + lane4 = y4 % 4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + vr = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4)); + vg = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4)); + vb = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4)); + va = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4)); + + lane4 = z4 % 4; + } + + afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]); + + if (ndim == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + if (ndim == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + if (ndim == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 lane2; + + if (psc(dims) == 1) + { + v_offset = i4 / 2; + lane2 = i4 % 2; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2; + lane2 = y4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2; + lane2 = z4 % 2; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * psc(outw) + gx; + if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); + + buffer_st4(top_blob_data, gi, v); +#else + ivec4 v_offset; + + if (psc(dims) == 1) + { + v_offset = i4; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * psc(outw) + gx; + if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/reshape_pack4to1.comp b/source/device/vulkan/shaders/reshape_pack4to1.comp new file mode 100644 index 000000000..abf9331e7 --- /dev/null +++ b/source/device/vulkan/shaders/reshape_pack4to1.comp @@ -0,0 +1,166 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + ivec3 gxyz = ivec3(gx, gy, gz); + + gxyz[psc(dims) - 1] *= 4; + + int i4_0 = gxyz.z * psc(h) * psc(w) + gxyz.y * psc(w) + gxyz.x; + + ivec3 gxyz4 = ivec3(1, psc(w), psc(h) * psc(w)); + + ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1]; + +#if NCNN_image_shader + afpvec4 v; + + if (psc(dims) == 1) + { + v = image1d_ld4(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + } + + if (ndim == 1) + { + ivec4 x4 = i4; + + image1d_st1(top_blob_1d, x4.r, v.r); + image1d_st1(top_blob_1d, x4.g, v.g); + image1d_st1(top_blob_1d, x4.b, v.b); + image1d_st1(top_blob_1d, x4.a, v.a); + } + if (ndim == 2) + { + ivec4 y4 = i4 / psc(outw); + ivec4 x4 = i4 % psc(outw); + + image2d_st1(top_blob_2d, ivec2(x4.r, y4.r), v.r); + image2d_st1(top_blob_2d, ivec2(x4.g, y4.g), v.g); + image2d_st1(top_blob_2d, ivec2(x4.b, y4.b), v.b); + image2d_st1(top_blob_2d, ivec2(x4.a, y4.a), v.a); + } + if (ndim == 3) + { + int size = psc(outw) * psc(outh); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(outw); + ivec4 x4 = i4 % size % psc(outw); + + image3d_st1(top_blob_3d, ivec3(x4.r, y4.r, z4.r), v.r); + image3d_st1(top_blob_3d, ivec3(x4.g, y4.g, z4.g), v.g); + image3d_st1(top_blob_3d, ivec3(x4.b, y4.b, z4.b), v.b); + image3d_st1(top_blob_3d, ivec3(x4.a, y4.a, z4.a), v.a); + } +#else + ivec4 v_offset; + + if (ndim == 1) + { + v_offset = i4; + } + if (ndim == 2) + { + ivec4 y4 = i4 / psc(outw); + ivec4 x4 = i4 % psc(outw); + + v_offset = y4 * psc(outw) + x4; + } + if (ndim == 3) + { + int size = psc(outw) * psc(outh); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(outw); + ivec4 x4 = i4 % size % psc(outw); + + v_offset = z4 * psc(outcstep) + y4 * psc(outw) + x4; + } + + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/reshape_pack4to8.comp b/source/device/vulkan/shaders/reshape_pack4to8.comp new file mode 100644 index 000000000..c3950a1aa --- /dev/null +++ b/source/device/vulkan/shaders/reshape_pack4to8.comp @@ -0,0 +1,301 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + ivec4 i4; + ivec4 ii4; + + if (ndim == 1) + { + i4 = gx * 8 + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + if (ndim == 2) + { + i4 = (gy * 8) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw); + ii4 = i4 + 4 * psc(outw); + } + if (ndim == 3) + { + i4 = (gz * 8) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw); + ii4 = i4 + 4 * psc(outh) * psc(outw); + } + +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 1) + { + ivec4 x4 = i4; + ivec4 xx4 = ii4; + + afpvec4 v0 = image1d_ld4(bottom_blob_1d, x4.r / 4); + afpvec4 v1 = image1d_ld4(bottom_blob_1d, x4.g / 4); + afpvec4 v2 = image1d_ld4(bottom_blob_1d, x4.b / 4); + afpvec4 v3 = image1d_ld4(bottom_blob_1d, x4.a / 4); + afpvec4 v4 = image1d_ld4(bottom_blob_1d, xx4.r / 4); + afpvec4 v5 = image1d_ld4(bottom_blob_1d, xx4.g / 4); + afpvec4 v6 = image1d_ld4(bottom_blob_1d, xx4.b / 4); + afpvec4 v7 = image1d_ld4(bottom_blob_1d, xx4.a / 4); + + v[0].r = v0[x4.r % 4]; + v[0].g = v1[x4.g % 4]; + v[0].b = v2[x4.b % 4]; + v[0].a = v3[x4.a % 4]; + v[1].r = v4[xx4.r % 4]; + v[1].g = v5[xx4.g % 4]; + v[1].b = v6[xx4.b % 4]; + v[1].a = v7[xx4.a % 4]; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4)); + afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4)); + afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4)); + afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4)); + afpvec4 v4 = image2d_ld4(bottom_blob_2d, ivec2(xx4.r, yy4.r / 4)); + afpvec4 v5 = image2d_ld4(bottom_blob_2d, ivec2(xx4.g, yy4.g / 4)); + afpvec4 v6 = image2d_ld4(bottom_blob_2d, ivec2(xx4.b, yy4.b / 4)); + afpvec4 v7 = image2d_ld4(bottom_blob_2d, ivec2(xx4.a, yy4.a / 4)); + + v[0].r = v0[y4.r % 4]; + v[0].g = v1[y4.g % 4]; + v[0].b = v2[y4.b % 4]; + v[0].a = v3[y4.a % 4]; + v[1].r = v4[yy4.r % 4]; + v[1].g = v5[yy4.g % 4]; + v[1].b = v6[yy4.b % 4]; + v[1].a = v7[yy4.a % 4]; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4)); + afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4)); + afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4)); + afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4)); + afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 4)); + afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 4)); + afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 4)); + afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 4)); + + v[0].r = v0[z4.r % 4]; + v[0].g = v1[z4.g % 4]; + v[0].b = v2[z4.b % 4]; + v[0].a = v3[z4.a % 4]; + v[1].r = v4[zz4.r % 4]; + v[1].g = v5[zz4.g % 4]; + v[1].b = v6[zz4.b % 4]; + v[1].a = v7[zz4.a % 4]; + } + + if (ndim == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + if (ndim == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + if (ndim == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 vv_offset; + ivec4 lane2; + ivec4 lane4; + + if (psc(dims) == 1) + { + v_offset = i4 / 2; + lane2 = i4 % 2; + vv_offset = ii4 / 2; + lane4 = ii4 % 2; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2; + lane2 = y4 % 2; + vv_offset = ((yy4 / 4) * psc(w) + xx4) * 2 + (yy4 % 4) / 2; + lane4 = yy4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2; + lane2 = z4 % 2; + vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2; + lane4 = zz4 % 2; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * psc(outw) + gx; + if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); + afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); + afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); + afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); + + afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); + + buffer_st8(top_blob_data, gi, v); +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (psc(dims) == 1) + { + v_offset = i4; + vv_offset = ii4; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4; + vv_offset = ((yy4 / 4) * psc(w) + xx4) * 4 + yy4 % 4; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4; + vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * psc(outw) + gx; + if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/reshape_pack8.comp b/source/device/vulkan/shaders/reshape_pack8.comp new file mode 100644 index 000000000..23ee23acb --- /dev/null +++ b/source/device/vulkan/shaders/reshape_pack8.comp @@ -0,0 +1,301 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + ivec4 i4; + ivec4 ii4; + + if (ndim == 1) + { + i4 = gx * 8 + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + if (ndim == 2) + { + i4 = (gy * 8) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw); + ii4 = i4 + 4 * psc(outw); + } + if (ndim == 3) + { + i4 = (gz * 8) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw); + ii4 = i4 + 4 * psc(outh) * psc(outw); + } + +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 1) + { + ivec4 x4 = i4; + ivec4 xx4 = ii4; + + afpvec8 v0 = image1d_ld8(bottom_blob_1d, x4.r / 8); + afpvec8 v1 = image1d_ld8(bottom_blob_1d, x4.g / 8); + afpvec8 v2 = image1d_ld8(bottom_blob_1d, x4.b / 8); + afpvec8 v3 = image1d_ld8(bottom_blob_1d, x4.a / 8); + afpvec8 v4 = image1d_ld8(bottom_blob_1d, xx4.r / 8); + afpvec8 v5 = image1d_ld8(bottom_blob_1d, xx4.g / 8); + afpvec8 v6 = image1d_ld8(bottom_blob_1d, xx4.b / 8); + afpvec8 v7 = image1d_ld8(bottom_blob_1d, xx4.a / 8); + + v[0].r = v0[(x4.r % 8) / 4][x4.r % 4]; + v[0].g = v1[(x4.g % 8) / 4][x4.g % 4]; + v[0].b = v2[(x4.b % 8) / 4][x4.b % 4]; + v[0].a = v3[(x4.a % 8) / 4][x4.a % 4]; + v[1].r = v4[(xx4.r % 8) / 4][xx4.r % 4]; + v[1].g = v5[(xx4.g % 8) / 4][xx4.g % 4]; + v[1].b = v6[(xx4.b % 8) / 4][xx4.b % 4]; + v[1].a = v7[(xx4.a % 8) / 4][xx4.a % 4]; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8)); + afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8)); + afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8)); + afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8)); + afpvec8 v4 = image2d_ld8(bottom_blob_2d, ivec2(xx4.r, yy4.r / 8)); + afpvec8 v5 = image2d_ld8(bottom_blob_2d, ivec2(xx4.g, yy4.g / 8)); + afpvec8 v6 = image2d_ld8(bottom_blob_2d, ivec2(xx4.b, yy4.b / 8)); + afpvec8 v7 = image2d_ld8(bottom_blob_2d, ivec2(xx4.a, yy4.a / 8)); + + v[0].r = v0[(y4.r % 8) / 4][y4.r % 4]; + v[0].g = v1[(y4.g % 8) / 4][y4.g % 4]; + v[0].b = v2[(y4.b % 8) / 4][y4.b % 4]; + v[0].a = v3[(y4.a % 8) / 4][y4.a % 4]; + v[1].r = v4[(yy4.r % 8) / 4][yy4.r % 4]; + v[1].g = v5[(yy4.g % 8) / 4][yy4.g % 4]; + v[1].b = v6[(yy4.b % 8) / 4][yy4.b % 4]; + v[1].a = v7[(yy4.a % 8) / 4][yy4.a % 4]; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8)); + afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8)); + afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8)); + afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8)); + afpvec8 v4 = image3d_ld8(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 8)); + afpvec8 v5 = image3d_ld8(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 8)); + afpvec8 v6 = image3d_ld8(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 8)); + afpvec8 v7 = image3d_ld8(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 8)); + + v[0].r = v0[(z4.r % 8) / 4][z4.r % 4]; + v[0].g = v1[(z4.g % 8) / 4][z4.g % 4]; + v[0].b = v2[(z4.b % 8) / 4][z4.b % 4]; + v[0].a = v3[(z4.a % 8) / 4][z4.a % 4]; + v[1].r = v4[(zz4.r % 8) / 4][zz4.r % 4]; + v[1].g = v5[(zz4.g % 8) / 4][zz4.g % 4]; + v[1].b = v6[(zz4.b % 8) / 4][zz4.b % 4]; + v[1].a = v7[(zz4.a % 8) / 4][zz4.a % 4]; + } + + if (ndim == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + if (ndim == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + if (ndim == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 vv_offset; + ivec4 lane2; + ivec4 lane4; + + if (psc(dims) == 1) + { + v_offset = i4 / 2; + lane2 = i4 % 2; + vv_offset = ii4 / 2; + lane4 = ii4 % 2; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2; + lane2 = y4 % 2; + vv_offset = ((yy4 / 8) * psc(w) + xx4) * 4 + (yy4 % 8) / 2; + lane4 = yy4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2; + lane2 = z4 % 2; + vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + (zz4 % 8) / 2; + lane4 = zz4 % 2; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * psc(outw) + gx; + if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); + afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); + afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); + afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); + + afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); + + buffer_st8(top_blob_data, gi, v); +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (psc(dims) == 1) + { + v_offset = i4; + vv_offset = ii4; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + ivec4 yy4 = ii4 / psc(w); + ivec4 xx4 = ii4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8; + vv_offset = ((yy4 / 8) * psc(w) + xx4) * 8 + yy4 % 8; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(w); + ivec4 xx4 = ii4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8; + vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 8 + zz4 % 8; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * psc(outw) + gx; + if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/reshape_pack8to1.comp b/source/device/vulkan/shaders/reshape_pack8to1.comp new file mode 100644 index 000000000..05cd2f869 --- /dev/null +++ b/source/device/vulkan/shaders/reshape_pack8to1.comp @@ -0,0 +1,195 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +#else +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + + ivec3 gxyz = ivec3(gx, gy, gz); + + gxyz[psc(dims) - 1] *= 8; + + int i4_0 = gxyz.z * psc(h) * psc(w) + gxyz.y * psc(w) + gxyz.x; + + ivec3 gxyz4 = ivec3(1, psc(w), psc(h) * psc(w)); + + ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1]; + ivec4 ii4 = i4 + 4 * gxyz4[psc(dims) - 1]; + +#if NCNN_image_shader + afpvec8 v; + + if (psc(dims) == 1) + { + v = image1d_ld8(bottom_blob_1d, gx); + } + else if (psc(dims) == 2) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + } + else // if (psc(dims) == 3) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + } + + if (ndim == 1) + { + ivec4 x4 = i4; + ivec4 xx4 = ii4; + + image1d_st1(top_blob_1d, x4.r, v[0].r); + image1d_st1(top_blob_1d, x4.g, v[0].g); + image1d_st1(top_blob_1d, x4.b, v[0].b); + image1d_st1(top_blob_1d, x4.a, v[0].a); + image1d_st1(top_blob_1d, xx4.r, v[1].r); + image1d_st1(top_blob_1d, xx4.g, v[1].g); + image1d_st1(top_blob_1d, xx4.b, v[1].b); + image1d_st1(top_blob_1d, xx4.a, v[1].a); + } + if (ndim == 2) + { + ivec4 y4 = i4 / psc(outw); + ivec4 x4 = i4 % psc(outw); + ivec4 yy4 = ii4 / psc(outw); + ivec4 xx4 = ii4 % psc(outw); + + image2d_st1(top_blob_2d, ivec2(x4.r, y4.r), v[0].r); + image2d_st1(top_blob_2d, ivec2(x4.g, y4.g), v[0].g); + image2d_st1(top_blob_2d, ivec2(x4.b, y4.b), v[0].b); + image2d_st1(top_blob_2d, ivec2(x4.a, y4.a), v[0].a); + image2d_st1(top_blob_2d, ivec2(xx4.r, yy4.r), v[1].r); + image2d_st1(top_blob_2d, ivec2(xx4.g, yy4.g), v[1].g); + image2d_st1(top_blob_2d, ivec2(xx4.b, yy4.b), v[1].b); + image2d_st1(top_blob_2d, ivec2(xx4.a, yy4.a), v[1].a); + } + if (ndim == 3) + { + int size = psc(outw) * psc(outh); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(outw); + ivec4 x4 = i4 % size % psc(outw); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(outw); + ivec4 xx4 = ii4 % size % psc(outw); + + image3d_st1(top_blob_3d, ivec3(x4.r, y4.r, z4.r), v[0].r); + image3d_st1(top_blob_3d, ivec3(x4.g, y4.g, z4.g), v[0].g); + image3d_st1(top_blob_3d, ivec3(x4.b, y4.b, z4.b), v[0].b); + image3d_st1(top_blob_3d, ivec3(x4.a, y4.a, z4.a), v[0].a); + image3d_st1(top_blob_3d, ivec3(xx4.r, yy4.r, zz4.r), v[1].r); + image3d_st1(top_blob_3d, ivec3(xx4.g, yy4.g, zz4.g), v[1].g); + image3d_st1(top_blob_3d, ivec3(xx4.b, yy4.b, zz4.b), v[1].b); + image3d_st1(top_blob_3d, ivec3(xx4.a, yy4.a, zz4.a), v[1].a); + } +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (ndim == 1) + { + v_offset = i4; + vv_offset = ii4; + } + if (ndim == 2) + { + ivec4 y4 = i4 / psc(outw); + ivec4 x4 = i4 % psc(outw); + ivec4 yy4 = ii4 / psc(outw); + ivec4 xx4 = ii4 % psc(outw); + + v_offset = y4 * psc(outw) + x4; + vv_offset = yy4 * psc(outw) + xx4; + } + if (ndim == 3) + { + int size = psc(outw) * psc(outh); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(outw); + ivec4 x4 = i4 % size % psc(outw); + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / psc(outw); + ivec4 xx4 = ii4 % size % psc(outw); + + v_offset = z4 * psc(outcstep) + y4 * psc(outw) + x4; + vv_offset = zz4 * psc(outcstep) + yy4 * psc(outw) + xx4; + } + + int gi = gz * psc(cstep) + gy * psc(w) + gx; + + buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi); +#endif +} diff --git a/source/device/vulkan/shaders/reshape_pack8to4.comp b/source/device/vulkan/shaders/reshape_pack8to4.comp new file mode 100644 index 000000000..558b07170 --- /dev/null +++ b/source/device/vulkan/shaders/reshape_pack8to4.comp @@ -0,0 +1,231 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +#else +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + ivec4 i4; + + if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3); + if (ndim == 2) i4 = (gy * 4) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw); + if (ndim == 3) i4 = (gz * 4) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw); + +#if NCNN_image_shader + afpvec4 v; + + if (psc(dims) == 1) + { + ivec4 x4 = i4; + + afpvec8 v0 = image1d_ld8(bottom_blob_1d, x4.r / 8); + afpvec8 v1 = image1d_ld8(bottom_blob_1d, x4.g / 8); + afpvec8 v2 = image1d_ld8(bottom_blob_1d, x4.b / 8); + afpvec8 v3 = image1d_ld8(bottom_blob_1d, x4.a / 8); + + v.r = v0[(x4.r % 8) / 4][x4.r % 4]; + v.g = v1[(x4.g % 8) / 4][x4.g % 4]; + v.b = v2[(x4.b % 8) / 4][x4.b % 4]; + v.a = v3[(x4.a % 8) / 4][x4.a % 4]; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8)); + afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8)); + afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8)); + afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8)); + + v.r = v0[(y4.r % 8) / 4][y4.r % 4]; + v.g = v1[(y4.g % 8) / 4][y4.g % 4]; + v.b = v2[(y4.b % 8) / 4][y4.b % 4]; + v.a = v3[(y4.a % 8) / 4][y4.a % 4]; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8)); + afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8)); + afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8)); + afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8)); + + v.r = v0[(z4.r % 8) / 4][z4.r % 4]; + v.g = v1[(z4.g % 8) / 4][z4.g % 4]; + v.b = v2[(z4.b % 8) / 4][z4.b % 4]; + v.a = v3[(z4.a % 8) / 4][z4.a % 4]; + } + + if (ndim == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + if (ndim == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + if (ndim == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 lane2; + + if (psc(dims) == 1) + { + v_offset = i4 / 2; + lane2 = i4 % 2; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2; + lane2 = y4 % 2; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2; + lane2 = z4 % 2; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * psc(outw) + gx; + if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); + + buffer_st4(top_blob_data, gi, v); +#else + ivec4 v_offset; + + if (psc(dims) == 1) + { + v_offset = i4; + } + else if (psc(dims) == 2) + { + ivec4 y4 = i4 / psc(w); + ivec4 x4 = i4 % psc(w); + + v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8; + } + else // if (psc(dims) == 3) + { + int size = psc(w) * psc(h); + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / psc(w); + ivec4 x4 = i4 % size % psc(w); + + v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * psc(outw) + gx; + if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +#endif +} diff --git a/source/device/vulkan/shaders/softmax_div_sum.comp b/source/device/vulkan/shaders/softmax_div_sum.comp new file mode 100644 index 000000000..5db4bd4a3 --- /dev/null +++ b/source/device/vulkan/shaders/softmax_div_sum.comp @@ -0,0 +1,166 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D sum_workspace_1d; +layout (binding = 2) uniform unfp sampler2D sum_workspace_2d; +#else +layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout (binding = 1) readonly buffer sum_workspace { sfp sum_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afp v; + afp sum; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld1(bottom_blob_1d, gx); + sum = image1d_ld1(sum_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld1(sum_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld1(sum_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld1(sum_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld1(sum_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld1(sum_workspace_2d, ivec2(gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afp v = buffer_ld1(bottom_top_blob_data, gi); + + afp sum; + + if (psc(dims) == 1) // axis == 0 + { + sum = buffer_ld1(sum_workspace_data, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + sum = buffer_ld1(sum_workspace_data, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + sum = buffer_ld1(sum_workspace_data, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + sum = buffer_ld1(sum_workspace_data, gy * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 1) + { + sum = buffer_ld1(sum_workspace_data, gz * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 2) + { + sum = buffer_ld1(sum_workspace_data, gz * psc(h) + gy); + } +#endif + + v /= sum; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st1(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st1(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/softmax_div_sum_pack4.comp b/source/device/vulkan/shaders/softmax_div_sum_pack4.comp new file mode 100644 index 000000000..27b28bc9e --- /dev/null +++ b/source/device/vulkan/shaders/softmax_div_sum_pack4.comp @@ -0,0 +1,175 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D sum_workspace_1d; +layout (binding = 2) uniform unfp sampler2D sum_workspace_2d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +layout (binding = 1) readonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec4 v; + afpvec4 sum; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld4(bottom_blob_1d, gx); + sum = image1d_ld4(sum_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld4(sum_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld4(sum_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld4(sum_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld4(sum_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld4(sum_workspace_2d, ivec2(gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); + + afpvec4 sum; + + if (psc(dims) == 1) // axis == 0 + { + sum = buffer_ld4(sum_workspace_data, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + sum = buffer_ld4(sum_workspace_data, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + sum = buffer_ld4(sum_workspace_data, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + sum = buffer_ld4(sum_workspace_data, gy * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 1) + { + sum = buffer_ld4(sum_workspace_data, gz * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 2) + { + sum = buffer_ld4(sum_workspace_data, gz * psc(h) + gy); + } + +#if NCNN_fp16_packed || NCNN_fp16_storage + // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s + // TODO only enable this workaround for some nvidia driver + if (axis == 0) + { + sum = afpvec4(sum.r); + } +#endif +#endif + + v /= sum; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st4(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/softmax_div_sum_pack8.comp b/source/device/vulkan/shaders/softmax_div_sum_pack8.comp new file mode 100644 index 000000000..a329d3f93 --- /dev/null +++ b/source/device/vulkan/shaders/softmax_div_sum_pack8.comp @@ -0,0 +1,177 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D sum_workspace_1d; +layout (binding = 2) uniform unfp sampler2D sum_workspace_2d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; +layout (binding = 1) readonly buffer sum_workspace { sfpvec8 sum_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec8 v; + afpvec8 sum; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld8(bottom_blob_1d, gx); + sum = image1d_ld8(sum_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld8(sum_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + sum = image1d_ld8(sum_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld8(sum_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld8(sum_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + sum = image2d_ld8(sum_workspace_2d, ivec2(gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); + + afpvec8 sum; + + if (psc(dims) == 1) // axis == 0 + { + sum = buffer_ld8(sum_workspace_data, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + sum = buffer_ld8(sum_workspace_data, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + sum = buffer_ld8(sum_workspace_data, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + sum = buffer_ld8(sum_workspace_data, gy * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 1) + { + sum = buffer_ld8(sum_workspace_data, gz * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 2) + { + sum = buffer_ld8(sum_workspace_data, gz * psc(h) + gy); + } + +#if NCNN_fp16_packed || NCNN_fp16_storage + // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s + // TODO only enable this workaround for some nvidia driver + if (axis == 0) + { + sum = afpvec8(afpvec4(sum[0].r), afpvec4(sum[0].r)); + } +#endif +#endif + + v[0] /= sum[0]; + v[1] /= sum[1]; + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st8(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/softmax_exp_sub_max.comp b/source/device/vulkan/shaders/softmax_exp_sub_max.comp new file mode 100644 index 000000000..210284df4 --- /dev/null +++ b/source/device/vulkan/shaders/softmax_exp_sub_max.comp @@ -0,0 +1,166 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D max_workspace_1d; +layout (binding = 2) uniform unfp sampler2D max_workspace_2d; +#else +layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout (binding = 1) readonly buffer max_workspace { sfp max_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afp v; + afp max_value; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld1(bottom_blob_1d, gx); + max_value = image1d_ld1(max_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld1(max_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld1(max_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld1(max_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld1(max_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld1(max_workspace_2d, ivec2(gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afp v = buffer_ld1(bottom_top_blob_data, gi); + + afp max_value; + + if (psc(dims) == 1) // axis == 0 + { + max_value = buffer_ld1(max_workspace_data, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + max_value = buffer_ld1(max_workspace_data, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + max_value = buffer_ld1(max_workspace_data, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + max_value = buffer_ld1(max_workspace_data, gy * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 1) + { + max_value = buffer_ld1(max_workspace_data, gz * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 2) + { + max_value = buffer_ld1(max_workspace_data, gz * psc(h) + gy); + } +#endif + + v = exp(v - max_value); + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st1(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st1(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st1(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/softmax_exp_sub_max_pack4.comp b/source/device/vulkan/shaders/softmax_exp_sub_max_pack4.comp new file mode 100644 index 000000000..2aba5894f --- /dev/null +++ b/source/device/vulkan/shaders/softmax_exp_sub_max_pack4.comp @@ -0,0 +1,175 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D max_workspace_1d; +layout (binding = 2) uniform unfp sampler2D max_workspace_2d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +layout (binding = 1) readonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec4 v; + afpvec4 max_value; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld4(bottom_blob_1d, gx); + max_value = image1d_ld4(max_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld4(max_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld4(max_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld4(max_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld4(max_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld4(max_workspace_2d, ivec2(gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec4 v = buffer_ld4(bottom_top_blob_data, gi); + + afpvec4 max_value; + + if (psc(dims) == 1) // axis == 0 + { + max_value = buffer_ld4(max_workspace_data, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + max_value = buffer_ld4(max_workspace_data, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + max_value = buffer_ld4(max_workspace_data, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + max_value = buffer_ld4(max_workspace_data, gy * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 1) + { + max_value = buffer_ld4(max_workspace_data, gz * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 2) + { + max_value = buffer_ld4(max_workspace_data, gz * psc(h) + gy); + } + +#if NCNN_fp16_packed || NCNN_fp16_storage + // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s + // TODO only enable this workaround for some nvidia driver + if (axis == 0) + { + max_value = afpvec4(max_value.r); + } +#endif +#endif + + v = exp(v - max_value); + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st4(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st4(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st4(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/softmax_exp_sub_max_pack8.comp b/source/device/vulkan/shaders/softmax_exp_sub_max_pack8.comp new file mode 100644 index 000000000..374c5d927 --- /dev/null +++ b/source/device/vulkan/shaders/softmax_exp_sub_max_pack8.comp @@ -0,0 +1,177 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d; +layout (binding = 2) uniform unfp sampler1D max_workspace_1d; +layout (binding = 2) uniform unfp sampler2D max_workspace_2d; +#else +layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; +layout (binding = 1) readonly buffer max_workspace { sfpvec8 max_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c)) + return; + +#if NCNN_image_shader + afpvec8 v; + afpvec8 max_value; + + if (psc(dims) == 1) // axis == 0 + { + v = image1d_ld8(bottom_blob_1d, gx); + max_value = image1d_ld8(max_workspace_1d, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld8(max_workspace_1d, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy)); + max_value = image1d_ld8(max_workspace_1d, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld8(max_workspace_2d, ivec2(gx, gy)); + } + else if (psc(dims) == 3 && axis == 1) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld8(max_workspace_2d, ivec2(gx, gz)); + } + else if (psc(dims) == 3 && axis == 2) + { + v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz)); + max_value = image2d_ld8(max_workspace_2d, ivec2(gy, gz)); + } +#else + const int gi = gz * psc(cstep) + gy * psc(w) + gx; + + afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); + + afpvec8 max_value; + + if (psc(dims) == 1) // axis == 0 + { + max_value = buffer_ld8(max_workspace_data, 0); + } + else if (psc(dims) == 2 && axis == 0) + { + max_value = buffer_ld8(max_workspace_data, gx); + } + else if (psc(dims) == 2 && axis == 1) + { + max_value = buffer_ld8(max_workspace_data, gy); + } + else if (psc(dims) == 3 && axis == 0) + { + max_value = buffer_ld8(max_workspace_data, gy * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 1) + { + max_value = buffer_ld8(max_workspace_data, gz * psc(w) + gx); + } + else if (psc(dims) == 3 && axis == 2) + { + max_value = buffer_ld8(max_workspace_data, gz * psc(h) + gy); + } + +#if NCNN_fp16_packed || NCNN_fp16_storage + // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s + // TODO only enable this workaround for some nvidia driver + if (axis == 0) + { + max_value = afpvec8(afpvec4(max_value[0].r), afpvec4(max_value[0].r)); + } +#endif +#endif + + v[0] = exp(v[0] - max_value[0]); + v[1] = exp(v[1] - max_value[1]); + +#if NCNN_image_shader + if (psc(dims) == 1) + { + image1d_st8(top_blob_1d, gx, v); + } + else if (psc(dims) == 2) + { + image2d_st8(top_blob_2d, ivec2(gx, gy), v); + } + else // if (psc(dims) == 3) + { + image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v); + } +#else + buffer_st8(bottom_top_blob_data, gi, v); +#endif +} diff --git a/source/device/vulkan/shaders/softmax_reduce_max.comp b/source/device/vulkan/shaders/softmax_reduce_max.comp new file mode 100644 index 000000000..42271ccb5 --- /dev/null +++ b/source/device/vulkan/shaders/softmax_reduce_max.comp @@ -0,0 +1,198 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D max_workspace_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D max_workspace_2d; +#else +layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer max_workspace { sfp max_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp max_value = afp(-99999999.f); + + if (psc(dims) == 1) // axis == 0 + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afp v = image1d_ld1(bottom_top_blob_1d, i); +#else + afp v = buffer_ld1(bottom_top_blob_data, i); +#endif + max_value = max(max_value, v); + } +#if NCNN_image_shader + image1d_st1(max_workspace_1d, 0, max_value); +#else + buffer_st1(max_workspace_data, 0, max_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 0) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afp v = image2d_ld1(bottom_top_blob_2d, ivec2(gx, i)); +#else + int v_offset = i * psc(w) + gx; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } +#if NCNN_image_shader + image1d_st1(max_workspace_1d, gx, max_value); +#else + buffer_st1(max_workspace_data, gx, max_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 1) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afp v = image2d_ld1(bottom_top_blob_2d, ivec2(i, gx)); +#else + int v_offset = gx * psc(w) + i; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } +#if NCNN_image_shader + image1d_st1(max_workspace_1d, gx, max_value); +#else + buffer_st1(max_workspace_data, gx, max_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 0) + { + for (int i = 0; i < psc(c); i++) + { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else + int v_offset = i * psc(cstep) + gy * psc(w) + gx; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } +#if NCNN_image_shader + image2d_st1(max_workspace_2d, ivec2(gx, gy), max_value); +#else + buffer_st1(max_workspace_data, gy * psc(w) + gx, max_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 1) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else + int v_offset = gy * psc(cstep) + i * psc(w) + gx; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } +#if NCNN_image_shader + image2d_st1(max_workspace_2d, ivec2(gx, gy), max_value); +#else + buffer_st1(max_workspace_data, gy * psc(w) + gx, max_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 2) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else + int v_offset = gy * psc(cstep) + gx * psc(w) + i; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } +#if NCNN_image_shader + image2d_st1(max_workspace_2d, ivec2(gx, gy), max_value); +#else + buffer_st1(max_workspace_data, gy * psc(h) + gx, max_value); +#endif + return; + } +} diff --git a/source/device/vulkan/shaders/softmax_reduce_max_pack4.comp b/source/device/vulkan/shaders/softmax_reduce_max_pack4.comp new file mode 100644 index 000000000..6de110db9 --- /dev/null +++ b/source/device/vulkan/shaders/softmax_reduce_max_pack4.comp @@ -0,0 +1,204 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D max_workspace_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D max_workspace_2d; +#else +layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer max_workspace { sfpvec4 max_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 max_value = afpvec4(-99999999.f); + + if (psc(dims) == 1) // axis == 0 + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec4 v = image1d_ld4(bottom_top_blob_1d, i); +#else + afpvec4 v = buffer_ld4(bottom_top_blob_data, i); +#endif + max_value = max(max_value, v); + } + afpvec2 max2 = max(max_value.rg, max_value.ba); + max_value = afpvec4(max(max2.r, max2.g)); +#if NCNN_image_shader + image1d_st4(max_workspace_1d, 0, max_value); +#else + buffer_st4(max_workspace_data, 0, max_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 0) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(gx, i)); +#else + int v_offset = i * psc(w) + gx; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } + afpvec2 max2 = max(max_value.rg, max_value.ba); + max_value = afpvec4(max(max2.r, max2.g)); +#if NCNN_image_shader + image1d_st4(max_workspace_1d, gx, max_value); +#else + buffer_st4(max_workspace_data, gx, max_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 1) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(i, gx)); +#else + int v_offset = gx * psc(w) + i; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } +#if NCNN_image_shader + image1d_st4(max_workspace_1d, gx, max_value); +#else + buffer_st4(max_workspace_data, gx, max_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 0) + { + for (int i = 0; i < psc(c); i++) + { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else + int v_offset = i * psc(cstep) + gy * psc(w) + gx; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } + afpvec2 max2 = max(max_value.rg, max_value.ba); + max_value = afpvec4(max(max2.r, max2.g)); +#if NCNN_image_shader + image2d_st4(max_workspace_2d, ivec2(gx, gy), max_value); +#else + buffer_st4(max_workspace_data, gy * psc(w) + gx, max_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 1) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else + int v_offset = gy * psc(cstep) + i * psc(w) + gx; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } +#if NCNN_image_shader + image2d_st4(max_workspace_2d, ivec2(gx, gy), max_value); +#else + buffer_st4(max_workspace_data, gy * psc(w) + gx, max_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 2) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else + int v_offset = gy * psc(cstep) + gx * psc(w) + i; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + max_value = max(max_value, v); + } +#if NCNN_image_shader + image2d_st4(max_workspace_2d, ivec2(gx, gy), max_value); +#else + buffer_st4(max_workspace_data, gy * psc(h) + gx, max_value); +#endif + return; + } +} diff --git a/source/device/vulkan/shaders/softmax_reduce_max_pack8.comp b/source/device/vulkan/shaders/softmax_reduce_max_pack8.comp new file mode 100644 index 000000000..66073dad9 --- /dev/null +++ b/source/device/vulkan/shaders/softmax_reduce_max_pack8.comp @@ -0,0 +1,217 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D max_workspace_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D max_workspace_2d; +#else +layout (binding = 0) readonly buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer max_workspace { sfpvec8 max_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 max_value = afpvec8(afpvec4(-99999999.f), afpvec4(-99999999.f)); + + if (psc(dims) == 1) // axis == 0 + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec8 v = image1d_ld8(bottom_top_blob_1d, i); +#else + afpvec8 v = buffer_ld8(bottom_top_blob_data, i); +#endif + max_value[0] = max(max_value[0], v[0]); + max_value[1] = max(max_value[1], v[1]); + } + afpvec4 max4 = max(max_value[0], max_value[1]); + afpvec2 max2 = max(max4.rg, max4.ba); + afp max1 = max(max2.r, max2.g); + max_value = afpvec8(afpvec4(max1), afpvec4(max1)); +#if NCNN_image_shader + image1d_st8(max_workspace_1d, 0, max_value); +#else + buffer_st8(max_workspace_data, 0, max_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 0) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(gx, i)); +#else + int v_offset = i * psc(w) + gx; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + max_value[0] = max(max_value[0], v[0]); + max_value[1] = max(max_value[1], v[1]); + } + afpvec4 max4 = max(max_value[0], max_value[1]); + afpvec2 max2 = max(max4.rg, max4.ba); + afp max1 = max(max2.r, max2.g); + max_value = afpvec8(afpvec4(max1), afpvec4(max1)); +#if NCNN_image_shader + image1d_st8(max_workspace_1d, gx, max_value); +#else + buffer_st8(max_workspace_data, gx, max_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 1) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(i, gx)); +#else + int v_offset = gx * psc(w) + i; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + max_value[0] = max(max_value[0], v[0]); + max_value[1] = max(max_value[1], v[1]); + } +#if NCNN_image_shader + image1d_st8(max_workspace_1d, gx, max_value); +#else + buffer_st8(max_workspace_data, gx, max_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 0) + { + for (int i = 0; i < psc(c); i++) + { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else + int v_offset = i * psc(cstep) + gy * psc(w) + gx; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + max_value[0] = max(max_value[0], v[0]); + max_value[1] = max(max_value[1], v[1]); + } + afpvec4 max4 = max(max_value[0], max_value[1]); + afpvec2 max2 = max(max4.rg, max4.ba); + afp max1 = max(max2.r, max2.g); + max_value = afpvec8(afpvec4(max1), afpvec4(max1)); +#if NCNN_image_shader + image2d_st8(max_workspace_2d, ivec2(gx, gy), max_value); +#else + buffer_st8(max_workspace_data, gy * psc(w) + gx, max_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 1) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else + int v_offset = gy * psc(cstep) + i * psc(w) + gx; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + max_value[0] = max(max_value[0], v[0]); + max_value[1] = max(max_value[1], v[1]); + } +#if NCNN_image_shader + image2d_st8(max_workspace_2d, ivec2(gx, gy), max_value); +#else + buffer_st8(max_workspace_data, gy * psc(w) + gx, max_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 2) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else + int v_offset = gy * psc(cstep) + gx * psc(w) + i; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + max_value[0] = max(max_value[0], v[0]); + max_value[1] = max(max_value[1], v[1]); + } +#if NCNN_image_shader + image2d_st8(max_workspace_2d, ivec2(gx, gy), max_value); +#else + buffer_st8(max_workspace_data, gy * psc(h) + gx, max_value); +#endif + return; + } +} diff --git a/source/device/vulkan/shaders/softmax_reduce_sum.comp b/source/device/vulkan/shaders/softmax_reduce_sum.comp new file mode 100644 index 000000000..b38d16454 --- /dev/null +++ b/source/device/vulkan/shaders/softmax_reduce_sum.comp @@ -0,0 +1,198 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image1D sum_workspace_1d; +layout (binding = 1, imfmtc1) writeonly uniform unfp image2D sum_workspace_2d; +#else +layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer sum_workspace { sfp sum_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afp sum_value = afp(0.f); + + if (psc(dims) == 1) // axis == 0 + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afp v = image1d_ld1(bottom_top_blob_1d, i); +#else + afp v = buffer_ld1(bottom_top_blob_data, i); +#endif + sum_value += v; + } +#if NCNN_image_shader + image1d_st1(sum_workspace_1d, 0, sum_value); +#else + buffer_st1(sum_workspace_data, 0, sum_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 0) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afp v = image2d_ld1(bottom_top_blob_2d, ivec2(gx, i)); +#else + int v_offset = i * psc(w) + gx; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image1d_st1(sum_workspace_1d, gx, sum_value); +#else + buffer_st1(sum_workspace_data, gx, sum_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 1) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afp v = image2d_ld1(bottom_top_blob_2d, ivec2(i, gx)); +#else + int v_offset = gx * psc(w) + i; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image1d_st1(sum_workspace_1d, gx, sum_value); +#else + buffer_st1(sum_workspace_data, gx, sum_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 0) + { + for (int i = 0; i < psc(c); i++) + { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else + int v_offset = i * psc(cstep) + gy * psc(w) + gx; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image2d_st1(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else + buffer_st1(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 1) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else + int v_offset = gy * psc(cstep) + i * psc(w) + gx; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image2d_st1(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else + buffer_st1(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 2) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afp v = image3d_ld1(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else + int v_offset = gy * psc(cstep) + gx * psc(w) + i; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image2d_st1(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else + buffer_st1(sum_workspace_data, gy * psc(h) + gx, sum_value); +#endif + return; + } +} diff --git a/source/device/vulkan/shaders/softmax_reduce_sum_pack4.comp b/source/device/vulkan/shaders/softmax_reduce_sum_pack4.comp new file mode 100644 index 000000000..40b035ac3 --- /dev/null +++ b/source/device/vulkan/shaders/softmax_reduce_sum_pack4.comp @@ -0,0 +1,204 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D sum_workspace_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D sum_workspace_2d; +#else +layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec4 sum_value = afpvec4(0.f); + + if (psc(dims) == 1) // axis == 0 + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec4 v = image1d_ld4(bottom_top_blob_1d, i); +#else + afpvec4 v = buffer_ld4(bottom_top_blob_data, i); +#endif + sum_value += v; + } + afpvec2 sum2 = sum_value.rg + sum_value.ba; + sum_value = afpvec4(sum2.r + sum2.g); +#if NCNN_image_shader + image1d_st4(sum_workspace_1d, 0, sum_value); +#else + buffer_st4(sum_workspace_data, 0, sum_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 0) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(gx, i)); +#else + int v_offset = i * psc(w) + gx; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } + afpvec2 sum2 = sum_value.rg + sum_value.ba; + sum_value = afpvec4(sum2.r + sum2.g); +#if NCNN_image_shader + image1d_st4(sum_workspace_1d, gx, sum_value); +#else + buffer_st4(sum_workspace_data, gx, sum_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 1) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(i, gx)); +#else + int v_offset = gx * psc(w) + i; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image1d_st4(sum_workspace_1d, gx, sum_value); +#else + buffer_st4(sum_workspace_data, gx, sum_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 0) + { + for (int i = 0; i < psc(c); i++) + { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else + int v_offset = i * psc(cstep) + gy * psc(w) + gx; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } + afpvec2 sum2 = sum_value.rg + sum_value.ba; + sum_value = afpvec4(sum2.r + sum2.g); +#if NCNN_image_shader + image2d_st4(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else + buffer_st4(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 1) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else + int v_offset = gy * psc(cstep) + i * psc(w) + gx; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image2d_st4(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else + buffer_st4(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 2) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else + int v_offset = gy * psc(cstep) + gx * psc(w) + i; + afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image2d_st4(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else + buffer_st4(sum_workspace_data, gy * psc(h) + gx, sum_value); +#endif + return; + } +} diff --git a/source/device/vulkan/shaders/softmax_reduce_sum_pack8.comp b/source/device/vulkan/shaders/softmax_reduce_sum_pack8.comp new file mode 100644 index 000000000..a4a88024b --- /dev/null +++ b/source/device/vulkan/shaders/softmax_reduce_sum_pack8.comp @@ -0,0 +1,211 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int axis = 0; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; +layout (constant_id = shape_constant_id_offset + 1) const int w = 0; +layout (constant_id = shape_constant_id_offset + 2) const int h = 0; +layout (constant_id = shape_constant_id_offset + 3) const int c = 0; +layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d; +layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d; +layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image1D sum_workspace_1d; +layout (binding = 1, imfmtc4) writeonly uniform unfp image2D sum_workspace_2d; +#else +layout (binding = 0) readonly buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer sum_workspace { sfpvec8 sum_workspace_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + return; + + afpvec8 sum_value = afpvec8(afpvec4(0.f), afpvec4(0.f)); + + if (psc(dims) == 1) // axis == 0 + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec8 v = image1d_ld8(bottom_top_blob_1d, i); +#else + afpvec8 v = buffer_ld8(bottom_top_blob_data, i); +#endif + sum_value += v; + } + afpvec4 sum4 = sum_value[0] + sum_value[1]; + afpvec2 sum2 = sum4.rg + sum4.ba; + afp sum1 = sum2.r + sum2.g; + sum_value = afpvec8(afpvec4(sum1), afpvec4(sum1)); +#if NCNN_image_shader + image1d_st8(sum_workspace_1d, 0, sum_value); +#else + buffer_st8(sum_workspace_data, 0, sum_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 0) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(gx, i)); +#else + int v_offset = i * psc(w) + gx; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } + afpvec4 sum4 = sum_value[0] + sum_value[1]; + afpvec2 sum2 = sum4.rg + sum4.ba; + afp sum1 = sum2.r + sum2.g; + sum_value = afpvec8(afpvec4(sum1), afpvec4(sum1)); +#if NCNN_image_shader + image1d_st8(sum_workspace_1d, gx, sum_value); +#else + buffer_st8(sum_workspace_data, gx, sum_value); +#endif + return; + } + + if (psc(dims) == 2 && axis == 1) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(i, gx)); +#else + int v_offset = gx * psc(w) + i; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image1d_st8(sum_workspace_1d, gx, sum_value); +#else + buffer_st8(sum_workspace_data, gx, sum_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 0) + { + for (int i = 0; i < psc(c); i++) + { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, gy, i)); +#else + int v_offset = i * psc(cstep) + gy * psc(w) + gx; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } + afpvec4 sum4 = sum_value[0] + sum_value[1]; + afpvec2 sum2 = sum4.rg + sum4.ba; + afp sum1 = sum2.r + sum2.g; + sum_value = afpvec8(afpvec4(sum1), afpvec4(sum1)); +#if NCNN_image_shader + image2d_st8(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else + buffer_st8(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 1) + { + for (int i = 0; i < psc(h); i++) + { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, i, gy)); +#else + int v_offset = gy * psc(cstep) + i * psc(w) + gx; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image2d_st8(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else + buffer_st8(sum_workspace_data, gy * psc(w) + gx, sum_value); +#endif + return; + } + + if (psc(dims) == 3 && axis == 2) + { + for (int i = 0; i < psc(w); i++) + { +#if NCNN_image_shader + afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(i, gx, gy)); +#else + int v_offset = gy * psc(cstep) + gx * psc(w) + i; + afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset); +#endif + sum_value += v; + } +#if NCNN_image_shader + image2d_st8(sum_workspace_2d, ivec2(gx, gy), sum_value); +#else + buffer_st8(sum_workspace_data, gy * psc(h) + gx, sum_value); +#endif + return; + } +} diff --git a/source/device/vulkan/vulkan_allocator.cpp b/source/device/vulkan/vulkan_allocator.cpp new file mode 100644 index 000000000..c5483ca4f --- /dev/null +++ b/source/device/vulkan/vulkan_allocator.cpp @@ -0,0 +1,1474 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include +#include "vulkan_allocator.hpp" +#include "vulkan_gpu.hpp" +#include "vulkan_pipeline.hpp" + +#include + +namespace TEngine { + +Allocator::~Allocator() +{ + +} + +VkAllocator::VkAllocator(const GPUDevice* _vkdev) : vkdev(_vkdev) +{ + buffer_memory_type_index = (uint32_t)-1; + image_memory_type_index = (uint32_t)-1; + mappable = false; + coherent = false; +} + +static inline size_t round_up(size_t n, size_t multiple) +{ + return (n + multiple - 1) / multiple * multiple; +} + +static inline size_t round_down(size_t n, size_t multiple) +{ + return n / multiple * multiple; +} + +static inline size_t least_common_multiple(size_t a, size_t b) +{ + if (a == b) + return a; + + if (a > b) + return least_common_multiple(b, a); + + size_t lcm = b; + while (lcm % a != 0) + { + lcm += b; + } + + return lcm; +} + +int VkAllocator::flush(VkBufferMemory* ptr) +{ + if (coherent) + return 0; + + VkMappedMemoryRange mappedMemoryRange; + mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + mappedMemoryRange.pNext = 0; + mappedMemoryRange.memory = ptr->memory; + mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size); + mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size) - mappedMemoryRange.offset; + + VkResult ret = vkFlushMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange); + if (ret != VK_SUCCESS) + { + printf("vkFlushMappedMemoryRanges failed %d", ret); + return -1; + } + + return 0; +} + +int VkAllocator::invalidate(VkBufferMemory* ptr) +{ + if (coherent) + return 0; + + VkMappedMemoryRange mappedMemoryRange; + mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE; + mappedMemoryRange.pNext = 0; + mappedMemoryRange.memory = ptr->memory; + mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size); + mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size) - mappedMemoryRange.offset; + + VkResult ret = vkInvalidateMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange); + if (ret != VK_SUCCESS) + { + printf("vkInvalidateMappedMemoryRanges failed %d", ret); + return -1; + } + return 0; +} + +VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage) +{ + VkBufferCreateInfo bufferCreateInfo; + bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + bufferCreateInfo.pNext = 0; + bufferCreateInfo.flags = 0; + bufferCreateInfo.size = size; + bufferCreateInfo.usage = usage; + bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + bufferCreateInfo.queueFamilyIndexCount = 0; + bufferCreateInfo.pQueueFamilyIndices = 0; + + VkBuffer buffer = 0; + VkResult ret = vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer); + if (ret != VK_SUCCESS) + { + printf("vkCreateBuffer failed %d", ret); + return 0; + } + + return buffer; +} + +VkDeviceMemory VkAllocator::allocate_memory(size_t size, uint32_t memory_type_index) +{ + VkMemoryAllocateInfo memoryAllocateInfo; + memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + memoryAllocateInfo.pNext = 0; + memoryAllocateInfo.allocationSize = size; + memoryAllocateInfo.memoryTypeIndex = memory_type_index; + + VkDeviceMemory memory = 0; + VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory); + if (ret != VK_SUCCESS) + { + printf("vkAllocateMemory failed %d", ret); + return 0; + } + return memory; +} + +VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer) +{ + VkMemoryAllocateInfo memoryAllocateInfo; + memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + memoryAllocateInfo.pNext = 0; + memoryAllocateInfo.allocationSize = size; + memoryAllocateInfo.memoryTypeIndex = memory_type_index; + + VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo; + memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR; + memoryDedicatedAllocateInfo.pNext = 0; + memoryDedicatedAllocateInfo.image = image; + memoryDedicatedAllocateInfo.buffer = buffer; + memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo; + + VkDeviceMemory memory = 0; + VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory); + if (ret != VK_SUCCESS) + { + printf("vkAllocateMemory failed %d", ret); + return 0; + } + + return memory; +} + +VkImage VkAllocator::create_image(VkImageType type, int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage) +{ + VkImageCreateInfo imageCreateInfo; + imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + imageCreateInfo.pNext = 0; + imageCreateInfo.flags = 0; + imageCreateInfo.imageType = type; + imageCreateInfo.format = format; + imageCreateInfo.extent.width = width; + imageCreateInfo.extent.height = height; + imageCreateInfo.extent.depth = depth; + imageCreateInfo.mipLevels = 1; + imageCreateInfo.arrayLayers = 1; + imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT; + imageCreateInfo.tiling = tiling; + imageCreateInfo.usage = usage; + imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + imageCreateInfo.queueFamilyIndexCount = 0; + imageCreateInfo.pQueueFamilyIndices = 0; + imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED; + + VkImage image; + VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image); + if (ret != VK_SUCCESS) + { + printf("vkCreateImage failed %d %d %d %d %d %d %d %d", ret, type, width, height, depth, format, tiling, usage); + return 0; + } + + return image; +} + +VkImageView VkAllocator::create_imageview(VkImageViewType type, VkImage image, VkFormat format) +{ + VkImageViewCreateInfo imageViewCreateInfo; + imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO; + imageViewCreateInfo.pNext = 0; + imageViewCreateInfo.flags = 0; + imageViewCreateInfo.image = image; + imageViewCreateInfo.viewType = type; + imageViewCreateInfo.format = format; + imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY; + imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY; + imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY; + imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY; + imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + imageViewCreateInfo.subresourceRange.baseMipLevel = 0; + imageViewCreateInfo.subresourceRange.levelCount = 1; + imageViewCreateInfo.subresourceRange.baseArrayLayer = 0; + imageViewCreateInfo.subresourceRange.layerCount = 1; + + VkImageView imageview; + VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview); + if (ret != VK_SUCCESS) + { + printf("vkCreateImageView failed %d", ret); + return 0; + } + + return imageview; +} + +VkBlobAllocator::VkBlobAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev) +{ + buffer_offset_alignment = vkdev->info.buffer_offset_alignment; + bind_memory_offset_alignment = vkdev->info.buffer_image_granularity; + + if (vkdev->info.type == 1) + { + // on integrated gpu, there may be device local only memory too, eg. AMD APU + // assuming larger alignment always keeps us safe :) + + // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size + buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.memory_map_alignment); + buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size); + } + + block_size = alignSize(16 * 1024 * 1024, buffer_offset_alignment);// 16M +} + +VkBlobAllocator::~VkBlobAllocator() +{ + clear(); +} + +// TODO +void VkBlobAllocator::clear() +{ +// TLOG_INFO("VkBlobAllocator %lu", buffer_blocks.size()); + + for (size_t i=0; i >::iterator it = buffer_budgets[i].begin(); +// while (it != buffer_budgets[i].end()) +// { +// TLOG_INFO("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second); +// it++; +// } + + if (mappable) + vkUnmapMemory(vkdev->vkdevice(), ptr->memory); + + vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); + vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + + delete ptr; + } + buffer_blocks.clear(); + + buffer_budgets.clear(); + + for (size_t i=0; i >::iterator it = image_memory_budgets[i].begin(); +// while (it != image_memory_budgets[i].end()) +// { +// TLOG_INFO("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second); +// it++; +// } + + vkFreeMemory(vkdev->vkdevice(), memory, 0); + } + image_memory_blocks.clear(); + + image_memory_budgets.clear(); +} + +VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size) +{ + size_t aligned_size = alignSize(size, buffer_offset_alignment); + + const int buffer_block_count = buffer_blocks.size(); + + // find first spare space in buffer_blocks + for (int i=0; i >::iterator it = buffer_budgets[i].begin(); + while (it != buffer_budgets[i].end()) + { + size_t budget_size = it->second; + if (budget_size < aligned_size) + { + it++; + continue; + } + + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = buffer_blocks[i]->buffer; + ptr->offset = it->first; + ptr->memory = buffer_blocks[i]->memory; + ptr->capacity = aligned_size; + ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + // adjust buffer_budgets + if (budget_size == aligned_size) + { + buffer_budgets[i].erase(it); + } + else + { + it->first += aligned_size; + it->second -= aligned_size; + } + + // printf("VkBlobAllocator M %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity); + + return ptr; + } + } + + size_t new_block_size = std::max(block_size, aligned_size); + + // create new block + VkBufferMemory* block = new VkBufferMemory; + + block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); + block->offset = 0; + + // TODO respect VK_KHR_dedicated_allocation ? + + VkMemoryRequirements memoryRequirements; + vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements); + + // setup memory type and alignment + if (buffer_memory_type_index == (uint32_t)-1) + { + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(buffer_memory_type_index); + coherent = vkdev->is_coherent(buffer_memory_type_index); + } + + block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index); + + // ignore memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); + + block->mapped_ptr = 0; + if (mappable) + { + vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); + } + + buffer_blocks.push_back(block); + + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = block->buffer; + ptr->offset = 0; + ptr->memory = block->memory; + ptr->capacity = aligned_size; + ptr->mapped_ptr = block->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + // adjust buffer_budgets + std::list< std::pair > budget; + if (new_block_size > aligned_size) + { + budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size)); + } + buffer_budgets.push_back(budget); + + // TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity); + + return ptr; + +} + +VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) +{ + if (elempack != 1 && elempack != 4 && elempack != 8) + { + printf("elempack must be 1 4 8"); + return 0; + } + + // resolve format + VkFormat format = VK_FORMAT_UNDEFINED; + + if (elemsize / elempack == 4) + { + // fp32 + if (elempack == 1) format = VK_FORMAT_R32_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT; + } + if (elemsize / elempack == 2) + { + // fp16 + if (elempack == 1) format = VK_FORMAT_R16_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT; + } + + // resolve image width height depth + int width = w; + int height = h; + int depth = c; + + // large elempack spills on image w + if (elempack == 8) width *= 2; + + VkImageType image_type; + VkImageViewType imageview_type; + if (dims == 1) + { + image_type = VK_IMAGE_TYPE_1D; + imageview_type = VK_IMAGE_VIEW_TYPE_1D; + + if (width > (int)vkdev->info.max_image_dimension_1d) + { + printf("image dimension too large %d > %d", width, (int)vkdev->info.max_image_dimension_1d); + return 0; + } + } + else if (dims == 2) + { + image_type = VK_IMAGE_TYPE_2D; + imageview_type = VK_IMAGE_VIEW_TYPE_2D; + + if (width > (int)vkdev->info.max_image_dimension_2d || height > (int)vkdev->info.max_image_dimension_2d) + { + printf("image dimension too large %d %d > %d", width, height, (int)vkdev->info.max_image_dimension_2d); + return 0; + } + } + else // if (dims == 3) + { + image_type = VK_IMAGE_TYPE_3D; + imageview_type = VK_IMAGE_VIEW_TYPE_3D; + + if (width > (int)vkdev->info.max_image_dimension_3d || height > (int)vkdev->info.max_image_dimension_3d || depth > (int)vkdev->info.max_image_dimension_3d) + { + printf("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d); + return 0; + } + } + + VkImageMemory* ptr = new VkImageMemory; + + ptr->image = create_image(image_type, width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + + ptr->image_type = image_type; + ptr->imageview_type = imageview_type; + ptr->width = width; + ptr->height = height; + ptr->depth = depth; + ptr->format = format; + + // TODO respect VK_KHR_dedicated_allocation ? + VkMemoryRequirements memoryRequirements; + vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); + + const size_t size = memoryRequirements.size; + const size_t alignment = std::max((size_t)memoryRequirements.alignment, bind_memory_offset_alignment); + + size_t aligned_size = alignSize(size, alignment); + + const int image_memory_block_count = image_memory_blocks.size(); + + // find first spare space in image_memory_blocks + for (int i=0; i >::iterator it = image_memory_budgets[i].begin(); + while (it != image_memory_budgets[i].end()) + { + // we cannot use it->first directly for base offset alignment + size_t bind_base_offset = it->first; + size_t bind_offset = alignSize(bind_base_offset, alignment); + size_t budget_size = it->second; + if (budget_size < aligned_size + (bind_offset - bind_base_offset)) + { + it++; + continue; + } + // bind at memory offset + ptr->memory = image_memory_blocks[i]; + ptr->bind_offset = bind_offset; + ptr->bind_capacity = aligned_size; + + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); + + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; + + ptr->imageview = create_imageview(imageview_type, ptr->image, format); + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; + + if (bind_base_offset != bind_offset) + { + // NOTE there is small offset inside bind_base_offset and bind_offset + // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory + // so that memory management could be easier + aligned_size += (bind_offset - bind_base_offset); + + ptr->bind_offset = bind_base_offset; + ptr->bind_capacity = aligned_size; + } + + // adjust image_memory_budgets + if (budget_size == aligned_size) + { + image_memory_budgets[i].erase(it); + } + else + { + it->first += aligned_size; + it->second -= aligned_size; + } + +// TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity); + + return ptr; + } + } + + // setup memory type and alignment + if (image_memory_type_index == (uint32_t)-1) + { + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + mappable = vkdev->is_mappable(image_memory_type_index); + coherent = vkdev->is_coherent(image_memory_type_index); + } + + // create new block + size_t new_block_size = std::max(block_size, aligned_size); + + // bind at memory offset + ptr->memory = allocate_memory(new_block_size, image_memory_type_index); + ptr->bind_offset = 0; + ptr->bind_capacity = aligned_size; + + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); + + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; + + ptr->imageview = create_imageview(imageview_type, ptr->image, format); + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; + + // adjust image_memory_budgets + image_memory_blocks.push_back(ptr->memory); + + std::list< std::pair > budget; + if (new_block_size > aligned_size) + { + budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size)); + } + image_memory_budgets.push_back(budget); + +// TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity); + + return ptr; +} + + +void VkBlobAllocator::fastFree(VkBufferMemory* ptr) +{ +// TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity); + + const int buffer_block_count = buffer_blocks.size(); + + int block_index = -1; + for (int i=0; ibuffer == ptr->buffer && buffer_blocks[i]->memory == ptr->memory) + { + block_index = i; + break; + } + } + + if (block_index == -1) + { + printf("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->buffer); + + delete ptr; + + return; + } + + // merge + std::list< std::pair >::iterator it_merge_left = buffer_budgets[block_index].end(); + std::list< std::pair >::iterator it_merge_right = buffer_budgets[block_index].end(); + std::list< std::pair >::iterator it = buffer_budgets[block_index].begin(); + for ( ; it != buffer_budgets[block_index].end(); it++) + { + if (it->first + it->second == ptr->offset) + { + it_merge_left = it; + } + else if (ptr->offset + ptr->capacity == it->first) + { + it_merge_right = it; + } + } + + if (it_merge_left != buffer_budgets[block_index].end() && it_merge_right != buffer_budgets[block_index].end()) + { + it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first; + buffer_budgets[block_index].erase(it_merge_right); + } + else if (it_merge_left != buffer_budgets[block_index].end()) + { + it_merge_left->second = ptr->offset + ptr->capacity - it_merge_left->first; + } + else if (it_merge_right != buffer_budgets[block_index].end()) + { + it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->offset; + it_merge_right->first = ptr->offset; + } + else + { + if (ptr->offset == 0) + { + // chain leading block + buffer_budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity)); + } + else + { + buffer_budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity)); + } + } + + delete ptr; +} + +void VkBlobAllocator::fastFree(VkImageMemory* ptr) +{ +// TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity); + + const int image_memory_block_count = image_memory_blocks.size(); + + int block_index = -1; + for (int i=0; imemory) + { + block_index = i; + break; + } + } + + if (block_index == -1) + { + printf("FATAL ERROR! unlocked VkBlobAllocator get wild %p\n", ptr->memory); + + if (!ptr->command_refcount) + { + vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + + delete ptr; + } + + return; + } + + // merge + std::list< std::pair >::iterator it_merge_left = image_memory_budgets[block_index].end(); + std::list< std::pair >::iterator it_merge_right = image_memory_budgets[block_index].end(); + std::list< std::pair >::iterator it = image_memory_budgets[block_index].begin(); + for ( ; it != image_memory_budgets[block_index].end(); it++) + { + if (it->first + it->second == ptr->bind_offset) + { + it_merge_left = it; + } + else if (ptr->bind_offset + ptr->bind_capacity == it->first) + { + it_merge_right = it; + } + } + + if (it_merge_left != image_memory_budgets[block_index].end() && it_merge_right != image_memory_budgets[block_index].end()) + { + it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first; + image_memory_budgets[block_index].erase(it_merge_right); + } + else if (it_merge_left != image_memory_budgets[block_index].end()) + { + it_merge_left->second = ptr->bind_offset + ptr->bind_capacity - it_merge_left->first; + } + else if (it_merge_right != image_memory_budgets[block_index].end()) + { + it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->bind_offset; + it_merge_right->first = ptr->bind_offset; + } + else + { + if (ptr->bind_offset == 0) + { + // chain leading block + image_memory_budgets[block_index].push_front(std::make_pair(ptr->bind_offset, ptr->bind_capacity)); + } + else + { + image_memory_budgets[block_index].push_back(std::make_pair(ptr->bind_offset, ptr->bind_capacity)); + } + } + + if (!ptr->command_refcount) + { + vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + + delete ptr; + } +} + +VkWeightAllocator::VkWeightAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev) +{ + buffer_offset_alignment = vkdev->info.buffer_offset_alignment; + bind_memory_offset_alignment = vkdev->info.buffer_image_granularity; + + if (vkdev->info.type == 1) + { + // on integrated gpu, there may be device local only memory too, eg. AMD APU + // assuming larger alignment always keeps us safe :) + + // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size + buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.memory_map_alignment); + buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size); + } + + block_size = alignSize(8 * 1024 * 1024, buffer_offset_alignment);// 8M +} + +VkWeightAllocator::~VkWeightAllocator() +{ + //clear(); + printf("run VkWeightAllocator descontruction function\n"); +} + + +void VkWeightAllocator::clear() +{ + printf("run VkWeightAllocator clear function\n"); +} + +VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size) +{ + // printf("VkWeightAllocator fastMalloc %lu\n", size); + + size_t aligned_size = alignSize(size, buffer_offset_alignment); + + const int buffer_block_count = buffer_blocks.size(); + + // find first spare space in buffer_blocks + for (int i=0; i= aligned_size) + { + size_t block_offset = block_size - free_size; + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = buffer_blocks[i]->buffer; + ptr->offset = block_offset; + ptr->memory = buffer_blocks[i]->memory; + ptr->capacity = aligned_size; + ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + buffer_block_free_spaces[i] -= aligned_size; + + return ptr; + } + } + size_t new_block_size = std::max(block_size, aligned_size); + + // create new block + VkBufferMemory* block = new VkBufferMemory; + + block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); + block->offset = 0; + + if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation) + { + VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2; + bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR; + bufferMemoryRequirementsInfo2.pNext = 0; + bufferMemoryRequirementsInfo2.buffer = block->buffer; + + VkMemoryRequirements2KHR memoryRequirements2; + memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR; + memoryRequirements2.pNext = 0; + + VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements; + memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR; + memoryDedicatedRequirements.pNext = 0; + memoryRequirements2.pNext = &memoryDedicatedRequirements; + + vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2); + + bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation; + + if (dedicatedAllocation) + { + // setup memory type and alignment + if (buffer_memory_type_index == (uint32_t)-1) + { + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(buffer_memory_type_index); + coherent = vkdev->is_coherent(buffer_memory_type_index); + } + + block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer); + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); + + block->mapped_ptr = 0; + if (mappable) + { + vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); + } + + dedicated_buffer_blocks.push_back(block); + + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = block->buffer; + ptr->offset = 0; + ptr->memory = block->memory; + ptr->capacity = new_block_size; + ptr->mapped_ptr = block->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + return ptr; + } + } + + VkMemoryRequirements memoryRequirements; + vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements); + + // setup memory type and alignment + if (buffer_memory_type_index == (uint32_t)-1) + { + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(buffer_memory_type_index); + coherent = vkdev->is_coherent(buffer_memory_type_index); + } + + block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index); + + // ignore memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0); + +// printf("VkWeightAllocator M %p", block->buffer); + block->mapped_ptr = 0; + if (mappable) + { + vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr); + } + + buffer_blocks.push_back(block); + + buffer_block_free_spaces.push_back(new_block_size - aligned_size); + + // return sub buffer + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = block->buffer; + ptr->offset = 0; + ptr->memory = block->memory; + ptr->capacity = aligned_size; + ptr->mapped_ptr = block->mapped_ptr; + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + + return ptr; +} + +VkImageMemory* VkWeightAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) +{ + if (elempack != 1 && elempack != 4 && elempack != 8 && elempack != 16 && elempack != 32 && elempack != 64) + { + printf("elempack must be 1 4 8 16 32 64\n"); + return 0; + } + + // resolve format + VkFormat format = VK_FORMAT_UNDEFINED; + + if (elemsize / elempack == 4) + { + // fp32 + if (elempack == 1) format = VK_FORMAT_R32_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 16) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 32) format = VK_FORMAT_R32G32B32A32_SFLOAT; + if (elempack == 64) format = VK_FORMAT_R32G32B32A32_SFLOAT; + } + if (elemsize / elempack == 2) + { + // fp16 + if (elempack == 1) format = VK_FORMAT_R16_SFLOAT; + if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 16) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT; + if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT; + } + + // resolve image width height depth + int width = w; + int height = h; + int depth = c; + + // large elempack spills on image w + if (elempack == 8) width *= 2; + if (elempack == 16) width *= 4; + if (elempack == 32) width *= 8; + if (elempack == 64) width *= 16; + + VkImageType image_type; + VkImageViewType imageview_type; + if (dims == 1) + { + image_type = VK_IMAGE_TYPE_1D; + imageview_type = VK_IMAGE_VIEW_TYPE_1D; + + if (width > (int)vkdev->info.max_image_dimension_1d) + { + printf("image dimension too large %d > %d\n", width, (int)vkdev->info.max_image_dimension_1d); + return 0; + } + } + else if (dims == 2) + { + image_type = VK_IMAGE_TYPE_2D; + imageview_type = VK_IMAGE_VIEW_TYPE_2D; + + if (width > (int)vkdev->info.max_image_dimension_2d || height > (int)vkdev->info.max_image_dimension_2d) + { + printf("image dimension too large %d %d > %d \n", width, height, (int)vkdev->info.max_image_dimension_2d); + return 0; + } + } + else // if (dims == 3) + { + image_type = VK_IMAGE_TYPE_3D; + imageview_type = VK_IMAGE_VIEW_TYPE_3D; + + if (width > (int)vkdev->info.max_image_dimension_3d || height > (int)vkdev->info.max_image_dimension_3d || depth > (int)vkdev->info.max_image_dimension_3d) + { + printf("image dimension too large %d %d %d > %d \n", width, height, depth, (int)vkdev->info.max_image_dimension_3d); + return 0; + } + } + + VkImageMemory* ptr = new VkImageMemory; + + ptr->image = create_image(image_type, width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT); + + ptr->image_type = image_type; + ptr->imageview_type = imageview_type; + ptr->width = width; + ptr->height = height; + ptr->depth = depth; + ptr->format = format; + + if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation) + { + VkImageMemoryRequirementsInfo2KHR imageMemoryRequirementsInfo2; + imageMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR; + imageMemoryRequirementsInfo2.pNext = 0; + imageMemoryRequirementsInfo2.image = ptr->image; + + VkMemoryRequirements2KHR memoryRequirements2; + memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR; + memoryRequirements2.pNext = 0; + + VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements; + memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR; + memoryDedicatedRequirements.pNext = 0; + memoryRequirements2.pNext = &memoryDedicatedRequirements; + + vkdev->vkGetImageMemoryRequirements2KHR(vkdev->vkdevice(), &imageMemoryRequirementsInfo2, &memoryRequirements2); + + bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation; + + if (dedicatedAllocation) + { + // setup memory type and alignment + if (image_memory_type_index == (uint32_t)-1) + { + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(image_memory_type_index); + coherent = vkdev->is_coherent(image_memory_type_index); + } + + // bind memory + ptr->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, image_memory_type_index, ptr->image, 0); + ptr->bind_offset = 0; + ptr->bind_capacity = memoryRequirements2.memoryRequirements.size; + + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); + + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; + + ptr->imageview = create_imageview(imageview_type, ptr->image, format); + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; + + dedicated_image_memory_blocks.push_back(ptr->memory); + + return ptr; + } + } + + VkMemoryRequirements memoryRequirements; + vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements); + + const size_t size = memoryRequirements.size; + const size_t alignment = std::max((size_t)memoryRequirements.alignment, bind_memory_offset_alignment); + + size_t aligned_size = alignSize(size, alignment); + + const int image_memory_block_count = image_memory_blocks.size(); + + // find first spare space in buffer_blocks + for (int i=0; i= aligned_size + (bind_offset - bind_base_offset)) + { + // bind at memory offset + ptr->memory = image_memory_blocks[i]; + ptr->bind_offset = bind_offset; + ptr->bind_capacity = aligned_size; + + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); + + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; + + ptr->imageview = create_imageview(imageview_type, ptr->image, format); + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; + + if (bind_base_offset != bind_offset) + { + // NOTE there is small offset inside bind_base_offset and bind_offset + // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory + // so that memory management could be easier + aligned_size += (bind_offset - bind_base_offset); + + ptr->bind_offset = bind_base_offset; + ptr->bind_capacity = aligned_size; + } + + image_memory_block_free_spaces[i] -= aligned_size; + + return ptr; + } + } + + // setup memory type and alignment + if (image_memory_type_index == (uint32_t)-1) + { + if (vkdev->info.type == 1) + { + // integrated gpu, prefer unified memory + image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0); + } + else + { + // discrete gpu, device local + image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT); + } + + mappable = vkdev->is_mappable(image_memory_type_index); + coherent = vkdev->is_coherent(image_memory_type_index); + } + + // create new block + size_t new_block_size = std::max(block_size, aligned_size); + + // bind at memory offset + ptr->memory = allocate_memory(new_block_size, image_memory_type_index); + ptr->bind_offset = 0; + ptr->bind_capacity = aligned_size; + + // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset + vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset); + + // do not allow host access to optimal tiling image + ptr->mapped_ptr = 0; + + ptr->imageview = create_imageview(imageview_type, ptr->image, format); + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + ptr->command_refcount = 0; + + image_memory_blocks.push_back(ptr->memory); + image_memory_block_free_spaces.push_back(new_block_size - aligned_size); + + return ptr; +} + + +void VkWeightAllocator::fastFree(VkBufferMemory* ptr) +{ +// TLOG_INFO("VkWeightAllocator F %p", ptr->buffer); + + delete ptr; +} + +void VkWeightAllocator::fastFree(VkImageMemory* ptr) +{ +// TLOG_INFO("VkWeightAllocator F %p", ptr->memory); + + if (!ptr->command_refcount) + { + vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + + delete ptr; + } +} + +VkStagingAllocator::VkStagingAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev) +{ + mappable = true; + coherent = true; + + size_compare_ratio = 192;// 0.75f * 256 +} + +VkStagingAllocator::~VkStagingAllocator() +{ + clear(); +} + +void VkStagingAllocator::clear() +{ +// TLOG_INFO("VkStagingAllocator %lu", buffer_budgets.size()); + + for (std::list::iterator it = buffer_budgets.begin(); it != buffer_budgets.end(); it++) + { + VkBufferMemory* ptr = *it; + +// TLOG_INFO("VkStagingAllocator F %p", ptr->buffer); + + vkUnmapMemory(vkdev->vkdevice(), ptr->memory); + vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); + vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + + delete ptr; + } + buffer_budgets.clear(); +} + +VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size) +{ + // printf("VkStagingAllocator fastMalloc %lu\n", size); + // find free budget + std::list::iterator it = buffer_budgets.begin(); + for (; it != buffer_budgets.end(); it++) + { + VkBufferMemory* ptr = *it; + + size_t capacity = ptr->capacity; + + // size_compare_ratio ~ 100% + if (capacity >= size && ((capacity * size_compare_ratio) >> 8) <= size) + { + buffer_budgets.erase(it); + +// TLOG_INFO("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity); + + return ptr; + } + } + + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); + ptr->offset = 0; + + VkMemoryRequirements memoryRequirements; + vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements); + + // setup memory type + if (buffer_memory_type_index == (uint32_t)-1) + { + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + } + + ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index); + + // ignore memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0); + + ptr->capacity = size; + + vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); + + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + +// TLOG_INFO("VkStagingAllocator M %p %lu", ptr->buffer, size); + + return ptr; +} + +VkImageMemory* VkStagingAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) +{ + // staging image is mainly used for storing small piece of dynamic parameters + // we allocate host memory as a fake image, it's simple and good + + const size_t size = w * h * c * elemsize; + + VkImageType image_type; + VkImageViewType imageview_type; + if (dims == 1) + { + image_type = VK_IMAGE_TYPE_1D; + imageview_type = VK_IMAGE_VIEW_TYPE_1D; + } + else if (dims == 2) + { + image_type = VK_IMAGE_TYPE_2D; + imageview_type = VK_IMAGE_VIEW_TYPE_2D; + } + else // if (dims == 3) + { + image_type = VK_IMAGE_TYPE_3D; + imageview_type = VK_IMAGE_VIEW_TYPE_3D; + } + + VkImageMemory* ptr = new VkImageMemory; + + ptr->image = 0; + ptr->image_type = image_type; + ptr->imageview_type = imageview_type; + ptr->width = w; + ptr->height = h; + ptr->depth = c; + ptr->format = VK_FORMAT_UNDEFINED; + ptr->memory = 0; + ptr->bind_offset = 0; + ptr->bind_capacity = size; + + ptr->mapped_ptr = malloc(size); + + ptr->imageview = 0; + + ptr->access_flags = 0; + ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED; + ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + ptr->command_refcount = 0; + +// TLOG_INFO("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format); + + return ptr; +} + +void VkStagingAllocator::fastFree(VkBufferMemory* ptr) +{ +// TLOG_INFO("VkStagingAllocator F %p", ptr->buffer); + + // return to buffer_budgets + buffer_budgets.push_back(ptr); +} + +void VkStagingAllocator::fastFree(VkImageMemory* ptr) +{ +// TLOG_INFO("VkStagingAllocator F %p", ptr->image); + + free(ptr->mapped_ptr); + + delete ptr; +} + +VkWeightStagingAllocator::VkWeightStagingAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev) +{ + mappable = true; + coherent = true; +} + +VkWeightStagingAllocator::~VkWeightStagingAllocator() +{ +} + +VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size) +{ + printf("VkWeightStagingAllocator fastMalloc %lu\n", size); + VkBufferMemory* ptr = new VkBufferMemory; + + ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); + ptr->offset = 0; + + VkMemoryRequirements memoryRequirements; + vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements); + + // setup memory type + if (buffer_memory_type_index == (uint32_t)-1) + { + buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + } + + ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index); + + // ignore memoryRequirements.alignment as we always bind at zero offset + vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0); + + ptr->capacity = size; + + vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr); + + ptr->access_flags = 0; + ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + +// printf("VkWeightStagingAllocator M %p %lu", ptr->buffer, size); + + return ptr; +} + +void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr) +{ +// TLOG_INFO("VkWeightStagingAllocator F %p", ptr->buffer); + + vkUnmapMemory(vkdev->vkdevice(), ptr->memory); + vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0); + vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0); + + delete ptr; +} + +} // namespace TEngine diff --git a/source/device/vulkan/vulkan_allocator.hpp b/source/device/vulkan/vulkan_allocator.hpp new file mode 100644 index 000000000..4a8f7e1c3 --- /dev/null +++ b/source/device/vulkan/vulkan_allocator.hpp @@ -0,0 +1,284 @@ +#ifndef VULKAN_ALLOCATOR_HPP +#define VULKAN_ALLOCATOR_HPP + +#include +#include +#include +#include +#include +#include +#include "vulkan_platform.hpp" + +namespace TEngine { + +#define MALLOC_ALIGN 16 + +template static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp)) +{ + return (_Tp*)(((size_t)ptr + n-1) & -n); +} + +static inline size_t alignSize(size_t sz, int n) +{ + return (sz + n-1) & -n; +} + +static inline void* fastMalloc(size_t size) +{ + unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN); + if (!udata) + return 0; + unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN); + adata[-1] = udata; + return adata; +} + +static inline void fastFree(void* ptr) +{ + if (ptr) + { + unsigned char* udata = ((unsigned char**)ptr)[-1]; + free(udata); + } +} + +static inline int TENGINE_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; } + + +class Allocator +{ +public: + virtual ~Allocator(); + virtual void* fastMalloc(size_t size) = 0; + virtual void fastFree(void* ptr) = 0; +}; + +// class PoolAllocator : public Allocator +// { +// public: +// PoolAllocator(); +// ~PoolAllocator(); + +// // ratio range 0 ~ 1 +// // default cr = 0.75 +// void set_size_compare_ratio(float scr); + +// // release all budgets immediately +// void clear(); + +// virtual void* fastMalloc(size_t size); +// virtual void fastFree(void* ptr); + +// private: +// Mutex budgets_lock; +// Mutex payouts_lock; +// unsigned int size_compare_ratio;// 0~256 +// std::list< std::pair > budgets; +// std::list< std::pair > payouts; +// }; + +// class UnlockedPoolAllocator : public Allocator +// { +// public: +// UnlockedPoolAllocator(); +// ~UnlockedPoolAllocator(); + +// // ratio range 0 ~ 1 +// // default cr = 0.75 +// void set_size_compare_ratio(float scr); + +// // release all budgets immediately +// void clear(); + +// virtual void* fastMalloc(size_t size); +// virtual void fastFree(void* ptr); + +// private: +// unsigned int size_compare_ratio;// 0~256 +// std::list< std::pair > budgets; +// std::list< std::pair > payouts; +// }; + +class GPUDevice; + +class VkBufferMemory +{ +public: + VkBuffer buffer; + + // the base offset assigned by allocator + size_t offset; + size_t capacity; + + VkDeviceMemory memory; + void* mapped_ptr; + + // buffer state, modified by command functions internally + mutable VkAccessFlags access_flags; + mutable VkPipelineStageFlags stage_flags; + + // initialize and modified by mat + int refcount; +}; + +class VkImageMemory +{ +public: + VkImage image; + VkImageView imageview; + + // underlying info assigned by allocator + VkImageType image_type; + VkImageViewType imageview_type; + int width; + int height; + int depth; + VkFormat format; + + VkDeviceMemory memory; + void* mapped_ptr; + + // the base offset assigned by allocator + size_t bind_offset; + size_t bind_capacity; + + // image state, modified by command functions internally + mutable VkAccessFlags access_flags; + mutable VkImageLayout image_layout; + mutable VkPipelineStageFlags stage_flags; + + // in-execution state, modified by command functions internally + mutable int command_refcount; + + // initialize and modified by mat + int refcount; +}; + +class VkAllocator +{ +public: + VkAllocator(const GPUDevice* _vkdev); + virtual ~VkAllocator() { clear(); } + virtual void clear() {} + + virtual VkBufferMemory* fastMalloc(size_t size) = 0; + virtual void fastFree(VkBufferMemory* ptr) = 0; + virtual int flush(VkBufferMemory* ptr); + virtual int invalidate(VkBufferMemory* ptr); + + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) = 0; + virtual void fastFree(VkImageMemory* ptr) = 0; + +public: + const GPUDevice* vkdev; + uint32_t buffer_memory_type_index; + uint32_t image_memory_type_index; + bool mappable; + bool coherent; + +protected: + VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage); + VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index); + VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer); + + VkImage create_image(VkImageType type, int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage); + VkImageView create_imageview(VkImageViewType type, VkImage image, VkFormat format); +}; + +class VkBlobAllocator : public VkAllocator +{ +public: + VkBlobAllocator(const GPUDevice* vkdev); + virtual ~VkBlobAllocator(); + +public: + // release all budgets immediately + virtual void clear(); + + virtual VkBufferMemory* fastMalloc(size_t size); + virtual void fastFree(VkBufferMemory* ptr); + + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; } + virtual void fastFree(VkImageMemory* ptr); + +protected: + size_t block_size; + size_t buffer_offset_alignment; + size_t bind_memory_offset_alignment; + std::vector< std::list< std::pair > > buffer_budgets; + std::vector buffer_blocks; + std::vector< std::list< std::pair > > image_memory_budgets; + std::vector image_memory_blocks; +}; + +class VkWeightAllocator : public VkAllocator +{ +public: + VkWeightAllocator(const GPUDevice* vkdev); + virtual ~VkWeightAllocator(); + +public: + // release all blocks immediately + virtual void clear(); + +public: + virtual VkBufferMemory* fastMalloc(size_t size); + virtual void fastFree(VkBufferMemory* ptr); + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; } + virtual void fastFree(VkImageMemory* ptr); + +protected: + size_t block_size; + size_t buffer_offset_alignment; + size_t bind_memory_offset_alignment; + std::vector buffer_block_free_spaces; + std::vector buffer_blocks; + std::vector dedicated_buffer_blocks; + std::vector image_memory_block_free_spaces; + std::vector image_memory_blocks; + std::vector dedicated_image_memory_blocks; +}; + + +class VkStagingAllocator : public VkAllocator +{ +public: + VkStagingAllocator(const GPUDevice* vkdev); + virtual ~VkStagingAllocator(); + +public: + // ratio range 0 ~ 1 + // default cr = 0.75 + void set_size_compare_ratio(float scr); + + // release all budgets immediately + virtual void clear(); + + virtual VkBufferMemory* fastMalloc(size_t size); + virtual void fastFree(VkBufferMemory* ptr); + virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; } + virtual void fastFree(VkImageMemory* ptr); + +protected: + unsigned int size_compare_ratio;// 0~256 + std::list buffer_budgets; +}; + + +class VkWeightStagingAllocator : public VkAllocator +{ +public: + VkWeightStagingAllocator(const GPUDevice* vkdev); + virtual ~VkWeightStagingAllocator(); + +public: + virtual VkBufferMemory* fastMalloc(size_t size); + virtual void fastFree(VkBufferMemory* ptr); + virtual VkImageMemory* fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/) { return 0; } + virtual void fastFree(VkImageMemory* /*ptr*/) {} + +protected: +}; + +} +#endif diff --git a/source/device/vulkan/vulkan_command.cpp b/source/device/vulkan/vulkan_command.cpp new file mode 100644 index 000000000..b5545fe6b --- /dev/null +++ b/source/device/vulkan/vulkan_command.cpp @@ -0,0 +1,1782 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: ddzhao@openailab.com + */ + +#include "vulkan_command.hpp" + +#include +#include "vulkan_option.hpp" +#include "vulkan_pipeline.hpp" +#include "vulkan_tensor.hpp" + +namespace TEngine { + +VkCompute::VkCompute(const GPUDevice* _vkdev) : vkdev(_vkdev) +{ + compute_command_pool = 0; + compute_command_buffer = 0; + compute_command_fence = 0; + + init(); +} + + +VkCompute::~VkCompute() +{ + for (size_t i=0; icommand_refcount, -1); + if (ptr->refcount == 0 && old_command_refcount == 1) + { + // no userspace reference and we are the last command reference + vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0); + vkDestroyImage(vkdev->vkdevice(), ptr->image, 0); + + delete ptr; + } + else + { + // reference exists in user code or other command + } + } + image_blocks_to_destroy.clear(); + + if (!vkdev->info.support_VK_KHR_push_descriptor) + { + for (size_t i=0; ivkdevice(), descriptor_pools[i], 1, &descriptorsets[i]); + vkDestroyDescriptorPool(vkdev->vkdevice(), descriptor_pools[i], 0); + } + } + + vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0); + + vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer); + vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0); +} + +void VkCompute::record_upload(tensor* src, VkTensor& dst, const Option& opt) +{ + Tensor src_tensor = Tensor(src); + record_upload(src_tensor, dst, opt); +// // const ir_tensor* src_fp16; +// // if (src.elemsize == src.elempack * 4u) +// if(src->elem_size == opt.elempack * 4u) +// { +// // cpu cast to fp16 (discrete gpu) +// if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && opt.elempack % 4 == 0))) +// { +// // ncnn::cast_float32_to_float16(src, src_fp16, opt); +// printf("need to add cast_float32_to_float16 here, fix me!\n"); +// } +// else +// { +// // src_fp16 = src; +// } +// } +// else +// { +// // src_fp16 = src; +// } + +// // upload +// VkTensor dst_staging; +// if (opt.blob_vkallocator->mappable) +// { +// // dst_staging.create_like(src_fp16, opt.blob_vkallocator); +// dst_staging.create_like(src, opt.blob_vkallocator); +// } +// else +// { +// // dst_staging.create_like(src_fp16, opt.staging_vkallocator); +// dst_staging.create_like(src, opt.staging_vkallocator); +// } +// if (dst_staging.empty()) +// return; + +// // stash staging +// upload_staging_buffers.push_back(dst_staging); + +// // TLOG_INFO("upload_staging_buffer %p -> %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity()); + +// // memcpy src to device +// // memcpy(dst_staging.mapped_ptr(), src_fp16->data, src_fp16->elem_size * src_fp16->elem_num); +// memcpy(dst_staging.mapped_ptr(), src->data, src->elem_size * src->elem_num); +// dst_staging.allocator->flush(dst_staging.data); + +// // mark device host-write @ null +// dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; +// dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + +// // TODO +// // not use pack for now------------------------ +// // // resolve dst_elempack +// int dims = src->dim_num; +// int elemcount = 0; +// // src dims[0-3] n c h w +// // if (dims == 1) elemcount = opt.elempack * src_fp16.w; +// // if (dims == 2) elemcount = opt.elempack * src_fp16.h; +// // if (dims == 3) elemcount = opt.elempack * src_fp16.c; +// if(dims == 4) +// elemcount = opt.elempack * src->dims[1]; +// else +// elemcount = opt.elempack * src->dims[0]; + +// int dst_elempack = 1; +// if (opt.use_shader_pack8) +// dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; +// else +// dst_elempack = elemcount % 4 == 0 ? 4 : 1; + +// vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt); +} + +void VkCompute::record_upload(const Tensor& src, VkTensor& dst, const Option& opt) +{ + // TLOG_INFO("record_upload buffer"); + + Tensor src_fp16; + if (src.elemsize == src.elempack * 4u) + { + // cpu cast to fp16 (discrete gpu) + if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0))) + { + // printf("do nothing for VkCompute record_upload cast_float32_to_float16, fix me\n"); + TEngine::cast_float32_to_float16(src, src_fp16, opt); + } + else + { + src_fp16 = src; + } + } + else + { + src_fp16 = src; + } + + // upload + VkTensor dst_staging; + if (opt.blob_vkallocator->mappable) + { + dst_staging.create_like(src_fp16, opt.blob_vkallocator); + } + else + { + dst_staging.create_like(src_fp16, opt.staging_vkallocator); + } + if (dst_staging.empty()) + return; + + // stash staging + upload_staging_buffers.push_back(dst_staging); + +// TLOG_INFO("upload_staging_buffer %p -> %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity()); + + // memcpy src to device + memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize); + dst_staging.allocator->flush(dst_staging.data); + + // mark device host-write @ null + dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; + dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + + // resolve dst_elempack + int dims = src_fp16.dims; + int elemcount = 0; + if (dims == 1) elemcount = src_fp16.elempack * src_fp16.w; + if (dims == 2) elemcount = src_fp16.elempack * src_fp16.h; + if (dims == 3) elemcount = src_fp16.elempack * src_fp16.c; + + int dst_elempack = 1; + if (opt.use_shader_pack8) + dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1; + else + dst_elempack = elemcount % 4 == 0 ? 4 : 1; + + // gpu cast to fp16 on the fly (integrated gpu) + vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt); +} + +void VkCompute::record_download(const VkTensor& src, tensor* dst, const Option& opt) +{ + Tensor dst_tensor; + record_download(src, dst_tensor, opt); + dst->data = dst_tensor.data; + + // Tensor feat; + // if (opt.use_packing_layout) + // { + // Tensor bottom_blob_unpacked; + // convert_packing(dst_tensor, bottom_blob_unpacked, 1, opt); + // feat = bottom_blob_unpacked; + // } + + // if (opt.use_bf16_storage) + // { + // if (feat.elemsize / feat.elempack == 2u) + // { + // Tensor feat_fp32; + // cast_bfloat16_to_float32(feat, feat_fp32, opt); + // feat = feat_fp32; + // } + // } + + // dst->data = feat.data; +} + +void VkCompute::record_download(const VkTensor& src, Tensor& dst, const Option& opt) +{ + int dims = src.dims; + int elemcount = 0; + if (dims == 1) elemcount = src.elempack * src.w; + if (dims == 2) elemcount = src.elempack * src.h; + if (dims == 3) elemcount = src.elempack * src.c; + + int dst_elempack = 1; + if (opt.use_packing_layout) + dst_elempack = elemcount % 4 == 0 ? 4 : 1; + else + dst_elempack = 1; + + // gpu cast to fp32 on the fly (integrated gpu) + Option opt_staging = opt; + if (vkdev->info.type != 0) + { + opt_staging.use_fp16_packed = false; + opt_staging.use_fp16_storage = false; + } + + VkTensor dst_staging; + if (opt_staging.blob_vkallocator->mappable) + { + vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt); + } + else + { + opt_staging.blob_vkallocator = opt.staging_vkallocator; + vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt_staging); + } + + // barrier device any @ compute to host-read @ compute + if (dst_staging.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || dst_staging.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT) + { + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = dst_staging.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = dst_staging.buffer(); + barriers[0].offset = dst_staging.buffer_offset(); + barriers[0].size = dst_staging.buffer_capacity(); + + VkPipelineStageFlags src_stage = dst_staging.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device host-read @ any + dst_staging.data->access_flags = VK_ACCESS_HOST_READ_BIT; + dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + } + + // create dst + Tensor dst_fp16; + dst_fp16.create_like(dst_staging, opt.blob_allocator); + if (dst_fp16.empty()) + return; + + // download + download_post_buffers.push_back(dst_staging); + download_post_tensors_fp16.push_back(dst_fp16); + + // post memcpy device to dst + { + record r; + r.type = record::TYPE_post_download; + r.command_buffer = 0; + r.post_download.download_post_buffer_mat_offset = download_post_buffers.size() - 1; + r.post_download.download_post_mat_fp16_offset = download_post_tensors_fp16.size() - 1; + delayed_records.push_back(r); + } + + // cast to fp32 (discrete gpu) + if (dst_fp16.elemsize == dst_fp16.elempack * 2u) + { + if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && dst_fp16.elempack % 4 == 0))) + { + int dims = dst_fp16.dims; + if (dims == 1) + dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + if (dims == 2) + dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + if (dims == 3) + dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator); + + download_post_tensors_fp16.push_back(dst_fp16); + download_post_tensors.push_back(dst); + + record r; + r.type = record::TYPE_post_cast_float16_to_float32; + r.command_buffer = 0; + r.post_cast_float16_to_float32.download_post_mat_fp16_offset = download_post_tensors_fp16.size() - 1; + r.post_cast_float16_to_float32.download_post_mat_offset = download_post_tensors.size() - 1; + delayed_records.push_back(r); + } + else + { + dst = dst_fp16; + } + } + else + { + dst = dst_fp16; + } +} + +int VkCompute::submit_and_wait() +{ + // printf("VkCompute submit_and_wait\n"); + if (!vkdev->info.support_VK_KHR_push_descriptor) + { + // printf("start to run begin command buffer\n"); + begin_command_buffer(); + const size_t record_count = delayed_records.size(); + // printf("delayed_records count:%d\n", record_count); + + // handle delayed records + for (size_t i=0; iacquire_queue(vkdev->info.compute_queue_family_index); + if (compute_queue == 0) + { + printf("out of compute queue\n"); + return -1; + } + + // submit compute + { + VkSubmitInfo submitInfo; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.pNext = 0; + submitInfo.waitSemaphoreCount = 0; + submitInfo.pWaitSemaphores = 0; + submitInfo.pWaitDstStageMask = 0; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &compute_command_buffer; + submitInfo.signalSemaphoreCount = 0; + submitInfo.pSignalSemaphores = 0; + + VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, compute_command_fence); + if (ret != VK_SUCCESS) + { + printf("vkQueueSubmit failed %d", ret); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } + } + + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + + // wait + { + VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &compute_command_fence, VK_TRUE, UINT64_MAX); + if (ret != VK_SUCCESS) + { + printf("vkWaitForFences failed %d", ret); + return -1; + } + } + + // handle delayed post records + for (size_t i=0; i %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data); + + src.allocator->invalidate(src.data); + // memcpy(dst.data, src.mapped_ptr(), dst.elem_size * dst.elem_num); + memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize); + break; + } + case record::TYPE_post_cast_float16_to_float32: + { + // TODO + printf("submit delayed_records TYPE_post_cast_float16_to_float32, Do nothing, fix me\n"); + break; + } + default: + break; + } + } + + delayed_records.clear(); + + return 0; +} + + +int VkCompute::init() +{ + // compute_command_pool + { + VkCommandPoolCreateInfo commandPoolCreateInfo; + commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + commandPoolCreateInfo.pNext = 0; + commandPoolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT; + commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index; + VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool); + if (ret != VK_SUCCESS) + { + printf("vkCreateCommandPool failed %d", ret); + return -1; + } + } + // compute_command_buffer + { + VkCommandBufferAllocateInfo commandBufferAllocateInfo; + commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + commandBufferAllocateInfo.pNext = 0; + commandBufferAllocateInfo.commandPool = compute_command_pool; + commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + commandBufferAllocateInfo.commandBufferCount = 1; + + VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer); + if (ret != VK_SUCCESS) + { + printf("vkAllocateCommandBuffers failed %d", ret); + return -1; + } + } + + // compute_command_fence + { + VkFenceCreateInfo fenceCreateInfo; + fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceCreateInfo.pNext = 0; + fenceCreateInfo.flags = 0; + + VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence); + if (ret != VK_SUCCESS) + { + printf("vkCreateFence failed %d", ret); + return -1; + } + } + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + begin_command_buffer(); + } + + return 0; +} + +int VkCompute::begin_command_buffer() +{ + VkCommandBufferBeginInfo commandBufferBeginInfo; + commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + commandBufferBeginInfo.pNext = 0; + commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + commandBufferBeginInfo.pInheritanceInfo = 0; + + VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo); + if (ret != VK_SUCCESS) + { + printf("vkBeginCommandBuffer failed %d", ret); + return -1; + } + return 0; +} + +int VkCompute::end_command_buffer() +{ + VkResult ret = vkEndCommandBuffer(compute_command_buffer); + if (ret != VK_SUCCESS) + { + printf("vkEndCommandBuffer failed %d", ret); + return -1; + } + + return 0; +} + +void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkTensor& dispatcher) +{ + record_pipeline(pipeline, bindings, std::vector(), constants, dispatcher); +} + +void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkImageTensor& dispatcher) +{ + record_pipeline(pipeline, std::vector(), bindings, constants, dispatcher); +} + +void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, const VkTensor& dispatcher) +{ + // Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.c, (void*)0); + + record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher.w, dispatcher.h, dispatcher.c); +} + +void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, const VkImageTensor& dispatcher) +{ + // VkTensor dispatcher_VkTensor(dispatcher.w, dispatcher.h, dispatcher.c, (void*)0); + + record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher.w, dispatcher.h, dispatcher.c); +} + +void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, int dispatcher_w, int dispatcher_h, int dispatcher_c) +{ + const int buffer_binding_count = (int)buffer_bindings.size(); + const int image_binding_count = (int)image_bindings.size(); + const int constant_count = (int)constants.size(); + + const int binding_count = buffer_binding_count + image_binding_count; + + if (binding_count != pipeline->shader_info.binding_count) + { + printf("binding_count not match, expect %d but got %d + %d", pipeline->shader_info.binding_count, buffer_binding_count, image_binding_count); + } + + if (constant_count != pipeline->shader_info.push_constant_count) + { + printf("push_constant_count not match, expect %d but got %d", pipeline->shader_info.push_constant_count, constant_count); + } + + int buffer_index = 0; + int image_index = 0; + for (int i=0; ishader_info.binding_types[i]; + + if (binding_type == 1) + { + const VkTensor& binding = buffer_bindings[buffer_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[buffer_index]; + buffer_index++; + +// TLOG_INFO("binding #%d buffer = %d %d %d %d @ %lu %d = %p +%ld ~%ld", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.buffer(), binding.buffer_offset(), binding.buffer_capacity()); + + if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) + { + // barrier device any @ compute/null to shader-readwrite @ compute + VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = binding.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = binding.buffer(); + barriers[0].offset = binding.buffer_offset(); + barriers[0].size = binding.buffer_capacity(); + + VkPipelineStageFlags src_stage = binding.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device shader-readwrite @ compute + binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + } + else if (binding_type == 2) + { + const VkImageTensor& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image() : image_bindings[image_index]; + image_index++; + +// TLOG_INFO("binding #%d image = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview()); + + if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) + { + // image layout transform any @ any to shader-write @ compute + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = binding.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + barriers[0].oldLayout = binding.data->image_layout; + barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = binding.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = binding.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark image shader-write @ compute + binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + binding.data->image_layout = VK_IMAGE_LAYOUT_GENERAL; + binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + + // image and imageview can not be destroyed until command execution ends + TENGINE_XADD(&binding.data->command_refcount, 1); + image_blocks_to_destroy.push_back(binding.data); + } + else // if (binding_type == 3) + { + const VkImageTensor& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image() : image_bindings[image_index]; + image_index++; + +// TLOG_INFO("binding #%d sampler = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview()); + + // if the same image used for both storage image and combined image sampler + // only apply image layout transition to general + for (int j=0; jshader_info.binding_types[j] == 2 && binding.data == image_bindings[j].data) + { + // the same image is used as storage image, skip it + continue; + } + } + + if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT) + { + // image layout transform any @ any to shader-readonly-optimal @ compute + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = binding.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barriers[0].oldLayout = binding.data->image_layout; + barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = binding.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = binding.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark image shader-readonly-optimal @ compute + binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT; + binding.data->image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; + binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + + // image and imageview can not be destroyed until command execution ends + TENGINE_XADD(&binding.data->command_refcount, 1); + image_blocks_to_destroy.push_back(binding.data); + } + } + // record bind pipeline + { + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdBindPipeline(compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline); + } + else + { + record r; + r.type = record::TYPE_bind_pipeline; + r.command_buffer = compute_command_buffer; + r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_pipeline.pipeline = pipeline->pipeline; + delayed_records.push_back(r); + } + } + + // record update bindings + if (binding_count > 0) + { + std::vector descriptorInfos; + { + descriptorInfos.resize(sizeof(VkDescriptorBufferInfo) * buffer_binding_count + sizeof(VkDescriptorImageInfo) * image_binding_count); + + unsigned char* p_descriptorInfos = descriptorInfos.data(); + int descriptorBufferInfo_index = 0; + int descriptorImageInfo_index = 0; + for (int i=0; ishader_info.binding_types[i]; + + if (binding_type == 1) + { + const VkTensor& binding = buffer_bindings[descriptorBufferInfo_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[descriptorBufferInfo_index]; + descriptorBufferInfo_index++; + + VkDescriptorBufferInfo descriptorBufferInfo; + descriptorBufferInfo.buffer = binding.buffer(); + descriptorBufferInfo.offset = binding.buffer_offset(); + descriptorBufferInfo.range = binding.total() * binding.elemsize; + + memcpy(p_descriptorInfos, &descriptorBufferInfo, sizeof(VkDescriptorBufferInfo)); + p_descriptorInfos += sizeof(VkDescriptorBufferInfo); + } + else //if (binding_type == 2 || binding_type == 3) + { + const VkImageTensor& binding = image_bindings[descriptorImageInfo_index].empty() ? vkdev->get_dummy_image() : image_bindings[descriptorImageInfo_index]; + descriptorImageInfo_index++; + + // we always use immutable nearest sampler set in descroptor layout during pipeline creation + VkDescriptorImageInfo descriptorImageInfo; + descriptorImageInfo.sampler = 0; + descriptorImageInfo.imageView = binding.imageview(); + descriptorImageInfo.imageLayout = binding.data->image_layout; + + memcpy(p_descriptorInfos, &descriptorImageInfo, sizeof(VkDescriptorImageInfo)); + p_descriptorInfos += sizeof(VkDescriptorImageInfo); + } + } + } + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptorInfos.data()); + } + else + { + // create new descriptor_pool and descriptorset + VkDescriptorPool descriptor_pool; + { + int image_binding_count = 0; + int sampler_binding_count = 0; + for (int i=0; ishader_info.binding_types[i]; + + if (binding_type == 2) + image_binding_count++; + else // if (binding_type == 3) + sampler_binding_count++; + } + + VkDescriptorPoolSize poolSizes[3]; + poolSizes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + poolSizes[0].descriptorCount = buffer_binding_count; + poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + poolSizes[1].descriptorCount = image_binding_count; + poolSizes[2].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + poolSizes[2].descriptorCount = sampler_binding_count; + + VkDescriptorPoolCreateInfo descriptorPoolCreateInfo; + descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO; + descriptorPoolCreateInfo.pNext = 0; + descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT; + descriptorPoolCreateInfo.maxSets = 1; + descriptorPoolCreateInfo.poolSizeCount = 3; + descriptorPoolCreateInfo.pPoolSizes = poolSizes; + + VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool); + if (ret != VK_SUCCESS) + { + printf("vkCreateDescriptorPool failed %d", ret); + return; + } + } + descriptor_pools.push_back(descriptor_pool); + + VkDescriptorSet descriptorset; + { + VkDescriptorSetAllocateInfo descriptorSetAllocateInfo; + descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO; + descriptorSetAllocateInfo.pNext = 0; + descriptorSetAllocateInfo.descriptorPool = descriptor_pool; + descriptorSetAllocateInfo.descriptorSetCount = 1; + descriptorSetAllocateInfo.pSetLayouts = &pipeline->descriptorset_layout; + + VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset); + if (ret != VK_SUCCESS) + { + printf("vkAllocateDescriptorSets failed %d", ret); + return; + } + } + descriptorsets.push_back(descriptorset); + + if (vkdev->info.support_VK_KHR_descriptor_update_template) + { + vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, descriptorInfos.data()); + } + else + { + std::vector writeDescriptorSets(binding_count); + { + const unsigned char* p_descriptorInfos = descriptorInfos.data(); + for (int i=0; ishader_info.binding_types[i]; + + writeDescriptorSets[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSets[i].pNext = 0; + writeDescriptorSets[i].dstSet = descriptorset; + writeDescriptorSets[i].dstBinding = i; + writeDescriptorSets[i].dstArrayElement = 0; + writeDescriptorSets[i].descriptorCount = 1; + writeDescriptorSets[i].pTexelBufferView = 0; + + if (binding_type == 1) + { + writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER; + writeDescriptorSets[i].pImageInfo = 0; + writeDescriptorSets[i].pBufferInfo = (const VkDescriptorBufferInfo*)p_descriptorInfos; + + p_descriptorInfos += sizeof(VkDescriptorBufferInfo); + } + else if (binding_type == 2) + { + writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos; + writeDescriptorSets[i].pBufferInfo = 0; + + p_descriptorInfos += sizeof(VkDescriptorImageInfo); + } + else // if (binding_type == 3) + { + writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER; + writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos; + writeDescriptorSets[i].pBufferInfo = 0; + + p_descriptorInfos += sizeof(VkDescriptorImageInfo); + } + } + } + + vkUpdateDescriptorSets(vkdev->vkdevice(), binding_count, writeDescriptorSets.data(), 0, 0); + } + + record r; + r.type = record::TYPE_bind_descriptorsets; + r.command_buffer = compute_command_buffer; + r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE; + r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout; + r.bind_descriptorsets.descriptorset_count = 1; + r.bind_descriptorsets.descriptorset_offset = descriptorsets.size() - 1; + delayed_records.push_back(r); + } + } + + // record push constants + if (constant_count > 0) + { + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPushConstants(compute_command_buffer, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data()); + } + else + { + uint32_t size = constant_count * sizeof(vk_constant_type); + unsigned char* constant_values = new unsigned char[size]; + memcpy(constant_values, constants.data(), size); + + record r; + r.type = record::TYPE_push_constants; + r.command_buffer = compute_command_buffer; + r.push_constants.pipeline_layout = pipeline->pipeline_layout; + r.push_constants.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT; + r.push_constants.size = size; + r.push_constants.values = constant_values; + delayed_records.push_back(r); + } + } + + // record dispatch + { + uint32_t group_count_x = (dispatcher_w + pipeline->local_size_x - 1) / pipeline->local_size_x; + uint32_t group_count_y = (dispatcher_h + pipeline->local_size_y - 1) / pipeline->local_size_y; + uint32_t group_count_z = (dispatcher_c + pipeline->local_size_z - 1) / pipeline->local_size_z; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdDispatch(compute_command_buffer, group_count_x, group_count_y, group_count_z); + } + else + { + record r; + r.type = record::TYPE_dispatch; + r.command_buffer = compute_command_buffer; + r.dispatch.group_count_x = group_count_x; + r.dispatch.group_count_y = group_count_y; + r.dispatch.group_count_z = group_count_z; + delayed_records.push_back(r); + } + } +} + +VkTransfer::VkTransfer(const GPUDevice* _vkdev) : vkdev(_vkdev) +{ + compute_command_pool = 0; + transfer_command_pool = 0; + + upload_command_buffer = 0; + compute_command_buffer = 0; + + upload_compute_semaphore = 0; + + upload_command_fence = 0; + compute_command_fence = 0; + + init(); +} + +VkTransfer::~VkTransfer() +{ + vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0); + + vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer); + vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0); + + if (!vkdev->info.unified_compute_transfer_queue) + { + vkDestroyFence(vkdev->vkdevice(), upload_command_fence, 0); + + vkDestroySemaphore(vkdev->vkdevice(), upload_compute_semaphore, 0); + + vkFreeCommandBuffers(vkdev->vkdevice(), transfer_command_pool, 1, &upload_command_buffer); + vkDestroyCommandPool(vkdev->vkdevice(), transfer_command_pool, 0); + } +} + +int VkTransfer::init() +{ + // compute_command_pool + { + VkCommandPoolCreateInfo commandPoolCreateInfo; + commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + commandPoolCreateInfo.pNext = 0; + commandPoolCreateInfo.flags = 0; + commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index; + + VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool); + if (ret != VK_SUCCESS) + { + printf("vkCreateCommandPool failed %d", ret); + return -1; + } + } + + // compute_command_buffer + { + VkCommandBufferAllocateInfo commandBufferAllocateInfo; + commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + commandBufferAllocateInfo.pNext = 0; + commandBufferAllocateInfo.commandPool = compute_command_pool; + commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + commandBufferAllocateInfo.commandBufferCount = 1; + + VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer); + if (ret != VK_SUCCESS) + { + printf("vkAllocateCommandBuffers failed %d", ret); + return -1; + } + } + + // compute_command_fence + { + VkFenceCreateInfo fenceCreateInfo; + fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceCreateInfo.pNext = 0; + fenceCreateInfo.flags = 0; + + VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence); + if (ret != VK_SUCCESS) + { + printf("vkCreateFence failed %d", ret); + return -1; + } + } + + if (!vkdev->info.unified_compute_transfer_queue) + { + // transfer_command_pool + { + VkCommandPoolCreateInfo commandPoolCreateInfo; + commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + commandPoolCreateInfo.pNext = 0; + commandPoolCreateInfo.flags = 0; + commandPoolCreateInfo.queueFamilyIndex = vkdev->info.transfer_queue_family_index; + + VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &transfer_command_pool); + if (ret != VK_SUCCESS) + { + printf("vkCreateCommandPool failed %d", ret); + return -1; + } + } + + // upload_command_buffer + { + VkCommandBufferAllocateInfo commandBufferAllocateInfo; + commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + commandBufferAllocateInfo.pNext = 0; + commandBufferAllocateInfo.commandPool = transfer_command_pool; + commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + commandBufferAllocateInfo.commandBufferCount = 1; + + VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &upload_command_buffer); + if (ret != VK_SUCCESS) + { + printf("vkAllocateCommandBuffers failed %d", ret); + return -1; + } + } + + // upload_compute_semaphore + { + VkSemaphoreCreateInfo semaphoreCreateInfo; + semaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO; + semaphoreCreateInfo.pNext = 0; + semaphoreCreateInfo.flags = 0; + + VkResult ret = vkCreateSemaphore(vkdev->vkdevice(), &semaphoreCreateInfo, 0, &upload_compute_semaphore); + + if (ret != VK_SUCCESS) + { + printf("vkCreateSemaphore failed %d", ret); + return -1; + } + } + + // upload_command_fence + { + VkFenceCreateInfo fenceCreateInfo; + fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO; + fenceCreateInfo.pNext = 0; + fenceCreateInfo.flags = 0; + + VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &upload_command_fence); + + if (ret != VK_SUCCESS) + { + printf("vkCreateFence failed %d", ret); + return -1; + } + } + } + + begin_command_buffer(); + + return 0; +} + +int VkTransfer::begin_command_buffer() +{ + { + VkCommandBufferBeginInfo commandBufferBeginInfo; + commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + commandBufferBeginInfo.pNext = 0; + commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + commandBufferBeginInfo.pInheritanceInfo = 0; + + VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo); + if (ret != VK_SUCCESS) + { + printf("vkBeginCommandBuffer failed %d", ret); + return -1; + } + } + + if (!vkdev->info.unified_compute_transfer_queue) + { + { + VkCommandBufferBeginInfo commandBufferBeginInfo; + commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + commandBufferBeginInfo.pNext = 0; + commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + commandBufferBeginInfo.pInheritanceInfo = 0; + + VkResult ret = vkBeginCommandBuffer(upload_command_buffer, &commandBufferBeginInfo); + if (ret != VK_SUCCESS) + { + printf("vkBeginCommandBuffer failed %d", ret); + return -1; + } + } + } + return 0; +} + + +int VkTransfer::end_command_buffer() +{ + { + VkResult ret = vkEndCommandBuffer(compute_command_buffer); + if (ret != VK_SUCCESS) + { + printf("vkEndCommandBuffer failed %d", ret); + return -1; + } + } + + if (!vkdev->info.unified_compute_transfer_queue) + { + { + VkResult ret = vkEndCommandBuffer(upload_command_buffer); + if (ret != VK_SUCCESS) + { + printf("vkEndCommandBuffer failed %d", ret); + return -1; + } + } + } + return 0; +} + +int VkTransfer::submit_and_wait() +{ + // end command buffer + { + end_command_buffer(); + } + + VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index); + if (compute_queue == 0) + { + printf("out of compute queue"); + return -1; + } + + if (vkdev->info.unified_compute_transfer_queue) + { + // submit compute + { + VkSubmitInfo submitInfo; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.pNext = 0; + submitInfo.waitSemaphoreCount = 0; + submitInfo.pWaitSemaphores = 0; + submitInfo.pWaitDstStageMask = 0; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &compute_command_buffer; + submitInfo.signalSemaphoreCount = 0; + submitInfo.pSignalSemaphores = 0; + + VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, compute_command_fence); + if (ret != VK_SUCCESS) + { + printf("vkQueueSubmit failed %d", ret); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } + } + } + else + { + VkQueue transfer_queue = vkdev->acquire_queue(vkdev->info.transfer_queue_family_index); + if (transfer_queue == 0) + { + printf("out of transfer queue"); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } + + // submit upload compute + { + VkSubmitInfo submitInfo; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.pNext = 0; + submitInfo.waitSemaphoreCount = 0; + submitInfo.pWaitSemaphores = 0; + submitInfo.pWaitDstStageMask = 0; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &upload_command_buffer; + submitInfo.signalSemaphoreCount = 1; + submitInfo.pSignalSemaphores = &upload_compute_semaphore; + + VkResult ret = vkQueueSubmit(transfer_queue, 1, &submitInfo, upload_command_fence); + if (ret != VK_SUCCESS) + { + printf("vkQueueSubmit failed %d", ret); + vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } + } + + { + VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;// FIXME + VkSubmitInfo submitInfo; + submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + submitInfo.pNext = 0; + submitInfo.waitSemaphoreCount = 1; + submitInfo.pWaitSemaphores = &upload_compute_semaphore; + submitInfo.pWaitDstStageMask = &wait_dst_stage; + submitInfo.commandBufferCount = 1; + submitInfo.pCommandBuffers = &compute_command_buffer; + submitInfo.signalSemaphoreCount = 0; + submitInfo.pSignalSemaphores = 0; + + VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, compute_command_fence); + + if (ret != VK_SUCCESS) + { + printf("vkQueueSubmit failed %d", ret); + vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue); + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + return -1; + } + } + + vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue); + } + vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue); + + // wait + if (vkdev->info.unified_compute_transfer_queue) + { + VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &compute_command_fence, VK_TRUE, UINT64_MAX); + if (ret != VK_SUCCESS) + { + printf("vkWaitForFences failed %d", ret); + return -1; + } + } + else + { + VkFence fences[2] = { upload_command_fence, compute_command_fence }; + + VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, UINT64_MAX); + if (ret != VK_SUCCESS) + { + printf("vkWaitForFences failed %d", ret); + return -1; + } + } + return 0; +} + +void VkTransfer::record_upload(const Tensor& src, VkTensor& dst, const Option& opt) +{ +// TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack); + + // NOTE keep the hack here ? + if (src.elemsize == src.elempack * 4u) + { + if (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)) + { + // printf("VkTransfer record_upload, cast fp32 to fp16, need to be done, fix me\n"); + Tensor src_fp16; + TEngine::cast_float32_to_float16(src, src_fp16); + record_upload(src_fp16, dst, opt); + + return; + } + } + + Tensor src_flattened = src.reshape(src.w * src.h * src.c); + + // create dst + dst.create_like(src_flattened, opt.blob_vkallocator); + + if (dst.empty()) + { + return; + } + + if (dst.allocator->mappable) + { + // memcpy src_flattened to device + memcpy(dst.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize); + dst.allocator->flush(dst.data); + + // barrier device host-write @ null to shader-read @ compute + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + + // mark device shader-readwrite @ compute + dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + return; + } + + // create staging + VkTensor dst_staging; + dst_staging.create_like(src_flattened, opt.staging_vkallocator); + + // memcpy src_flattened to staging + memcpy(dst_staging.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize); + dst_staging.allocator->flush(dst_staging.data); + + VkCommandBuffer command_buffer; + if (vkdev->info.unified_compute_transfer_queue) + { + command_buffer = compute_command_buffer; + } + else + { + command_buffer = upload_command_buffer; + } + + // barrier staging host-write @ null to transfer-read @ queue + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = dst_staging.buffer(); + barrier.offset = dst_staging.buffer_offset(); + barrier.size = dst_staging.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + + // record staging to device + { + VkBufferCopy region; + region.srcOffset = dst_staging.buffer_offset(); + region.dstOffset = dst.buffer_offset(); + region.size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity()); + + vkCmdCopyBuffer(command_buffer, dst_staging.buffer(), dst.buffer(), 1, ®ion); + } + + if (vkdev->info.unified_compute_transfer_queue) + { + // barrier device transfer-write @ compute to shader-read @ compute + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + } + else + { + // queue ownership transfer transfer-write @ transfer to shader-read @ compute + + // release + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = 0; + barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index; + barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + + vkCmdPipelineBarrier(upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + + // acquire + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index; + barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + } + + // mark device shader-readwrite @ compute + dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + // stash staging + upload_staging_buffers.push_back(dst_staging); +} + +void VkTransfer::record_upload(const tensor* src, VkTensor& dst, const Option& opt) +{ +// TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack); + + // NOTE keep the hack here ? + // printf("elem size: %d, elempack:%d\n", src.elemsize, src.elempack); + if (src->elem_size == opt.elempack * 4u) + { + if (opt.use_fp16_storage || (opt.use_fp16_packed && opt.elempack % 4 == 0)) + { + printf("VkTransfer record_upload, cast fp32 to fp16, need to be done, fix me\n"); + // Mat src_fp16; + // cast_float32_to_float16(src, src_fp16); + + // record_upload(src_fp16, dst, opt); + + return; + } + } + + // Mat src_flattened = src.reshape(src.w * src.h * src.c); + + // create dst + // dst.create_like(src_flattened, opt.blob_vkallocator); + // int elemnum = src->elem_num; // src->GetTotalSize()/sizeof(float); + dst.create(src->elem_num, src->elem_size, opt.blob_vkallocator); + + if (dst.empty()) + { + return; + } + + if (dst.allocator->mappable) + { + // memcpy src_flattened to device + memcpy(dst.mapped_ptr(), src->data, src->elem_num * src->elem_size); + dst.allocator->flush(dst.data); + + // barrier device host-write @ null to shader-read @ compute + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + + // mark device shader-readwrite @ compute + dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + return; + } + + printf("run create staging\n"); + // create staging + VkTensor dst_staging; + dst_staging.create(src->elem_num, src->elem_size, opt.staging_vkallocator); + // dst_staging.create_like(src_flattened, opt.staging_vkallocator); + + // memcpy src_flattened to staging + memcpy(dst_staging.mapped_ptr(), src->data, src->elem_num * src->elem_size); + dst_staging.allocator->flush(dst_staging.data); + + VkCommandBuffer command_buffer; + if (vkdev->info.unified_compute_transfer_queue) + { + command_buffer = compute_command_buffer; + } + else + { + command_buffer = upload_command_buffer; + } + + // barrier staging host-write @ null to transfer-read @ queue + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = dst_staging.buffer(); + barrier.offset = dst_staging.buffer_offset(); + barrier.size = dst_staging.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + + vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + + // record staging to device + { + VkBufferCopy region; + region.srcOffset = dst_staging.buffer_offset(); + region.dstOffset = dst.buffer_offset(); + region.size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity()); + + vkCmdCopyBuffer(command_buffer, dst_staging.buffer(), dst.buffer(), 1, ®ion); + } + + if (vkdev->info.unified_compute_transfer_queue) + { + // barrier device transfer-write @ compute to shader-read @ compute + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + } + else + { + // queue ownership transfer transfer-write @ transfer to shader-read @ compute + + // release + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + barrier.dstAccessMask = 0; + barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index; + barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT; + + vkCmdPipelineBarrier(upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + + // acquire + { + VkBufferMemoryBarrier barrier; + barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; + barrier.pNext = 0; + barrier.srcAccessMask = 0; + barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index; + barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index; + barrier.buffer = dst.buffer(); + barrier.offset = dst.buffer_offset(); + barrier.size = dst.buffer_capacity(); + + VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0); + } + } + + // mark device shader-readwrite @ compute + dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT; + dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + // stash staging + upload_staging_buffers.push_back(dst_staging); +} + +} // namespace TEngine diff --git a/source/device/vulkan/vulkan_command.hpp b/source/device/vulkan/vulkan_command.hpp new file mode 100644 index 000000000..1f5e82e06 --- /dev/null +++ b/source/device/vulkan/vulkan_command.hpp @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2020, OPEN AI LAB + * Author: ddzhao@openailab.com + */ + +#ifndef VULKAN_COMMAND_HPP +#define VULKAN_COMMAND_HPP + +#include +#include +#include "vulkan_allocator.hpp" +#include "vulkan_tensor.hpp" +#include "vulkan_pipeline.hpp" +#include "vulkan_option.hpp" +#include "vulkan_platform.hpp" +// #include "tengine_log.h" + +namespace TEngine { + +class Pipeline; +class VkCompute +{ +public: + VkCompute(const GPUDevice* vkdev); + virtual ~VkCompute(); + +public: + void record_upload(tensor* src, VkTensor& dst, const Option& opt); + void record_upload(const Tensor& src, VkTensor& dst, const Option& opt); + + void record_download(const VkTensor& src, tensor* dst, const Option& opt); + void record_download(const VkTensor& src, Tensor& dst, const Option& opt); + + void record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkTensor& dispatcher); + void record_pipeline(const Pipeline* pipeline, const std::vector& bindings, const std::vector& constants, const VkImageTensor& dispatcher); + void record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, const VkTensor& dispatcher); + void record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, const VkImageTensor& dispatcher); + void record_pipeline(const Pipeline* pipeline, const std::vector& buffer_bindings, const std::vector& image_bindings, const std::vector& constants, int dispatcher_w, int dispatcher_h, int dispatcher_c); + + int submit_and_wait(); + + int reset(); + +protected: + int init(); + int begin_command_buffer(); + int end_command_buffer(); + +protected: + const GPUDevice* vkdev; + + VkCommandPool compute_command_pool; + VkCommandBuffer compute_command_buffer; + VkFence compute_command_fence; + + std::vector upload_staging_buffers; + std::vector download_post_buffers; + std::vector download_post_tensors_fp16; + std::vector download_post_tensors; + std::vector image_blocks_to_destroy; + + // the good-old path for device without VK_KHR_push_descriptor + std::vector descriptor_pools; + std::vector descriptorsets; + + struct record + { + enum + { + TYPE_copy_buffer, + TYPE_copy_image, + TYPE_copy_buffer_to_image, + TYPE_copy_image_to_buffer, + TYPE_bind_pipeline, + TYPE_bind_descriptorsets, + TYPE_push_constants, + TYPE_dispatch, + TYPE_memory_barrers, + TYPE_buffer_barrers, + TYPE_image_barrers, + TYPE_post_download, + TYPE_post_cast_float16_to_float32, + }; + + int type; + VkCommandBuffer command_buffer; + + union + { + struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer; + struct { VkImage src; VkImageLayout src_layout; VkImage dst; VkImageLayout dst_layout; uint32_t region_count; const VkImageCopy* regions; } copy_image; + struct { VkBuffer src; VkImage dst; VkImageLayout layout; uint32_t region_count; const VkBufferImageCopy* regions; } copy_buffer_to_image; + struct { VkImage src; VkImageLayout layout; VkBuffer dst; uint32_t region_count; const VkBufferImageCopy* regions; } copy_image_to_buffer; + + struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline; + struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets; + struct { VkPipelineLayout pipeline_layout; VkShaderStageFlags stage_flags; uint32_t size; const void* values; } push_constants; + + struct { uint32_t group_count_x; uint32_t group_count_y; uint32_t group_count_z; } dispatch; + + struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkMemoryBarrier* barriers; } memory_barrers; + struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkBufferMemoryBarrier* barriers; } buffer_barrers; + struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkImageMemoryBarrier* barriers; } image_barrers; + + struct { uint32_t download_post_buffer_mat_offset; uint32_t download_post_mat_fp16_offset; } post_download; + struct { uint32_t download_post_mat_fp16_offset; uint32_t download_post_mat_offset; } post_cast_float16_to_float32; + }; + }; + + std::vector delayed_records; +}; + + +class VkTransfer +{ +public: + VkTransfer(const GPUDevice* vkdev); + ~VkTransfer(); +public: + void record_upload(const tensor* src, VkTensor& dst, const Option& opt); + void record_upload(const Tensor& src, VkTensor& dst, const Option& opt); + + int submit_and_wait(); + +protected: + int init(); + int begin_command_buffer(); + int end_command_buffer(); + +protected: + const GPUDevice* vkdev; + + VkCommandPool compute_command_pool; + VkCommandPool transfer_command_pool; + + VkCommandBuffer upload_command_buffer; + VkCommandBuffer compute_command_buffer; + + VkSemaphore upload_compute_semaphore; + + VkFence upload_command_fence; + VkFence compute_command_fence; + + std::vector upload_staging_buffers; +}; + +} // namespace TEngine + +#endif diff --git a/source/device/vulkan/vulkan_define.h b/source/device/vulkan/vulkan_define.h new file mode 100644 index 000000000..e0c68277a --- /dev/null +++ b/source/device/vulkan/vulkan_define.h @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#pragma once + +#define VULKAN_DEV_NAME "VK" + + +typedef struct vulkan_option +{ + char* dev_name; + int precision; //!< precision of calculation +} vulkan_opt_t; diff --git a/source/device/vulkan/vulkan_device.cc b/source/device/vulkan/vulkan_device.cc new file mode 100644 index 000000000..57067405b --- /dev/null +++ b/source/device/vulkan/vulkan_device.cc @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "vulkan_device.hpp" + +#include "vulkan_limit.hpp" +#include "vulkan_graph.hpp" + +extern "C" +{ +#include "api/c_api.h" +#include "device/device.h" +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "executer/executer.h" +#include "optimizer/split.h" +#include "module/module.h" +#include "utility/vector.h" +#include "utility/log.h" +} + +#include + + +int vulkan_describe(struct device* device, struct vector* allowed_ops, struct vector* blocked_ops, struct vector* precision) +{ + (void)device; + + for (int op_type : vulkan_supported_ops) + { + push_vector_data(allowed_ops, &op_type); + } + + for (int i = 0, j = 0; i < OP_BUILTIN_LAST; i++) + { + int op_type = vulkan_supported_ops[j]; + if (op_type != i) + { + push_vector_data(blocked_ops, &i); + } + else + { + if (j < sizeof(vulkan_supported_ops) / sizeof(vulkan_supported_ops[0])) + j++; + } + } + + int precision_var = TENGINE_DT_UINT8; + push_vector_data(precision, &precision_var); + precision_var = TENGINE_DT_FP16; + push_vector_data(precision, &precision_var); + precision_var = TENGINE_DT_FP32; + push_vector_data(precision, &precision_var); + + return 0; +} + + +int vulkan_evaluation(struct device* device, struct subgraph* sub_graph, struct vector* evolution_tensors, struct vector* evolution_nodes) +{ + // nothing to do with vulkan + (void)device; + (void)sub_graph; + (void)evolution_tensors; + (void)evolution_nodes; + + return 0; +} + + +int vulkan_allocate(struct device* device, struct subgraph* sub_graph) +{ + if (nullptr == device) + { + return -1; + } + + /* set the correct input wait count: INPUT tensor is always ready */ + sub_graph->input_wait_count = 0; + + for (int i = 0; i < sub_graph->input_num; i++) + { + struct tensor* tensor = get_ir_graph_tensor(sub_graph->graph, sub_graph->input_tensor_list[i]); + + if (tensor->tensor_type == TENSOR_TYPE_VAR) + sub_graph->input_wait_count++; + } + + return 0; +} + + +int vulkan_release(struct device* device, struct subgraph* sub_graph) +{ + (void)sub_graph; + + return 0; +} + +int vulkan_split_graph(struct graph* ir_graph) +{ + struct device* cur_dev = ir_graph->attribute->context->device; + + if (0 != strcmp(VULKAN_DEV_NAME, cur_dev->name)) + { + return -1; + } + + struct vector* allowed_ops = create_vector(sizeof(int), nullptr); + struct vector* blocked_ops = create_vector(sizeof(int), nullptr); + struct vector* precision = create_vector(sizeof(int), nullptr); + + cur_dev->allocator->describe(cur_dev, allowed_ops, blocked_ops, precision); + + split_graph_node_to_sub_graph(ir_graph, allowed_ops, blocked_ops, precision); + + release_vector(allowed_ops); + release_vector(blocked_ops); + release_vector(precision); + + // + generate_sub_graph_io(ir_graph); + add_sub_graph_to_ir_graph(ir_graph); + + // add node sub graph id + for (int i = 0; i < (uint16_t)get_vector_num(ir_graph->subgraph_list); i++) + { + struct subgraph* sub_graph = *(struct subgraph**)get_vector_data(ir_graph->subgraph_list, i); + sub_graph->index = i; + + for (uint16_t j = 0; j < sub_graph->node_num; j++) + { + uint16_t node_id = sub_graph->node_list[j]; + struct node* ir_node = get_ir_graph_node(ir_graph, node_id); + ir_node->subgraph_idx = sub_graph->index; + } + } + + return 0; +} + + +extern "C" +{ +static struct interface vulkan_interface = { + .init = vulkan_dev_init, + .pre_run = vulkan_dev_prerun, + .run = vulkan_dev_run, + .post_run = vulkan_dev_postrun, + .async_run = nullptr, + .async_wait = nullptr, + .release_graph = nullptr, + .release_device = vulkan_dev_release, +}; + + +static struct allocator vulkan_allocator = { + .describe = vulkan_describe, + .evaluation = vulkan_evaluation, + .allocate = vulkan_allocate, + .release = vulkan_release, +}; + + +static struct optimizer vulkan_optimizer = { + .split_graph = vulkan_split_graph, + .optimize_graph = nullptr, +}; + + + +static struct vulkan_device vulkan_dev = { + .base = { + .name = VULKAN_DEV_NAME, + .interface = &vulkan_interface, + .allocator = &vulkan_allocator, + .optimizer = &vulkan_optimizer, + .scheduler = nullptr, + .privacy = nullptr, + }, +}; + + +int register_vulkan_device(void) +{ + int ret = register_device(&vulkan_dev.base); + if (0 != ret) + { + TLOG_INFO("Tengine plugin %s register failed.\n", vulkan_dev.base.name); + return -1; + } + + TLOG_INFO("Tengine plugin device %s is registered.\n", vulkan_dev.base.name); + return 0; +} + + +int unregister_vulkan_device(void) +{ + int ret = unregister_device(&vulkan_dev.base); + if (0 != ret) + { + TLOG_INFO("Tengine plugin %s unregister failed.\n", vulkan_dev.base.name); + return ret; + } + + TLOG_INFO("Tengine plugin device %s is unregistered.\n", vulkan_dev.base.name); + + return 0; +} +} diff --git a/source/device/vulkan/vulkan_device.hpp b/source/device/vulkan/vulkan_device.hpp new file mode 100644 index 000000000..9560261fe --- /dev/null +++ b/source/device/vulkan/vulkan_device.hpp @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#pragma once + +#include "vulkan_define.h" + +extern "C" +{ +#include "api/c_api.h" +#include "device/device.h" + +struct vulkan_device +{ + struct device base; +}; + +DLLEXPORT int register_vulkan_device(void); +} \ No newline at end of file diff --git a/source/device/vulkan/vulkan_executor.cc b/source/device/vulkan/vulkan_executor.cc new file mode 100644 index 000000000..ca030e894 --- /dev/null +++ b/source/device/vulkan/vulkan_executor.cc @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "vulkan_executor.hpp" +#include "vulkan_helper.hpp" +#include "vulkan_gpu.hpp" +#include "vulkan_graph.hpp" + +extern "C" +{ +#include "operator/op.h" +#include "convolution_param.h" +} + +using namespace TEngine; + +bool VULKANEngine::init() +{ + return true; +} + + + +int VULKANEngine::VULKANEnginePreRun(struct subgraph* subgraph) +{ + // TLOG_INFO("==== vulkan prerun start ====\n"); + create_gpu_instance(); + // struct device *vk_dev = (struct device *)dev; + struct graph *orig_graph = subgraph->graph; + // struct vk_dev_priv *priv = (struct vk_dev_priv *)orig_graph->dev_priv; + + // /* todo: set the tensor shape ? */ + + // /* create exec_graph */ + VulkanGraph* vk_exec_graph = new VulkanGraph(subgraph); + + if (vk_exec_graph == nullptr) + { + // set_tengine_errno(EIO); + TLOG_ERR("vulkan exec graph is NULL\n"); + return -1; + } + + vk_exec_graph->upload_model(); + vk_exec_graph->create_pipeline(); + + subgraph->device_graph = vk_exec_graph; + + int node_num = subgraph->node_num; + TLOG_INFO("==== vulkan prerun done ====\n"); + return 0; + +}; + +int VULKANEngine::VULKANEngineRun(struct subgraph* subgraph) +{ + // struct vk_device *vk_dev = (struct vk_device *)dev; + struct graph *orig_graph = subgraph->graph; + // struct vk_dev_priv *priv = (struct vk_dev_priv *)orig_graph->dev_priv; + + VulkanGraph *vk_exec_graph = (VulkanGraph *)subgraph->device_graph; + if (vk_exec_graph == nullptr) + { + // set_tengine_errno(EIO); + TLOG_ERR("vulkan exec graph is NULL\n"); + return -1; + } + + vk_exec_graph->record_graph_pipeline(); + return 0; +} + +void VULKANEngine::VULKANEnginePostRun() +{ + destroy_gpu_instance(); + return; +}; \ No newline at end of file diff --git a/source/device/vulkan/vulkan_executor.hpp b/source/device/vulkan/vulkan_executor.hpp new file mode 100644 index 000000000..28ae46efb --- /dev/null +++ b/source/device/vulkan/vulkan_executor.hpp @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, Open AI Lab + * Author: lswang@openailab.com + */ + + +extern "C" +{ +#include "api/c_api.h" +#include "device/device.h" +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "executer/executer.h" +#include "optimizer/split.h" +#include "module/module.h" +#include "utility/vector.h" +#include "utility/log.h" +} + +#include +#include +#include +#include +#include +#include + +// #include + +#include + +// typedef std::map dict_uint2clmem; + +struct VULKANqueue +{ + std::string name; + int dims; + // cl_kernel queue_kernel; + // cl_event enentPoint; + size_t *queue_global_work_size; + size_t *queue_local_work_size; +}; + +class VULKANEngine +{ +public: +// VULKANEngine(); +// ~VULKANEngine() = default; + + int VULKANEnginePreRun(struct subgraph* subgraph); + int VULKANEngineRun(struct subgraph* subgraph); + void VULKANEnginePostRun(); + +private: + bool init(); + +private: + +public: + // dict_uint2clmem vulkan_tensor_map; + std::vector queue_list; + +public: + int bin_num; + +}; + + + diff --git a/source/device/vulkan/vulkan_gpu.cpp b/source/device/vulkan/vulkan_gpu.cpp new file mode 100644 index 000000000..dac4e9486 --- /dev/null +++ b/source/device/vulkan/vulkan_gpu.cpp @@ -0,0 +1,2036 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "vulkan_gpu.hpp" +#include "vulkan_option.hpp" +#include +#include +#include + +#include +#include +#include + +#include "layer/packing_vulkan.hpp" + +#if __ANDROID__ +#define ENABLE_VALIDATION_LAYER 0 +#else +#define ENABLE_VALIDATION_LAYER 0 +#endif + +namespace TEngine { + +// global +static VkInstance g_instance = 0; +static int g_gpu_count = 0; +static int g_default_gpu_index = -1; + +// experience value +#define MAX_GPU_COUNT 8 +static GpuInfo g_gpu_infos[MAX_GPU_COUNT]; + +// TODO +// default +// static Mutex g_default_vkdev_lock; +static GPUDevice* g_default_vkdev[MAX_GPU_COUNT] = {0}; + +// precompiled spirv +struct layer_shader_registry_entry +{ + const uint32_t* spv_data; + size_t spv_data_size; +}; + +#include "layer_shader_spv_data.h" + +static const layer_shader_registry_entry layer_shader_registry[] = +{ +#include "layer_shader_registry.h" +}; + +static ShaderInfo layer_shader_infos[sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry)]; + +static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry); + +int support_VK_KHR_external_memory_capabilities = 0; +int support_VK_KHR_get_physical_device_properties2 = 0; +int support_VK_KHR_get_surface_capabilities2 = 0; +int support_VK_KHR_surface = 0; +int support_VK_EXT_debug_utils = 0; + +#if __ANDROID_API__ >= 26 +int support_VK_KHR_android_surface = 0; +#endif // __ANDROID_API__ >= 26 + +// VK_KHR_external_memory_capabilities +PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0; + +// VK_KHR_get_physical_device_properties2 +PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = 0; +PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR = 0; +PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR = 0; +PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR = 0; +PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR = 0; +PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR = 0; +PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR = 0; + +// VK_KHR_get_surface_capabilities2 +PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR = 0; +PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR = 0; + +// VK_KHR_surface +PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = 0; +PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR = 0; +PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR = 0; +PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR = 0; +PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR = 0; + +#if __ANDROID_API__ >= 26 +// VK_KHR_android_surface +PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0; +#endif // __ANDROID_API__ >= 26 + +// compile with old vulkan sdk +#if VK_HEADER_VERSION < 80 +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000 +typedef struct VkPhysicalDevice8BitStorageFeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 storageBuffer8BitAccess; + VkBool32 uniformAndStorageBuffer8BitAccess; + VkBool32 storagePushConstant8; +} VkPhysicalDevice8BitStorageFeaturesKHR; +#endif // VK_HEADER_VERSION < 80 +#if VK_HEADER_VERSION < 95 +#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000 +typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR { + VkStructureType sType; + void* pNext; + VkBool32 shaderFloat16; + VkBool32 shaderInt8; +} VkPhysicalDeviceFloat16Int8FeaturesKHR; +#endif // VK_HEADER_VERSION < 95 + +static int init_instance_extension() +{ + if (support_VK_KHR_external_memory_capabilities) + { + vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR"); + } + + if (support_VK_KHR_get_physical_device_properties2) + { + vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR"); + vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR"); + vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR"); + vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR"); + vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR"); + vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR"); + vkGetPhysicalDeviceSparseImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSparseImageFormatProperties2KHR"); + } + + if (support_VK_KHR_get_surface_capabilities2) + { + vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR"); + vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR"); + } + + if (support_VK_KHR_surface) + { + vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR"); + vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR"); + vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR"); + vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR"); + vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR"); + } + +#if __ANDROID_API__ >= 26 + if (support_VK_KHR_android_surface) + { + vkCreateAndroidSurfaceKHR = (PFN_vkCreateAndroidSurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkCreateAndroidSurfaceKHR"); + } +#endif // __ANDROID_API__ >= 26 + + return 0; +} + +#if ENABLE_VALIDATION_LAYER +static VkDebugUtilsMessengerEXT callback; + +static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback( + VkDebugUtilsMessageSeverityFlagBitsEXT /*messageSeverity*/, + VkDebugUtilsMessageTypeFlagsEXT /*messageType*/, + const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData, + void* /*pUserData*/) +{ + TLOG_INFO("validation layer: %s\n", pCallbackData->pMessage); + + return VK_FALSE; +} + +VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pCallback) +{ + PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT"); + if (func) + return func(instance, pCreateInfo, pAllocator, pCallback); + + return VK_ERROR_EXTENSION_NOT_PRESENT; +} + +void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT callback, const VkAllocationCallbacks* pAllocator) +{ + PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT"); + if (func) + func(instance, callback, pAllocator); +} +#endif // ENABLE_VALIDATION_LAYER + +static uint32_t find_device_compute_queue(const std::vector& queueFamilyProperties) +{ + // first try, compute only queue + for (uint32_t i=0; i& queueFamilyProperties) +{ + // first try, graphics only queue + for (uint32_t i=0; i& queueFamilyProperties) +{ + // first try, transfer only queue + for (uint32_t i=0; i 0) + return 0; + + TLOG_INFO("no vulkan device\n"); + return -1; +} + +int create_gpu_instance() +{ + VkResult ret; + + std::vector enabledLayers; + +#if ENABLE_VALIDATION_LAYER + uint32_t instanceLayerPropertyCount; + ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, NULL); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkEnumerateInstanceLayerProperties failed %d\n", ret); + return -1; + } + + std::vector instanceLayerProperties(instanceLayerPropertyCount); + ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, instanceLayerProperties.data()); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkEnumerateInstanceLayerProperties failed %d\n", ret); + return -1; + } + + for (uint32_t i=0; i enabledExtensions; + + uint32_t instanceExtensionPropertyCount; + ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkEnumerateInstanceExtensionProperties failed %d\n", ret); + return -1; + } + + std::vector instanceExtensionProperties(instanceExtensionPropertyCount); + ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data()); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkEnumerateInstanceExtensionProperties failed %d\n", ret); + return -1; + } + + support_VK_KHR_get_physical_device_properties2 = 0; + support_VK_KHR_get_surface_capabilities2 = 0; + support_VK_KHR_surface = 0; + support_VK_EXT_debug_utils = 0; +#if __ANDROID_API__ >= 26 + support_VK_KHR_android_surface = 0; +#endif // __ANDROID_API__ >= 26 + for (uint32_t j=0; j= 26 + else if (strcmp(exp.extensionName, "VK_KHR_android_surface") == 0) + support_VK_KHR_android_surface = exp.specVersion; +#endif // __ANDROID_API__ >= 26 + } + + if (support_VK_KHR_external_memory_capabilities) + enabledExtensions.push_back("VK_KHR_external_memory_capabilities"); + if (support_VK_KHR_get_physical_device_properties2) + enabledExtensions.push_back("VK_KHR_get_physical_device_properties2"); + if (support_VK_KHR_get_surface_capabilities2) + enabledExtensions.push_back("VK_KHR_get_surface_capabilities2"); + if (support_VK_KHR_surface) + enabledExtensions.push_back("VK_KHR_surface"); +#if ENABLE_VALIDATION_LAYER + if (support_VK_EXT_debug_utils) + enabledExtensions.push_back("VK_EXT_debug_utils"); +#endif // ENABLE_VALIDATION_LAYER +#if __ANDROID_API__ >= 26 + if (support_VK_KHR_android_surface) + enabledExtensions.push_back("VK_KHR_android_surface"); +#endif // __ANDROID_API__ >= 26 + + VkApplicationInfo applicationInfo; + applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + applicationInfo.pNext = 0; + applicationInfo.pApplicationName = "tengine"; + applicationInfo.applicationVersion = 0; + applicationInfo.pEngineName = "tengine"; + applicationInfo.engineVersion = 20200530; + applicationInfo.apiVersion = VK_MAKE_VERSION(1, 0, 0); + + VkInstanceCreateInfo instanceCreateInfo; + instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + instanceCreateInfo.pNext = 0; + instanceCreateInfo.flags = 0; + instanceCreateInfo.pApplicationInfo = &applicationInfo; + instanceCreateInfo.enabledLayerCount = enabledLayers.size(); + instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data(); + instanceCreateInfo.enabledExtensionCount = enabledExtensions.size(); + instanceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data(); + + ret = vkCreateInstance(&instanceCreateInfo, 0, &g_instance); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkCreateInstance failed %d\n", ret); + return -1; + } + +#if ENABLE_VALIDATION_LAYER + if (support_VK_EXT_debug_utils) + { + VkDebugUtilsMessengerCreateInfoEXT createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT; + createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT; + createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT; + createInfo.pfnUserCallback = debugCallback; + createInfo.pUserData = 0; + ret = CreateDebugUtilsMessengerEXT(g_instance, &createInfo, NULL, &callback); + if (ret != VK_SUCCESS) + { + TLOG_INFO("CreateDebugUtilsMessengerEXT failed %d\n", ret); + return -1; + } + } +#endif // ENABLE_VALIDATION_LAYER + + init_instance_extension(); + + uint32_t physicalDeviceCount = 0; + ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0); + if (ret != VK_SUCCESS) + { + TLOG_INFO("01vkEnumeratePhysicalDevices failed %d\n", ret); + return -1; + } + + if (physicalDeviceCount > MAX_GPU_COUNT) + physicalDeviceCount = MAX_GPU_COUNT; + + std::vector physicalDevices(physicalDeviceCount); + + ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data()); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkEnumeratePhysicalDevices failed %d\n", ret); + return -1; + } + + // find proper device and queue + int gpu_info_index = 0; + for (uint32_t i=0; i queueFamilyProperties(queueFamilyPropertiesCount); + vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data()); + + gpu_info.compute_queue_family_index = find_device_compute_queue(queueFamilyProperties); + gpu_info.graphics_queue_family_index = find_device_graphics_queue(queueFamilyProperties); + gpu_info.transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties); + + gpu_info.compute_queue_count = queueFamilyProperties[gpu_info.compute_queue_family_index].queueCount; + gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount; + gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount; + + // cache memory properties + vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physicalDeviceMemoryProperties); + + // get device extension + uint32_t deviceExtensionPropertyCount = 0; + ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkEnumerateDeviceExtensionProperties failed %d\n", ret); + return -1; + } + + std::vector deviceExtensionProperties(deviceExtensionPropertyCount); + ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data()); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkEnumerateDeviceExtensionProperties failed %d\n", ret); + return -1; + } + + // extension capability + gpu_info.support_VK_KHR_8bit_storage = 0; + gpu_info.support_VK_KHR_16bit_storage = 0; + gpu_info.support_VK_KHR_bind_memory2 = 0; + gpu_info.support_VK_KHR_dedicated_allocation = 0; + gpu_info.support_VK_KHR_descriptor_update_template = 0; + gpu_info.support_VK_KHR_external_memory = 0; + gpu_info.support_VK_KHR_get_memory_requirements2 = 0; + gpu_info.support_VK_KHR_maintenance1 = 0; + gpu_info.support_VK_KHR_push_descriptor = 0; + gpu_info.support_VK_KHR_sampler_ycbcr_conversion = 0; + gpu_info.support_VK_KHR_shader_float16_int8 = 0; + gpu_info.support_VK_KHR_shader_float_controls = 0; + gpu_info.support_VK_KHR_storage_buffer_storage_class = 0; + gpu_info.support_VK_KHR_swapchain = 0; + gpu_info.support_VK_EXT_queue_family_foreign = 0; +#if __ANDROID_API__ >= 26 + gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = 0; +#endif // __ANDROID_API__ >= 26 + for (uint32_t j=0; j= 26 + else if (strcmp(exp.extensionName, "VK_ANDROID_external_memory_android_hardware_buffer") == 0) + gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = exp.specVersion; +#endif // __ANDROID_API__ >= 26 + } + + // check features + gpu_info.support_fp16_packed = true; + gpu_info.support_fp16_storage = false; + gpu_info.support_fp16_arithmetic = false; + gpu_info.support_int8_storage = false; + gpu_info.support_int8_arithmetic = false; + gpu_info.support_ycbcr_conversion = false; + if (support_VK_KHR_get_physical_device_properties2) + { + void* queryExtensionFeatures = 0; + + // query int8 storage + VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures; + query8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR; + query8BitStorageFeatures.pNext = 0; + if (gpu_info.support_VK_KHR_8bit_storage) + { + query8BitStorageFeatures.pNext = queryExtensionFeatures; + queryExtensionFeatures = &query8BitStorageFeatures; + } + + // query fp16/int16 storage + VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures; + query16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR; + query16BitStorageFeatures.pNext = 0; + if (gpu_info.support_VK_KHR_16bit_storage) + { + query16BitStorageFeatures.pNext = queryExtensionFeatures; + queryExtensionFeatures = &query16BitStorageFeatures; + } + + // query fp16/int8 arithmetic + VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features; + queryFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR; + queryFloat16Int8Features.pNext = 0; + if (gpu_info.support_VK_KHR_shader_float16_int8) + { + queryFloat16Int8Features.pNext = queryExtensionFeatures; + queryExtensionFeatures = &queryFloat16Int8Features; + } + + // query ycbcr_conversion + VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures; + querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR; + querySamplerYcbcrConversionFeatures.pNext = 0; + if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion) + { + querySamplerYcbcrConversionFeatures.pNext = queryExtensionFeatures; + queryExtensionFeatures = &querySamplerYcbcrConversionFeatures; + } + + VkPhysicalDeviceFeatures2KHR queryFeatures; + queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR, + queryFeatures.pNext = queryExtensionFeatures; + + vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures); + + if (gpu_info.support_VK_KHR_8bit_storage) + { + gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess && query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess; + } + if (gpu_info.support_VK_KHR_16bit_storage) + { + gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess; + } + if (gpu_info.support_VK_KHR_shader_float16_int8) + { + gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16; + gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8; + } + if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion) + { + gpu_info.support_ycbcr_conversion = querySamplerYcbcrConversionFeatures.samplerYcbcrConversion; + } + } + else + { +// // TODO +// VkPhysicalDeviceFeatures features; +// vkGetPhysicalDeviceFeatures(physicalDevice, &features); + } + + if (physicalDeviceProperties.vendorID == 0x13b5) + { + // the 16bit_storage implementation of arm mali driver is buggy :[ + gpu_info.support_fp16_storage = false; + } + + if (physicalDeviceProperties.vendorID == 0x10002 && physicalDeviceProperties.deviceID == 0x70006214 && physicalDeviceProperties.apiVersion == VK_MAKE_VERSION(1, 1, 82)) + { + // the 16bit_storage implementation of vivante gc1700 driver is buggy :[ + gpu_info.support_fp16_storage = false; + } + + if (gpu_info.bug_implicit_fp16_arithmetic) + { + // force capability on as long as the driver accept spirv with fp16 arithmetic :D + gpu_info.support_fp16_arithmetic = true; + } + + TLOG_INFO("[%u %s] queueC=%u[%u] queueG=%u[%u] queueT=%u[%u]\n", i, physicalDeviceProperties.deviceName, + gpu_info.compute_queue_family_index, gpu_info.compute_queue_count, + gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count, + gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count); + + TLOG_INFO("[%u %s] buglssc=%d bugihfa=%d\n", i, physicalDeviceProperties.deviceName, + gpu_info.bug_local_size_spec_const, gpu_info.bug_implicit_fp16_arithmetic); + + TLOG_INFO("[%u %s] fp16p=%d fp16s=%d fp16a=%d int8s=%d int8a=%d\n", i, physicalDeviceProperties.deviceName, + gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic, + gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic); + + gpu_info_index++; + } + + g_gpu_count = gpu_info_index; + + // the default gpu device + g_default_gpu_index = find_default_vulkan_device_index(); + + // resolve shader info + // TLOG_INFO("run create gpu instance resolve shader info, layer_shader_registry_entry_count:%d\n", layer_shader_registry_entry_count); + for (int i=0; i enabledExtensions; + if (info.support_VK_KHR_8bit_storage) + enabledExtensions.push_back("VK_KHR_8bit_storage"); + if (info.support_VK_KHR_16bit_storage) + enabledExtensions.push_back("VK_KHR_16bit_storage"); + if (info.support_VK_KHR_bind_memory2) + enabledExtensions.push_back("VK_KHR_bind_memory2"); + if (info.support_VK_KHR_dedicated_allocation) + enabledExtensions.push_back("VK_KHR_dedicated_allocation"); + if (info.support_VK_KHR_descriptor_update_template) + enabledExtensions.push_back("VK_KHR_descriptor_update_template"); + if (info.support_VK_KHR_external_memory) + enabledExtensions.push_back("VK_KHR_external_memory"); + if (info.support_VK_KHR_get_memory_requirements2) + enabledExtensions.push_back("VK_KHR_get_memory_requirements2"); + if (info.support_VK_KHR_maintenance1) + enabledExtensions.push_back("VK_KHR_maintenance1"); + if (info.support_VK_KHR_push_descriptor) + enabledExtensions.push_back("VK_KHR_push_descriptor"); + if (info.support_VK_KHR_sampler_ycbcr_conversion) + enabledExtensions.push_back("VK_KHR_sampler_ycbcr_conversion"); + if (info.support_VK_KHR_shader_float16_int8) + enabledExtensions.push_back("VK_KHR_shader_float16_int8"); + if (info.support_VK_KHR_shader_float_controls) + enabledExtensions.push_back("VK_KHR_shader_float_controls"); + if (info.support_VK_KHR_storage_buffer_storage_class) + enabledExtensions.push_back("VK_KHR_storage_buffer_storage_class"); + if (info.support_VK_KHR_swapchain) + enabledExtensions.push_back("VK_KHR_swapchain"); + if (info.support_VK_EXT_queue_family_foreign) + enabledExtensions.push_back("VK_EXT_queue_family_foreign"); +#if __ANDROID_API__ >= 26 + if (info.support_VK_ANDROID_external_memory_android_hardware_buffer) + enabledExtensions.push_back("VK_ANDROID_external_memory_android_hardware_buffer"); +#endif // __ANDROID_API__ >= 26 + + void* enabledExtensionFeatures = 0; + + // enable int8 storage + VkPhysicalDevice8BitStorageFeaturesKHR enabled8BitStorageFeatures; + enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR; + enabled8BitStorageFeatures.pNext = 0; + enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage; + enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_storage; + enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE; + if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage) + { + enabled8BitStorageFeatures.pNext = enabledExtensionFeatures; + enabledExtensionFeatures = &enabled8BitStorageFeatures; + } + + // enable fp16/int16 storage + VkPhysicalDevice16BitStorageFeaturesKHR enabled16BitStorageFeatures; + enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR; + enabled16BitStorageFeatures.pNext = 0; + enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage; + enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_storage; + enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE; + enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE; + if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage) + { + enabled16BitStorageFeatures.pNext = enabledExtensionFeatures; + enabledExtensionFeatures = &enabled16BitStorageFeatures; + } + + // enable fp16/int8 arithmetic + VkPhysicalDeviceFloat16Int8FeaturesKHR enabledFloat16Int8Features; + enabledFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR; + enabledFloat16Int8Features.pNext = 0; + enabledFloat16Int8Features.shaderFloat16 = info.support_fp16_arithmetic; + enabledFloat16Int8Features.shaderInt8 = info.support_int8_arithmetic; + if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_shader_float16_int8) + { + enabledFloat16Int8Features.pNext = enabledExtensionFeatures; + enabledExtensionFeatures = &enabledFloat16Int8Features; + } + + // enable ycbcr conversion + VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures; + querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR; + querySamplerYcbcrConversionFeatures.pNext = 0; + querySamplerYcbcrConversionFeatures.samplerYcbcrConversion = info.support_ycbcr_conversion; + if (support_VK_KHR_get_physical_device_properties2 && info.support_ycbcr_conversion) + { + querySamplerYcbcrConversionFeatures.pNext = enabledExtensionFeatures; + enabledExtensionFeatures = &querySamplerYcbcrConversionFeatures; + } + std::vector compute_queue_priorities(info.compute_queue_count, 1.f);// 0.f ~ 1.f + std::vector graphics_queue_priorities(info.graphics_queue_count, 1.f);// 0.f ~ 1.f + std::vector transfer_queue_priorities(info.transfer_queue_count, 1.f);// 0.f ~ 1.f + + VkDeviceQueueCreateInfo deviceQueueCreateInfos[3]; + VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo; + deviceComputeQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + deviceComputeQueueCreateInfo.pNext = 0; + deviceComputeQueueCreateInfo.flags = 0; + deviceComputeQueueCreateInfo.queueFamilyIndex = info.compute_queue_family_index; + deviceComputeQueueCreateInfo.queueCount = info.compute_queue_count; + deviceComputeQueueCreateInfo.pQueuePriorities = compute_queue_priorities.data(); + + VkDeviceQueueCreateInfo deviceGraphicsQueueCreateInfo; + deviceGraphicsQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + deviceGraphicsQueueCreateInfo.pNext = 0; + deviceGraphicsQueueCreateInfo.flags = 0; + deviceGraphicsQueueCreateInfo.queueFamilyIndex = info.graphics_queue_family_index; + deviceGraphicsQueueCreateInfo.queueCount = info.graphics_queue_count; + deviceGraphicsQueueCreateInfo.pQueuePriorities = graphics_queue_priorities.data(); + + VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo; + deviceTransferQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + deviceTransferQueueCreateInfo.pNext = 0; + deviceTransferQueueCreateInfo.flags = 0; + deviceTransferQueueCreateInfo.queueFamilyIndex = info.transfer_queue_family_index; + deviceTransferQueueCreateInfo.queueCount = info.transfer_queue_count; + deviceTransferQueueCreateInfo.pQueuePriorities = transfer_queue_priorities.data(); + + VkDeviceCreateInfo deviceCreateInfo; + deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + deviceCreateInfo.pNext = enabledExtensionFeatures; + deviceCreateInfo.flags = 0; + if (info.compute_queue_family_index == info.graphics_queue_family_index && info.compute_queue_family_index == info.transfer_queue_family_index) + { + deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; + deviceCreateInfo.queueCreateInfoCount = 1; + } + else if (info.compute_queue_family_index == info.graphics_queue_family_index && info.compute_queue_family_index != info.transfer_queue_family_index) + { + deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; + deviceQueueCreateInfos[1] = deviceTransferQueueCreateInfo; + deviceCreateInfo.queueCreateInfoCount = 2; + } + else if (info.compute_queue_family_index != info.graphics_queue_family_index && info.graphics_queue_family_index == info.transfer_queue_family_index) + { + deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; + deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo; + deviceCreateInfo.queueCreateInfoCount = 2; + } + else // if (info.compute_queue_family_index != info.graphics_queue_family_index && info.graphics_queue_family_index != info.transfer_queue_family_index) + { + deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo; + deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo; + deviceQueueCreateInfos[2] = deviceTransferQueueCreateInfo; + deviceCreateInfo.queueCreateInfoCount = 3; + } + deviceCreateInfo.pQueueCreateInfos = deviceQueueCreateInfos; + deviceCreateInfo.enabledLayerCount = 0; + deviceCreateInfo.ppEnabledLayerNames = 0; + deviceCreateInfo.enabledExtensionCount = enabledExtensions.size(); + deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data(); + deviceCreateInfo.pEnabledFeatures = 0;// VkPhysicalDeviceFeatures pointer + + VkResult ret = vkCreateDevice(info.physical_device, &deviceCreateInfo, 0, &device); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkCreateDevice failed %d\n", ret); + } + + init_device_extension(); + + create_shader_module(); + + compute_queues.resize(info.compute_queue_count); + blob_allocators.resize(info.compute_queue_count); + staging_allocators.resize(info.compute_queue_count); + for (uint32_t i = 0; i < info.compute_queue_count; i++) + { + vkGetDeviceQueue(device, info.compute_queue_family_index, i, &compute_queues[i]); + + blob_allocators[i] = new VkBlobAllocator(this); + staging_allocators[i] = new VkStagingAllocator(this); + } + if (info.compute_queue_family_index != info.graphics_queue_family_index) + { + graphics_queues.resize(info.graphics_queue_count); + for (uint32_t i = 0; i < info.graphics_queue_count; i++) + { + vkGetDeviceQueue(device, info.graphics_queue_family_index, i, &graphics_queues[i]); + } + } + if (info.compute_queue_family_index != info.transfer_queue_family_index && info.graphics_queue_family_index != info.transfer_queue_family_index) + { + transfer_queues.resize(info.transfer_queue_count); + for (uint32_t i = 0; i < info.transfer_queue_count; i++) + { + vkGetDeviceQueue(device, info.transfer_queue_family_index, i, &transfer_queues[i]); + } + } + + create_dummy_buffer_image(); + + create_utility_operator(); +} + +GPUDevice::~GPUDevice() +{ + destroy_utility_operator(); + + destroy_dummy_buffer_image(); + + for (uint32_t i = 0; i < info.compute_queue_count; i++) + { + delete blob_allocators[i]; + delete staging_allocators[i]; + } + blob_allocators.clear(); + staging_allocators.clear(); + + destroy_shader_module(); + + vkDestroyDevice(device, 0); +} + +VkShaderModule GPUDevice::get_shader_module(int shader_type_index) const +{ + if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count) + { + TLOG_INFO("no such shader module %d\n", shader_type_index); + return 0; + } + + return shader_modules[shader_type_index]; +} + +VkShaderModule GPUDevice::create_shader_module(int shader_type_index, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const +{ + if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count) + { + TLOG_INFO("no such shader module %d\n", shader_type_index); + return 0; + } + + const uint32_t* spv_data = layer_shader_registry[shader_type_index].spv_data; + size_t spv_data_size = layer_shader_registry[shader_type_index].spv_data_size; + + return compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z); +} + +VkShaderModule GPUDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const +{ + VkShaderModuleCreateInfo shaderModuleCreateInfo; + shaderModuleCreateInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO; + shaderModuleCreateInfo.pNext = 0; + shaderModuleCreateInfo.flags = 0; + shaderModuleCreateInfo.codeSize = spv_data_size; + shaderModuleCreateInfo.pCode = spv_data; + + VkShaderModule shader_module; + VkResult ret = vkCreateShaderModule(device, &shaderModuleCreateInfo, 0, &shader_module); + if (ret != VK_SUCCESS) + { + TLOG_INFO("vkCreateShaderModule failed %d\n", ret); + return 0; + } + + return shader_module; +} + +// TODO +static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize) +{ + uint32_t local_size_x_id = -1; + uint32_t local_size_y_id = -1; + uint32_t local_size_z_id = -1; + uint32_t gl_WorkGroupSize_id = -1; + + const uint32_t* p = code; + uint32_t* dp = dstcode; + + // skip magic version generator bound schema + memcpy(dp, p, 5 * sizeof(uint32_t)); + p += 5; + dp += 5; + + // foreach op + while ((const unsigned char*)p < (const unsigned char*)code + size) + { + uint32_t opcode = p[0]; + + uint16_t wordcount = opcode >> 16; + uint16_t op = opcode & 0xffff; + if (op == 16) // OpExecutionMode + { + uint32_t mode = p[2]; + if (mode == 17) // LocalSize + { + memcpy(dp, p, wordcount * sizeof(uint32_t)); + + // set local_size_xyz + dp[3] = local_size_x; + dp[4] = local_size_y; + dp[5] = local_size_z; + + p += wordcount; + dp += wordcount; + continue; + } + } + else if (op == 50) // OpSpecConstant + { + uint32_t id = p[2]; + if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id) + { + p += wordcount; + continue; + } + } + else if (op == 51) // OpSpecConstantComposite + { + uint32_t id = p[2]; + if (id == gl_WorkGroupSize_id) + { + if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id)) + { + p += wordcount; + continue; + } + } + } + else if (op == 71) // OpDecorate + { + uint32_t id = p[1]; + uint32_t decoration = p[2]; + if (decoration == 1) // SpecId + { + uint32_t specid = p[3]; + if (specid == 233) local_size_x_id = id; + if (specid == 234) local_size_y_id = id; + if (specid == 235) local_size_z_id = id; + if (specid == 233 || specid == 234 || specid == 235) + { + p += wordcount; + continue; + } + } + else if (decoration == 11) // BuiltIn + { + uint32_t builtin = p[3]; + if (builtin == 25) // WorkgroupSize + { + gl_WorkGroupSize_id = id; + p += wordcount; + continue; + } + } + } + + memcpy(dp, p, wordcount * sizeof(uint32_t)); + p += wordcount; + dp += wordcount; + } + *dstsize = (unsigned char*)dp - (unsigned char*)dstcode; +} + +VkShaderModule GPUDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const +{ + uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size); + size_t spv_data_size_modified = spv_data_size; + inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified); + + VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified); + + free(spv_data_modified); + + return shader_module; +} + + + + +uint32_t GPUDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const +{ + // first try, find required and with preferred and without preferred_not + for (uint32_t i=0; i& queues = queue_family_index == info.compute_queue_family_index ? compute_queues + : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues; + for (int i=0; i<(int)queues.size(); i++) + { + VkQueue queue = queues[i]; + if (queue) + { + queues[i] = 0; + return queue; + } + } + + // out of hardware queue + return 0; +} + +// TODO +void GPUDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const +{ + if (queue_family_index != info.compute_queue_family_index + && queue_family_index != info.graphics_queue_family_index + && queue_family_index != info.transfer_queue_family_index) + { + TLOG_INFO("invalid queue_family_index %u", queue_family_index); + return; + } + + // TODO + MutexLockGuard lock(queue_lock); + + std::vector& queues = queue_family_index == info.compute_queue_family_index ? compute_queues + : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues; + for (int i=0; i<(int)queues.size(); i++) + { + if (!queues[i]) + { + queues[i] = queue; + return; + } + } + + TLOG_INFO("FATAL ERROR! reclaim_queue get wild queue %u %p", queue_family_index, queue); +} + +VkAllocator* GPUDevice::acquire_blob_allocator() const +{ + MutexLockGuard lock(blob_allocator_lock); + + for (int i=0; i<(int)blob_allocators.size(); i++) + { + VkAllocator* allocator = blob_allocators[i]; + if (allocator) + { + blob_allocators[i] = 0; + return allocator; + } + } + + // out of blob allocator + return 0; +} + +void GPUDevice::reclaim_blob_allocator(VkAllocator* allocator) const +{ + MutexLockGuard lock(blob_allocator_lock); + + for (int i=0; i<(int)blob_allocators.size(); i++) + { + if (!blob_allocators[i]) + { + blob_allocators[i] = allocator; + return; + } + } + + TLOG_INFO("FATAL ERROR! reclaim_blob_allocator get wild allocator %p", allocator); +} + + +VkAllocator* GPUDevice::acquire_staging_allocator() const +{ + MutexLockGuard lock(staging_allocator_lock); + + for (int i=0; i<(int)staging_allocators.size(); i++) + { + VkAllocator* allocator = staging_allocators[i]; + if (allocator) + { + staging_allocators[i] = 0; + return allocator; + } + } + + // out of staging allocator + return 0; +} + + +void GPUDevice::reclaim_staging_allocator(VkAllocator* allocator) const +{ + MutexLockGuard lock(staging_allocator_lock); + + for (int i=0; i<(int)staging_allocators.size(); i++) + { + if (!staging_allocators[i]) + { + staging_allocators[i] = allocator; + return; + } + } + + TLOG_INFO("FATAL ERROR! reclaim_staging_allocator get wild allocator %p", allocator); +} + +int GPUDevice::create_shader_module() +{ + if (info.bug_local_size_spec_const) + { + // do not cache shader module + return 0; + } + + shader_modules.resize(layer_shader_registry_entry_count, VK_NULL_HANDLE); + for (int i=0; i= 26 + if (info.support_VK_ANDROID_external_memory_android_hardware_buffer) + { + vkGetAndroidHardwareBufferPropertiesANDROID = (PFN_vkGetAndroidHardwareBufferPropertiesANDROID)vkGetDeviceProcAddr(device, "vkGetAndroidHardwareBufferPropertiesANDROID"); + vkGetMemoryAndroidHardwareBufferANDROID = (PFN_vkGetMemoryAndroidHardwareBufferANDROID)vkGetDeviceProcAddr(device, "vkGetMemoryAndroidHardwareBufferANDROID"); + } +#endif // __ANDROID_API__ >= 26 + + return 0; +} + +GPUDevice* get_gpu_device(int device_index) +{ + if (device_index < 0 || device_index >= g_gpu_count) + return 0; + + // MutexLockGuard lock(g_default_vkdev_lock); + + if (!g_default_vkdev[device_index]) + g_default_vkdev[device_index] = new GPUDevice(device_index); + + return g_default_vkdev[device_index]; +} + +const ShaderInfo& get_shader_info(int shader_type_index) +{ + if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count) + { + TLOG_INFO("no such shader module %d\n", shader_type_index); + return layer_shader_infos[0]; + } + + return layer_shader_infos[shader_type_index]; +} + +// TODO +int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info) +{ + shader_info.specialization_count = 0; + shader_info.binding_count = 0; + shader_info.push_constant_count = 0; + + uint32_t parameter_id = -233; + + int specialization_count = 0; + int binding_count = 0; + int push_constant_count = 0; + + // id -> binding_type + std::vector id_types; + + // binding_id -> binding_type + std::vector binding_types; + + const uint32_t* p = spv_data; + + int bound = p[3]; + + id_types.resize(bound); + + // skip magic version generator bound schema + p += 5; + + // foreach op + while ((const unsigned char*)p < (const unsigned char*)spv_data + spv_data_size) + { + uint32_t opcode = p[0]; + + uint16_t wordcount = opcode >> 16; + uint16_t op = opcode & 0xffff; + + if (op == 5) // OpName + { + uint32_t id = p[1]; + const char* name = (const char*)&p[2]; + if (strcmp(name, "parameter") == 0) + { + parameter_id = id; + } + } + else if (op == 6) // OpMemberName + { + uint32_t id = p[1]; + if (id == parameter_id) + { + push_constant_count++; + } + } + else if (op == 25) // OpTypeImage + { + uint32_t id = p[1]; + id_types[id] = 2; + } + else if (op == 27) // OpTypeSampledImage + { + uint32_t id = p[1]; + id_types[id] = 3; + } + else if (op == 32) // OpTypePointer + { + uint32_t id = p[1]; + uint32_t storage_class = p[2]; + uint32_t type = p[3]; + if (storage_class == 0) // UniformConstant + { + id_types[id] = id_types[type]; + } + if (storage_class == 2) // Uniform + { + id_types[id] = id_types[type]; + } + } + else if (op == 59) // OpVariable + { + uint32_t id = p[1]; + uint32_t var_id = p[2]; + uint32_t storage_class = p[3]; + if (storage_class == 0) // UniformConstant + { + id_types[var_id] = id_types[id]; + } + if (storage_class == 2) // Uniform + { + id_types[var_id] = id_types[id]; + } + } + else if (op == 71) // OpDecorate + { + uint32_t id = p[1]; + uint32_t decoration = p[2]; + uint32_t binding_id = p[3]; + if (decoration == 1) // SpecId + { + specialization_count++; + } + if (decoration == 3) // BufferBlock + { + id_types[id] = 1; + } + else if (decoration == 33) // Binding + { + binding_count = std::max(binding_count, (int)binding_id + 1); + + binding_types.resize(binding_count); + binding_types[binding_id] = id; + } + } + + p += wordcount; + } + + if (binding_count > 16) + { + TLOG_INFO("too many binding %d", binding_count); + return -1; + } + + shader_info.specialization_count = specialization_count; + shader_info.binding_count = binding_count; + shader_info.push_constant_count = push_constant_count; + + // resolve binding_types + for (int i=0; iaccess_flags; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].buffer = buffer.buffer(); + barriers[0].offset = buffer.buffer_offset(); + barriers[0].size = buffer.buffer_capacity(); + + VkPipelineStageFlags src_stage = buffer.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_buffer_barrers; + r.command_buffer = compute_command_buffer; + r.buffer_barrers.src_stage = src_stage; + r.buffer_barrers.dst_stage = dst_stage; + r.buffer_barrers.barrier_count = 1; + r.buffer_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark device shader-readwrite @ compute + buffer.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + buffer.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + + void record_dummy(const VkImageTensor& image) + { +// TLOG_INFO("xxx barrier image %p +%d ~%d %p", image.image(), image.data->bind_offset, image.data->bind_capacity, image.imageview()); + + // image layout transform any @ any to shader-write @ compute + VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1]; + barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; + barriers[0].pNext = 0; + barriers[0].srcAccessMask = image.data->access_flags; + barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + barriers[0].oldLayout = image.data->image_layout; + barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL; + barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; + barriers[0].image = image.image(); + barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT; + barriers[0].subresourceRange.baseMipLevel = 0; + barriers[0].subresourceRange.levelCount = 1; + barriers[0].subresourceRange.baseArrayLayer = 0; + barriers[0].subresourceRange.layerCount = 1; + + VkPipelineStageFlags src_stage = image.data->stage_flags; + VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers); + delete[] barriers; + } + else + { + record r; + r.type = record::TYPE_image_barrers; + r.command_buffer = compute_command_buffer; + r.image_barrers.src_stage = src_stage; + r.image_barrers.dst_stage = dst_stage; + r.image_barrers.barrier_count = 1; + r.image_barrers.barriers = barriers; + delayed_records.push_back(r); + } + + // mark image shader-write @ compute + image.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + image.data->image_layout = VK_IMAGE_LAYOUT_GENERAL; + image.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; + } + +}; + +int GPUDevice::create_dummy_buffer_image() +{ + dummy_allocator = new VkDummyAllocator(this); + + dummy_buffer.create(1, 4u, dummy_allocator); + dummy_image.create(1, 4u, dummy_allocator); + + VkDummyCompute cmd(this); + + cmd.record_dummy(dummy_buffer); + cmd.record_dummy(dummy_image); + + cmd.submit_and_wait(); + + return 0; +} + +void GPUDevice::destroy_dummy_buffer_image() +{ + dummy_buffer.release(); + dummy_image.release(); + + delete dummy_allocator; +} + +int GPUDevice::create_utility_operator() +{ + TLOG_INFO("run create utility operator\n"); + memset(uop_packing, 0, sizeof(uop_packing)); + + Option opt; + + // from buffer | image + // to buffer | image + for (int i0=0; i0<2; i0++) + { + for (int i1=0; i1<2; i1++) + { + opt.use_image_storage = (i0 == 1 || i1 == 1); +// #if __APPLE__ +// if (opt.use_image_storage) +// continue; +// #endif + + // from fp32-b/i | fp16p-b/i | fp16s-b/i + // to fp32-b/i | fp16p-b/i | fp16s-b/i + for (int j0=0; j0<3; j0++) + { + for (int j1=0; j1<3; j1++) + { + opt.use_fp16_packed = (j0 == 1 || j1 == 1); + opt.use_fp16_storage = (j0 == 2 || j1 == 2); + + if (!info.support_fp16_packed && opt.use_fp16_packed) + continue; + + if (!info.support_fp16_storage && opt.use_fp16_storage) + continue; + + // from pack1 | pack4 | pack8 + for (int k=0; k<3; k++) + { + // enable pack8 for pack8to1/pack8to4 + opt.use_shader_pack8 = true; + + { // create packing layer + TEngine::Packing_vulkan* uop = new Packing_vulkan(); + uop->vkdev = this; + + uop->out_elempack = k == 0 ? 1 : k == 1 ? 4 : 8; + uop->cast_type_from = j0 + 1; + uop->cast_type_to = j1 + 1; + uop->storage_type_from = i0; + uop->storage_type_to = i1; + // TLOG_INFO("out_elempack:%d %d %d %d %d\n", uop->out_elempack, uop->cast_type_from, uop->cast_type_to, uop->storage_type_from, uop->storage_type_to); + + uop->create_pipeline(opt); + + uop_packing[i0][i1][j0][j1][k] = uop; + } + } + } + } + } + } + + return 0; +} + +void GPUDevice::destroy_utility_operator() +{ + Option opt; + + // from buffer | image + // to buffer | image + for (int i0=0; i0<2; i0++) + { + for (int i1=0; i1<2; i1++) + { + opt.use_image_storage = (i0 == 1 || i1 == 1); +#if __APPLE__ + if (opt.use_image_storage) + continue; +#endif + + // from fp32-b/i | fp16p-b/i | fp16s-b/i + // to fp32-b/i | fp16p-b/i | fp16s-b/i + for (int j0=0; j0<3; j0++) + { + for (int j1=0; j1<3; j1++) + { + opt.use_fp16_packed = (j0 == 1 || j1 == 1); + opt.use_fp16_storage = (j0 == 2 || j1 == 2); + + if (!info.support_fp16_packed && opt.use_fp16_packed) + continue; + + if (!info.support_fp16_storage && opt.use_fp16_storage) + continue; + + // from pack1 | pack4 | pack8 + for (int k=0; k<3; k++) + { + opt.use_shader_pack8 = (k == 2 || k == 2); + + TEngine::Layer* uop = uop_packing[i0][i1][j0][j1][k]; + + uop->destroy_pipeline(opt); + + delete uop; + + uop_packing[i0][i1][j0][j1][k] = 0; + } + } + } + } + } +} + +void GPUDevice::convert_packing(const VkTensor& src, VkTensor& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const +{ + // buffer2buffer uop is created with use_image_storage disabled + Option opt = _opt; + opt.use_image_storage = false; + + int cast_type_from_index = src.elemsize == src.elempack * 4u ? 0 : opt.use_fp16_storage ? 2 : 1; + int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed && dst_elempack % 4 == 0 ? 1 : 0; + int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2; + + // TLOG_INFO("convert_packing b2b %d %d %d\n", cast_type_from_index, cast_type_to_index, packing_type_to_index); + + const TEngine::Packing_vulkan* uop = uop_packing[0][0][cast_type_from_index][cast_type_to_index][packing_type_to_index]; + + uop->record_pipeline(src, dst, cmd, opt); +} + +} // namespace TEngine diff --git a/source/device/vulkan/vulkan_gpu.hpp b/source/device/vulkan/vulkan_gpu.hpp new file mode 100644 index 000000000..b0a6466a1 --- /dev/null +++ b/source/device/vulkan/vulkan_gpu.hpp @@ -0,0 +1,349 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef VULKAN_GPU_HPP +#define VULKAN_GPU_HPP + +#include + +#include "vulkan_platform.hpp" +#include +#include "vulkan_tensor.hpp" + +// #include "tengine_log.h" + +namespace TEngine { + +// instance +int create_gpu_instance(); +void destroy_gpu_instance(); + +// instance extension capability +extern int support_VK_KHR_external_memory_capabilities; +extern int support_VK_KHR_get_physical_device_properties2; +extern int support_VK_KHR_get_surface_capabilities2; +extern int support_VK_KHR_surface; +extern int support_VK_EXT_debug_utils; +#if __ANDROID_API__ >= 26 +extern int support_VK_KHR_android_surface; +#endif // __ANDROID_API__ >= 26 + +// VK_KHR_external_memory_capabilities +extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR; + +// VK_KHR_get_physical_device_properties2 +extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR; +extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR; +extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR; +extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR; +extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR; +extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR; +extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR; + +// VK_KHR_get_surface_capabilities2 +extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR; +extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR; + +// VK_KHR_surface +extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR; +extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR; +extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR; +extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR; +extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR; + +#if __ANDROID_API__ >= 26 +// VK_KHR_android_surface +extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR; +#endif // __ANDROID_API__ >= 26 + +// get info +int get_gpu_count(); +int get_default_gpu_index(); + +class GpuInfo +{ +public: + // vulkan physical device + VkPhysicalDevice physical_device; + + // memory properties + VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties; + + // info + uint32_t api_version; + uint32_t driver_version; + uint32_t vendor_id; + uint32_t device_id; + uint8_t pipeline_cache_uuid[VK_UUID_SIZE]; + + // 0 = discrete gpu + // 1 = integrated gpu + // 2 = virtual gpu + // 3 = cpu + int type; + + // hardware limit + uint32_t max_shared_memory_size; + uint32_t max_workgroup_count[3]; + uint32_t max_workgroup_invocations; + uint32_t max_workgroup_size[3]; + size_t memory_map_alignment; + size_t buffer_offset_alignment; + size_t non_coherent_atom_size; + size_t buffer_image_granularity; + uint32_t max_image_dimension_1d; + uint32_t max_image_dimension_2d; + uint32_t max_image_dimension_3d; + float timestamp_period; + + // runtime + uint32_t compute_queue_family_index; + uint32_t graphics_queue_family_index; + uint32_t transfer_queue_family_index; + + uint32_t compute_queue_count; + uint32_t graphics_queue_count; + uint32_t transfer_queue_count; + + // property + bool unified_compute_transfer_queue; + + // bug is not feature + bool bug_local_size_spec_const; + + // but sometimes bug is a feature + bool bug_implicit_fp16_arithmetic; + + // fp16 and int8 feature + bool support_fp16_packed; + bool support_fp16_storage; + bool support_fp16_arithmetic; + bool support_int8_storage; + bool support_int8_arithmetic; + + // ycbcr conversion feature + bool support_ycbcr_conversion; + + // extension capability + int support_VK_KHR_8bit_storage; + int support_VK_KHR_16bit_storage; + int support_VK_KHR_bind_memory2; + int support_VK_KHR_dedicated_allocation; + int support_VK_KHR_descriptor_update_template; + int support_VK_KHR_external_memory; + int support_VK_KHR_get_memory_requirements2; + int support_VK_KHR_maintenance1; + int support_VK_KHR_push_descriptor; + int support_VK_KHR_sampler_ycbcr_conversion; + int support_VK_KHR_shader_float16_int8; + int support_VK_KHR_shader_float_controls; + int support_VK_KHR_storage_buffer_storage_class; + int support_VK_KHR_swapchain; + int support_VK_EXT_queue_family_foreign; +#if __ANDROID_API__ >= 26 + int support_VK_ANDROID_external_memory_android_hardware_buffer; +#endif // __ANDROID_API__ >= 26 +}; + +const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index()); + +class VkAllocator; +class VkCompute; +class Layer; +class Packing_vulkan; +class Option; +class GPUDevice +{ +public: + GPUDevice(int device_index = get_default_gpu_index()); + ~GPUDevice(); + + const GpuInfo& info; + + VkDevice vkdevice() const { return device; } + + VkShaderModule get_shader_module(int shader_type_index) const; + + // with fixed workgroup size + VkShaderModule create_shader_module(int shader_type_index, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const; + + VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const; + + // with fixed workgroup size + VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const; + + uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const; + bool is_mappable(uint32_t memory_type_index) const; + bool is_coherent(uint32_t memory_type_index) const; + + VkQueue acquire_queue(uint32_t queue_family_index) const; + void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const; + + // allocator on this device + VkAllocator* acquire_blob_allocator() const; + void reclaim_blob_allocator(VkAllocator* allocator) const; + + VkAllocator* acquire_staging_allocator() const; + void reclaim_staging_allocator(VkAllocator* allocator) const; + + // dummy buffer image + VkTensor get_dummy_buffer() const; + VkImageTensor get_dummy_image() const; + + // utility operator + void convert_packing(const VkTensor& src, VkTensor& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const; + // void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const; + // void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const; + // void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const; + + // VK_KHR_bind_memory2 + PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR; + PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR; + + // VK_KHR_descriptor_update_template + PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR; + PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR; + PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR; + + // VK_KHR_get_memory_requirements2 + PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR; + PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR; + PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR; + + // VK_KHR_maintenance1 + PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR; + + // VK_KHR_push_descriptor + PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR; + PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR; + + // VK_KHR_sampler_ycbcr_conversion + PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR; + PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR; + + // VK_KHR_swapchain + PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR; + PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR; + PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR; + PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR; + PFN_vkQueuePresentKHR vkQueuePresentKHR; + +#if __ANDROID_API__ >= 26 + // VK_ANDROID_external_memory_android_hardware_buffer + PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID; + PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID; +#endif // __ANDROID_API__ >= 26 + +protected: + // shader management + int create_shader_module(); + void destroy_shader_module(); + + // device extension + int init_device_extension(); + + // dummy buffer and image + int create_dummy_buffer_image(); + void destroy_dummy_buffer_image(); + + // utility operator + int create_utility_operator(); + void destroy_utility_operator(); + +private: + VkDevice device; + std::vector shader_modules; + + // hardware queue + mutable std::vector compute_queues; + mutable std::vector graphics_queues; + mutable std::vector transfer_queues; + + mutable Mutex queue_lock; + + // default blob allocator for each queue + mutable std::vector blob_allocators; + + mutable Mutex blob_allocator_lock; + + // default staging allocator for each queue + mutable std::vector staging_allocators; + + mutable Mutex staging_allocator_lock; + + // dummy buffer and image + VkAllocator* dummy_allocator; + VkTensor dummy_buffer; + VkImageTensor dummy_image; + + // utility operator + // from buffer | image + // to buffer | image + // from fp32-b/i | fp16p-b/i | fp16s-b/i + // to fp32-b/i | fp16p-b/i | fp16s-b/i + // to pack1 | pack4 | pack8 + TEngine::Packing_vulkan* uop_packing[2][2][3][3][3]; +}; + +GPUDevice* get_gpu_device(int device_index = get_default_gpu_index()); + +// info from spirv +class ShaderInfo +{ +public: + int specialization_count; + int binding_count; + int push_constant_count; + + // 0 = null + // 1 = storage buffer + // 2 = storage image + // 3 = combined image sampler + int binding_types[16];// 16 is large enough(maybe) +}; + +const ShaderInfo& get_shader_info(int shader_type_index); +int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info); + +union vk_specialization_type { int i; float f; uint32_t u32; }; +union vk_constant_type { int i; float f; }; + +} + +#endif // VULKAN_GPU_HPP diff --git a/source/device/vulkan/vulkan_graph.cc b/source/device/vulkan/vulkan_graph.cc new file mode 100644 index 000000000..222477f80 --- /dev/null +++ b/source/device/vulkan/vulkan_graph.cc @@ -0,0 +1,545 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "vulkan_graph.hpp" +#include "vulkan_executor.hpp" + +#include +#include "vulkan_graph.hpp" +#include "vulkan_pipeline.hpp" +#include "vulkan_gpu.hpp" +#include "vulkan_command.hpp" +#include "vulkan_allocator.hpp" +#include "vulkan_tensor.hpp" +#include "vulkan_layer.hpp" + +#include "layer/convolution_vulkan.hpp" +#include "layer/pooling_vulkan.hpp" +#include "layer/convolutiondepthwise_vulkan.hpp" +#include "layer/innerproduct_vulkan.hpp" +#include "layer/flatten_vulkan.hpp" +#include "layer/softmax_vulkan.hpp" +#include "layer/relu_vulkan.hpp" +#include "layer/dropout_vulkan.hpp" +#include "layer/eltwise_vulkan.hpp" +#include "layer/priorbox_vulkan.hpp" +#include "layer/permute_vulkan.hpp" +#include "layer/concat_vulkan.hpp" +#include "layer/reshape_vulkan.hpp" +#include "layer/interp_vulkan.hpp" +#include "layer/crop_vulkan.hpp" + +#include + +extern "C" +{ +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +} + + +int vulkan_dev_init(struct device* dev) +{ + (void)dev; + return 0; +} + + +int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options) +{ + subgraph->device_graph = new VULKANEngine; + auto engine = (VULKANEngine*)subgraph->device_graph; + + return engine->VULKANEnginePreRun(subgraph); +} + + +int vulkan_dev_run(struct device* dev, struct subgraph* subgraph) +{ + auto engine = (VULKANEngine*)subgraph->device_graph; + return engine->VULKANEngineRun(subgraph); +} + + +int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph) +{ + auto engine = (VULKANEngine*)subgraph->device_graph; + engine->VULKANEnginePostRun(); + // delete engine; + + return 0; +} + + +int vulkan_dev_release(struct device* dev) +{ + (void)dev; + return 0; +} + + + +namespace TEngine { + +static double get_cur_time(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + + return tv.tv_sec * 1000.0 + (tv.tv_usec / 1000.0); +} + + +VulkanGraph::VulkanGraph(struct subgraph* graph) +{ + vkdev = get_gpu_device(); + weight_vkallocator = 0; + weight_staging_vkallocator = 0; + + // set graph options + if (!vkdev->info.support_fp16_packed || !vkdev->info.support_fp16_storage) + opt.use_fp16_packed = false; + if (!vkdev->info.support_fp16_storage) + { + opt.use_fp16_storage = false; + opt.use_shader_pack8 = false; + } + + if (!vkdev->info.support_fp16_arithmetic) + opt.use_fp16_arithmetic = false; + + TLOG_INFO("use_fp16_packed %d\n", opt.use_fp16_packed); + TLOG_INFO("use_fp16_storage %d\n", opt.use_fp16_storage); + TLOG_INFO("use_shader_pack8 %d\n", opt.use_shader_pack8); + TLOG_INFO("use_fp16_arithmetic %d\n", opt.use_fp16_arithmetic); + + struct subgraph *subgraph = (struct subgraph *)graph; + struct graph *ir_graph = subgraph->graph; + int node_num = subgraph->node_num; + + sgraph = graph; + for(int i = 0; i < node_num; i++) + { + struct node *ir_node = get_ir_graph_node(ir_graph, subgraph->node_list[i]); + + if (ir_node->op.type == OP_CONST || ir_node->op.type == OP_INPUT) + continue; + else if (ir_node->op.type == OP_CLIP) + ir_node->op.type = OP_RELU6; + + if(ir_node->op.type == OP_CONV) + { + struct conv_param *conv_param = (struct conv_param *)ir_node->op.param_mem; + + if (conv_param->group == conv_param->output_channel && conv_param->group != 1 && ir_graph->graph_layout == TENGINE_LAYOUT_NCHW) // DW + { + Layer* layer = new ConvolutionDepthWise_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "ConvolutionDepthWise"; + layers.push_back(layer); + } + else + { + Layer* layer = new Convolution_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Convolution"; + layers.push_back(layer); + } + } + + if(ir_node->op.type == OP_POOL) + { + Layer* layer = new Pooling_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Pooling"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_FC) + { + Layer* layer = new InnerProduct_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "InnerProduct"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_FLATTEN) + { + Layer* layer = new Flatten_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Flatten"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_SOFTMAX) + { + Layer* layer = new Softmax_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Softmax"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_RELU) + { + Layer* layer = new ReLU_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "ReLU"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_DROPOUT) + { + Layer* layer = new Dropout_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Dropout"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_ELTWISE) + { + Layer* layer = new Eltwise_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Eltwise"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_PRIORBOX) + { + Layer* layer = new PriorBox_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "PriorBox"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_PERMUTE) + { + Layer* layer = new Permute_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Permute"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_CONCAT) + { + Layer* layer = new Concat_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Concat"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_RESHAPE) + { + Layer* layer = new Reshape_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Reshape"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_INTERP || ir_node->op.type == OP_UPSAMPLE) + { + Layer* layer = new Interp_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Interp"; + layers.push_back(layer); + } + + if(ir_node->op.type == OP_CROP) + { + Layer* layer = new Crop_vulkan(ir_graph, ir_node); + layer->vkdev = vkdev; + layer->name = "Crop"; + layers.push_back(layer); + } + + struct tensor *input = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); + std::string name = input->name; + tensor_map_[name] = input; + tensor_map[name] = Tensor(input); + + VkTensor vktensor; + vktensor_map_[name] = vktensor; + + struct tensor *output = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); + name = output->name; + tensor_map_[name] = output; + tensor_map[name] = Tensor(output); + } +} + +VulkanGraph::~VulkanGraph() +{ + for(auto& ptr: mem_buf_vector_) + std::free(ptr); +} + +int VulkanGraph::upload_model() +{ + +// printf("run upload_model\n"); + TEngine::VkTransfer cmd(vkdev); + if (!weight_vkallocator) + { + weight_vkallocator = new VkWeightAllocator(vkdev); + } + if (!weight_staging_vkallocator) + { + weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev); + } + + Option opt_upload = opt; + opt_upload.blob_vkallocator = weight_vkallocator; + opt_upload.workspace_vkallocator = weight_vkallocator; + opt_upload.staging_vkallocator = weight_staging_vkallocator; + + int layer_size = layers.size(); + for(int i = 0; i < layer_size; i++) + { + layers[i]->upload_model(cmd, opt_upload); + } + + cmd.submit_and_wait(); +// printf("run upload_model done\n"); + return 0; +} + +int VulkanGraph::create_pipeline() +{ + // printf("start to run create pipeline\n"); + for (size_t i=0; iname.c_str()); + int cret = layer->create_pipeline(opt1); + if (cret != 0) + { + printf("layer create_pipeline %d failed", (int)i); + return -1; + } + } +// printf("run create_pipeline done\n"); + return 0; +} + +int VulkanGraph::record_graph_pipeline() +{ + // printf("start to run record pipeline, layer size:%d\n", layers.size()); + + TEngine::VkCompute cmd(vkdev); + + if (!opt.blob_vkallocator) + { + local_blob_vkallocator = vkdev->acquire_blob_allocator(); + opt.blob_vkallocator = local_blob_vkallocator; + } + if (!opt.workspace_vkallocator) + { + opt.workspace_vkallocator = opt.blob_vkallocator; + } + if (!opt.staging_vkallocator) + { + local_staging_vkallocator = vkdev->acquire_staging_allocator(); + opt.staging_vkallocator = local_staging_vkallocator; + } + std::string name; + + Tensor input; + Tensor output; + + // printf("tensor_map size:%d ---------------------\n", tensor_map.size()); + + for (size_t i=0; iname.c_str()); + + std::string in_name = layer->bottoms[0]; + std::string out_name = layer->tops[0]; + name = out_name; + + // upload Tensor data to VkTensor + if((i==0) && vktensor_map_[in_name].dims == 0) + { + cmd.record_upload(tensor_map_[in_name], vktensor_map_[in_name], opt); + // cmd.record_download(vktensor_map_[in_name], tensor_map[in_name], opt); + } + + int cret; + if(layer->name == "ReLU" || layer->name == "Dropout" || layer->name == "Softmax") // inplace + { + VkTensor bottom_tensor = vktensor_map_[in_name]; + cret = layer->record_pipeline(bottom_tensor, cmd, opt); + vktensor_map_[out_name] = bottom_tensor; + } + else if(layer->name == "Eltwise" || layer->name == "Concat" || layer->name == "PriorBox" || layer->name == "Crop") // multi-in, one-out + { + std::vector bottom_blobs; + for(int i = 0; i < layer->bottoms.size(); i++) + { + bottom_blobs.push_back(vktensor_map_[layer->bottoms[i]]); + } + + VkTensor top_tensor; + std::vector top_blobs; + top_blobs.push_back(top_tensor); + cret = layer->record_pipeline(bottom_blobs, top_blobs, cmd, opt); + vktensor_map_[out_name] = top_blobs[0]; + } + else // original one-in one-out + { + VkTensor bottom_tensor = vktensor_map_[in_name]; + VkTensor top_tensor; + cret = layer->record_pipeline(bottom_tensor, top_tensor, cmd, opt); + vktensor_map_[out_name] = top_tensor; + } + + // download all nodes data + { + // Tensor tmp_tensor; + // cmd.record_download(vktensor_map_[out_name], tmp_tensor, opt); + // tensor_map[out_name] = tmp_tensor; + } + + if (cret != 0) + { + printf("layer record_pipeline %d failed", (int)i); + return -1; + } + } + + cmd.record_download(vktensor_map_[name], output, opt); + + // // download output + // int byte_size=tensor_map_[name]->elem_size * tensor_map_[name]->elem_num; + // void* mem=std::malloc(byte_size); + // tensor_map_[name]->data = mem; + // cmd.record_download(vktensor_map_[name], tensor_map_[name], opt); + +// double total_time, min_time, max_time; +// min_time = 999999999; +// max_time = 0; +// total_time = 0; +// double start_time = get_cur_time(); + + cmd.submit_and_wait(); + +// double end_time = get_cur_time(); +// double cur_time = end_time - start_time; +// total_time += cur_time; +// if (cur_time > max_time) +// max_time = cur_time; +// if (cur_time < min_time) +// min_time = cur_time; +// printf("vulkan Repeat [1] min %.3f ms, max %.3f ms, avg %.3f ms\n", min_time, max_time, total_time / 1); + + Tensor tmp_fp32; + if(output.elemsize == output.elempack * 2) + { + TEngine::cast_float16_to_float32(output, tmp_fp32, opt); + } + else + { + tmp_fp32 = output; + } + + Tensor blob_unpacked; + if (opt.use_packing_layout) + { + convert_packing(tmp_fp32, blob_unpacked, 1, opt); + } + else + { + blob_unpacked = tmp_fp32; + } + + tensor_map_[name]->data = blob_unpacked.data; + + +// #define DEBUG_OUTPUT +#ifdef DEBUG_OUTPUT + printf("run save tensor data\n"); + for (size_t j=0; jtops[0]; + // std::string in_name = layer->bottoms[0]; + printf("%s\n", in_name.c_str()); + + std::string fname = std::to_string(j)+".data"; + FILE* fp = fopen(fname.c_str(), "w"); + + // float * data = (float*)get_tensor_buffer(tensor_map_[name]); + // float* data = (float*)vktensor_map_[in_name].mapped_ptr(); + // float* data = (float*)tensor_map_[in_name]->data; + // float* data = (float*)tensor_map[in_name].data; + Tensor tmp_fp16 = tensor_map[in_name]; + Tensor tmp_fp32; + if(tmp_fp16.elemsize == tmp_fp16.elempack * 2) + TEngine::cast_float16_to_float32(tmp_fp16, tmp_fp32, opt); + else + tmp_fp32 = tmp_fp16; + + Tensor blob_unpacked; + if (opt.use_packing_layout) + convert_packing(tmp_fp32, blob_unpacked, 1, opt); + else + blob_unpacked = tmp_fp32; + + int byte_size=tensor_map_[in_name]->elem_size * tensor_map_[name]->elem_num; + void* mem=std::malloc(byte_size); + memcpy(mem, blob_unpacked.data, byte_size); + tensor_map_[in_name]->data = mem; + // tensor_map_[in_name]->data = blob_unpacked.data; + + // float* data = (float*)tmp_fp32.data; + float* data = (float*)blob_unpacked.data; + printf("tensor shape:%d %d %d %d\n", tensor_map_[in_name]->dims[0], tensor_map_[in_name]->dims[1], tensor_map_[in_name]->dims[2], tensor_map_[in_name]->dims[3]); + byte_size=tensor_map_[in_name]->elem_size * tensor_map_[in_name]->elem_num; + for(int i = 0; i < byte_size/sizeof(float); i++) + { + if(i % 16 == 0) + { + fprintf(fp, "\n%d:", i); + } + fprintf(fp, " %.6f", data[i]); + } + fprintf(fp, "\n"); + + fclose(fp); + } +#endif + + return 0; +} + +int VulkanGraph::destory_pipeline() +{ + return 0; +} + +} diff --git a/source/device/vulkan/vulkan_graph.hpp b/source/device/vulkan/vulkan_graph.hpp new file mode 100644 index 000000000..8218f271c --- /dev/null +++ b/source/device/vulkan/vulkan_graph.hpp @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "vulkan_gpu.hpp" +#include "vulkan_pipeline.hpp" +#include "vulkan_command.hpp" +#include "vulkan_option.hpp" +#include "vulkan_layer.hpp" + +extern "C" +{ +// #include "device/device.h" +// #include "graph/subgraph.h" + +#include "api/c_api.h" +#include "device/device.h" +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "executer/executer.h" +#include "optimizer/split.h" +#include "module/module.h" +#include "utility/vector.h" +#include "utility/log.h" + + +#include "convolution_param.h" + +namespace TEngine { + +class VulkanDevice; + +class VulkanGraph { + +friend VulkanDevice; + +public: + const std::string& GetName(void) const {return name_;} + + VulkanGraph(const std::string& name); + VulkanGraph(struct subgraph* graph); + ~VulkanGraph(); + + int record_convolution(VkCompute& cmd, ir_node_t* node); + + int UploadConvolutionWeight(VkTransfer& cmd, const Option& opt, ir_node_t* node); + + bool CreateConvolutionPipeline(ir_node_t* node); + + bool CreatePoolingPipeline(ir_node_t* node); + + std::unordered_map tensor_map_; // tengine lite cpu tensor list + std::unordered_map tensor_map; // vulkan cpu tensor list + std::unordered_map vktensor_map_; // vulkan gpu tensor list + + bool OpSupported(const std::string& name); + + Option opt; + Pipeline* pipeline_convolution; + + int record_graph_pipeline(); + + int upload_model(); + + int create_pipeline(); + + int destory_pipeline(); + +protected: + subgraph* sgraph; + std::vector layers; + + const GPUDevice* vkdev; + + VkAllocator* weight_vkallocator; + VkAllocator* weight_staging_vkallocator; + +private: + + VkAllocator* local_blob_vkallocator; + VkAllocator* local_staging_vkallocator; + + std::string name_; + + std::vector gpu_mem_vector_; + std::vector mem_buf_vector_; + + std::map iotensor_map_; +}; + +} //namespace TEngine + + +int vulkan_dev_init(struct device* dev); +int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options); +int vulkan_dev_run(struct device* dev, struct subgraph* subgraph); +int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph); +int vulkan_dev_release(struct device* dev); +} + + +/* + + + + +*/ \ No newline at end of file diff --git a/source/device/vulkan/vulkan_helper.cc b/source/device/vulkan/vulkan_helper.cc new file mode 100644 index 000000000..4668f8bfe --- /dev/null +++ b/source/device/vulkan/vulkan_helper.cc @@ -0,0 +1,311 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#include "vulkan_helper.hpp" + +// bool CHECK_ENQUEUE_BUFFER_STATUS(cl_int status) +// { +// if (status != CL_SUCCESS) +// { +// TLOG_INFO("Log: clEnqueue****Buffer status %d\n",status); +// if (status == CL_INVALID_COMMAND_QUEUE ) +// TLOG_INFO("Log: CL_INVALID_COMMAND_QUEUE \n"); +// else if (status == CL_INVALID_CONTEXT ) +// TLOG_INFO("Log: CL_INVALID_CONTEXT \n"); +// else if (status == CL_INVALID_MEM_OBJECT ) +// TLOG_INFO("Log: CL_INVALID_MEM_OBJECT \n"); +// else if (status == CL_INVALID_VALUE ) +// TLOG_INFO("Log: CL_INVALID_VALUE \n"); +// else if (status == CL_INVALID_EVENT_WAIT_LIST ) +// TLOG_INFO("Log: CL_INVALID_EVENT_WAIT_LIST \n"); +// else if (status == CL_MISALIGNED_SUB_BUFFER_OFFSET ) +// TLOG_INFO("Log: CL_MISALIGNED_SUB_BUFFER_OFFSET \n"); +// else if (status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ) +// TLOG_INFO("Log: CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST \n"); +// else if (status == CL_MEM_OBJECT_ALLOCATION_FAILURE ) +// TLOG_INFO("Log: CL_MEM_OBJECT_ALLOCATION_FAILURE \n"); +// else if (status == CL_INVALID_OPERATION ) +// TLOG_INFO("Log: CL_INVALID_OPERATION \n"); +// else if (status == CL_OUT_OF_RESOURCES ) +// TLOG_INFO("Log: CL_OUT_OF_RESOURCES \n"); +// else if (status == CL_OUT_OF_HOST_MEMORY ) +// TLOG_INFO("Log: CL_OUT_OF_HOST_MEMORY \n"); +// return false; +// } +// // else +// // TLOG_INFO("Log: clEnqueue****Buffer SUCCESS\n"); +// return true; +// } + +// bool CHECK_ENQUEUE_KERNEL_STATUS(cl_int status) +// { +// if (status != CL_SUCCESS) +// { +// TLOG_INFO("Log: clEnqueueNDRangeKernel status %d\n",status); +// if (status == CL_INVALID_PROGRAM_EXECUTABLE ) +// TLOG_INFO("Log: CL_INVALID_PROGRAM_EXECUTABLE \n"); +// else if (status == CL_INVALID_COMMAND_QUEUE ) +// TLOG_INFO("Log: CL_INVALID_COMMAND_QUEUE \n"); +// else if (status == CL_INVALID_KERNEL ) +// TLOG_INFO("Log: CL_INVALID_KERNEL \n"); +// else if (status == CL_INVALID_CONTEXT ) +// TLOG_INFO("Log: CL_INVALID_CONTEXT \n"); +// else if (status == CL_INVALID_KERNEL_ARGS ) +// TLOG_INFO("Log: CL_INVALID_KERNEL_ARGS \n"); +// else if (status == CL_INVALID_WORK_DIMENSION ) +// TLOG_INFO("Log: CL_INVALID_WORK_DIMENSION \n"); +// else if (status == CL_INVALID_GLOBAL_WORK_SIZE ) +// TLOG_INFO("Log: CL_INVALID_GLOBAL_WORK_SIZE \n"); +// else if (status == CL_INVALID_GLOBAL_OFFSET ) +// TLOG_INFO("Log: CL_INVALID_GLOBAL_OFFSET \n"); +// else if (status == CL_INVALID_WORK_GROUP_SIZE ) +// TLOG_INFO("Log: CL_INVALID_WORK_GROUP_SIZE \n"); +// else if (status == CL_INVALID_WORK_ITEM_SIZE ) +// TLOG_INFO("Log: CL_INVALID_WORK_ITEM_SIZE \n"); +// else if (status == CL_MISALIGNED_SUB_BUFFER_OFFSET ) +// TLOG_INFO("Log: CL_MISALIGNED_SUB_BUFFER_OFFSET \n"); +// else if (status == CL_INVALID_IMAGE_SIZE ) +// TLOG_INFO("Log: CL_INVALID_IMAGE_SIZE \n"); +// else if (status == CL_OUT_OF_RESOURCES ) +// TLOG_INFO("Log: CL_OUT_OF_RESOURCES \n"); +// else if (status == CL_MEM_OBJECT_ALLOCATION_FAILURE ) +// TLOG_INFO("Log: CL_MEM_OBJECT_ALLOCATION_FAILURE \n"); +// else if (status == CL_INVALID_EVENT_WAIT_LIST ) +// TLOG_INFO("Log: CL_INVALID_EVENT_WAIT_LIST \n"); +// else if (status == CL_OUT_OF_RESOURCES ) +// TLOG_INFO("Log: CL_OUT_OF_RESOURCES \n"); +// else if (status == CL_OUT_OF_HOST_MEMORY ) +// TLOG_INFO("Log: CL_OUT_OF_HOST_MEMORY \n"); +// return false; +// } +// // else +// // TLOG_INFO("Log: clEnqueueNDRangeKernel SUCCESS\n"); +// return true; +// } + +// bool CHECK_SET_KERNEL_STATUS(cl_int status) +// { +// if (status != CL_SUCCESS) +// { +// TLOG_INFO("Log: clSetKernelArg status %d\n",status); +// if (status == CL_INVALID_KERNEL ) +// TLOG_INFO("Log: CL_INVALID_KERNEL \n"); +// else if (status == CL_INVALID_ARG_INDEX ) +// TLOG_INFO("Log: CL_INVALID_ARG_INDEX \n"); +// else if (status == CL_INVALID_ARG_VALUE ) +// TLOG_INFO("Log: CL_INVALID_ARG_VALUE \n"); +// else if (status == CL_INVALID_MEM_OBJECT ) +// TLOG_INFO("Log: CL_INVALID_MEM_OBJECT \n"); +// else if (status == CL_INVALID_SAMPLER ) +// TLOG_INFO("Log: CL_INVALID_SAMPLER \n"); +// else if (status == CL_INVALID_ARG_SIZE ) +// TLOG_INFO("Log: CL_INVALID_ARG_SIZE \n"); +// else if (status == CL_INVALID_ARG_VALUE ) +// TLOG_INFO("Log: CL_INVALID_ARG_VALUE \n"); +// else if (status == CL_OUT_OF_RESOURCES ) +// TLOG_INFO("Log: CL_OUT_OF_RESOURCES \n"); +// else if (status == CL_OUT_OF_HOST_MEMORY ) +// TLOG_INFO("Log: CL_OUT_OF_HOST_MEMORY \n"); +// return false; +// } +// // else +// // { +// // TLOG_INFO("Log: clSetKernelArg SUCCESS \n"); +// // } +// return true; +// } + +/** convert the kernel file into a string */ +int convertToString(const char *filename, std::string& s) +{ + size_t size; + char* str; + std::fstream f(filename, (std::fstream::in | std::fstream::binary)); + + if(f.is_open()) + { + size_t fileSize; + f.seekg(0, std::fstream::end); + size = fileSize = (size_t)f.tellg(); + f.seekg(0, std::fstream::beg); + str = new char[size+1]; + if(!str) + { + f.close(); + return 0; + } + + f.read(str, fileSize); + f.close(); + str[size] = '\0'; + s = str; + delete[] str; + return 0; + } + std::cout<<"Error: failed to open file\n"< 0) + // { + // cl_platform_id* platforms = + // (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id)); + // status = clGetPlatformIDs(numPlatforms, platforms, NULL); + // platform = platforms[0]; + // free(platforms); + // } + // else + // return -1; + + // return 0; +// } + +/**Step 2:Query the platform and choose the first GPU device if has one.*/ +// cl_device_id *getCl_device_id(cl_platform_id &platform) +// { +// cl_uint numDevices = 0; +// cl_device_id *devices=NULL; +// cl_int status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); +// if (numDevices > 0) //GPU available. +// { +// devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); +// status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); +// } +// return devices; +// } + +void get_device_message() +{ + // /* Host/device data structures */ + // cl_platform_id *platforms; + // cl_device_id *devices; + // cl_uint num_platforms; + // cl_uint num_devices, addr_data; + // cl_int i, err; + + // /* Extension data */ + // char name_data[48000], ext_data[409600]; + + // err = clGetPlatformIDs(5, NULL, &num_platforms); + // if(err < 0) { + // perror("Couldn't find any platforms."); + // exit(1); + // } + + // /* 选取所有的platforms*/ + // platforms = (cl_platform_id*) + // malloc(sizeof(cl_platform_id) * num_platforms); + // err = clGetPlatformIDs(num_platforms, platforms, NULL); + // if(err < 0) { + // perror("Couldn't find any platforms"); + // exit(1); + // } + + // //循环查看所有platforms的devices信息,一般intel和AMD的都可以有两个devices:CPU和显卡 + // //如果是nvidia的就一般只有一个显卡device了。 + // printf("\nnum_platforms %d\n", num_platforms); + // for (int j = 0; j < (int)num_platforms; j++) + // { + // printf("\nplatform %d\n", j+1); + // /* 步骤和platforms的一样 */ + // err = clGetDeviceIDs(platforms[j], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices); + // if(err < 0) { + // perror("Couldn't find any devices!!!"); + // exit(1); + // } + + // /* Access connected devices */ + // devices = (cl_device_id*) + // malloc(sizeof(cl_device_id) * num_devices); + // clGetDeviceIDs(platforms[j], CL_DEVICE_TYPE_ALL, + // num_devices, devices, NULL); + + // /*循环显示platform的所有device(CPU和显卡)信息。*/ + // for(i=0; i<(int)num_devices; i++) { + + // err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, + // sizeof(name_data), name_data, NULL); + // if(err < 0) { + // perror("Couldn't read extension data"); + // exit(1); + // } + // clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS, + // sizeof(ext_data), &addr_data, NULL); + + // clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, + // sizeof(ext_data), ext_data, NULL); + + // printf("NAME: %s\nADDRESS_WIDTH: %u\nEXTENSIONS: %s\n\n", + // name_data, addr_data, ext_data); + // } + // } + + // free(platforms); + // free(devices); + // printf("\n"); +} + +void dump_sub_graph(struct subgraph* sub_graph) +{ + // TLOG_INFO("Sub graph[%d]: {%8s } has %d nodes, %d input tensors, %d output tensors.\n", sub_graph->index, sub_graph->device->name, sub_graph->node_num, sub_graph->input_num, sub_graph->output_num); + // TLOG_INFO("\tSub nodes: [ "); + + // for (int j = 0; j < sub_graph->node_num - 1; j++) + // { + // int node_id = sub_graph->node_list[j]; + // TLOG_INFO("%d, ", node_id); + // } + // TLOG_INFO("%d ].\n", sub_graph->node_list[sub_graph->node_num - 1]); + + // TLOG_INFO("\tSub input tensors: [ "); + // for (int j = 0; j < sub_graph->input_num - 1; j++) + // { + // int tensor_id = sub_graph->input_tensor_list[j]; + // TLOG_INFO("%d, ", tensor_id); + // } + // TLOG_INFO("%d ].\n", sub_graph->input_tensor_list[sub_graph->input_num - 1]); + + // TLOG_INFO("\tSub output tensors: [ "); + // for (int j = 0; j < sub_graph->output_num - 1; j++) + // { + // int tensor_id = sub_graph->output_tensor_list[j]; + // TLOG_INFO("%d, ", tensor_id); + // } + // TLOG_INFO("%d ].\n", sub_graph->output_tensor_list[sub_graph->output_num - 1]); +} + diff --git a/source/device/vulkan/vulkan_helper.hpp b/source/device/vulkan/vulkan_helper.hpp new file mode 100644 index 000000000..3955be7bb --- /dev/null +++ b/source/device/vulkan/vulkan_helper.hpp @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: hhchen@openailab.com + */ + +#pragma once + +// #include +#include +#include +#include +#include +#include +#include + +extern "C" +{ +#include "api/c_api.h" +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "device/device.h" +#include "utility/sys_port.h" +#include "utility/log.h" +} + +// bool CHECK_SET_KERNEL_STATUS(cl_int status); +// bool CHECK_ENQUEUE_KERNEL_STATUS(cl_int status); +// bool CHECK_ENQUEUE_BUFFER_STATUS(cl_int status); + +/** convert the kernel file into a string */ +int convertToString(const char *filename, std::string& s); + +/**Getting platforms and choose an available one.*/ +// int getPlatform(cl_platform_id &platform); + +/**Step 2:Query the platform and choose the first GPU device if has one.*/ +// cl_device_id *getCl_device_id(cl_platform_id &platform); + +void get_device_message(); + +void dump_sub_graph(struct subgraph* sub_graph); + diff --git a/source/device/vulkan/vulkan_layer.cpp b/source/device/vulkan/vulkan_layer.cpp new file mode 100644 index 000000000..a4c7e4dab --- /dev/null +++ b/source/device/vulkan/vulkan_layer.cpp @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "vulkan_layer.hpp" + +namespace TEngine { + +Layer::Layer() +{ + support_vulkan = false; +} + +Layer::~Layer() +{ +} + +int Layer::create_pipeline(const Option& /*opt*/) +{ + return 0; +} + +int Layer::destroy_pipeline(const Option& /*opt*/) +{ + return 0; +} + +int Layer::upload_model(VkTransfer& cmd, const Option& opt) +{ + return 0; +} + +int Layer::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const +{ + return 0; +} + +int Layer::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const +{ + return 0; +} + +int Layer::record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const +{ + printf("run layer record_pipeline VkTensors\n"); + return 0; +} + +} // TEngine \ No newline at end of file diff --git a/source/device/vulkan/vulkan_layer.hpp b/source/device/vulkan/vulkan_layer.hpp new file mode 100644 index 000000000..526ca148b --- /dev/null +++ b/source/device/vulkan/vulkan_layer.hpp @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef VULKAN_LAYER_HPP +#define VULKAN_LAYER_HPP + +#include +#include "vulkan_command.hpp" +#include "vulkan_pipeline.hpp" + +extern "C" +{ +#include "api/c_api.h" +#include "device/device.h" +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +#include "executer/executer.h" +#include "optimizer/split.h" +#include "module/module.h" +#include "utility/vector.h" +#include "utility/log.h" +} + +namespace TEngine { + +class Layer +{ +public: + // empty + Layer(); + // virtual destructor + virtual ~Layer(); + + // layer implementation specific setup + // return 0 if success + virtual int create_pipeline(const Option& opt); + + // layer implementation specific clean + // return 0 if success + virtual int destroy_pipeline(const Option& opt); + + // upload weight blob from host to device + virtual int upload_model(VkTransfer& cmd, const Option& opt); + + // virtual int record_pipeline(VkCompute& cmd, const Option& opt) const; + virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const; + virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const; + + virtual int record_pipeline(const std::vector& bottom_blobs, std::vector& top_blobs, VkCompute& cmd, const Option& opt) const; + +public: + // support vulkan compute + bool support_vulkan; + + // accept input blob with packed storage + bool support_packing; + + // accept bf16 + bool support_bf16_storage; + + // shader image storage + bool support_image_storage; + +public: + const GPUDevice* vkdev; + std::vector bottoms; + std::vector tops; + +public: + // layer name + std::string name; + // Node* node; + ir_graph_t* graph; + ir_node_t* node; +}; + +Layer* create_layer(std::string type); + +} // TEngine + +#endif // VULKAN_LAYER_HPP diff --git a/source/device/vulkan/vulkan_limit.hpp b/source/device/vulkan/vulkan_limit.hpp new file mode 100644 index 000000000..741786fae --- /dev/null +++ b/source/device/vulkan/vulkan_limit.hpp @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Copyright (c) 2021, OPEN AI LAB + * Author: hhchen@openailab.com + */ + + +#pragma once + +extern "C" +{ +#include "operator/op.h" +} + + +const int vulkan_supported_ops[] = { + + OP_CLIP, + OP_CONCAT, + OP_CONST, + OP_CONV, + OP_DROPOUT, + OP_ELTWISE, + OP_FC, + OP_FLATTEN, + OP_INPUT, +//// OP_PERMUTE, + OP_POOL, + OP_RELU, + OP_RESHAPE, + OP_SLICE, +//// OP_SOFTMAX + + +// OP_BIAS, + +//// OP_ABSVAL, +//// OP_ADD_N, +//// OP_ARGMAX, +//// OP_ARGMIN, +//// OP_BATCHNORM, +//// OP_BATCHTOSPACEND, +//// OP_BIAS, +//// OP_BROADMUL, +// +//// OP_CAST, +//// OP_CEIL, +//// OP_CLIP, +//// OP_COMPARISON, +//// OP_CONCAT, +// OP_CONST, +// OP_CONV, +//// OP_CROP, +//// OP_DECONV, +//// OP_DEPTHTOSPACE, +//// OP_DETECTION_OUTPUT, +//// OP_DETECTION_POSTPROCESS, +// +//// OP_DROPOUT, +//// OP_ELTWISE, +//// OP_ELU, +//// OP_EMBEDDING, +//// OP_EXPANDDIMS, +//// OP_FC, +//// OP_FLATTEN, +//// OP_GATHER, +//// OP_GEMM, +//// OP_GRU, +//// OP_HARDSIGMOID, +//// OP_HARDSWISH, +// OP_INPUT, +//// OP_INSTANCENORM, +//// OP_INTERP, +//// OP_LOGICAL, +//// OP_LOGISTIC, +//// OP_LRN, +//// OP_LSTM, +//// OP_MATMUL, +//// OP_MAXIMUM, +//// OP_MEAN, +//// OP_MINIMUM, +//// OP_MVN, +//// OP_NOOP, +//// OP_NORMALIZE, +// +//// OP_PAD, +//// OP_PERMUTE, +// OP_POOL, +//// OP_PRELU, +//// OP_PRIORBOX, +//// OP_PSROIPOOLING, +//// OP_REDUCEL2, +//// OP_REDUCTION, +//// OP_REGION, +// OP_RELU, +// +//// OP_RELU6, +//// OP_REORG, +//// OP_RESHAPE, +//// OP_RESIZE, +//// OP_REVERSE, +//// OP_RNN, +//// OP_ROIALIGN, +//// OP_ROIPOOLING, +//// OP_ROUND, +//// OP_RPN, +//// OP_SCALE, +//// OP_SELU, +//// OP_SHUFFLECHANNEL, +//// OP_SIGMOID, +// +//// OP_SLICE, +//// OP_SOFTMAX, +//// OP_SPACETOBATCHND, +//// OP_SPACETODEPTH, +//// OP_SPARSETODENSE, +//// OP_SPLIT, +//// OP_SQUAREDDIFFERENCE, +//// OP_SQUEEZE, +//// OP_STRIDED_SLICE, +//// OP_SWAP_AXIS, +//// OP_TANH, +//// OP_THRESHOLD, +//// OP_TOPKV2, +//// OP_TRANSPOSE, +//// OP_UNARY, +//// OP_UNSQUEEZE, +//// OP_UPSAMPLE, +//// OP_ZEROSLIKE, +//// OP_MISH, +//// OP_LOGSOFTMAX, +//// OP_RELU1, +//// OP_L2NORMALIZATION, +//// OP_L2POOL, +//// OP_TILE, +//// OP_SHAPE, +//// OP_SCATTER, +//// OP_WHERE, +//// OP_BUILTIN_LAST + + +}; diff --git a/source/device/vulkan/vulkan_option.cpp b/source/device/vulkan/vulkan_option.cpp new file mode 100644 index 000000000..d57440411 --- /dev/null +++ b/source/device/vulkan/vulkan_option.cpp @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "vulkan_option.hpp" + +namespace TEngine { + +Option::Option() +{ + lightmode = true; + num_threads = 1; + elempack = 1; + blob_allocator = 0; + workspace_allocator = 0; + + blob_vkallocator = 0; + workspace_vkallocator = 0; + staging_vkallocator = 0; + + use_winograd_convolution = true; + use_sgemm_convolution = true; + use_int8_inference = true; + use_vulkan_compute = true; + + use_fp16_packed = true; + use_fp16_storage = true; + use_fp16_arithmetic = false; + use_int8_storage = false; + use_int8_arithmetic = false; + + use_packing_layout = true; + use_shader_pack8 = false; + use_image_storage = false; + use_bf16_storage = false; +} + +} // namespace TEngine diff --git a/source/device/vulkan/vulkan_option.hpp b/source/device/vulkan/vulkan_option.hpp new file mode 100644 index 000000000..ee026e1a2 --- /dev/null +++ b/source/device/vulkan/vulkan_option.hpp @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef VULKAN_OPTION_HPP +#define VULKAN_OPTION_HPP + +namespace TEngine { + +class VkAllocator; + +class Allocator; +class Option +{ +public: + // default option + Option(); + +public: + // light mode + // intermediate blob will be recycled when enabled + // enabled by default + bool lightmode; + + // thread count + // default value is the one returned by get_cpu_count() + int num_threads; + + // Pack Layout 1/4/8 + int elempack; + + // blob memory allocator + Allocator* blob_allocator; + + // workspace memory allocator + Allocator* workspace_allocator; + + // blob memory allocator + VkAllocator* blob_vkallocator; + + // workspace memory allocator + VkAllocator* workspace_vkallocator; + + // staging memory allocator + VkAllocator* staging_vkallocator; + + // enable winograd convolution optimization + // improve convolution 3x3 stride1 performace, may consume more memory + // changes should be applied before loading network structure and weight + // enabled by default + bool use_winograd_convolution; + + // enable sgemm convolution optimization + // improve convolution 1x1 stride1 performace, may consume more memory + // changes should be applied before loading network structure and weight + // enabled by default + bool use_sgemm_convolution; + + // enable quantized int8 inference + // use low-precision int8 path for quantized model + // changes should be applied before loading network structure and weight + // enabled by default + bool use_int8_inference; + + // enable vulkan compute + bool use_vulkan_compute; + + // enable options for gpu inference + bool use_fp16_packed; + bool use_fp16_storage; + bool use_fp16_arithmetic; + bool use_int8_storage; + bool use_int8_arithmetic; + + // enable simd-friendly packed memory layout + // improve all operator performace on all arm devices, will consume more memory + // changes should be applied before loading network structure and weight + // enabled by default + bool use_packing_layout; + + bool use_shader_pack8; + + // turn on for adreno + bool use_image_storage; + + // enable bf16 data type for storage + // improve most operator performace on all arm devices, may consume more memory + bool use_bf16_storage; +}; + +} // namespace TEngine + +#endif // VULKAN_OPTION_HPP diff --git a/source/device/vulkan/vulkan_pipeline.cpp b/source/device/vulkan/vulkan_pipeline.cpp new file mode 100644 index 000000000..6935c76b5 --- /dev/null +++ b/source/device/vulkan/vulkan_pipeline.cpp @@ -0,0 +1,568 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "vulkan_pipeline.hpp" +#include "vulkan_gpu.hpp" + +#include "stdio.h" +#include +#include + +namespace TEngine { + +Pipeline::Pipeline(const GPUDevice* _vkdev) : vkdev(_vkdev) +{ + local_shader_module = 0; + + descriptorset_layout = 0; + pipeline_layout = 0; + pipeline = 0; + descriptor_update_template = 0; + + local_size_x = 1; + local_size_y = 1; + local_size_z = 1; +} + +Pipeline::~Pipeline() +{ + destroy(); +} + +int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::vector& specializations) +{ + ShaderInfo si; + int ret = resolve_shader_info(spv_data, spv_data_size, si); + if (ret != 0) + { + printf("resolve_shader_info failed %d", ret); + return -1; + } + + // -3 for local_size_xyz + int specialization_count_expected = si.specialization_count - 3; + if ((int)specializations.size() != specialization_count_expected) + { + printf("pipeline specialization count mismatch, expect %d but got %d", specialization_count_expected, (int)specializations.size()); + return -1; + } + + if (vkdev->info.bug_local_size_spec_const) + { + local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z); + } + else + { + local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size); + } + +// TLOG_INFO("local_shader_module %p created", local_shader_module); + + return create(local_shader_module, si, specializations); +} + +int Pipeline::create(int shader_type_index, const Option& opt, const std::vector& specializations) +{ + // printf("run pipeline create, shader_type_index:%d, specialization size:%d\n", shader_type_index, specializations.size()); + // ncnn_add_shader cmake macro + // 0 = fp32 + // 1 = fp16p + // 2 = fp16pa + // 3 = fp16s + // 4 = fp16sa + // 5 = image + // 6 = image_fp16p + // 7 = image_fp16pa + // 8 = image_fp16s + // 9 = image_fp16sa + + if (opt.use_image_storage && vkdev->info.support_fp16_storage && opt.use_fp16_storage && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic) + { + shader_type_index += 9; + } + else if (opt.use_image_storage && vkdev->info.support_fp16_packed && opt.use_fp16_packed && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic) + { + shader_type_index += 7; + } + else if (opt.use_image_storage && vkdev->info.support_fp16_storage && opt.use_fp16_storage) + { + shader_type_index += 8; + } + else if (opt.use_image_storage && vkdev->info.support_fp16_packed && opt.use_fp16_packed) + { + shader_type_index += 6; + } + else if (opt.use_image_storage) + { + shader_type_index += 5; + } + else if (vkdev->info.support_fp16_storage && opt.use_fp16_storage && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic) + { + shader_type_index += 4; + } + else if (vkdev->info.support_fp16_packed && opt.use_fp16_packed && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic) + { + shader_type_index += 2; + } + else if (vkdev->info.support_fp16_storage && opt.use_fp16_storage) + { + shader_type_index += 3; + } + else if (vkdev->info.support_fp16_packed && opt.use_fp16_packed) + { + shader_type_index += 1; + } + + const ShaderInfo& si = get_shader_info(shader_type_index); + + // -3 for local_size_xyz + int specialization_count_expected = si.specialization_count - 3; + // int specialization_count_expected = si.specialization_count; + if ((int)specializations.size() != specialization_count_expected) + { + printf("pipeline %d specialization count mismatch, expect %d but got %d\n", shader_type_index, specialization_count_expected, (int)specializations.size()); + return -1; + } + + if (vkdev->info.bug_local_size_spec_const) + { + local_shader_module = vkdev->create_shader_module(shader_type_index, local_size_x, local_size_y, local_size_z); + + return create(local_shader_module, si, specializations); + } + + VkShaderModule shader_module = vkdev->get_shader_module(shader_type_index); + + return create(shader_module, si, specializations); +} + +int Pipeline::create(VkShaderModule shader_module, const ShaderInfo& _shader_info, const std::vector& specializations) +{ + shader_info = _shader_info; + + create_descriptorset_layout(); + + create_pipeline_layout(); + + create_pipeline(shader_module, specializations); + + if (vkdev->info.support_VK_KHR_descriptor_update_template) + { + create_descriptor_update_template(); + } + + return 0; +} + +void Pipeline::destroy() +{ + if (vkdev->info.support_VK_KHR_descriptor_update_template) + { + if (descriptor_update_template) + { + vkdev->vkDestroyDescriptorUpdateTemplateKHR(vkdev->vkdevice(), descriptor_update_template, 0); + descriptor_update_template = 0; + } + } + + if (pipeline) + { + vkDestroyPipeline(vkdev->vkdevice(), pipeline, 0); + pipeline = 0; + } + + if (pipeline_layout) + { + vkDestroyPipelineLayout(vkdev->vkdevice(), pipeline_layout, 0); + pipeline_layout = 0; + } + + if (descriptorset_layout) + { + vkDestroyDescriptorSetLayout(vkdev->vkdevice(), descriptorset_layout, 0); + descriptorset_layout = 0; + } + + if (local_shader_module) + { + vkDestroyShaderModule(vkdev->vkdevice(), local_shader_module, 0); + local_shader_module = 0; + } +} + +void Pipeline::set_optimal_local_size_xyz(int w, int h, int c) +{ + set_optimal_local_size_xyz(Tensor(w, h, c, (void*)0)); +} + +void Pipeline::set_optimal_local_size_xyz(const VkTensor& local_size_xyz) +{ + int w = local_size_xyz.w; + int h = local_size_xyz.h; + int c = local_size_xyz.c; + + if (w == 0 && h == 0 && c == 0) + { + // fallback to the common and safe 4x4x4 + w = 4; + h = 4; + c = 4; + } + + w = std::min(w, (int)vkdev->info.max_workgroup_size[0]); + h = std::min(h, (int)vkdev->info.max_workgroup_size[1]); + c = std::min(c, (int)vkdev->info.max_workgroup_size[2]); + + if (w * h * c <= (int)vkdev->info.max_workgroup_invocations) + { + return set_local_size_xyz(w, h, c); + } + + int max_local_size_xy = (int)vkdev->info.max_workgroup_invocations / c; + + int wh_max = std::max(1, (int)sqrt(max_local_size_xy)); + while (w * h >= wh_max) + { + w = std::max(1, w / 2); + h = std::max(1, h / 2); + } + + set_local_size_xyz(w, h, c); +} + +void Pipeline::set_optimal_local_size_xyz(const Tensor& local_size_xyz) +{ + int w = local_size_xyz.w; + int h = local_size_xyz.h; + int c = local_size_xyz.c; + + if (w == 0 && h == 0 && c == 0) + { + // fallback to the common and safe 4x4x4 + w = 4; + h = 4; + c = 4; + } + + w = std::min(w, (int)vkdev->info.max_workgroup_size[0]); + h = std::min(h, (int)vkdev->info.max_workgroup_size[1]); + c = std::min(c, (int)vkdev->info.max_workgroup_size[2]); + + if (w * h * c <= (int)vkdev->info.max_workgroup_invocations) + { + return set_local_size_xyz(w, h, c); + } + + int max_local_size_xy = (int)vkdev->info.max_workgroup_invocations / c; + + int wh_max = std::max(1, (int)sqrt(max_local_size_xy)); + while (w * h >= wh_max) + { + w = std::max(1, w / 2); + h = std::max(1, h / 2); + } + + set_local_size_xyz(w, h, c); +} + +void Pipeline::set_local_size_xyz(int w, int h, int c) +{ + local_size_x = w; + local_size_y = h; + local_size_z = c; + +// TLOG_INFO("local size = %d %d %d", local_size_x, local_size_y, local_size_z); +} + +int Pipeline::create_descriptorset_layout() +{ + const int binding_count = shader_info.binding_count; + + if (binding_count == 0) + { + descriptorset_layout = 0; + return 0; + } + + std::vector descriptorSetLayoutBindings(binding_count); + for (int i=0; iimmutable_texelfetch_sampler();// we always use texelfetch + // } + } + + VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo; + descriptorSetLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO; + descriptorSetLayoutCreateInfo.pNext = 0; + descriptorSetLayoutCreateInfo.flags = 0; + descriptorSetLayoutCreateInfo.bindingCount = binding_count; + descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings.data(); + + if (vkdev->info.support_VK_KHR_push_descriptor) + { + descriptorSetLayoutCreateInfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR; + } + + VkResult ret = vkCreateDescriptorSetLayout(vkdev->vkdevice(), &descriptorSetLayoutCreateInfo, 0, &descriptorset_layout); + if (ret != VK_SUCCESS) + { + printf("vkCreateDescriptorSetLayout failed %d", ret); + return -1; + } + + return 0; +} + +int Pipeline::create_pipeline_layout() +{ + const int push_constant_count = shader_info.push_constant_count; + + VkPushConstantRange pushConstantRange; + pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; + pushConstantRange.offset = 0; + pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count; + + VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo; + pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; + pipelineLayoutCreateInfo.pNext = 0; + pipelineLayoutCreateInfo.flags = 0; + + if (descriptorset_layout) + { + pipelineLayoutCreateInfo.setLayoutCount = 1; + pipelineLayoutCreateInfo.pSetLayouts = &descriptorset_layout; + } + else + { + pipelineLayoutCreateInfo.setLayoutCount = 0; + pipelineLayoutCreateInfo.pSetLayouts = 0; + } + + if (push_constant_count > 0) + { + pipelineLayoutCreateInfo.pushConstantRangeCount = 1; + pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange; + } + else + { + pipelineLayoutCreateInfo.pushConstantRangeCount = 0; + pipelineLayoutCreateInfo.pPushConstantRanges = 0; + } + + VkResult ret = vkCreatePipelineLayout(vkdev->vkdevice(), &pipelineLayoutCreateInfo, 0, &pipeline_layout); + if (ret != VK_SUCCESS) + { + printf("vkCreatePipelineLayout failed %d", ret); + return -1; + } + + return 0; +} + + +int Pipeline::create_pipeline(VkShaderModule shader_module, const std::vector& specializations) +{ + const int specialization_count = specializations.size(); + + // +3 for local_size_xyz + std::vector specializationMapEntries; + specializationMapEntries.resize(specialization_count + 3); + + for (int i=0; i specialization_data = specializations; + + // append local_size_xyz specialization + if (!vkdev->info.bug_local_size_spec_const) + { + VkSpecializationMapEntry* local_size_xyz_entries = specializationMapEntries.data() + specialization_count; + + local_size_xyz_entries[0].constantID = 233; + local_size_xyz_entries[0].offset = (specialization_count+0) * sizeof(vk_specialization_type); + local_size_xyz_entries[0].size = sizeof(vk_specialization_type); + + local_size_xyz_entries[1].constantID = 234; + local_size_xyz_entries[1].offset = (specialization_count+1) * sizeof(vk_specialization_type); + local_size_xyz_entries[1].size = sizeof(vk_specialization_type); + + local_size_xyz_entries[2].constantID = 235; + local_size_xyz_entries[2].offset = (specialization_count+2) * sizeof(vk_specialization_type); + local_size_xyz_entries[2].size = sizeof(vk_specialization_type); + + specialization_data.resize(specialization_count + 3); + specialization_data[ specialization_count+0 ].u32 = local_size_x; + specialization_data[ specialization_count+1 ].u32 = local_size_y; + specialization_data[ specialization_count+2 ].u32 = local_size_z; + } + + VkSpecializationInfo specializationInfo; + specializationInfo.mapEntryCount = specializationMapEntries.size(); + specializationInfo.pMapEntries = specializationMapEntries.data(); + specializationInfo.dataSize = specialization_data.size() * sizeof(vk_specialization_type); + specializationInfo.pData = specialization_data.data(); + + VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo; + pipelineShaderStageCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO; + pipelineShaderStageCreateInfo.pNext = 0; + pipelineShaderStageCreateInfo.flags = 0; + pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT; + pipelineShaderStageCreateInfo.module = shader_module; + pipelineShaderStageCreateInfo.pName = "main"; + pipelineShaderStageCreateInfo.pSpecializationInfo = &specializationInfo; + + VkComputePipelineCreateInfo computePipelineCreateInfo; + computePipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO; + computePipelineCreateInfo.pNext = 0; + computePipelineCreateInfo.flags = 0; + computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo; + computePipelineCreateInfo.layout = pipeline_layout; + computePipelineCreateInfo.basePipelineHandle = 0; + computePipelineCreateInfo.basePipelineIndex = 0; + + VkResult ret = vkCreateComputePipelines(vkdev->vkdevice(), 0, 1, &computePipelineCreateInfo, 0, &pipeline); + if (ret != VK_SUCCESS) + { + printf("vkCreateComputePipelines failed %d", ret); + return -1; + } + + return 0; +} + +int Pipeline::create_descriptor_update_template() +{ + const int binding_count = shader_info.binding_count; + + if (binding_count == 0) + { + descriptor_update_template = 0; + return 0; + } + + std::vector descriptorUpdateTemplateEntries(binding_count); + size_t offset = 0; + for (int i=0; iinfo.support_VK_KHR_push_descriptor) + { + descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR; + } + else + { + descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR; + } + // descriptorSetLayout should be ignored if VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR + // FIXME HACK WARNING TODO NOTE but crash on radv if set NULL :( + descriptorUpdateTemplateCreateInfo.descriptorSetLayout = descriptorset_layout; + descriptorUpdateTemplateCreateInfo.pipelineBindPoint = VK_PIPELINE_BIND_POINT_COMPUTE; + descriptorUpdateTemplateCreateInfo.pipelineLayout = pipeline_layout; + descriptorUpdateTemplateCreateInfo.set = 0; + + VkResult ret = vkdev->vkCreateDescriptorUpdateTemplateKHR(vkdev->vkdevice(), &descriptorUpdateTemplateCreateInfo, 0, &descriptor_update_template); + if (ret != VK_SUCCESS) + { + printf("vkCreateDescriptorUpdateTemplateKHR failed %d", ret); + return -1; + } + + return 0; +} + + +} // namespace TEngine diff --git a/source/device/vulkan/vulkan_pipeline.hpp b/source/device/vulkan/vulkan_pipeline.hpp new file mode 100644 index 000000000..9980d2e43 --- /dev/null +++ b/source/device/vulkan/vulkan_pipeline.hpp @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef VULKAN_PIPELINE_HPP +#define VULKAN_PIPELINE_HPP + +#include +#include "vulkan_gpu.hpp" +#include "vulkan_tensor.hpp" +#include "vulkan_platform.hpp" +#include "vulkan_option.hpp" + +namespace TEngine { + +class Option; +class Pipeline +{ +public: + Pipeline(const GPUDevice* vkdev); + virtual ~Pipeline(); + +public: + void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4); + + void set_optimal_local_size_xyz(const VkTensor& local_size_xyz); + void set_optimal_local_size_xyz(const Tensor& local_size_xyz); + void set_local_size_xyz(int w, int h, int c); + + int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector& specializations); + + int create(int shader_type_index, const Option& opt, const std::vector& specializations); + + int create(VkShaderModule shader_module, const ShaderInfo& si, const std::vector& specializations); + + void destroy(); + +protected: + int create_descriptorset_layout(); + int create_pipeline_layout(); + int create_pipeline(VkShaderModule shader_module, const std::vector& specializations); + int create_descriptor_update_template(); + +public: + const GPUDevice* vkdev; + + // local shader module + VkShaderModule local_shader_module; + + VkDescriptorSetLayout descriptorset_layout; + VkPipelineLayout pipeline_layout; + + // op forward TODO use pipeline cache ? + VkPipeline pipeline; + + VkDescriptorUpdateTemplateKHR descriptor_update_template; + + ShaderInfo shader_info; + + uint32_t local_size_x; + uint32_t local_size_y; + uint32_t local_size_z; +}; + +#if __ANDROID_API__ >= 26 +class VkCompute; +class ImportAndroidHardwareBufferPipeline : private Pipeline +{ +public: + ImportAndroidHardwareBufferPipeline(const GPUDevice* vkdev); + ~ImportAndroidHardwareBufferPipeline(); + + int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt); + int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt); + void destroy(); + + friend class VkCompute; + +protected: + int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator); + int create_descriptorset_layout(); + int create_descriptor_update_template(); + +public: + int type_to; + int rotate_from; + bool need_resize; + + VkSampler sampler; +}; +#endif // __ANDROID_API__ >= 26 + +} // namespace TEngine + +#endif // VULKAN_PIPELINE_HPP diff --git a/source/device/vulkan/vulkan_platform.hpp b/source/device/vulkan/vulkan_platform.hpp new file mode 100644 index 000000000..cc03681a7 --- /dev/null +++ b/source/device/vulkan/vulkan_platform.hpp @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef VULKAN_PLATFORM_HPP +#define VULKAN_PLATFORM_HPP + +#include + +namespace TEngine { + +class Mutex +{ +public: + Mutex() { pthread_mutex_init(&mutex, 0); } + ~Mutex() { pthread_mutex_destroy(&mutex); } + void lock() { pthread_mutex_lock(&mutex); } + void unlock() { pthread_mutex_unlock(&mutex); } +private: + friend class ConditionVariable; + pthread_mutex_t mutex; +}; + +class MutexLockGuard +{ +public: + MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); } + ~MutexLockGuard() { mutex.unlock(); } +private: + Mutex& mutex; +}; + +class ConditionVariable +{ +public: + ConditionVariable() { pthread_cond_init(&cond, 0); } + ~ConditionVariable() { pthread_cond_destroy(&cond); } + void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); } + void broadcast() { pthread_cond_broadcast(&cond); } + void signal() { pthread_cond_signal(&cond); } +private: + pthread_cond_t cond; +}; + +class Thread +{ +public: + Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); } + ~Thread() {} + void join() { pthread_join(t, 0); } +private: + pthread_t t; +}; + +} // namespace TEngine + +#endif \ No newline at end of file diff --git a/source/device/vulkan/vulkan_tensor.cpp b/source/device/vulkan/vulkan_tensor.cpp new file mode 100644 index 000000000..38f588502 --- /dev/null +++ b/source/device/vulkan/vulkan_tensor.cpp @@ -0,0 +1,374 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#include "vulkan_tensor.hpp" + +namespace TEngine { + +void convert_packing(tensor* src, Tensor& dst, int elempack, const Option& opt) +{ + const Tensor _src = Tensor(src); + // printf("convert packing ir_tensor to Tensor : %d %d %d %d %d\n", _src.c, _src.h, _src.w, _src.elempack, _src.elemsize); +} + +void convert_packing(const Tensor& src, Tensor& dst, int _elempack, const Option& opt) +{ + int elempack = src.elempack; + int out_elempack = _elempack; + + if (elempack == out_elempack) + { + dst = src; + return; + } + + int w = src.w; + int h = src.h; + int channels = src.c; + int dims = src.dims; + size_t elemsize = src.elemsize; + + if (dims == 1) + { + if (out_elempack == 1) + { + dst = src; + dst.w = w * elempack; + dst.cstep = w * elempack; + dst.elemsize = elemsize / elempack; + dst.elempack = out_elempack; + return; + } + + int outw = (w * elempack + out_elempack - 1) / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + + dst.create(outw, out_elemsize, out_elempack, opt.blob_allocator); + if (dst.empty()) + return; + + memcpy(dst.data, src.data, w * elemsize); + + return; + } + + if (dims == 2) + { + int outh = (h * elempack + out_elempack - 1) / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + size_t lane_size = out_elemsize / out_elempack; + + dst.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); + if (dst.empty()) + return; + + #pragma omp parallel for + for (int i = 0; i < outh; i++) + { + unsigned char* outptr = (unsigned char*)dst + i * w * out_elemsize; + + for (int j = 0; j < w; j++) + { + unsigned char* out_elem_ptr = outptr + j * out_elemsize; + + for (int k = 0; k < out_elempack; k++) + { + int srcy = (i * out_elempack + k) / elempack; + if (srcy >= h) + break; + + int srck = (i * out_elempack + k) % elempack; + + const unsigned char* ptr = (const unsigned char*)src + srcy * w * elemsize; + const unsigned char* elem_ptr = ptr + j * elemsize; + memcpy(out_elem_ptr + k * lane_size, elem_ptr + srck * lane_size, lane_size); + } + } + } + + return; + } + + if (dims == 3) + { + int outc = (channels * elempack + out_elempack - 1) / out_elempack; + size_t out_elemsize = elemsize / elempack * out_elempack; + size_t lane_size = out_elemsize / out_elempack; + + dst.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); + if (dst.empty()) + return; + + #pragma omp parallel for + for (int q = 0; q < outc; q++) + { + Tensor out = dst.channel(q); + + for (int i = 0; i < h; i++) + { + unsigned char* outptr = (unsigned char*)out + i * w * out_elemsize; + + for (int j = 0; j < w; j++) + { + unsigned char* out_elem_ptr = outptr + j * out_elemsize; + + for (int k = 0; k < out_elempack; k++) + { + int srcq = (q * out_elempack + k) / elempack; + if (srcq >= channels) + break; + + int srck = (q * out_elempack + k) % elempack; + + const Tensor m = src.channel(srcq); + const unsigned char* ptr = (const unsigned char*)m + i * w * elemsize; + const unsigned char* elem_ptr = ptr + j * elemsize; + memcpy(out_elem_ptr + k * lane_size, elem_ptr + srck * lane_size, lane_size); + } + } + } + } + + return; + } +} + +unsigned short float32_to_float16(float value) +{ + // 1 : 8 : 23 + union + { + unsigned int u; + float f; + } tmp; + + tmp.f = value; + + // 1 : 8 : 23 + unsigned short sign = (tmp.u & 0x80000000) >> 31; + unsigned short exponent = (tmp.u & 0x7F800000) >> 23; + unsigned int significand = tmp.u & 0x7FFFFF; + + // TLOG_INFO("%d %d %d", sign, exponent, significand); + + // 1 : 5 : 10 + unsigned short fp16; + if (exponent == 0) + { + // zero or denormal, always underflow + fp16 = (sign << 15) | (0x00 << 10) | 0x00; + } + else if (exponent == 0xFF) + { + // infinity or NaN + fp16 = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00); + } + else + { + // normalized + short newexp = exponent + (-127 + 15); + if (newexp >= 31) + { + // overflow, return infinity + fp16 = (sign << 15) | (0x1F << 10) | 0x00; + } + else if (newexp <= 0) + { + // underflow + if (newexp >= -10) + { + // denormal half-precision + unsigned short sig = (significand | 0x800000) >> (14 - newexp); + fp16 = (sign << 15) | (0x00 << 10) | sig; + } + else + { + // underflow + fp16 = (sign << 15) | (0x00 << 10) | 0x00; + } + } + else + { + fp16 = (sign << 15) | (newexp << 10) | (significand >> 13); + } + } + + return fp16; +} + +float float16_to_float32(unsigned short value) +{ + // 1 : 5 : 10 + unsigned short sign = (value & 0x8000) >> 15; + unsigned short exponent = (value & 0x7c00) >> 10; + unsigned short significand = value & 0x03FF; + + // TLOG_INFO("%d %d %d", sign, exponent, significand); + + // 1 : 8 : 23 + union + { + unsigned int u; + float f; + } tmp; + if (exponent == 0) + { + if (significand == 0) + { + // zero + tmp.u = (sign << 31); + } + else + { + // denormal + exponent = 0; + // find non-zero bit + while ((significand & 0x200) == 0) + { + significand <<= 1; + exponent++; + } + significand <<= 1; + significand &= 0x3FF; + tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13); + } + } + else if (exponent == 0x1F) + { + // infinity or NaN + tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13); + } + else + { + // normalized + tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13); + } + + return tmp.f; +} + +void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt) +{ + // printf("function cast_float32_to_float16 not done, fix me\n!!!!!"); + + int w = src.w; + int h = src.h; + int channels = src.c; + int dims = src.dims; + size_t elemsize = src.elemsize; + int elempack = src.elempack; + + size_t out_elemsize = 2 * elempack; + + if (dims == 1) + { + dst.create(w, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 2) + { + dst.create(w, h, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 3) + { + dst.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + } + if (dst.empty()) + return ; + + int size = w * h * elempack; + + #pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const float* ptr = src.channel(q); + unsigned short* outptr = dst.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = float32_to_float16(ptr[i]); + } + } + +} + +void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt) +{ + // printf("function cast_float16_to_float32 not done, fix me\n!!!!!"); + + int w = src.w; + int h = src.h; + int channels = src.c; + int dims = src.dims; + size_t elemsize = src.elemsize; + int elempack = src.elempack; + + size_t out_elemsize = 4 * elempack; + + if (dims == 1) + { + dst.create(w, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 2) + { + dst.create(w, h, out_elemsize, elempack, opt.blob_allocator); + } + else if (dims == 3) + { + dst.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + } + if (dst.empty()) + return ; + + int size = w * h * elempack; + + #pragma omp parallel for + for (int q = 0; q < channels; q++) + { + const unsigned short* ptr = src.channel(q); + float* outptr = dst.channel(q); + + for (int i = 0; i < size; i++) + { + outptr[i] = float16_to_float32(ptr[i]); + } + } + +} + +} // namespace TEngine diff --git a/source/device/vulkan/vulkan_tensor.hpp b/source/device/vulkan/vulkan_tensor.hpp new file mode 100644 index 000000000..a0ef5a9bd --- /dev/null +++ b/source/device/vulkan/vulkan_tensor.hpp @@ -0,0 +1,1817 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/* + * Parts of the following code in this file refs to + * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/ + * Tencent is pleased to support the open source community by making ncnn + * available. + * + * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the BSD 3-Clause License (the "License"); you may not use this + * file except in compliance with the License. You may obtain a copy of the + * License at + * + * https://opensource.org/licenses/BSD-3-Clause + */ + +/* + * Copyright (c) 2020, Open AI Lab + * Author: ddzhao@openailab.com + */ + +#ifndef VULKAN_TENSOR_HPP +#define VULKAN_TENSOR_HPP + +#include +#include +// #include "tengine_ir.h" + +extern "C" +{ +#include "graph/tensor.h" +#include "graph/node.h" +#include "graph/graph.h" +#include "graph/subgraph.h" +} + +#include +#include "vulkan_allocator.hpp" +#include "vulkan_option.hpp" + +namespace TEngine { + +class VkTensor; +class VkImageTensor; + +class Tshape +{ +public: + Tshape() + { + w = 0; + h = 0; + c = 0; + dims = 0; + } + Tshape(int _w, int _h, int _c) + { + w = _w; + h = _h; + c = _c; + dims = 3; + } + + int dims; + int w; + int h; + int c; + + size_t cstep; +}; + +class Tensor +{ +public: + // empty + Tensor(); + // vec + Tensor(int w, size_t elemsize = 4u, Allocator* allocator = 0); + // image + Tensor(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0); + // dim + Tensor(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0); + // packed vec + Tensor(int w, size_t elemsize, int elempack, Allocator* allocator = 0); + // packed image + Tensor(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0); + // packed dim + Tensor(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0); + // copy + Tensor(const Tensor& m); + // copy from ir_tensor + Tensor(struct tensor* m); + // external vec + Tensor(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0); + // external image + Tensor(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0); + // external dim + Tensor(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0); + // external packed vec + Tensor(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0); + // external packed image + Tensor(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0); + // external packed dim + Tensor(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0); + // release + ~Tensor(); + // assign + Tensor& operator=(const Tensor& m); + + // reshape vec + Tensor reshape(int w, Allocator* allocator = 0) const; + // reshape image + Tensor reshape(int w, int h, Allocator* allocator = 0) const; + // reshape dim + Tensor reshape(int w, int h, int c, Allocator* allocator = 0) const; + // allocate vec + void create(int w, size_t elemsize = 4u, Allocator* allocator = 0); + // allocate image + void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0); + // allocate dim + void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0); + // allocate packed vec + void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0); + // allocate packed image + void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0); + // allocate packed dim + void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0); + // allocate like + void create_like(const tensor* m, Allocator* allocator = 0); + // allocate like + void create_like(const Tensor& m, Allocator* allocator = 0); + // allocate like + void create_like(const VkTensor& m, Allocator* allocator = 0); + // allocate like + void create_like(const VkImageTensor& im, Allocator* allocator = 0); + // refcount++ + void addref(); + // refcount-- + void release(); + + bool empty() const; + size_t total() const; + + // shape only + Tensor shape() const; + + // data reference + Tensor channel(int c); + const Tensor channel(int c) const; + float* row(int y); + const float* row(int y) const; + + // access raw data + template operator T*(); + template operator const T*() const; + + // pointer to the data + void* data; + + // pointer to the reference counter + // when points to user-allocated data, the pointer is NULL + int* refcount; + + // element size in bytes + // 4 = float32/int32 + // 2 = float16 + // 1 = int8/uint8 + // 0 = empty + size_t elemsize; + + // packed count inside element + // c/1-h-w-1 h/1-w-1 w/1-1 scalar + // c/4-h-w-4 h/4-w-4 w/4-4 sse/neon + // c/8-h-w-8 h/8-w-8 w/8-8 avx/fp16 + int elempack; + + // the allocator + Allocator* allocator; + + // the dimension rank + int dims; + + int w; + int h; + int c; + + size_t cstep; +}; + + + +class VkTensor +{ +public: + // empty + VkTensor(); + // vec + VkTensor(int w, size_t elemsize, VkAllocator* allocator); + // image + VkTensor(int w, int h, size_t elemsize, VkAllocator* allocator); + // dim + VkTensor(int w, int h, int c, size_t elemsize, VkAllocator* allocator); + // packed vec + VkTensor(int w, size_t elemsize, int elempack, VkAllocator* allocator); + // packed image + VkTensor(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator); + // packed dim + VkTensor(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator); + // copy + VkTensor(const VkTensor& m); + // external vec + VkTensor(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator); + // external image + VkTensor(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator); + // external dim + VkTensor(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator); + // external packed vec + VkTensor(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); + // external packed image + VkTensor(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); + // external packed dim + VkTensor(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); + // release + ~VkTensor(); + // assign + VkTensor& operator=(const VkTensor& m); + // reshape vec + VkTensor reshape(int w, Allocator* allocator = 0) const; + // reshape image + VkTensor reshape(int w, int h, Allocator* allocator = 0) const; + // reshape dim + VkTensor reshape(int w, int h, int c, Allocator* allocator = 0) const; + // allocate vec + void create(int w, size_t elemsize, VkAllocator* allocator); + // allocate image + void create(int w, int h, size_t elemsize, VkAllocator* allocator); + // allocate dim + void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator); + // allocate packed vec + void create(int w, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate packed image + void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate packed dim + void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate like + void create_like(const Tensor& m, VkAllocator* allocator); + void create_like(const tensor* m, VkAllocator* allocator); + // allocate like + void create_like(const VkTensor& m, VkAllocator* allocator); + + // allocate vec + void create(struct tensor* tensor, VkAllocator* allocator); + + // staging buffer + void prepare_staging_buffer(); + void discard_staging_buffer(); + + // copy + // void upload(const Tensor& m); + // void download(Tensor& m) const; + + // mapped + void* mapped_ptr() const; + + // refcount++ + void addref(); + // refcount-- + void release(); + + bool empty() const; + size_t total() const; + + // shape only + // Mat shape() const; + + // low-level reference + VkBuffer buffer() const; + size_t buffer_offset() const; + size_t buffer_capacity() const; + + // device buffer + VkBufferMemory* data; + + // staging buffer + VkBufferMemory* staging_data; + + // pointer to the reference counter + // when points to user-allocated data, the pointer is NULL + int* refcount; + int* staging_refcount; + + // element size in bytes + // 4 = float32/int32 + // 2 = float16 + // 1 = int8/uint8 + // 0 = empty + size_t elemsize; + + // packed count inside element + // c/1-h-w-1 h/1-w-1 w/1-1 scalar + // c/4-h-w-4 h/4-w-4 w/4-4 sse/neon + // c/8-h-w-8 h/8-w-8 w/8-8 avx/fp16 + int elempack; + + // the allocator + VkAllocator* allocator; + VkAllocator* staging_allocator; + + // the dimension rank + int dims; + + int w; + int h; + int c; + + size_t cstep; +}; + +class VkImageTensor +{ +public: + // empty + VkImageTensor(); + // vec + VkImageTensor(int w, size_t elemsize, VkAllocator* allocator); + // image + VkImageTensor(int w, int h, size_t elemsize, VkAllocator* allocator); + // dim + VkImageTensor(int w, int h, int c, size_t elemsize, VkAllocator* allocator); + // packed vec + VkImageTensor(int w, size_t elemsize, int elempack, VkAllocator* allocator); + // packed image + VkImageTensor(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator); + // packed dim + VkImageTensor(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator); + // copy + VkImageTensor(const VkImageTensor& m); + // external vec + VkImageTensor(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator); + // external image + VkImageTensor(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator); + // external dim + VkImageTensor(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator); + // external packed vec + VkImageTensor(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); + // external packed image + VkImageTensor(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); + // external packed dim + VkImageTensor(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator); + // release + ~VkImageTensor(); + // assign + VkImageTensor& operator=(const VkImageTensor& m); + // allocate vec + void create(int w, size_t elemsize, VkAllocator* allocator); + // allocate image + void create(int w, int h, size_t elemsize, VkAllocator* allocator); + // allocate dim + void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator); + // allocate packed vec + void create(int w, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate packed image + void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate packed dim + void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator); + // allocate like + void create_like(const tensor* m, VkAllocator* allocator); + // allocate like + void create_like(const VkTensor& m, VkAllocator* allocator); + // allocate like + void create_like(const VkImageTensor& im, VkAllocator* allocator); + + + // mapped + ///Mat mapped() const; + void* mapped_ptr() const; + + // refcount++ + void addref(); + // refcount-- + void release(); + + bool empty() const; + size_t total() const; + + // shape only + ///Mat shape() const; + + // low-level reference + VkImage image() const; + VkImageView imageview() const; + +#if __ANDROID_API__ >= 26 + // convenient construct from android hardware buffer + static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator); +#endif // __ANDROID_API__ >= 26 + + // device image + VkImageMemory* data; + + // pointer to the reference counter + // when points to user-allocated data, the pointer is NULL + + int* refcount; + + // element size in bytes + // 4 = float32/int32 + // 2 = float16 + // 1 = int8/uint8 + // 0 = empty + size_t elemsize; + + // packed count inside element + // c/1-h-w-1 h/1-w-1 w/1-1 scalar + // c/4-h-w-4 h/4-w-4 w/4-4 sse/neon + // c/8-h-w-8 h/8-w-8 w/8-8 avx/fp16 + int elempack; + + // the allocator + VkAllocator* allocator; + + // the dimension rank + int dims; + + int w; + int h; + int c; +}; + +inline VkTensor::VkTensor() + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ +} + +inline VkTensor::VkTensor(int _w, size_t _elemsize, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _elemsize, _allocator); +} + +inline VkTensor::VkTensor(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _h, _elemsize, _allocator); +} + +inline VkTensor::VkTensor(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _h, _c, _elemsize, _allocator); +} + +inline VkTensor::VkTensor(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _elemsize, _elempack, _allocator); +} + +inline VkTensor::VkTensor(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _h, _elemsize, _elempack, _allocator); +} + +inline VkTensor::VkTensor(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _h, _c, _elemsize, _elempack, _allocator); +} + +inline VkTensor::VkTensor(const VkTensor& m) + : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c) +{ + if (refcount) + TENGINE_XADD(refcount, 1); + + cstep = m.cstep; +} + +inline VkTensor::VkTensor(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1) +{ + cstep = w; +} + +inline VkTensor::VkTensor(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1) +{ + cstep = w * h; +} + +inline VkTensor::VkTensor(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) +{ + cstep = alignSize(w * h * elemsize, 16) / elemsize; +} + +inline VkTensor::VkTensor(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1) +{ + cstep = w; +} + +inline VkTensor::VkTensor(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1) +{ + cstep = w * h; +} + +inline VkTensor::VkTensor(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) +{ + cstep = alignSize(w * h * elemsize, 16) / elemsize; +} + +inline VkTensor::~VkTensor() +{ + release(); +} + +inline VkTensor& VkTensor::operator=(const VkTensor& m) +{ + if (this == &m) + return *this; + + if (m.refcount) + TENGINE_XADD(m.refcount, 1); + + release(); + + data = m.data; + refcount = m.refcount; + elemsize = m.elemsize; + elempack = m.elempack; + allocator = m.allocator; + + dims = m.dims; + w = m.w; + h = m.h; + c = m.c; + + cstep = m.cstep; + + return *this; +} + +inline void VkTensor::create(int _w, size_t _elemsize, VkAllocator* _allocator) +{ + if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 1; + w = _w; + h = 1; + c = 1; + + cstep = w; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + + data = allocator->fastMalloc(totalsize); + + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + +inline void VkTensor::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) +{ + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 2; + w = _w; + h = _h; + c = 1; + + cstep = w * h; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + + data = allocator->fastMalloc(totalsize); + + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + +inline void VkTensor::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + + data = allocator->fastMalloc(totalsize); + + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + +inline void VkTensor::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 1; + w = _w; + h = 1; + c = 1; + + cstep = w; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + + data = allocator->fastMalloc(totalsize); + + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + +inline void VkTensor::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 2; + w = _w; + h = _h; + c = 1; + + cstep = w * h; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + + data = allocator->fastMalloc(totalsize); + + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + +inline void VkTensor::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + c = _c; + + // cstep = alignSize(w * h * elemsize, 16) / elemsize; + cstep = w * h; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + + data = allocator->fastMalloc(totalsize); + + refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount)); + *refcount = 1; + } +} + +inline void VkTensor::create_like(const tensor* m, VkAllocator* _allocator) +{ + int _c = m->dims[1]; + int _h = m->dims[2]; + int _w = m->dims[3]; + size_t _elemsize = m->data_type == 0 ? 4 : 1; + int _elempack = 1; + + if (_c == 0 && _h == 0 && _w != 0) + create(_w, _elemsize, _elempack, _allocator); + if (_c == 0 && _h != 0 && _w != 0) + create(_w, _h, _elemsize, _elempack, _allocator); + if (_c != 0 && _h != 0 && _w != 0) + create(_w, _h, _c, _elemsize, _elempack, _allocator); +} + +inline void VkTensor::create_like(const Tensor& m, VkAllocator* _allocator) +{ + int _dims = m.dims; + if (_dims == 1) + create(m.w, m.elemsize, m.elempack, _allocator); + if (_dims == 2) + create(m.w, m.h, m.elemsize, m.elempack, _allocator); + if (_dims == 3) + create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); +} + +inline void VkTensor::create_like(const VkTensor& m, VkAllocator* _allocator) +{ + int _dims = m.dims; + if (_dims == 1) + create(m.w, m.elemsize, m.elempack, _allocator); + if (_dims == 2) + create(m.w, m.h, m.elemsize, m.elempack, _allocator); + if (_dims == 3) + create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); +} + +inline void* VkTensor::mapped_ptr() const +{ + if (!allocator->mappable) + return 0; + + return (unsigned char*)data->mapped_ptr + data->offset; +} + +inline void VkTensor::addref() +{ + if (refcount) + TENGINE_XADD(refcount, 1); +} + +inline void VkTensor::release() +{ + if (refcount && TENGINE_XADD(refcount, -1) == 1) + { + if (allocator && data) + { + allocator->fastFree(data); + } + } + + data = 0; + + elemsize = 0; + elempack = 0; + + dims = 0; + w = 0; + h = 0; + c = 0; + + cstep = 0; + + refcount = 0; +} + +inline bool VkTensor::empty() const +{ + return data == 0 || total() == 0; +} + +inline size_t VkTensor::total() const +{ + return cstep * c; +} + +// TODO +// inline Mat VkTensor::shape() const +// { +// if (dims == 1) +// return Mat(w * elempack, (void*)0); +// if (dims == 2) +// return Mat(w, h * elempack, (void*)0); +// if (dims == 3) +// return Mat(w, h, c * elempack, (void*)0); + +// return Mat(); +// } + +inline VkBuffer VkTensor::buffer() const +{ + return data->buffer; +} + +inline size_t VkTensor::buffer_offset() const +{ + return data->offset; +} + +inline size_t VkTensor::buffer_capacity() const +{ + return data->capacity; +} + +// VkImageTensor +inline VkImageTensor::VkImageTensor() + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ +} + +inline VkImageTensor::VkImageTensor(int _w, size_t _elemsize, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _elemsize, _allocator); +} + +inline VkImageTensor::VkImageTensor(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _h, _elemsize, _allocator); +} + +inline VkImageTensor::VkImageTensor(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _h, _c, _elemsize, _allocator); +} + +inline VkImageTensor::VkImageTensor(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _elemsize, _elempack, _allocator); +} + +inline VkImageTensor::VkImageTensor(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _h, _elemsize, _elempack, _allocator); +} + +inline VkImageTensor::VkImageTensor(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0) +{ + create(_w, _h, _c, _elemsize, _elempack, _allocator); +} + +inline VkImageTensor::VkImageTensor(const VkImageTensor& m) + : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c) +{ + if (refcount) + TENGINE_XADD(refcount, 1); +} + +inline VkImageTensor::VkImageTensor(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1) +{ +} + +inline VkImageTensor::VkImageTensor(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1) +{ +} + +inline VkImageTensor::VkImageTensor(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) +{ +} + +inline VkImageTensor::VkImageTensor(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1) +{ +} + +inline VkImageTensor::VkImageTensor(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1) +{ +} + +inline VkImageTensor::VkImageTensor(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) +{ +} + +inline VkImageTensor::~VkImageTensor() +{ + release(); +} + +inline VkImageTensor& VkImageTensor::operator=(const VkImageTensor& m) +{ + if (this == &m) + return *this; + + if (m.refcount) + TENGINE_XADD(m.refcount, 1); + + release(); + + data = m.data; + refcount = m.refcount; + elemsize = m.elemsize; + elempack = m.elempack; + allocator = m.allocator; + + dims = m.dims; + w = m.w; + h = m.h; + c = m.c; + + return *this; +} + +inline void VkImageTensor::create(int _w, size_t _elemsize, VkAllocator* _allocator) +{ + if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 1; + w = _w; + h = 1; + c = 1; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageTensor::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator) +{ + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 2; + w = _w; + h = _h; + c = 1; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageTensor::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + c = _c; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageTensor::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 1; + w = _w; + h = 1; + c = 1; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageTensor::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 2; + w = _w; + h = _h; + c = 1; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageTensor::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + c = _c; + + if (total() > 0) + { + data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack); + if (!data) + return; + + refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount)); + *refcount = 1; + } +} + +inline void VkImageTensor::create_like(const tensor* m, VkAllocator* _allocator) +{ + int _c = m->dims[1]; + int _h = m->dims[2]; + int _w = m->dims[3]; + size_t _elemsize = m->data_type == 0 ? 4 : 1; + int _elempack = 1; + int _dims = m->dim_num; + + if (_dims == 1) + create(_w, _elemsize, _elempack, _allocator); + if (_dims == 2) + create(_w, _h, _elemsize, _elempack, _allocator); + if (_dims == 3) + create(_w, _h, _c, _elemsize, _elempack, _allocator); +} + + +inline void VkImageTensor::create_like(const VkTensor& m, VkAllocator* _allocator) +{ + int _dims = m.dims; + if (_dims == 1) + create(m.w, m.elemsize, m.elempack, _allocator); + if (_dims == 2) + create(m.w, m.h, m.elemsize, m.elempack, _allocator); + if (_dims == 3) + create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); +} + +inline void VkImageTensor::create_like(const VkImageTensor& im, VkAllocator* _allocator) +{ + int _dims = im.dims; + if (_dims == 1) + create(im.w, im.elemsize, im.elempack, _allocator); + if (_dims == 2) + create(im.w, im.h, im.elemsize, im.elempack, _allocator); + if (_dims == 3) + create(im.w, im.h, im.c, im.elemsize, im.elempack, _allocator); +} + +// inline Mat VkImageMat::mapped() const +// { +// if (!allocator->mappable || !data->mapped_ptr) +// return Mat(); + +// if (dims == 1) +// return Mat(w, mapped_ptr(), elemsize, elempack, 0); + +// if (dims == 2) +// return Mat(w, h, mapped_ptr(), elemsize, elempack, 0); + +// if (dims == 3) +// return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0); + +// return Mat(); +// } + +inline void* VkImageTensor::mapped_ptr() const +{ + if (!allocator->mappable || !data->mapped_ptr) + return 0; + + return (unsigned char*)data->mapped_ptr + data->bind_offset; +} + +inline void VkImageTensor::addref() +{ + if (refcount) + TENGINE_XADD(refcount, 1); +} + +inline void VkImageTensor::release() +{ + if (refcount && TENGINE_XADD(refcount, -1) == 1) + { + if (allocator && data) + { + allocator->fastFree(data); + } + } + + data = 0; + + elemsize = 0; + elempack = 0; + + dims = 0; + w = 0; + h = 0; + c = 0; + + refcount = 0; +} + +inline bool VkImageTensor::empty() const +{ + return data == 0 || total() == 0; +} + +inline size_t VkImageTensor::total() const +{ + return w * h * c; +} + +// inline Mat VkImageTensor::shape() const +// { +// if (dims == 1) +// return Mat(w * elempack, (void*)0); +// if (dims == 2) +// return Mat(w, h * elempack, (void*)0); +// if (dims == 3) +// return Mat(w, h, c * elempack, (void*)0); + +// return Mat(); +// } + +inline VkImage VkImageTensor::image() const +{ + return data->image; +} + +inline VkImageView VkImageTensor::imageview() const +{ + return data->imageview; +} + + +///////////////////////////////////////////////////////////////////////////////////////////////////////////// +//Tensor defination + +inline Tensor::Tensor() + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ +} + +inline Tensor::Tensor(int _w, size_t _elemsize, Allocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _elemsize, _allocator); +} + +inline Tensor::Tensor(int _w, int _h, size_t _elemsize, Allocator* _allocator) : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0){ + create(_w, _h, _elemsize, _allocator);} +inline Tensor::Tensor(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _h, _c, _elemsize, _allocator); +} + +inline Tensor::Tensor(int _w, size_t _elemsize, int _elempack, Allocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _elemsize, _elempack, _allocator); +} + +inline Tensor::Tensor(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _h, _elemsize, _elempack, _allocator); +} + +inline Tensor::Tensor(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator) + : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0) +{ + create(_w, _h, _c, _elemsize, _elempack, _allocator); +} + +inline Tensor::Tensor(const Tensor& m) + : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c), cstep(m.cstep) +{ + if (refcount) + TENGINE_XADD(refcount, 1); +} + +inline Tensor::Tensor(struct tensor* m) + : data(m->data), refcount(0), elemsize(0), elempack(1), allocator(0), dims(0), w(0), h(0), c(0) +{ + if(m->layout == 0) + { + c = m->dims[1]; + h = m->dims[2]; + w = m->dims[3]; + elemsize = m->elem_size; + elempack = 1; + dims = 3; + cstep = w * h; + } + else + { + c = m->dims[3]; + h = m->dims[2]; + w = m->dims[1]; + elemsize = m->elem_size; + elempack = 1; + dims = 3; + cstep = w * h; + } +} +inline Tensor::Tensor(int _w, void* _data, size_t _elemsize, Allocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1) +{ + cstep = w; +} + +inline Tensor::Tensor(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1) +{ + cstep = w * h; +} + +inline Tensor::Tensor(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) +{ + cstep = alignSize(w * h * elemsize, 16) / elemsize; +} + +inline Tensor::Tensor(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1) +{ + cstep = w; +} + +inline Tensor::Tensor(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1) +{ + cstep = w * h; +} + +inline Tensor::Tensor(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator) + : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c) +{ + cstep = alignSize(w * h * elemsize, 16) / elemsize; +} + +inline Tensor::~Tensor() +{ + release(); +} + +inline Tensor& Tensor::operator=(const Tensor& m) +{ + if (this == &m) + return *this; + + if (m.refcount) + TENGINE_XADD(m.refcount, 1); + + release(); + + data = m.data; + refcount = m.refcount; + elemsize = m.elemsize; + elempack = m.elempack; + allocator = m.allocator; + + dims = m.dims; + w = m.w; + h = m.h; + c = m.c; + + cstep = m.cstep; + + return *this; +} + +inline Tensor Tensor::reshape(int _w, Allocator* _allocator) const +{ + if (w * h * c != _w) + return Tensor(); + + if (dims == 3 && cstep != (size_t)w * h) + { + Tensor m; + m.create(_w, elemsize, elempack, _allocator); + + // flatten + for (int i=0; i 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + +inline void Tensor::create(int _w, int _h, size_t _elemsize, Allocator* _allocator) +{ + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 2; + w = _w; + h = _h; + c = 1; + + cstep = w * h; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + +inline void Tensor::create(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = 1; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + c = _c; + + cstep = alignSize(w * h * elemsize, 16) / elemsize; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + +inline void Tensor::create(int _w, size_t _elemsize, int _elempack, Allocator* _allocator) +{ + if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 1; + w = _w; + h = 1; + c = 1; + + cstep = w; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + +inline void Tensor::create(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator) +{ + if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 2; + w = _w; + h = _h; + c = 1; + + cstep = w * h; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + +inline void Tensor::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator) +{ + if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator) + return; + + release(); + + elemsize = _elemsize; + elempack = _elempack; + allocator = _allocator; + + dims = 3; + w = _w; + h = _h; + c = _c; + + cstep = w * h; //alignSize(w * h * elemsize, 16) / elemsize; + + if (total() > 0) + { + size_t totalsize = alignSize(total() * elemsize, 4); + if (allocator) + data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount)); + else + data = fastMalloc(totalsize + (int)sizeof(*refcount)); + refcount = (int*)(((unsigned char*)data) + totalsize); + *refcount = 1; + } +} + +// inline void Tensor::create_like(const tensor* m, Allocator* _allocator) +// { +// int _dims = m.dims; +// if (_dims == 1) +// create(m.w, m.elemsize, m.elempack, _allocator); +// if (_dims == 2) +// create(m.w, m.h, m.elemsize, m.elempack, _allocator); +// if (_dims == 3) +// create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); +// } + +inline void Tensor::create_like(const Tensor& m, Allocator* _allocator) +{ + int _dims = m.dims; + if (_dims == 1) + create(m.w, m.elemsize, m.elempack, _allocator); + if (_dims == 2) + create(m.w, m.h, m.elemsize, m.elempack, _allocator); + if (_dims == 3) + create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); +} + +inline void Tensor::create_like(const VkTensor& m, Allocator* _allocator) +{ + int _dims = m.dims; + if (_dims == 1) + create(m.w, m.elemsize, m.elempack, _allocator); + if (_dims == 2) + create(m.w, m.h, m.elemsize, m.elempack, _allocator); + if (_dims == 3) + create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator); +} + +inline void Tensor::create_like(const VkImageTensor& im, Allocator* _allocator) +{ + int _dims = im.dims; + if (_dims == 1) + create(im.w, im.elemsize, im.elempack, _allocator); + if (_dims == 2) + create(im.w, im.h, im.elemsize, im.elempack, _allocator); + if (_dims == 3) + create(im.w, im.h, im.c, im.elemsize, im.elempack, _allocator); +} + +inline void Tensor::addref() +{ + if (refcount) + TENGINE_XADD(refcount, 1); +} + +inline void Tensor::release() +{ + if (refcount && TENGINE_XADD(refcount, -1) == 1) + { + if (allocator) + allocator->fastFree(data); + else + fastFree(data); + } + + data = 0; + + elemsize = 0; + elempack = 0; + + dims = 0; + w = 0; + h = 0; + c = 0; + + cstep = 0; + + refcount = 0; +} + +inline bool Tensor::empty() const +{ + return data == 0 || total() == 0; +} + +inline size_t Tensor::total() const +{ + return cstep * c; +} + +inline Tensor Tensor::shape() const +{ + if (dims == 1) + return Tensor(w * elempack, (void*)0); + if (dims == 2) + return Tensor(w, h * elempack, (void*)0); + if (dims == 3) + return Tensor(w, h, c * elempack, (void*)0); + + return Tensor(); +} + +inline Tensor Tensor::channel(int _c) +{ + return Tensor(w, h, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator); +} + +inline const Tensor Tensor::channel(int _c) const +{ + return Tensor(w, h, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator); +} + +inline float* Tensor::row(int y) +{ + return (float*)((unsigned char*)data + w * y * elemsize); +} + +inline const float* Tensor::row(int y) const +{ + return (const float*)((unsigned char*)data + w * y * elemsize); +} + +template +inline Tensor::operator T*() +{ + return (T*)data; +} + +template +inline Tensor::operator const T*() const +{ + return (const T*)data; +} + +void convert_packing(const Tensor& src, Tensor& dst, int elempack, const Option& opt = Option()); +void convert_packing(tensor* src, Tensor&dst, int elempack, const Option& opt = Option()); +void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt = Option()); +void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt = Option()); + + +} // namespace TEngine + + +#endif // VULKAN_TENSOR_HPP +