From 5c3b19106ae31e035f3926958a49b290f502181e Mon Sep 17 00:00:00 2001
From: Daniel <7994127+ddzhao91@users.noreply.github.com>
Date: Fri, 30 Apr 2021 00:21:37 +0800
Subject: [PATCH] Dev/vulkan (#633)

* update CMakeLists to install tengine cpp api header

* update vulkan to support tengine v1.4

* Update CMakeLists.txt

Co-authored-by: dongdong <ddzhao@openailab.com>
---
 source/CMakeLists.txt                         |    6 +
 source/device/CMakeLists.txt                  |   17 +-
 source/device/vulkan/CMakeLists.txt           |  171 ++
 source/device/vulkan/layer/concat_vulkan.cpp  |  788 +++++++
 source/device/vulkan/layer/concat_vulkan.hpp  |   81 +
 .../vulkan/layer/convolution_vulkan.cpp       |  616 +++++
 .../vulkan/layer/convolution_vulkan.hpp       |  115 +
 .../layer/convolutiondepthwise_vulkan.cpp     |  301 +++
 .../layer/convolutiondepthwise_vulkan.hpp     |   96 +
 source/device/vulkan/layer/crop_vulkan.cpp    |  607 +++++
 source/device/vulkan/layer/crop_vulkan.hpp    |   95 +
 source/device/vulkan/layer/dropout_vulkan.cpp |  216 ++
 source/device/vulkan/layer/dropout_vulkan.hpp |   78 +
 source/device/vulkan/layer/eltwise_vulkan.cpp |  266 +++
 source/device/vulkan/layer/eltwise_vulkan.hpp |   99 +
 source/device/vulkan/layer/flatten_vulkan.cpp |  326 +++
 source/device/vulkan/layer/flatten_vulkan.hpp |   82 +
 .../vulkan/layer/innerproduct_vulkan.cpp      |  464 ++++
 .../vulkan/layer/innerproduct_vulkan.hpp      |  103 +
 source/device/vulkan/layer/interp_vulkan.cpp  |  464 ++++
 source/device/vulkan/layer/interp_vulkan.hpp  |   92 +
 source/device/vulkan/layer/packing_vulkan.cpp |  495 ++++
 source/device/vulkan/layer/packing_vulkan.hpp |   96 +
 source/device/vulkan/layer/padding_vulkan.cpp |  174 ++
 source/device/vulkan/layer/padding_vulkan.hpp |   81 +
 source/device/vulkan/layer/permute_vulkan.cpp |  475 ++++
 source/device/vulkan/layer/permute_vulkan.hpp |   84 +
 source/device/vulkan/layer/pooling_vulkan.cpp |  338 +++
 source/device/vulkan/layer/pooling_vulkan.hpp |   95 +
 .../device/vulkan/layer/priorbox_vulkan.cpp   |  351 +++
 .../device/vulkan/layer/priorbox_vulkan.hpp   |   96 +
 source/device/vulkan/layer/relu_vulkan.cpp    |  214 ++
 source/device/vulkan/layer/relu_vulkan.hpp    |   79 +
 source/device/vulkan/layer/reshape_vulkan.cpp |  580 +++++
 source/device/vulkan/layer/reshape_vulkan.hpp |   98 +
 source/device/vulkan/layer/softmax_vulkan.cpp |  486 ++++
 source/device/vulkan/layer/softmax_vulkan.hpp |   90 +
 .../device/vulkan/layer_shader_registry.h.in  |    6 +
 .../device/vulkan/layer_shader_spv_data.h.in  |    6 +
 source/device/vulkan/layer_shader_type.h      |   54 +
 .../device/vulkan/layer_shader_type_enum.h.in |    5 +
 source/device/vulkan/layer_type_enum.h.in     |    5 +
 source/device/vulkan/shaders/concat.comp      |  108 +
 .../device/vulkan/shaders/concat_pack4.comp   |  108 +
 .../vulkan/shaders/concat_pack4to1.comp       |  164 ++
 .../device/vulkan/shaders/concat_pack8.comp   |  109 +
 .../vulkan/shaders/concat_pack8to1.comp       |  190 ++
 .../vulkan/shaders/concat_pack8to4.comp       |  154 ++
 source/device/vulkan/shaders/convolution.comp |  175 ++
 .../vulkan/shaders/convolution_1x1s1d1.comp   |  187 ++
 .../vulkan/shaders/convolution_pack1to4.comp  |  183 ++
 .../vulkan/shaders/convolution_pack1to8.comp  |  193 ++
 .../vulkan/shaders/convolution_pack4.comp     |  203 ++
 .../shaders/convolution_pack4_1x1s1d1.comp    |  237 ++
 ...olution_pack4_3x3s1d1_winograd23_gemm.comp |  139 ++
 ...k4_3x3s1d1_winograd23_transform_input.comp |  202 ++
 ...4_3x3s1d1_winograd23_transform_output.comp |  209 ++
 .../vulkan/shaders/convolution_pack4to1.comp  |  183 ++
 .../vulkan/shaders/convolution_pack4to8.comp  |  219 ++
 .../vulkan/shaders/convolution_pack8.comp     |  219 ++
 .../shaders/convolution_pack8_1x1s1d1.comp    |  327 +++
 ...olution_pack8_3x3s1d1_winograd23_gemm.comp |  198 ++
 ...k8_3x3s1d1_winograd23_transform_input.comp |  203 ++
 ...8_3x3s1d1_winograd23_transform_output.comp |  230 ++
 .../vulkan/shaders/convolution_pack8to1.comp  |  186 ++
 .../vulkan/shaders/convolution_pack8to4.comp  |  198 ++
 .../vulkan/shaders/convolutiondepthwise.comp  |  170 ++
 .../shaders/convolutiondepthwise_group.comp   |  186 ++
 .../convolutiondepthwise_group_pack1to4.comp  |  194 ++
 .../convolutiondepthwise_group_pack1to8.comp  |  204 ++
 .../convolutiondepthwise_group_pack4.comp     |  214 ++
 .../convolutiondepthwise_group_pack4to1.comp  |  194 ++
 .../convolutiondepthwise_group_pack4to8.comp  |  230 ++
 .../convolutiondepthwise_group_pack8.comp     |  230 ++
 .../convolutiondepthwise_group_pack8to1.comp  |  197 ++
 .../convolutiondepthwise_group_pack8to4.comp  |  209 ++
 .../shaders/convolutiondepthwise_pack4.comp   |  178 ++
 .../shaders/convolutiondepthwise_pack8.comp   |  191 ++
 source/device/vulkan/shaders/crop.comp        |   92 +
 .../device/vulkan/shaders/crop_pack1to4.comp  |   98 +
 .../device/vulkan/shaders/crop_pack1to8.comp  |  104 +
 source/device/vulkan/shaders/crop_pack4.comp  |   92 +
 .../device/vulkan/shaders/crop_pack4to1.comp  |  107 +
 .../device/vulkan/shaders/crop_pack4to8.comp  |  182 ++
 source/device/vulkan/shaders/crop_pack8.comp  |   93 +
 .../device/vulkan/shaders/crop_pack8to1.comp  |  108 +
 .../device/vulkan/shaders/crop_pack8to4.comp  |  149 ++
 .../vulkan/shaders/depthwiseconvolution.comp  |  121 +
 source/device/vulkan/shaders/dropout.comp     |  104 +
 .../device/vulkan/shaders/dropout_pack4.comp  |  104 +
 .../device/vulkan/shaders/dropout_pack8.comp  |  106 +
 source/device/vulkan/shaders/eltwise.comp     |  141 ++
 .../device/vulkan/shaders/eltwise_pack4.comp  |  141 ++
 .../device/vulkan/shaders/eltwise_pack8.comp  |  160 ++
 source/device/vulkan/shaders/flatten.comp     |   98 +
 .../vulkan/shaders/flatten_pack1to4.comp      |  127 +
 .../vulkan/shaders/flatten_pack1to8.comp      |  154 ++
 .../device/vulkan/shaders/flatten_pack4.comp  |  175 ++
 .../vulkan/shaders/flatten_pack4to8.comp      |  222 ++
 .../device/vulkan/shaders/flatten_pack8.comp  |  222 ++
 .../device/vulkan/shaders/innerproduct.comp   |  140 ++
 .../vulkan/shaders/innerproduct_pack1to4.comp |  148 ++
 .../vulkan/shaders/innerproduct_pack1to8.comp |  160 ++
 .../vulkan/shaders/innerproduct_pack4.comp    |  171 ++
 .../vulkan/shaders/innerproduct_pack4to1.comp |  148 ++
 .../vulkan/shaders/innerproduct_pack4to8.comp |  188 ++
 .../vulkan/shaders/innerproduct_pack8.comp    |  188 ++
 .../vulkan/shaders/innerproduct_pack8to1.comp |  151 ++
 .../vulkan/shaders/innerproduct_pack8to4.comp |  167 ++
 source/device/vulkan/shaders/interp.comp      |  149 ++
 .../device/vulkan/shaders/interp_bicubic.comp |  149 ++
 .../vulkan/shaders/interp_bicubic_coeffs.comp |  107 +
 .../vulkan/shaders/interp_bicubic_pack4.comp  |  163 ++
 .../vulkan/shaders/interp_bicubic_pack8.comp  |  175 ++
 .../device/vulkan/shaders/interp_pack4.comp   |  150 ++
 .../device/vulkan/shaders/interp_pack8.comp   |  238 ++
 source/device/vulkan/shaders/packing.comp     |  165 ++
 .../vulkan/shaders/packing_fp16_to_fp32.comp  |  165 ++
 .../vulkan/shaders/packing_fp32_to_fp16.comp  |  165 ++
 .../vulkan/shaders/packing_pack1to4.comp      |  195 ++
 .../packing_pack1to4_fp16_to_fp32.comp        |  195 ++
 .../packing_pack1to4_fp32_to_fp16.comp        |  195 ++
 .../vulkan/shaders/packing_pack1to8.comp      |  223 ++
 .../packing_pack1to8_fp16_to_fp32.comp        |  226 ++
 .../packing_pack1to8_fp32_to_fp16.comp        |  223 ++
 .../device/vulkan/shaders/packing_pack4.comp  |  165 ++
 .../shaders/packing_pack4_fp16_to_fp32.comp   |  165 ++
 .../shaders/packing_pack4_fp32_to_fp16.comp   |  165 ++
 .../vulkan/shaders/packing_pack4to1.comp      |  195 ++
 .../packing_pack4to1_fp16_to_fp32.comp        |  195 ++
 .../packing_pack4to1_fp32_to_fp16.comp        |  195 ++
 .../vulkan/shaders/packing_pack4to8.comp      |  184 ++
 .../packing_pack4to8_fp16_to_fp32.comp        |  184 ++
 .../packing_pack4to8_fp32_to_fp16.comp        |  184 ++
 .../device/vulkan/shaders/packing_pack8.comp  |  166 ++
 .../shaders/packing_pack8_fp16_to_fp32.comp   |  169 ++
 .../shaders/packing_pack8_fp32_to_fp16.comp   |  166 ++
 .../vulkan/shaders/packing_pack8to1.comp      |  223 ++
 .../packing_pack8to1_fp16_to_fp32.comp        |  223 ++
 .../packing_pack8to1_fp32_to_fp16.comp        |  223 ++
 .../vulkan/shaders/packing_pack8to4.comp      |  184 ++
 .../packing_pack8to4_fp16_to_fp32.comp        |  184 ++
 .../packing_pack8to4_fp32_to_fp16.comp        |  184 ++
 source/device/vulkan/shaders/padding.comp     |  145 ++
 .../device/vulkan/shaders/padding_pack4.comp  |  144 ++
 .../device/vulkan/shaders/padding_pack8.comp  |  144 ++
 source/device/vulkan/shaders/permute.comp     |  186 ++
 .../vulkan/shaders/permute_pack1to4.comp      |  234 ++
 .../vulkan/shaders/permute_pack1to8.comp      |  284 +++
 .../device/vulkan/shaders/permute_pack4.comp  |  281 +++
 .../vulkan/shaders/permute_pack4to1.comp      |  230 ++
 .../vulkan/shaders/permute_pack4to8.comp      |  350 +++
 .../device/vulkan/shaders/permute_pack8.comp  |  350 +++
 .../vulkan/shaders/permute_pack8to1.comp      |  280 +++
 .../vulkan/shaders/permute_pack8to4.comp      |  285 +++
 source/device/vulkan/shaders/pooling.comp     |  226 ++
 .../device/vulkan/shaders/pooling_global.comp |  130 ++
 .../vulkan/shaders/pooling_global_pack4.comp  |  130 ++
 .../vulkan/shaders/pooling_global_pack8.comp  |  139 ++
 .../device/vulkan/shaders/pooling_pack4.comp  |  226 ++
 .../device/vulkan/shaders/pooling_pack8.comp  |  242 ++
 source/device/vulkan/shaders/priorbox.comp    |  170 ++
 .../device/vulkan/shaders/priorbox_mxnet.comp |   92 +
 source/device/vulkan/shaders/relu.comp        |  107 +
 source/device/vulkan/shaders/relu_pack4.comp  |  107 +
 source/device/vulkan/shaders/relu_pack8.comp  |  114 +
 source/device/vulkan/shaders/reshape.comp     |  138 ++
 .../vulkan/shaders/reshape_pack1to4.comp      |  147 ++
 .../vulkan/shaders/reshape_pack1to8.comp      |  177 ++
 .../device/vulkan/shaders/reshape_pack4.comp  |  228 ++
 .../vulkan/shaders/reshape_pack4to1.comp      |  166 ++
 .../vulkan/shaders/reshape_pack4to8.comp      |  301 +++
 .../device/vulkan/shaders/reshape_pack8.comp  |  301 +++
 .../vulkan/shaders/reshape_pack8to1.comp      |  195 ++
 .../vulkan/shaders/reshape_pack8to4.comp      |  231 ++
 .../vulkan/shaders/softmax_div_sum.comp       |  166 ++
 .../vulkan/shaders/softmax_div_sum_pack4.comp |  175 ++
 .../vulkan/shaders/softmax_div_sum_pack8.comp |  177 ++
 .../vulkan/shaders/softmax_exp_sub_max.comp   |  166 ++
 .../shaders/softmax_exp_sub_max_pack4.comp    |  175 ++
 .../shaders/softmax_exp_sub_max_pack8.comp    |  177 ++
 .../vulkan/shaders/softmax_reduce_max.comp    |  198 ++
 .../shaders/softmax_reduce_max_pack4.comp     |  204 ++
 .../shaders/softmax_reduce_max_pack8.comp     |  217 ++
 .../vulkan/shaders/softmax_reduce_sum.comp    |  198 ++
 .../shaders/softmax_reduce_sum_pack4.comp     |  204 ++
 .../shaders/softmax_reduce_sum_pack8.comp     |  211 ++
 source/device/vulkan/vulkan_allocator.cpp     | 1474 ++++++++++++
 source/device/vulkan/vulkan_allocator.hpp     |  284 +++
 source/device/vulkan/vulkan_command.cpp       | 1782 +++++++++++++++
 source/device/vulkan/vulkan_command.hpp       |  168 ++
 source/device/vulkan/vulkan_define.h          |   34 +
 source/device/vulkan/vulkan_device.cc         |  234 ++
 source/device/vulkan/vulkan_device.hpp        |   40 +
 source/device/vulkan/vulkan_executor.cc       |   98 +
 source/device/vulkan/vulkan_executor.hpp      |   89 +
 source/device/vulkan/vulkan_gpu.cpp           | 2036 +++++++++++++++++
 source/device/vulkan/vulkan_gpu.hpp           |  349 +++
 source/device/vulkan/vulkan_graph.cc          |  545 +++++
 source/device/vulkan/vulkan_graph.hpp         |  139 ++
 source/device/vulkan/vulkan_helper.cc         |  311 +++
 source/device/vulkan/vulkan_helper.hpp        |   63 +
 source/device/vulkan/vulkan_layer.cpp         |   84 +
 source/device/vulkan/vulkan_layer.hpp         |  119 +
 source/device/vulkan/vulkan_limit.hpp         |  160 ++
 source/device/vulkan/vulkan_option.cpp        |   73 +
 source/device/vulkan/vulkan_option.hpp        |  128 ++
 source/device/vulkan/vulkan_pipeline.cpp      |  568 +++++
 source/device/vulkan/vulkan_pipeline.hpp      |  130 ++
 source/device/vulkan/vulkan_platform.hpp      |   92 +
 source/device/vulkan/vulkan_tensor.cpp        |  374 +++
 source/device/vulkan/vulkan_tensor.hpp        | 1817 +++++++++++++++
 212 files changed, 46448 insertions(+), 1 deletion(-)
 create mode 100644 source/device/vulkan/CMakeLists.txt
 create mode 100644 source/device/vulkan/layer/concat_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/concat_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/convolution_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/convolution_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/crop_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/crop_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/dropout_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/dropout_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/eltwise_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/eltwise_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/flatten_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/flatten_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/innerproduct_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/innerproduct_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/interp_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/interp_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/packing_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/packing_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/padding_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/padding_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/permute_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/permute_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/pooling_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/pooling_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/priorbox_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/priorbox_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/relu_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/relu_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/reshape_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/reshape_vulkan.hpp
 create mode 100644 source/device/vulkan/layer/softmax_vulkan.cpp
 create mode 100644 source/device/vulkan/layer/softmax_vulkan.hpp
 create mode 100644 source/device/vulkan/layer_shader_registry.h.in
 create mode 100644 source/device/vulkan/layer_shader_spv_data.h.in
 create mode 100644 source/device/vulkan/layer_shader_type.h
 create mode 100644 source/device/vulkan/layer_shader_type_enum.h.in
 create mode 100644 source/device/vulkan/layer_type_enum.h.in
 create mode 100644 source/device/vulkan/shaders/concat.comp
 create mode 100644 source/device/vulkan/shaders/concat_pack4.comp
 create mode 100644 source/device/vulkan/shaders/concat_pack4to1.comp
 create mode 100644 source/device/vulkan/shaders/concat_pack8.comp
 create mode 100644 source/device/vulkan/shaders/concat_pack8to1.comp
 create mode 100644 source/device/vulkan/shaders/concat_pack8to4.comp
 create mode 100644 source/device/vulkan/shaders/convolution.comp
 create mode 100644 source/device/vulkan/shaders/convolution_1x1s1d1.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack1to4.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack1to8.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack4.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack4_1x1s1d1.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack4to1.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack4to8.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack8.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack8_1x1s1d1.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack8to1.comp
 create mode 100644 source/device/vulkan/shaders/convolution_pack8to4.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack4.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack8.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_pack4.comp
 create mode 100644 source/device/vulkan/shaders/convolutiondepthwise_pack8.comp
 create mode 100644 source/device/vulkan/shaders/crop.comp
 create mode 100644 source/device/vulkan/shaders/crop_pack1to4.comp
 create mode 100644 source/device/vulkan/shaders/crop_pack1to8.comp
 create mode 100644 source/device/vulkan/shaders/crop_pack4.comp
 create mode 100644 source/device/vulkan/shaders/crop_pack4to1.comp
 create mode 100644 source/device/vulkan/shaders/crop_pack4to8.comp
 create mode 100644 source/device/vulkan/shaders/crop_pack8.comp
 create mode 100644 source/device/vulkan/shaders/crop_pack8to1.comp
 create mode 100644 source/device/vulkan/shaders/crop_pack8to4.comp
 create mode 100644 source/device/vulkan/shaders/depthwiseconvolution.comp
 create mode 100644 source/device/vulkan/shaders/dropout.comp
 create mode 100644 source/device/vulkan/shaders/dropout_pack4.comp
 create mode 100644 source/device/vulkan/shaders/dropout_pack8.comp
 create mode 100644 source/device/vulkan/shaders/eltwise.comp
 create mode 100644 source/device/vulkan/shaders/eltwise_pack4.comp
 create mode 100644 source/device/vulkan/shaders/eltwise_pack8.comp
 create mode 100644 source/device/vulkan/shaders/flatten.comp
 create mode 100644 source/device/vulkan/shaders/flatten_pack1to4.comp
 create mode 100644 source/device/vulkan/shaders/flatten_pack1to8.comp
 create mode 100644 source/device/vulkan/shaders/flatten_pack4.comp
 create mode 100644 source/device/vulkan/shaders/flatten_pack4to8.comp
 create mode 100644 source/device/vulkan/shaders/flatten_pack8.comp
 create mode 100644 source/device/vulkan/shaders/innerproduct.comp
 create mode 100644 source/device/vulkan/shaders/innerproduct_pack1to4.comp
 create mode 100644 source/device/vulkan/shaders/innerproduct_pack1to8.comp
 create mode 100644 source/device/vulkan/shaders/innerproduct_pack4.comp
 create mode 100644 source/device/vulkan/shaders/innerproduct_pack4to1.comp
 create mode 100644 source/device/vulkan/shaders/innerproduct_pack4to8.comp
 create mode 100644 source/device/vulkan/shaders/innerproduct_pack8.comp
 create mode 100644 source/device/vulkan/shaders/innerproduct_pack8to1.comp
 create mode 100644 source/device/vulkan/shaders/innerproduct_pack8to4.comp
 create mode 100644 source/device/vulkan/shaders/interp.comp
 create mode 100644 source/device/vulkan/shaders/interp_bicubic.comp
 create mode 100644 source/device/vulkan/shaders/interp_bicubic_coeffs.comp
 create mode 100644 source/device/vulkan/shaders/interp_bicubic_pack4.comp
 create mode 100644 source/device/vulkan/shaders/interp_bicubic_pack8.comp
 create mode 100644 source/device/vulkan/shaders/interp_pack4.comp
 create mode 100644 source/device/vulkan/shaders/interp_pack8.comp
 create mode 100644 source/device/vulkan/shaders/packing.comp
 create mode 100644 source/device/vulkan/shaders/packing_fp16_to_fp32.comp
 create mode 100644 source/device/vulkan/shaders/packing_fp32_to_fp16.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack1to4.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack1to8.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack4.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack4_fp16_to_fp32.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack4_fp32_to_fp16.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack4to1.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack4to8.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack8.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack8_fp16_to_fp32.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack8_fp32_to_fp16.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack8to1.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack8to4.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp
 create mode 100644 source/device/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp
 create mode 100644 source/device/vulkan/shaders/padding.comp
 create mode 100644 source/device/vulkan/shaders/padding_pack4.comp
 create mode 100644 source/device/vulkan/shaders/padding_pack8.comp
 create mode 100644 source/device/vulkan/shaders/permute.comp
 create mode 100644 source/device/vulkan/shaders/permute_pack1to4.comp
 create mode 100644 source/device/vulkan/shaders/permute_pack1to8.comp
 create mode 100644 source/device/vulkan/shaders/permute_pack4.comp
 create mode 100644 source/device/vulkan/shaders/permute_pack4to1.comp
 create mode 100644 source/device/vulkan/shaders/permute_pack4to8.comp
 create mode 100644 source/device/vulkan/shaders/permute_pack8.comp
 create mode 100644 source/device/vulkan/shaders/permute_pack8to1.comp
 create mode 100644 source/device/vulkan/shaders/permute_pack8to4.comp
 create mode 100644 source/device/vulkan/shaders/pooling.comp
 create mode 100644 source/device/vulkan/shaders/pooling_global.comp
 create mode 100644 source/device/vulkan/shaders/pooling_global_pack4.comp
 create mode 100644 source/device/vulkan/shaders/pooling_global_pack8.comp
 create mode 100644 source/device/vulkan/shaders/pooling_pack4.comp
 create mode 100644 source/device/vulkan/shaders/pooling_pack8.comp
 create mode 100644 source/device/vulkan/shaders/priorbox.comp
 create mode 100644 source/device/vulkan/shaders/priorbox_mxnet.comp
 create mode 100644 source/device/vulkan/shaders/relu.comp
 create mode 100644 source/device/vulkan/shaders/relu_pack4.comp
 create mode 100644 source/device/vulkan/shaders/relu_pack8.comp
 create mode 100644 source/device/vulkan/shaders/reshape.comp
 create mode 100644 source/device/vulkan/shaders/reshape_pack1to4.comp
 create mode 100644 source/device/vulkan/shaders/reshape_pack1to8.comp
 create mode 100644 source/device/vulkan/shaders/reshape_pack4.comp
 create mode 100644 source/device/vulkan/shaders/reshape_pack4to1.comp
 create mode 100644 source/device/vulkan/shaders/reshape_pack4to8.comp
 create mode 100644 source/device/vulkan/shaders/reshape_pack8.comp
 create mode 100644 source/device/vulkan/shaders/reshape_pack8to1.comp
 create mode 100644 source/device/vulkan/shaders/reshape_pack8to4.comp
 create mode 100644 source/device/vulkan/shaders/softmax_div_sum.comp
 create mode 100644 source/device/vulkan/shaders/softmax_div_sum_pack4.comp
 create mode 100644 source/device/vulkan/shaders/softmax_div_sum_pack8.comp
 create mode 100644 source/device/vulkan/shaders/softmax_exp_sub_max.comp
 create mode 100644 source/device/vulkan/shaders/softmax_exp_sub_max_pack4.comp
 create mode 100644 source/device/vulkan/shaders/softmax_exp_sub_max_pack8.comp
 create mode 100644 source/device/vulkan/shaders/softmax_reduce_max.comp
 create mode 100644 source/device/vulkan/shaders/softmax_reduce_max_pack4.comp
 create mode 100644 source/device/vulkan/shaders/softmax_reduce_max_pack8.comp
 create mode 100644 source/device/vulkan/shaders/softmax_reduce_sum.comp
 create mode 100644 source/device/vulkan/shaders/softmax_reduce_sum_pack4.comp
 create mode 100644 source/device/vulkan/shaders/softmax_reduce_sum_pack8.comp
 create mode 100644 source/device/vulkan/vulkan_allocator.cpp
 create mode 100644 source/device/vulkan/vulkan_allocator.hpp
 create mode 100644 source/device/vulkan/vulkan_command.cpp
 create mode 100644 source/device/vulkan/vulkan_command.hpp
 create mode 100644 source/device/vulkan/vulkan_define.h
 create mode 100644 source/device/vulkan/vulkan_device.cc
 create mode 100644 source/device/vulkan/vulkan_device.hpp
 create mode 100644 source/device/vulkan/vulkan_executor.cc
 create mode 100644 source/device/vulkan/vulkan_executor.hpp
 create mode 100644 source/device/vulkan/vulkan_gpu.cpp
 create mode 100644 source/device/vulkan/vulkan_gpu.hpp
 create mode 100644 source/device/vulkan/vulkan_graph.cc
 create mode 100644 source/device/vulkan/vulkan_graph.hpp
 create mode 100644 source/device/vulkan/vulkan_helper.cc
 create mode 100644 source/device/vulkan/vulkan_helper.hpp
 create mode 100644 source/device/vulkan/vulkan_layer.cpp
 create mode 100644 source/device/vulkan/vulkan_layer.hpp
 create mode 100644 source/device/vulkan/vulkan_limit.hpp
 create mode 100644 source/device/vulkan/vulkan_option.cpp
 create mode 100644 source/device/vulkan/vulkan_option.hpp
 create mode 100644 source/device/vulkan/vulkan_pipeline.cpp
 create mode 100644 source/device/vulkan/vulkan_pipeline.hpp
 create mode 100644 source/device/vulkan/vulkan_platform.hpp
 create mode 100644 source/device/vulkan/vulkan_tensor.cpp
 create mode 100644 source/device/vulkan/vulkan_tensor.hpp

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 491498ab7..09065f7f8 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -385,6 +385,12 @@ IF (TENGINE_ENABLE_CUDA)
     ENDIF()
 ENDIF()
 
+# deal with depends
+
+FOREACH(_var ${TENGINE_DEVICE_DEPENDS_FORWARD})
+    ADD_DEPENDENCIES(${TENGINE_LITE_NAME}-static ${_var})
+    ADD_DEPENDENCIES(${TENGINE_LITE_NAME}        ${_var})
+ENDFOREACH()
 
 # debug macro information
 IF (TENGINE_DEBUG_MEM_STAT)
diff --git a/source/device/CMakeLists.txt b/source/device/CMakeLists.txt
index 54b3103c4..e651ac6e0 100644
--- a/source/device/CMakeLists.txt
+++ b/source/device/CMakeLists.txt
@@ -133,6 +133,21 @@ IF (TENGINE_ENABLE_TIM_VX)
     LIST (APPEND _REGISTER_DEVICE_LIST              "${CMAKE_SOURCE_DIR}/source/device/tim-vx/timvx_device.cc")
 ENDIF()
 
+# Khronos Vulkan
+IF (TENGINE_ENABLE_VULKAN)
+    ADD_SUBDIRECTORY (vulkan)
+
+    LIST (APPEND _TENGINE_DEVICE_HEADER_PATH        ${TENGINE_VULKAN_HEADER_PATH})
+    LIST (APPEND _TENGINE_DEVICE_LINK_PATH          ${TENGINE_VULKAN_LINK_PATH})
+    LIST (APPEND _TENGINE_DEVICE_COMPILER_DEFINES   ${TENGINE_VULKAN_COMPILER_DEFINES})
+    LIST (APPEND _TENGINE_DEVICE_COMPILER_OPTIONS   ${TENGINE_VULKAN_COMPILER_OPTIONS})
+    LIST (APPEND _TENGINE_DEVICE_LINKER_OPTIONS     ${TENGINE_VULKAN_LINKER_OPTIONS})
+    LIST (APPEND _TENGINE_DEVICE_LINK_LIBRARIES     ${TENGINE_VULKAN_LINK_LIBRARIES})
+    LIST (APPEND _TENGINE_DEVICE_SOURCE             ${TENGINE_VULKAN_DEVICE_SOURCE})
+    LIST (APPEND _TENGINE_DEVICE_DEPENDS_FORWARD    ${TENGINE_VULKAN_DEPENDS_FORWARD})
+    LIST (APPEND _REGISTER_DEVICE_LIST              "${CMAKE_SOURCE_DIR}/source/device/vulkan/vulkan_device.cc")
+ENDIF()
+
 
 # set var to cache
 SET (TENGINE_DEVICE_HEADER_PATH         ${_TENGINE_DEVICE_HEADER_PATH}      CACHE INTERNAL "Tengine device level header files searching path" FORCE)
@@ -142,7 +157,7 @@ SET (TENGINE_DEVICE_COMPILER_DEFINES    ${_TENGINE_DEVICE_COMPILER_DEFINES} CACH
 SET (TENGINE_DEVICE_COMPILER_OPTIONS    ${_TENGINE_DEVICE_COMPILER_OPTIONS} CACHE INTERNAL "Tengine device about compiler options" FORCE)
 SET (TENGINE_DEVICE_LINKER_OPTIONS      ${_TENGINE_DEVICE_LINKER_OPTIONS}   CACHE INTERNAL "Tengine device about linker options" FORCE)
 SET (TENGINE_DEVICE_LINK_LIBRARIES      ${_TENGINE_DEVICE_LINK_LIBRARIES}   CACHE INTERNAL "Tengine device about link libraries" FORCE)
-
+SET (TENGINE_DEVICE_DEPENDS_FORWARD     ${_TENGINE_DEVICE_DEPENDS_FORWARD}   CACHE INTERNAL "Tengine device about depends project" FORCE)
 
 # generate device register configuration
 GENERATE_REGISTER_HEADER_FILE ("register_" "unregister_" "" "${CMAKE_SOURCE_DIR}/source/device/register.h.in" "${CMAKE_BINARY_DIR}/source/device/register.h" "${_REGISTER_DEVICE_LIST}")
diff --git a/source/device/vulkan/CMakeLists.txt b/source/device/vulkan/CMakeLists.txt
new file mode 100644
index 000000000..9273bb39e
--- /dev/null
+++ b/source/device/vulkan/CMakeLists.txt
@@ -0,0 +1,171 @@
+# 0. clear var
+UNSET (_DEV_VULKAN_HEADER_PATH)
+UNSET (_VULKAN_BASE_SOURCE)
+UNSET (_VULKAN_OPS_SOURCE)
+UNSET (_DEV_VULKAN_DEVICE_SOURCE)
+UNSET (_DEV_VULKAN_COMPILER_DEFINES)
+UNSET (_DEV_VULKAN_COMPILER_OPTIONS)
+UNSET (_DEV_VULKAN_LINKER_OPTIONS)
+UNSET (_DEV_VULKAN_LINK_LIBRARIES)
+
+
+
+find_program(GLSLANGVALIDATOR_EXECUTABLE NAMES glslangValidator PATHS $ENV{VULKAN_SDK}/bin NO_CMAKE_FIND_ROOT_PATH REQUIRED)
+message(STATUS "Tengine: found glslangValidator: ${GLSLANGVALIDATOR_EXECUTABLE}")
+
+# add shader spv header generate macro
+include(${CMAKE_SOURCE_DIR}/cmake/generate_shader_spv_header.cmake)
+
+macro(add_shader SHADER_SRC)
+    message(STATUS "SHADER_SRC: ${SHADER_SRC}")
+    generate_shader_spv_header(SHADER_SPV_HEADER SHADER_SPV_HEX_HEADERS ${SHADER_SRC})
+
+
+    get_filename_component(SHADER_SPV_HEADER_NAME ${SHADER_SPV_HEADER} NAME)
+    string(APPEND layer_shader_spv_data "#include \"${SHADER_SPV_HEADER_NAME}\"\n")
+
+    get_filename_component(SHADER_SRC_NAME_WE ${SHADER_SRC} NAME_WE)
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_spv_data,sizeof(${SHADER_SRC_NAME_WE}_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16p_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16pa_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16s_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_fp16sa_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16p_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16p_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16pa_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16s_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16s_spv_data)},\n")
+    string(APPEND layer_shader_registry "{${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data,sizeof(${SHADER_SRC_NAME_WE}_image_fp16sa_spv_data)},\n")
+
+    list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEADER})
+    list(APPEND SHADER_SPV_HEX_FILES ${SHADER_SPV_HEX_HEADERS})
+
+    # generate layer_shader_type_enum file
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE} = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16p = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16pa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16s = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+    set(layer_shader_type_enum "${layer_shader_type_enum}${SHADER_SRC_NAME_WE}_image_fp16sa = ${__LAYER_SHADER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_SHADER_TYPE_ENUM_INDEX "${__LAYER_SHADER_TYPE_ENUM_INDEX}+1")
+
+endmacro()
+
+macro(add_layer class)
+    string(TOLOWER ${class} name)
+
+    file(GLOB_RECURSE SHADER_SRCS "shaders/${name}.comp")
+    file(GLOB_RECURSE SHADER_SUBSRCS "shaders/${name}_*.comp")
+    list(APPEND SHADER_SRCS ${SHADER_SUBSRCS})
+    foreach(SHADER_SRC ${SHADER_SRCS})
+        add_shader(${SHADER_SRC})
+    endforeach()
+
+    # generate layer_type_enum file
+    set(layer_type_enum "${layer_type_enum}${class} = ${__LAYER_TYPE_ENUM_INDEX},\n")
+    math(EXPR __LAYER_TYPE_ENUM_INDEX "${__LAYER_TYPE_ENUM_INDEX}+1")
+endmacro()
+
+set(SHADER_SPV_HEX_FILES)
+
+set(__LAYER_TYPE_ENUM_INDEX 0)
+set(__LAYER_SHADER_TYPE_ENUM_INDEX 0)
+
+add_layer(Convolution)
+add_layer(ConvolutionDepthWise)
+add_layer(Pooling)
+add_layer(Padding)
+add_layer(Packing)
+add_layer(InnerProduct)
+add_layer(Flatten)
+add_layer(Relu)
+add_layer(Eltwise)
+add_layer(Softmax)
+add_layer(Dropout)
+add_layer(PriorBox)
+add_layer(Permute)
+add_layer(Reshape)
+add_layer(Concat)
+add_layer(Interp)
+add_layer(Crop)
+
+add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES})
+
+# create new registry file
+configure_file(layer_shader_registry.h.in   ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_registry.h)
+configure_file(layer_shader_spv_data.h.in   ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_spv_data.h)
+configure_file(layer_type_enum.h.in         ${CMAKE_CURRENT_BINARY_DIR}/layer_type_enum.h)
+configure_file(layer_shader_type_enum.h.in  ${CMAKE_CURRENT_BINARY_DIR}/layer_shader_type_enum.h)
+
+# find_package(Vulkan QUIET)
+set(VULKAN_LIBRARY "/usr/lib/x86_64-linux-gnu/" CACHE INTERNAL " " FORCE)
+set(VULKAN_INCLUDE_DIRS "/usr/include/vulkan/" CACHE INTERNAL " " FORCE)
+
+# 1.  set source root path
+SET(_VULKAN_ROOT ${CMAKE_SOURCE_DIR}/source/device/vulkan)
+SET(_VULKAN_BUILD_ROOT ${CMAKE_CURRENT_BINARY_DIR})
+
+
+# 2.  add header file path
+LIST (APPEND _DEV_VULKAN_HEADER_PATH           ${_VULKAN_BUILD_ROOT})
+LIST (APPEND _DEV_VULKAN_HEADER_PATH           ${_VULKAN_ROOT})
+LIST (APPEND _DEV_VULKAN_HEADER_PATH           ${VULKAN_INCLUDE_DIRS})
+
+
+# 3.  add linking lib searching path
+LIST (APPEND _DEV_VULKAN_LINK_PATH             ${VULKAN_LIBRARY})
+
+
+# 4.  add source files
+AUX_SOURCE_DIRECTORY("${_VULKAN_ROOT}"         _VULKAN_BASE_SOURCE)
+AUX_SOURCE_DIRECTORY("${_VULKAN_ROOT}/layer"      _VULKAN_OPS_SOURCE)
+LIST (APPEND _DEV_VULKAN_DEVICE_SOURCE         ${_VULKAN_BASE_SOURCE})
+LIST (APPEND _DEV_VULKAN_DEVICE_SOURCE         ${_VULKAN_OPS_SOURCE})
+
+
+# 5.  add build options for cpu device
+# 5.1 is a gcc or clang like compiler
+IF (TENGINE_COMPILER_GCC OR TENGINE_COMPILER_CLANG)
+    IF (TENGINE_COMPILER_GCC AND (${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER_EQUAL "6.1"))
+        LIST (APPEND _DEV_VULKAN_COMPILER_OPTIONS -Wno-ignored-attributes)
+    ENDIF()
+ENDIF()
+
+
+# 5.2 is Microsoft Visual C++
+IF (TENGINE_COMPILER_MSVC)
+ENDIF()
+
+
+# 6.  add link options
+
+
+# 7.  add link libs
+LIST (APPEND _DEV_VULKAN_LINK_LIBRARIES   "libvulkan.so")
+
+
+# 8. set all to cmake cache
+SET (TENGINE_VULKAN_HEADER_PATH       ${_DEV_VULKAN_HEADER_PATH}        CACHE INTERNAL  "Tengine VULKAN device header files searching path"   FORCE)
+SET (TENGINE_VULKAN_LINK_PATH         ${_DEV_VULKAN_LINK_PATH}          CACHE INTERNAL  "Tengine VULKAN device link libraries searching path" FORCE)
+SET (TENGINE_VULKAN_DEVICE_SOURCE     ${_DEV_VULKAN_DEVICE_SOURCE}      CACHE INTERNAL  "Tengine VULKAN device main source files"             FORCE)
+SET (TENGINE_VULKAN_COMPILER_DEFINES  ${_DEV_VULKAN_COMPILER_DEFINES}   CACHE INTERNAL  "Tengine VULKAN about compiler defines"               FORCE)
+SET (TENGINE_VULKAN_COMPILER_OPTIONS  ${_DEV_VULKAN_COMPILER_OPTIONS}   CACHE INTERNAL  "Tengine VULKAN about compiler options"               FORCE)
+SET (TENGINE_VULKAN_LINKER_OPTIONS    ${_DEV_VULKAN_LINKER_OPTIONS}     CACHE INTERNAL  "Tengine VULKAN about linker options"                 FORCE)
+SET (TENGINE_VULKAN_LINK_LIBRARIES    ${_DEV_VULKAN_LINK_LIBRARIES}     CACHE INTERNAL  "Tengine VULKAN about link libraries"                 FORCE)
+SET (TENGINE_VULKAN_DEPENDS_FORWARD   generate-spirv                    CACHE INTERNAL  "Tengine VULKAN about depends project"                FORCE)
+
+
+# 9. install device option
+INSTALL (FILES ${_VULKAN_ROOT}/VULKAN_define.h DESTINATION include/tengine RENAME VULKAN_device.h)
diff --git a/source/device/vulkan/layer/concat_vulkan.cpp b/source/device/vulkan/layer/concat_vulkan.cpp
new file mode 100644
index 000000000..926e7b19a
--- /dev/null
+++ b/source/device/vulkan/layer/concat_vulkan.cpp
@@ -0,0 +1,788 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "concat_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Concat_vulkan::Concat_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_concat[0] = 0;
+    pipeline_concat[1] = 0;
+    pipeline_concat_pack4[0] = 0;
+    pipeline_concat_pack4[1] = 0;
+    pipeline_concat_pack4to1[0] = 0;
+    pipeline_concat_pack4to1[1] = 0;
+    pipeline_concat_pack8[0] = 0;
+    pipeline_concat_pack8[1] = 0;
+    pipeline_concat_pack8to4[0] = 0;
+    pipeline_concat_pack8to4[1] = 0;
+    pipeline_concat_pack8to1[0] = 0;
+    pipeline_concat_pack8to1[1] = 0;
+}
+
+Concat_vulkan::Concat_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_concat[0] = 0;
+    pipeline_concat[1] = 0;
+    pipeline_concat_pack4[0] = 0;
+    pipeline_concat_pack4[1] = 0;
+    pipeline_concat_pack4to1[0] = 0;
+    pipeline_concat_pack4to1[1] = 0;
+    pipeline_concat_pack8[0] = 0;
+    pipeline_concat_pack8[1] = 0;
+    pipeline_concat_pack8to4[0] = 0;
+    pipeline_concat_pack8to4[1] = 0;
+    pipeline_concat_pack8to1[0] = 0;
+    pipeline_concat_pack8to1[1] = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    for(int i = 0; i < ir_node->input_num; i++)
+    {
+        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        std::string name = input->name;
+        bottoms.push_back(name);
+    }
+
+    for(int i = 0; i < ir_node->output_num; i++)
+    {
+        struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        std::string name = output->name;
+        tops.push_back(name);
+    }
+
+    // params
+    struct tensor *input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    input_c = input_tensor->dims[1];   // param->input_channel;
+    input_h = input_tensor->dims[2];
+    input_w = input_tensor->dims[3];
+    output_c = output_tensor->dims[1];  // param->output_channel;
+    output_h = output_tensor->dims[2];
+    output_w = output_tensor->dims[3];
+
+    struct concat_param *param = (struct concat_param *)ir_node->op.param_mem;
+    axis = param->axis -1;
+}
+
+int Concat_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    int out_elempack = 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
+
+    int elempack = 1;
+    if (axis == 0)
+    {
+        if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+        if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+        if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+        // TODO fix other input data shape to set elempack
+        // for (size_t b = 1; b < bottom_shapes.size(); b++)
+        // {
+        //     const Tensor& shape1 = bottom_shapes[b];
+
+        //     int elempack1 = 1;
+        //     if (shape1.dims == 1) elempack1 = opt.use_shader_pack8 && shape1.w % 8 == 0 ? 8 : shape1.w % 4 == 0 ? 4 : 1;
+        //     if (shape1.dims == 2) elempack1 = opt.use_shader_pack8 && shape1.h % 8 == 0 ? 8 : shape1.h % 4 == 0 ? 4 : 1;
+        //     if (shape1.dims == 3) elempack1 = opt.use_shader_pack8 && shape1.c % 8 == 0 ? 8 : shape1.c % 4 == 0 ? 4 : 1;
+
+        //     elempack = std::min(elempack, elempack1);
+        // }
+    }
+    else
+    {
+        elempack = out_elempack;
+    }
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Tensor out_shape_unpacked;
+    if (out_shape.dims == 1) out_shape_unpacked = Tensor(out_shape.w / elempack, (void*)0, elemsize, elempack);
+    if (out_shape.dims == 2) out_shape_unpacked = Tensor(out_shape.w, out_shape.h / elempack, (void*)0, elemsize, elempack);
+    if (out_shape.dims == 3) out_shape_unpacked = Tensor(out_shape.w, out_shape.h, out_shape.c / elempack, (void*)0, elemsize, elempack);
+
+    // if (!vkdev->shape_support_image_storage(out_shape_unpacked))
+    {
+        support_image_storage = false;
+        opt.use_image_storage = false;
+    }
+
+    std::vector<vk_specialization_type> specializations(1 + 10);
+    specializations[0].i = axis;
+    specializations[1 + 0].i = 0; // TODO handle shape_packed for concat2
+    specializations[1 + 1].i = 0;
+    specializations[1 + 2].i = 0;
+    specializations[1 + 3].i = 0;
+    specializations[1 + 4].i = 0;
+    specializations[1 + 5].i = out_shape_unpacked.dims;
+    specializations[1 + 6].i = out_shape_unpacked.w;
+    specializations[1 + 7].i = out_shape_unpacked.h;
+    specializations[1 + 8].i = out_shape_unpacked.c;
+    specializations[1 + 9].i = out_shape_unpacked.cstep;
+
+    Tensor local_size_xyz; // TODO more precise group size guessed from out_shape_unpacked
+    if (out_shape_unpacked.dims == 1)
+    {
+        local_size_xyz.w = 64;
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_unpacked.dims == 2)
+    {
+        local_size_xyz.w = 8;
+        local_size_xyz.h = 8;
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_unpacked.dims == 3)
+    {
+        local_size_xyz.w = 4;
+        local_size_xyz.h = 4;
+        local_size_xyz.c = 4;
+    }
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_concat[0] = new Pipeline(vkdev);
+        pipeline_concat[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat[0]->create(LayerShaderType::concat, opt, specializations);
+        pipeline_concat[1] = new Pipeline(vkdev);
+        pipeline_concat[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat[1]->create(LayerShaderType::concat, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_concat_pack4[0] = new Pipeline(vkdev);
+        pipeline_concat_pack4[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack4[0]->create(LayerShaderType::concat_pack4, opt, specializations);
+        pipeline_concat_pack4[1] = new Pipeline(vkdev);
+        pipeline_concat_pack4[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack4[1]->create(LayerShaderType::concat_pack4, opt, specializations);
+    }
+
+    // pack4to1
+    if ((axis == 0 && shape.dims == 0) || elempack == 1)
+    {
+        pipeline_concat_pack4to1[0] = new Pipeline(vkdev);
+        pipeline_concat_pack4to1[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack4to1[0]->create(LayerShaderType::concat_pack4to1, opt, specializations);
+        pipeline_concat_pack4to1[1] = new Pipeline(vkdev);
+        pipeline_concat_pack4to1[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack4to1[1]->create(LayerShaderType::concat_pack4to1, opt, specializations);
+    }
+
+    // pack8
+    if (opt.use_shader_pack8 && (shape.dims == 0 || elempack == 8))
+    {
+        pipeline_concat_pack8[0] = new Pipeline(vkdev);
+        pipeline_concat_pack8[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack8[0]->create(LayerShaderType::concat_pack8, opt, specializations);
+        pipeline_concat_pack8[1] = new Pipeline(vkdev);
+        pipeline_concat_pack8[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack8[1]->create(LayerShaderType::concat_pack8, opt, specializations);
+    }
+
+    // pack8to4
+    if (opt.use_shader_pack8 && ((axis == 0 && shape.dims == 0) || elempack == 4))
+    {
+        pipeline_concat_pack8to4[0] = new Pipeline(vkdev);
+        pipeline_concat_pack8to4[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack8to4[0]->create(LayerShaderType::concat_pack8to4, opt, specializations);
+        pipeline_concat_pack8to4[1] = new Pipeline(vkdev);
+        pipeline_concat_pack8to4[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack8to4[1]->create(LayerShaderType::concat_pack8to4, opt, specializations);
+    }
+
+    // pack8to1
+    if (opt.use_shader_pack8 && ((axis == 0 && shape.dims == 0) || elempack == 1))
+    {
+        pipeline_concat_pack8to1[0] = new Pipeline(vkdev);
+        pipeline_concat_pack8to1[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack8to1[0]->create(LayerShaderType::concat_pack8to1, opt, specializations);
+        pipeline_concat_pack8to1[1] = new Pipeline(vkdev);
+        pipeline_concat_pack8to1[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_concat_pack8to1[1]->create(LayerShaderType::concat_pack8to1, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Concat_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_concat[0];
+    delete pipeline_concat[1];
+    pipeline_concat[0] = 0;
+    pipeline_concat[1] = 0;
+
+    delete pipeline_concat_pack4[0];
+    delete pipeline_concat_pack4[1];
+    pipeline_concat_pack4[0] = 0;
+    pipeline_concat_pack4[1] = 0;
+
+    delete pipeline_concat_pack4to1[0];
+    delete pipeline_concat_pack4to1[1];
+    pipeline_concat_pack4to1[0] = 0;
+    pipeline_concat_pack4to1[1] = 0;
+
+    delete pipeline_concat_pack8[0];
+    delete pipeline_concat_pack8[1];
+    pipeline_concat_pack8[0] = 0;
+    pipeline_concat_pack8[1] = 0;
+
+    delete pipeline_concat_pack8to4[0];
+    delete pipeline_concat_pack8to4[1];
+    pipeline_concat_pack8to4[0] = 0;
+    pipeline_concat_pack8to4[1] = 0;
+
+    delete pipeline_concat_pack8to1[0];
+    delete pipeline_concat_pack8to1[1];
+    pipeline_concat_pack8to1[0] = 0;
+    pipeline_concat_pack8to1[1] = 0;
+
+    return 0;
+}
+
+int Concat_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    int dims = bottom_blobs[0].dims;
+
+    if (dims == 1) // axis == 0
+    {
+        // concat vector
+        // total length
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+        int top_w = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+            elemsize = std::min(elemsize, bottom_blob.elemsize);
+            elempack = std::min(elempack, bottom_blob.elempack);
+            top_w += bottom_blob.w * bottom_blob.elempack;
+        }
+
+        int out_elempack = opt.use_shader_pack8 && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        VkTensor& top_blob = top_blobs[0];
+        top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        VkTensor top_blob_unpacked = top_blob;
+        if (elempack < out_elempack)
+        {
+            top_blob_unpacked.create(top_w / elempack, elemsize, elempack, opt.workspace_vkallocator);
+            if (top_blob_unpacked.empty())
+                return -100;
+        }
+
+        int woffset = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+
+            std::vector<VkTensor> bindings(2);
+            bindings[0] = bottom_blob;
+            bindings[1] = top_blob_unpacked;
+
+            std::vector<vk_constant_type> constants(11);
+            constants[0].i = bottom_blob.dims;
+            constants[1].i = bottom_blob.w;
+            constants[2].i = bottom_blob.h;
+            constants[3].i = bottom_blob.c;
+            constants[4].i = bottom_blob.cstep;
+            constants[5].i = top_blob_unpacked.dims;
+            constants[6].i = top_blob_unpacked.w;
+            constants[7].i = top_blob_unpacked.h;
+            constants[8].i = top_blob_unpacked.c;
+            constants[9].i = top_blob_unpacked.cstep;
+            constants[10].i = woffset;
+
+            const Pipeline* pipeline = 0;
+            if (bottom_blob.elempack == 1 && elempack == 1)
+            {
+                pipeline = pipeline_concat[b % 2];
+            }
+            else if (bottom_blob.elempack == 4 && elempack == 4)
+            {
+                pipeline = pipeline_concat_pack4[b % 2];
+            }
+            else if (bottom_blob.elempack == 4 && elempack == 1)
+            {
+                pipeline = pipeline_concat_pack4to1[b % 2];
+            }
+            else if (bottom_blob.elempack == 8 && elempack == 8)
+            {
+                pipeline = pipeline_concat_pack8[b % 2];
+            }
+            else if (bottom_blob.elempack == 8 && elempack == 4)
+            {
+                pipeline = pipeline_concat_pack8to4[b % 2];
+            }
+            else if (bottom_blob.elempack == 8 && elempack == 1)
+            {
+                pipeline = pipeline_concat_pack8to1[b % 2];
+            }
+
+            cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);
+
+            woffset += bottom_blob.w * bottom_blob.elempack / elempack;
+        }
+
+        // packing
+        if (elempack < out_elempack)
+        {
+            vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt);
+        }
+
+        return 0;
+    }
+
+    if (dims == 2 && axis == 0)
+    {
+        // concat image
+        int w = bottom_blobs[0].w;
+
+        // total height
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+        int top_h = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+            elemsize = std::min(elemsize, bottom_blob.elemsize);
+            elempack = std::min(elempack, bottom_blob.elempack);
+            top_h += bottom_blob.h * bottom_blob.elempack;
+        }
+
+        int out_elempack = opt.use_shader_pack8 && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        VkTensor& top_blob = top_blobs[0];
+        top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        VkTensor top_blob_unpacked = top_blob;
+        if (elempack < out_elempack)
+        {
+            top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_vkallocator);
+            if (top_blob_unpacked.empty())
+                return -100;
+        }
+
+        int hoffset = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+
+            std::vector<VkTensor> bindings(2);
+            bindings[0] = bottom_blob;
+            bindings[1] = top_blob_unpacked;
+
+            std::vector<vk_constant_type> constants(11);
+            constants[0].i = bottom_blob.dims;
+            constants[1].i = bottom_blob.w;
+            constants[2].i = bottom_blob.h;
+            constants[3].i = bottom_blob.c;
+            constants[4].i = bottom_blob.cstep;
+            constants[5].i = top_blob_unpacked.dims;
+            constants[6].i = top_blob_unpacked.w;
+            constants[7].i = top_blob_unpacked.h;
+            constants[8].i = top_blob_unpacked.c;
+            constants[9].i = top_blob_unpacked.cstep;
+            constants[10].i = hoffset;
+
+            const Pipeline* pipeline = 0;
+            if (bottom_blob.elempack == 1 && elempack == 1)
+            {
+                pipeline = pipeline_concat[b % 2];
+            }
+            else if (bottom_blob.elempack == 4 && elempack == 4)
+            {
+                pipeline = pipeline_concat_pack4[b % 2];
+            }
+            else if (bottom_blob.elempack == 4 && elempack == 1)
+            {
+                pipeline = pipeline_concat_pack4to1[b % 2];
+            }
+            else if (bottom_blob.elempack == 8 && elempack == 8)
+            {
+                pipeline = pipeline_concat_pack8[b % 2];
+            }
+            else if (bottom_blob.elempack == 8 && elempack == 4)
+            {
+                pipeline = pipeline_concat_pack8to4[b % 2];
+            }
+            else if (bottom_blob.elempack == 8 && elempack == 1)
+            {
+                pipeline = pipeline_concat_pack8to1[b % 2];
+            }
+
+            cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);
+
+            hoffset += bottom_blob.h * bottom_blob.elempack / elempack;
+        }
+
+        // packing
+        if (elempack < out_elempack)
+        {
+            vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt);
+        }
+
+        return 0;
+    }
+
+    if (dims == 2 && axis == 1)
+    {
+        // interleave image row
+        int h = bottom_blobs[0].h;
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+
+        // total width
+        int top_w = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+            top_w += bottom_blob.w;
+        }
+
+        VkTensor& top_blob = top_blobs[0];
+        top_blob.create(top_w, h, elemsize, elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        int woffset = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+
+            std::vector<VkTensor> bindings(2);
+            bindings[0] = bottom_blob;
+            bindings[1] = top_blob;
+
+            std::vector<vk_constant_type> constants(11);
+            constants[0].i = bottom_blob.dims;
+            constants[1].i = bottom_blob.w;
+            constants[2].i = bottom_blob.h;
+            constants[3].i = bottom_blob.c;
+            constants[4].i = bottom_blob.cstep;
+            constants[5].i = top_blob.dims;
+            constants[6].i = top_blob.w;
+            constants[7].i = top_blob.h;
+            constants[8].i = top_blob.c;
+            constants[9].i = top_blob.cstep;
+            constants[10].i = woffset;
+
+            const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2]
+                                       : elempack == 4 ? pipeline_concat_pack4[b % 2]
+                                       : pipeline_concat[b % 2];
+
+            cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);
+
+            woffset += bottom_blob.w;
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 && axis == 0)
+    {
+        // concat dim
+        int w = bottom_blobs[0].w;
+        int h = bottom_blobs[0].h;
+
+        // total channels
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+        int top_channels = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+            elemsize = std::min(elemsize, bottom_blob.elemsize);
+            elempack = std::min(elempack, bottom_blob.elempack);
+            top_channels += bottom_blob.c * bottom_blob.elempack;
+        }
+
+        int out_elempack = opt.use_shader_pack8 && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        VkTensor& top_blob = top_blobs[0];
+        top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        VkTensor top_blob_unpacked = top_blob;
+        if (elempack < out_elempack)
+        {
+            top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_vkallocator);
+            if (top_blob_unpacked.empty())
+                return -100;
+        }
+
+        int coffset = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+
+            std::vector<VkTensor> bindings(2);
+            bindings[0] = bottom_blob;
+            bindings[1] = top_blob_unpacked;
+
+            std::vector<vk_constant_type> constants(11);
+            constants[0].i = bottom_blob.dims;
+            constants[1].i = bottom_blob.w;
+            constants[2].i = bottom_blob.h;
+            constants[3].i = bottom_blob.c;
+            constants[4].i = bottom_blob.cstep;
+            constants[5].i = top_blob_unpacked.dims;
+            constants[6].i = top_blob_unpacked.w;
+            constants[7].i = top_blob_unpacked.h;
+            constants[8].i = top_blob_unpacked.c;
+            constants[9].i = top_blob_unpacked.cstep;
+            constants[10].i = coffset;
+
+            const Pipeline* pipeline = 0;
+            if (bottom_blob.elempack == 1 && elempack == 1)
+            {
+                pipeline = pipeline_concat[b % 2];
+            }
+            else if (bottom_blob.elempack == 4 && elempack == 4)
+            {
+                pipeline = pipeline_concat_pack4[b % 2];
+            }
+            else if (bottom_blob.elempack == 4 && elempack == 1)
+            {
+                pipeline = pipeline_concat_pack4to1[b % 2];
+            }
+            else if (bottom_blob.elempack == 8 && elempack == 8)
+            {
+                pipeline = pipeline_concat_pack8[b % 2];
+            }
+            else if (bottom_blob.elempack == 8 && elempack == 4)
+            {
+                pipeline = pipeline_concat_pack8to4[b % 2];
+            }
+            else if (bottom_blob.elempack == 8 && elempack == 1)
+            {
+                pipeline = pipeline_concat_pack8to1[b % 2];
+            }
+
+            cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);
+
+            coffset += bottom_blob.c * bottom_blob.elempack / elempack;
+        }
+
+        // packing
+        if (elempack < out_elempack)
+        {
+            vkdev->convert_packing(top_blob_unpacked, top_blob, out_elempack, cmd, opt);
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 && axis == 1)
+    {
+        // interleave dim height
+        int w = bottom_blobs[0].w;
+        int channels = bottom_blobs[0].c;
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+
+        // total height
+        int top_h = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+            top_h += bottom_blob.h;
+        }
+
+        VkTensor& top_blob = top_blobs[0];
+        top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        int hoffset = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+
+            std::vector<VkTensor> bindings(2);
+            bindings[0] = bottom_blob;
+            bindings[1] = top_blob;
+
+            std::vector<vk_constant_type> constants(11);
+            constants[0].i = bottom_blob.dims;
+            constants[1].i = bottom_blob.w;
+            constants[2].i = bottom_blob.h;
+            constants[3].i = bottom_blob.c;
+            constants[4].i = bottom_blob.cstep;
+            constants[5].i = top_blob.dims;
+            constants[6].i = top_blob.w;
+            constants[7].i = top_blob.h;
+            constants[8].i = top_blob.c;
+            constants[9].i = top_blob.cstep;
+            constants[10].i = hoffset;
+
+            const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2]
+                                       : elempack == 4 ? pipeline_concat_pack4[b % 2]
+                                       : pipeline_concat[b % 2];
+
+            cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);
+
+            hoffset += bottom_blob.h;
+        }
+
+        return 0;
+    }
+
+    if (dims == 3 && axis == 2)
+    {
+        // interleave dim width
+        int h = bottom_blobs[0].h;
+        int channels = bottom_blobs[0].c;
+        size_t elemsize = bottom_blobs[0].elemsize;
+        int elempack = bottom_blobs[0].elempack;
+
+        // total height
+        int top_w = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+            top_w += bottom_blob.w;
+        }
+
+        VkTensor& top_blob = top_blobs[0];
+        top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        int woffset = 0;
+        for (size_t b = 0; b < bottom_blobs.size(); b++)
+        {
+            const VkTensor& bottom_blob = bottom_blobs[b];
+
+            std::vector<VkTensor> bindings(2);
+            bindings[0] = bottom_blob;
+            bindings[1] = top_blob;
+
+            std::vector<vk_constant_type> constants(11);
+            constants[0].i = bottom_blob.dims;
+            constants[1].i = bottom_blob.w;
+            constants[2].i = bottom_blob.h;
+            constants[3].i = bottom_blob.c;
+            constants[4].i = bottom_blob.cstep;
+            constants[5].i = top_blob.dims;
+            constants[6].i = top_blob.w;
+            constants[7].i = top_blob.h;
+            constants[8].i = top_blob.c;
+            constants[9].i = top_blob.cstep;
+            constants[10].i = woffset;
+
+            const Pipeline* pipeline = elempack == 8 ? pipeline_concat_pack8[b % 2]
+                                       : elempack == 4 ? pipeline_concat_pack4[b % 2]
+                                       : pipeline_concat[b % 2];
+
+            cmd.record_pipeline(pipeline, bindings, constants, bottom_blob);
+
+            woffset += bottom_blob.w;
+        }
+
+        return 0;
+    }
+
+    return 0;
+}
+
+}   // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/concat_vulkan.hpp b/source/device/vulkan/layer/concat_vulkan.hpp
new file mode 100644
index 000000000..6476fc997
--- /dev/null
+++ b/source/device/vulkan/layer/concat_vulkan.hpp
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_CONCAT_HPP
+#define LAYER_CONCAT_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "concat_param.h"
+
+namespace TEngine{
+
+class Concat_vulkan : public Layer
+{
+public:
+    Concat_vulkan();
+    Concat_vulkan(ir_graph_t* graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_concat[2];
+    Pipeline* pipeline_concat_pack4[2];
+    Pipeline* pipeline_concat_pack4to1[2];
+    Pipeline* pipeline_concat_pack8[2];
+    Pipeline* pipeline_concat_pack8to4[2];
+    Pipeline* pipeline_concat_pack8to1[2];
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    int axis;
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/convolution_vulkan.cpp b/source/device/vulkan/layer/convolution_vulkan.cpp
new file mode 100644
index 000000000..5f135feba
--- /dev/null
+++ b/source/device/vulkan/layer/convolution_vulkan.cpp
@@ -0,0 +1,616 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "convolution_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Convolution_vulkan::Convolution_vulkan()
+{
+    support_vulkan = true;
+    pipeline_convolution = 0;
+}
+
+Convolution_vulkan::Convolution_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    padding = 0;
+    innerproduct = 0;
+
+    pipeline_convolution = 0;
+    pipeline_convolution_pack4 = 0;
+    pipeline_convolution_pack8 = 0;
+    pipeline_convolution_pack1to4 = 0;
+    pipeline_convolution_pack4to1 = 0;
+    pipeline_convolution_pack1to8 = 0;
+    pipeline_convolution_pack4to8 = 0;
+    pipeline_convolution_pack8to1 = 0;
+    pipeline_convolution_pack8to4 = 0;
+    pipeline_convolution_1x1s1d1 = 0;
+    pipeline_convolution_pack4_1x1s1d1 = 0;
+    pipeline_convolution_pack8_1x1s1d1 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    // Tensor* output_tensor = t_node->GetOutputTensor(0);
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // Convolution* conv_op = dynamic_cast<Convolution*>(node->GetOp());
+    // ConvParam* param = conv_op->GetParam();
+    struct conv_param *param = (struct conv_param *)ir_node->op.param_mem;
+
+    group = param->group;
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    pad_w0 = param->pad_w0;    // left padding columns
+    pad_w1 = param->pad_w1;    // right padding columns
+    pad_h0 = param->pad_h0;    // top padding rows
+    pad_h1 = param->pad_h1;    // bottom padding rows
+    stride_w = param->stride_w;
+    stride_h = param->stride_h;
+    dilation_w = param->dilation_w;
+    dilation_h = param->dilation_h;
+    kernel_w = param->kernel_w;
+    kernel_h = param->kernel_h;
+    activation = param->activation == 0 ? 1 : -1;
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+    struct tensor *weight = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    weight_data_size = weight->elem_num;
+}
+
+int Convolution_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+
+    // const Tshape& shape = bottom_shapes.empty() ? Tshape() : bottom_shapes[0];
+    // const Tshape& out_shape = top_shapes.empty() ? Tshape() : top_shapes[0];
+
+    // const int maxk = kernel_w * kernel_h;
+    // // int num_input = weight_data_size / maxk / num_output;
+    // int num_output = output_c;
+    // int num_input = input_c;
+    const Tshape& shape = Tshape(input_w, input_h, input_c);
+    const Tshape& out_shape = Tshape(output_w, output_h, output_c);
+    const int maxk = kernel_w * kernel_h;
+    int num_output = output_c;
+    int num_input = input_c;
+    int pad_left = pad_w0;
+    int pad_right = pad_w1;
+    int pad_top = pad_h0;
+    int pad_bottom = pad_h1;
+
+    // TLOG_INFO("%d %d %d -> %d %d %d\n", shape.c, shape.h, shape.w, out_shape.c, out_shape.h, out_shape.w);
+    // fc
+    // if (kernel_w == 1 && kernel_h == 1)
+    // {
+    //     innerproduct = new InnerProduct_vulkan(graph, node);
+    //     innerproduct->vkdev = vkdev;
+
+    //     innerproduct->create_pipeline(opt);
+
+    //     if (shape.dims == 1 && shape.w == num_input)
+    //     {
+    //         return 0;
+    //     }
+    // }
+
+    Tshape shape_bordered = Tshape();
+
+    if (shape.dims != 0)
+    {
+        if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
+        {
+            shape_bordered = Tshape(shape.w + pad_left + pad_right, shape.h + pad_top + pad_bottom, shape.c);
+        }
+        else if ((pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
+            || (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234))
+        {
+            const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+            const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+            int wpad = kernel_extent_w + (shape.w - 1) / stride_w * stride_w - shape.w;
+            int hpad = kernel_extent_h + (shape.h - 1) / stride_h * stride_h - shape.h;
+            if (wpad > 0 || hpad > 0)
+            {
+                shape_bordered = Tshape(shape.w + wpad, shape.h + hpad, shape.c);
+            }
+        }
+        else
+        {
+            shape_bordered = shape;
+        }
+    }
+
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    // TLOG_INFO("elemsize out_elemsize:%d %d\n", elemsize, out_elemsize);
+
+    Tshape shape_bordered_packed;
+    // if (shape_bordered.dims == 3) shape_bordered_packed = Mat(shape_bordered.w, shape_bordered.h, num_input / elempack, (void*)0, elemsize, elempack);
+    if (shape_bordered.dims == 3) shape_bordered_packed = Tshape(shape_bordered.w, shape_bordered.h, num_input / elempack);
+
+    Tshape out_shape_packed;
+    // if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, num_output / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (out_shape.dims == 3) out_shape_packed = Tshape(out_shape.w, out_shape.h, num_output / out_elempack);
+
+    bool is_conv1x1s1d1 = kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
+    // bool is_conv3x3s1d1 = kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;
+    // bool is_conv1x1s1d1 = false;
+    bool is_conv3x3s1d1 = false;
+
+    // if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16 && ((elempack == 4 && out_elempack == 4) || (elempack == 8 && out_elempack == 8)))
+    {
+        // TODO do nothing for wino fix me!!!!!
+    }
+    // else
+    {
+        support_image_storage = false;
+        opt.use_image_storage = false;
+    }
+
+    {
+        padding = new Padding_vulkan();
+        padding->vkdev = vkdev;
+
+        padding->top = pad_h0;
+        padding->bottom = pad_h1;
+        padding->left = pad_w0;
+        padding->right = pad_w1;
+        padding->type = 0;
+        padding->value = 0;
+
+        padding->input_w = input_w;
+        padding->input_h = input_h;
+        padding->input_c = input_c;
+        padding->output_w = input_w + pad_w0 + pad_w1;
+        padding->output_h = input_h + pad_h0 + pad_h1;
+        padding->output_c = input_c;
+
+        padding->create_pipeline(opt);
+    }
+    
+    std::vector<vk_specialization_type> specializations(10 + 10);
+    specializations[0].i = kernel_w;	// kernel_w;
+    specializations[1].i = kernel_h;	// kernel_h
+    specializations[2].i = dilation_w;	// dilation_w;
+    specializations[3].i = dilation_h;	// dilation_h;
+    specializations[4].i = stride_w;	// stride_w;
+    specializations[5].i = stride_h;	// stride_h;
+    specializations[6].i = node->input_num>2 ? 1 : 0; // bias_term;
+    specializations[7].i = activation;	// activation_type;
+    specializations[8].f = 0;//param->activation;	// activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[9].f = 0;//param->activation; 	// activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[10 + 0].i = 0;//3;	// shape_bordered_packed.dims;
+    specializations[10 + 1].i = 0;//input_w + pad_w0 + pad_w1;	// shape_bordered_packed.w;
+    specializations[10 + 2].i = 0;//input_h + pad_h0 + pad_h1;	// shape_bordered_packed.h;
+    specializations[10 + 3].i = 0;//input_c;	// shape_bordered_packed.c;
+    specializations[10 + 4].i = 0;//(input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1);	// shape_bordered_packed.cstep;
+    specializations[10 + 5].i = 0;	// out_shape_packed.dims;
+    specializations[10 + 6].i = 0;//output_w;	// out_shape_packed.w;
+    specializations[10 + 7].i = 0;//output_h;	// out_shape_packed.h;
+    specializations[10 + 8].i = 0;//output_c;	// out_shape_packed.c;
+    specializations[10 + 9].i = 0;//output_w * output_h;	// out_shape_packed.cstep;
+
+    // TODO with local_size_xyz and shader_index options
+
+    VkTensor local_size_xyz;
+    local_size_xyz.w = std::min(8, out_shape_packed.w);
+    local_size_xyz.h = std::min(8, out_shape_packed.h);
+    local_size_xyz.c = std::min(4, out_shape_packed.c);
+    
+    // TLOG_INFO("create pipeline elempack out_elempack:%d %d\n", elempack, out_elempack);
+
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        // TODO deal with conv1x1s1d1
+        if (is_conv1x1s1d1)
+        {
+            pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
+            pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output));
+            pipeline_convolution_1x1s1d1->create(LayerShaderType::convolution_1x1s1d1, opt, specializations);
+        }
+        else
+        {
+            // TLOG_INFO("create pipeline pack1to1\n");
+            pipeline_convolution = new Pipeline(vkdev);
+            pipeline_convolution->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_convolution->create(LayerShaderType::convolution, opt, specializations);
+        }
+    }
+
+    // pack4
+    if (elempack == 4 && out_elempack == 4)
+    {
+        if (is_conv1x1s1d1)
+        {
+            pipeline_convolution_pack4_1x1s1d1 = new Pipeline(vkdev);
+            pipeline_convolution_pack4_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 4));
+            pipeline_convolution_pack4_1x1s1d1->create(LayerShaderType::convolution_pack4_1x1s1d1, opt, specializations);
+        }
+        else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
+        {
+            // winograd23
+        }
+        else
+        {
+            pipeline_convolution_pack4 = new Pipeline(vkdev);
+            pipeline_convolution_pack4->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_convolution_pack4->create(LayerShaderType::convolution_pack4, opt, specializations);
+        }
+    }
+
+    // pack1to4
+    if (elempack == 1 && out_elempack == 4)
+    {
+        pipeline_convolution_pack1to4 = new Pipeline(vkdev);
+        pipeline_convolution_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolution_pack1to4->create(LayerShaderType::convolution_pack1to4, opt, specializations);
+    }
+
+    // pack4to1
+    if (elempack == 4 && out_elempack == 1)
+    {
+        pipeline_convolution_pack4to1 = new Pipeline(vkdev);
+        pipeline_convolution_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolution_pack4to1->create(LayerShaderType::convolution_pack4to1, opt, specializations);
+    }
+
+    // pack8
+    if (elempack == 8 && out_elempack == 8)
+    {
+        if (is_conv1x1s1d1)
+        {
+            pipeline_convolution_pack8_1x1s1d1 = new Pipeline(vkdev);
+            pipeline_convolution_pack8_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / 8));
+            pipeline_convolution_pack8_1x1s1d1->create(LayerShaderType::convolution_pack8_1x1s1d1, opt, specializations);
+        }
+        else if (is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
+        {
+            // winograd23
+        }
+        else
+        {
+            pipeline_convolution_pack8 = new Pipeline(vkdev);
+            pipeline_convolution_pack8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_convolution_pack8->create(LayerShaderType::convolution_pack8, opt, specializations);
+        }
+    }
+
+    // pack1to8
+    if (elempack == 1 && out_elempack == 8)
+    {
+        pipeline_convolution_pack1to8 = new Pipeline(vkdev);
+        pipeline_convolution_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolution_pack1to8->create(LayerShaderType::convolution_pack1to8, opt, specializations);
+    }
+
+    // pack4to8
+    if (elempack == 4 && out_elempack == 8)
+    {
+        pipeline_convolution_pack4to8 = new Pipeline(vkdev);
+        pipeline_convolution_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolution_pack4to8->create(LayerShaderType::convolution_pack4to8, opt, specializations);
+    }
+
+    // pack8to4
+    if (elempack == 8 && out_elempack == 4)
+    {
+        pipeline_convolution_pack8to4 = new Pipeline(vkdev);
+        pipeline_convolution_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolution_pack8to4->create(LayerShaderType::convolution_pack8to4, opt, specializations);
+    }
+
+    // pack8to1
+    if (elempack == 8 && out_elempack == 1)
+    {
+        pipeline_convolution_pack8to1 = new Pipeline(vkdev);
+        pipeline_convolution_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolution_pack8to1->create(LayerShaderType::convolution_pack8to1, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Convolution_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    return 0;
+}
+
+int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{   
+    tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
+
+    // Tensor weight_data = Tensor(weight_tensor->elem_num, 1, 1, weight_tensor->data);
+    Tensor weight_data = Tensor(weight_tensor->elem_num, weight_tensor->data);
+
+    // if (padding)
+    // {
+    //     padding->upload_model(cmd, opt);
+    // }
+
+    const int maxk = kernel_w * kernel_h;
+    int num_output = output_c;
+    int num_input = input_c; //weight_data_size / maxk / num_output;
+
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
+    // int elempack = 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    // TLOG_INFO("conv upload model pack:%d %d\n", elempack, out_elempack);
+
+    Tensor weight_data_packed;
+    {
+        Tensor weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+        weight_data_packed.create(maxk, num_input/elempack, num_output/out_elempack, (size_t)4*elempack*out_elempack, elempack*out_elempack);
+        for (int q=0; q+(out_elempack-1)<num_output; q+=out_elempack)
+        {
+            Tensor g0 = weight_data_packed.channel(q/out_elempack);
+
+            for (int p=0; p+(elempack-1)<num_input; p+=elempack)
+            {
+                float* g00 = g0.row(p/elempack);
+
+                for (int k=0; k<maxk; k++)
+                {
+
+                    for (int i=0; i<out_elempack; i++)
+                    {
+                        const Tensor k0 = weight_data_r2.channel(q+i);
+
+                        for (int j=0; j<elempack; j++)
+                        {
+                            const float* k00 = k0.row(p+j);
+
+                            g00[0] = k00[k];
+
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // ir_tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    // cmd.record_upload(weight_tensor, weight_data_gpu, opt);
+    if (support_image_storage && opt.use_image_storage)
+    {
+        TLOG_INFO("not record_upload weight_data_gpu_image, fix me\n");
+        // cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
+    }
+    else
+    {
+        cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
+    }
+
+    // upload bias data
+    if(node->input_num > 2)
+    {
+        tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);
+        Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data);
+
+        // TLOG_INFO("bias data shape:%d %d %d\n", bias_data.c, bias_data.h, bias_data.w);
+
+        Tensor bias_data_packed;
+        convert_packing(bias_data, bias_data_packed, out_elempack);
+
+        if (support_image_storage && opt.use_image_storage)
+        {
+            // cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
+        }
+        else
+        {
+            cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
+        }
+
+    }
+
+    // if (innerproduct)
+    // {
+    //     innerproduct->upload_model(cmd, opt);
+    // }
+
+    return 0;
+}
+
+int Convolution_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    // TLOG_INFO("in_c in_h in_w k_h k_w s p dilation group:%d %d %d %d %d %d %d %d %d\n", input_c, input_h, input_w, kernel_h, kernel_w, stride_h, pad_w0, dilation_h, group);
+    VkTensor bottom_blob_dim3 = bottom_blob;
+    if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
+    {
+        bottom_blob_dim3.dims = 3;
+        bottom_blob_dim3.c = bottom_blob_dim3.w;
+        bottom_blob_dim3.w = 1;
+        bottom_blob_dim3.cstep = 1;
+    }
+    
+    int w = bottom_blob_dim3.w;
+    int h = bottom_blob_dim3.h;
+    int channels = bottom_blob_dim3.c;
+    size_t elemsize = bottom_blob_dim3.elemsize;
+    int elempack = bottom_blob_dim3.elempack;
+    // TLOG_INFO("botom shape:%d %d %d %d %d %d %d\n", bottom_blob.dims, bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.elemsize, bottom_blob.elempack, bottom_blob.cstep);
+
+    int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1;
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    VkTensor bottom_blob_bordered = bottom_blob_dim3;
+    if (pad_h0 > 0 || pad_h1 > 0 || pad_w0 > 0 || pad_w1 > 0)
+    {
+        Option opt_pad = opt;
+        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+        padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
+    }
+
+    // TLOG_INFO("forward convolution, w h c elemsize, elempack:%d %d %d %d %d\n", output_w, output_h, channels, elemsize, elempack);
+    top_blob.create(output_w, output_h, output_c / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+
+    // TLOG_INFO("convolution bottom shape:%d %d %d %d %d, top shape:%d %d %d %d %d\n", bottom_blob_bordered.dims, bottom_blob_bordered.w, bottom_blob_bordered.h, bottom_blob_bordered.c, bottom_blob_bordered.cstep, top_blob.dims, top_blob.w, top_blob.h, top_blob.c, top_blob.cstep);
+
+    std::vector<VkTensor> bindings(4);
+    bindings[0] = bottom_blob_bordered;
+    bindings[1] = top_blob;
+    bindings[2] = weight_data_gpu;
+    bindings[3] = bias_data_gpu;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob_bordered.dims;
+    constants[1].i = bottom_blob_bordered.w;
+    constants[2].i = bottom_blob_bordered.h;
+    constants[3].i = bottom_blob_bordered.c;
+    constants[4].i = bottom_blob_bordered.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    // record
+    if (elempack == 1 && out_elempack == 1 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
+    {
+        VkTensor dispatcher;
+        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
+        dispatcher.h = 1;
+        dispatcher.c = top_blob.c;
+
+        cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
+    }
+    else if (elempack == 4 && out_elempack == 4 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
+    {
+        VkTensor dispatcher;
+        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
+        dispatcher.h = 1;
+        dispatcher.c = top_blob.c;
+        
+        cmd.record_pipeline(pipeline_convolution_pack4_1x1s1d1, bindings, constants, dispatcher);
+    }
+    else if (elempack == 8 && out_elempack == 8 && kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1)
+    {
+        VkTensor dispatcher;
+        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
+        dispatcher.h = 1;
+        dispatcher.c = top_blob.c;
+
+        cmd.record_pipeline(pipeline_convolution_pack8_1x1s1d1, bindings, constants, dispatcher);
+    }
+    else
+    {
+        const Pipeline* pipeline = 0;
+        if (elempack == 1 && out_elempack == 1)
+        {
+            pipeline = pipeline_convolution;
+        }
+        else if (elempack == 4 && out_elempack == 4)
+        {
+            // TLOG_INFO("pipeline is pipeline_convolution_pack4\n");
+            pipeline = pipeline_convolution_pack4;
+        }
+        else if (elempack == 1 && out_elempack == 4)
+        {
+            pipeline = pipeline_convolution_pack1to4;
+        }
+        else if (elempack == 4 && out_elempack == 1)
+        {
+            pipeline = pipeline_convolution_pack4to1;
+        }
+        else if (elempack == 8 && out_elempack == 8)
+        {
+            pipeline = pipeline_convolution_pack8;
+        }
+        else if (elempack == 1 && out_elempack == 8)
+        {
+            pipeline = pipeline_convolution_pack1to8;
+        }
+        else if (elempack == 4 && out_elempack == 8)
+        {
+            pipeline = pipeline_convolution_pack4to8;
+        }
+        else if (elempack == 8 && out_elempack == 4)
+        {
+            pipeline = pipeline_convolution_pack8to4;
+        }
+        else if (elempack == 8 && out_elempack == 1)
+        {
+            pipeline = pipeline_convolution_pack8to1;
+        }
+
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+    }
+
+    // TLOG_INFO("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w);
+    // cmd.record_pipeline(pipeline_convolution, bindings, constants, top_blob);
+	// TLOG_INFO("run record convolution\n");
+    return 0;
+}
+
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/convolution_vulkan.hpp b/source/device/vulkan/layer/convolution_vulkan.hpp
new file mode 100644
index 000000000..a1e7c1ad8
--- /dev/null
+++ b/source/device/vulkan/layer/convolution_vulkan.hpp
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_CONVOLUTION_HPP
+#define LAYER_CONVOLUTION_HPP
+
+#include "padding_vulkan.hpp"
+#include "innerproduct_vulkan.hpp"
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "convolution_param.h"
+
+namespace TEngine {
+
+class Convolution_vulkan : public Layer
+{
+public:
+    Convolution_vulkan();
+    // Convolution_vulkan(ir_node* node);
+    Convolution_vulkan(ir_graph_t* graph, ir_node_t* node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    // virtual int record_pipeline(VkCompute& cmd, const Option& opt) const;
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+
+public:
+    int group;
+    int input_c;
+    int input_h;
+    int input_w;
+    int pad_w0;  // left padding columns
+    int pad_w1;  // right padding columns
+    int pad_h0;  // top padding rows
+    int pad_h1;  // bottom padding rows
+    int stride_h;
+    int stride_w;
+    int dilation_h;
+    int dilation_w;
+    int kernel_h;
+    int kernel_w;
+    int activation;
+    int output_c;
+    int output_h;
+    int output_w;
+
+    int weight_data_size;
+
+public:
+    Padding_vulkan* padding;
+    InnerProduct_vulkan* innerproduct;
+
+    VkTensor weight_data_gpu;
+    VkImageTensor weight_data_gpu_image;
+    VkTensor bias_data_gpu;
+
+    Pipeline* pipeline_convolution;
+    Pipeline* pipeline_convolution_pack4;
+    Pipeline* pipeline_convolution_pack8;
+    Pipeline* pipeline_convolution_pack1to4;
+    Pipeline* pipeline_convolution_pack4to1;
+    Pipeline* pipeline_convolution_pack1to8;
+    Pipeline* pipeline_convolution_pack4to8;
+    Pipeline* pipeline_convolution_pack8to1;
+    Pipeline* pipeline_convolution_pack8to4;
+
+    Pipeline* pipeline_convolution_1x1s1d1;
+    Pipeline* pipeline_convolution_pack4_1x1s1d1;
+    Pipeline* pipeline_convolution_pack8_1x1s1d1;
+};
+
+} // namespace TEngine
+
+
+#endif
diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp
new file mode 100644
index 000000000..bc950cf38
--- /dev/null
+++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.cpp
@@ -0,0 +1,301 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "convolutiondepthwise_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+    ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan()
+    {
+        support_vulkan = true;
+        pipeline_convolutiondepthwise = 0;
+    }
+
+    ConvolutionDepthWise_vulkan::ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+    {
+        support_vulkan = true;
+
+        padding = 0;
+
+        pipeline_convolutiondepthwise = 0;
+        pipeline_convolutiondepthwise_pack4 = 0;
+        pipeline_convolutiondepthwise_pack8 = 0;
+        graph = ir_graph;
+        node = ir_node;
+
+        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+        std::string name = input->name;
+        bottoms.push_back(name);
+
+        struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+        name = output->name;
+        tops.push_back(name);
+
+        struct conv_param *param = (struct conv_param *)ir_node->op.param_mem;
+
+        group = param->group;
+        input_c = input->dims[1];   // param->input_channel;
+        input_h = input->dims[2];
+        input_w = input->dims[3];
+        pad_w0 = param->pad_w0;    // left padding columns
+        pad_w1 = param->pad_w1;    // right padding columns
+        pad_h0 = param->pad_h0;    // top padding rows
+        pad_h1 = param->pad_h1;    // bottom padding rows
+        stride_w = param->stride_w;
+        stride_h = param->stride_h;
+        dilation_w = param->dilation_w;
+        dilation_h = param->dilation_h;
+        kernel_w = param->kernel_w;
+        kernel_h = param->kernel_h;
+        output_c = output->dims[1];  // param->output_channel;
+        output_h = output->dims[2];
+        output_w = output->dims[3];
+    }
+
+int ConvolutionDepthWise_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+
+    {
+        padding = new Padding_vulkan();
+        padding->vkdev = vkdev;
+
+        padding->top = pad_h0;
+        padding->bottom = pad_h1;
+        padding->left = pad_w0;
+        padding->right = pad_w1;
+        padding->type = 0;
+        padding->value = 0;
+
+        padding->input_w = input_w;
+        padding->input_h = input_h;
+        padding->input_c = input_c;
+        padding->output_w = input_w + pad_w0 + pad_w1;
+        padding->output_h = input_h + pad_h0 + pad_h1;
+        padding->output_c = input_c;
+
+        padding->create_pipeline(opt);
+    }
+
+
+    // const int maxk = kernel_w * kernel_h;
+    int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group;
+    int num_output = output_c;
+
+    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    std::vector<vk_specialization_type> specializations(11 + 10);
+    specializations[0].i = kernel_w;	// kernel_w;
+    specializations[1].i = kernel_h;	// kernel_h
+    specializations[2].i = dilation_w;	// dilation_w;
+    specializations[3].i = dilation_h;	// dilation_h;
+    specializations[4].i = stride_w;	// stride_w;
+    specializations[5].i = stride_h;	// stride_h;
+    specializations[6].i = node->input_num >2 ? 1 : 0; // bias_term;
+    specializations[7].i = group;
+    specializations[8].i = 1;//param->activation;	// activation_type;
+    specializations[9].f = 0;//param->activation;	// activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[10].f = 0;//param->activation; 	// activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[11 + 0].i = 0;  // 3;	// shape_bordered_packed.dims;
+    specializations[11 + 1].i = 0;  // input_w + pad_w0 + pad_w1;	// shape_bordered_packed.w;
+    specializations[11 + 2].i = 0;  // input_h + pad_h0 + pad_h1;	// shape_bordered_packed.h;
+    specializations[11 + 3].i = 0;  // input_c;	// shape_bordered_packed.c;
+    specializations[11 + 4].i = 0;  // (input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1);	// shape_bordered_packed.cstep;
+    specializations[11 + 5].i = 0;  // 3;	// out_shape_packed.dims;
+    specializations[11 + 6].i = 0;  // output_w;	// out_shape_packed.w;
+    specializations[11 + 7].i = 0;  // output_h;	// out_shape_packed.h;
+    specializations[11 + 8].i = 0;  // output_c;	// out_shape_packed.c;
+    specializations[11 + 9].i = 0;  // output_w * output_h;	// out_shape_packed.cstep;
+
+    VkTensor local_size_xyz;
+    local_size_xyz.w = std::min(4, output_w);
+    local_size_xyz.h = std::min(4, output_h);
+    local_size_xyz.c = std::min(4, output_c);
+
+    // pack1
+    if (elempack == 1)
+    {
+        pipeline_convolutiondepthwise = new Pipeline(vkdev);
+        pipeline_convolutiondepthwise->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolutiondepthwise->create(LayerShaderType::convolutiondepthwise, opt, specializations);
+    }
+
+    // pack4
+    if (elempack == 4)
+    {
+        pipeline_convolutiondepthwise_pack4 = new Pipeline(vkdev);
+        pipeline_convolutiondepthwise_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolutiondepthwise_pack4->create(LayerShaderType::convolutiondepthwise_pack4, opt, specializations);
+    }
+
+    // pack8
+    if (elempack == 8)
+    {
+        pipeline_convolutiondepthwise_pack8 = new Pipeline(vkdev);
+        pipeline_convolutiondepthwise_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_convolutiondepthwise_pack8->create(LayerShaderType::convolutiondepthwise_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int ConvolutionDepthWise_vulkan::destroy_pipeline(const Option& opt)
+{
+    if (padding)
+    {
+        padding->destroy_pipeline(opt);
+        delete padding;
+        padding = 0;
+    }
+
+    delete pipeline_convolutiondepthwise;
+    pipeline_convolutiondepthwise = 0;
+
+    delete pipeline_convolutiondepthwise_pack4;
+    pipeline_convolutiondepthwise_pack4 = 0;
+
+    delete pipeline_convolutiondepthwise_pack8;
+    pipeline_convolutiondepthwise_pack8 = 0;
+    return 0;
+}
+
+int ConvolutionDepthWise_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{
+        // upload kernel data
+    const int maxk = kernel_w * kernel_h;
+    int channels = input_c; // (weight_data_size / group) / maxk / (num_output / group) * group;
+    int num_output = output_c;
+
+    int elempack = opt.use_shader_pack8 && channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+
+    tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    Tensor weight_data = Tensor(weight_tensor->elem_num, weight_tensor->data);
+
+    Tensor weight_data_packed;
+    Tensor weight_data_r2 = weight_data.reshape(maxk, group);
+    TEngine::convert_packing(weight_data_r2, weight_data_packed, elempack);
+
+    cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
+
+    // upload bias data
+    if(node->input_num > 2)
+    {
+        tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);
+        Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data);
+        Tensor bias_data_packed;
+        convert_packing(bias_data, bias_data_packed, out_elempack);
+	    cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
+    }
+    return 0;
+}
+
+int ConvolutionDepthWise_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+
+    VkTensor bottom_blob_bordered = bottom_blob;
+    if (pad_h0 > 0 || pad_h1 > 0 || pad_w0 > 0 || pad_w1 > 0)
+    {
+        // bottom_blob_bordered.w = bottom_blob_bordered.w + pad_w0 + pad_w1;
+        // bottom_blob_bordered.h = bottom_blob_bordered.h + pad_h0 + pad_h1;
+        // bottom_blob_bordered.cstep = bottom_blob_bordered.w * bottom_blob_bordered.h;
+        Option opt_pad = opt;
+        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+        padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
+    }
+
+    top_blob.create(output_w, output_h, output_c/elempack, elemsize, elempack, opt.blob_vkallocator);
+
+    std::vector<VkTensor> bindings(4);
+    bindings[0] = bottom_blob_bordered;
+    bindings[1] = top_blob;
+    bindings[2] = weight_data_gpu;
+    bindings[3] = bias_data_gpu;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob_bordered.dims;
+    constants[1].i = bottom_blob_bordered.w;
+    constants[2].i = bottom_blob_bordered.h;
+    constants[3].i = bottom_blob_bordered.c;
+    constants[4].i = bottom_blob_bordered.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    // printf("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w);
+    const Pipeline* pipeline = elempack == 8 ? pipeline_convolutiondepthwise_pack8
+                                   : elempack == 4 ? pipeline_convolutiondepthwise_pack4
+                                   : pipeline_convolutiondepthwise;
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    return 0;
+}
+
+}
\ No newline at end of file
diff --git a/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp
new file mode 100644
index 000000000..05f78f22c
--- /dev/null
+++ b/source/device/vulkan/layer/convolutiondepthwise_vulkan.hpp
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_CONVOLUTIONDEPTHWISE_HPP
+#define LAYER_CONVOLUTIONDEPTHWISE_HPP
+
+#include "padding_vulkan.hpp"
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "convolution_param.h"
+
+namespace TEngine {
+
+class ConvolutionDepthWise_vulkan : public Layer
+{
+public:
+    ConvolutionDepthWise_vulkan();
+    ConvolutionDepthWise_vulkan(ir_graph_t* ir_graph, ir_node_t* node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    int group;
+    int input_c;
+    int input_h;
+    int input_w;
+    int pad_w0;  // left padding columns
+    int pad_w1;  // right padding columns
+    int pad_h0;  // top padding rows
+    int pad_h1;  // bottom padding rows
+    int stride_h;
+    int stride_w;
+    int dilation_h;
+    int dilation_w;
+    int kernel_h;
+    int kernel_w;
+    int output_c;
+    int output_h;
+    int output_w;
+
+public:
+    Padding_vulkan* padding;
+
+    VkTensor weight_data_gpu;
+    VkTensor bias_data_gpu;
+
+    Pipeline* pipeline_convolutiondepthwise;
+    Pipeline* pipeline_convolutiondepthwise_pack4;
+    Pipeline* pipeline_convolutiondepthwise_pack8;
+};
+
+} // namespace TEngine
+
+
+#endif
diff --git a/source/device/vulkan/layer/crop_vulkan.cpp b/source/device/vulkan/layer/crop_vulkan.cpp
new file mode 100644
index 000000000..26f8768e8
--- /dev/null
+++ b/source/device/vulkan/layer/crop_vulkan.cpp
@@ -0,0 +1,607 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "crop_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Crop_vulkan::Crop_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_crop = 0;
+    pipeline_crop_pack4 = 0;
+    pipeline_crop_pack1to4 = 0;
+    pipeline_crop_pack4to1 = 0;
+    pipeline_crop_pack8 = 0;
+    pipeline_crop_pack1to8 = 0;
+    pipeline_crop_pack4to8 = 0;
+    pipeline_crop_pack8to4 = 0;
+    pipeline_crop_pack8to1 = 0;
+}
+
+Crop_vulkan::Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_crop = 0;
+    pipeline_crop_pack4 = 0;
+    pipeline_crop_pack1to4 = 0;
+    pipeline_crop_pack4to1 = 0;
+    pipeline_crop_pack8 = 0;
+    pipeline_crop_pack1to8 = 0;
+    pipeline_crop_pack4to8 = 0;
+    pipeline_crop_pack8to4 = 0;
+    pipeline_crop_pack8to1 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    for(int i = 0; i < ir_node->input_num; i++)
+    {
+        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        std::string name = input->name;
+        bottoms.push_back(name);
+    }
+
+    for(int i = 0; i < ir_node->output_num; i++)
+    {
+        struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        std::string name = output->name;
+        tops.push_back(name);
+    }
+
+    // params
+    struct tensor *input_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    input_c = input_tensor->dims[1];   // param->input_channel;
+    input_h = input_tensor->dims[2];
+    input_w = input_tensor->dims[3];
+    output_c = output_tensor->dims[1];  // param->output_channel;
+    output_h = output_tensor->dims[2];
+    output_w = output_tensor->dims[3];
+
+    struct crop_param *param = (struct crop_param *)ir_node->op.param_mem;
+
+    int num_args = param->num_args;
+    int offset_c = 0;   // param->offset_c;
+    int offset_h = 0;   // param->offset_h;
+    int offset_w = 0;   // param->offset_w;
+    int crop_h = param->crop_h;
+    int crop_w = param->crop_w;
+    int center_crop = param->center_crop;
+    int axis = param->axis;
+    int flag = param->flag;
+}
+
+int Crop_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    int out_elempack = 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
+
+    int offset_elempack = 1;
+    
+    {
+        // TODO vec and image crop
+        if (offset_c == 0)
+            offset_elempack = elempack;
+        else
+            offset_elempack = opt.use_shader_pack8 && offset_c % 8 == 0 ? 8 : offset_c % 4 == 0 ? 4 : 1;
+    }
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    Tensor out_shape_packed;
+    if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (out_shape.dims == 2) out_shape_packed = Tensor(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
+
+    Tensor shape_unpacked = shape_packed;
+    if (bottoms.size() == 1 && shape.dims != 0 && elempack == out_elempack && elempack > offset_elempack)
+    {
+        size_t offset_elemsize;
+        if (opt.use_fp16_storage)
+        {
+            offset_elemsize = offset_elempack * 2u;
+        }
+        else if (opt.use_fp16_packed)
+        {
+            offset_elemsize = offset_elempack == 1 ? 4u : offset_elempack * 2u;
+        }
+        else
+        {
+            offset_elemsize = offset_elempack * 4u;
+        }
+
+        if (shape.dims == 1) shape_unpacked = Tensor(shape.w / offset_elempack, (void*)0, offset_elemsize, offset_elempack);
+        if (shape.dims == 2) shape_unpacked = Tensor(shape.w, shape.h / offset_elempack, (void*)0, offset_elemsize, offset_elempack);
+        if (shape.dims == 3) shape_unpacked = Tensor(shape.w, shape.h, shape.c / offset_elempack, (void*)0, offset_elemsize, offset_elempack);
+    }
+
+    std::vector<vk_specialization_type> specializations(1 + 10);
+    specializations[0].i = vkdev->info.bug_implicit_fp16_arithmetic;
+    specializations[1 + 0].i = 0;   // shape_unpacked.dims;
+    specializations[1 + 1].i = 0;   // shape_unpacked.w;
+    specializations[1 + 2].i = 0;   // shape_unpacked.h;
+    specializations[1 + 3].i = 0;   // shape_unpacked.c;
+    specializations[1 + 4].i = 0;   // shape_unpacked.cstep;
+    specializations[1 + 5].i = 0;   // out_shape_packed.dims;
+    specializations[1 + 6].i = 0;   // out_shape_packed.w;
+    specializations[1 + 7].i = 0;   // out_shape_packed.h;
+    specializations[1 + 8].i = 0;   // out_shape_packed.c;
+    specializations[1 + 9].i = 0;   // out_shape_packed.cstep;
+
+    Tensor local_size_xyz;
+    if (out_shape_packed.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, out_shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, out_shape_packed.w);
+        local_size_xyz.h = std::min(8, out_shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, out_shape_packed.w);
+        local_size_xyz.h = std::min(4, out_shape_packed.h);
+        local_size_xyz.c = std::min(4, out_shape_packed.c);
+    }
+
+    // pack1
+    if (out_shape.dims == 0 || out_elempack == 1)
+    {
+        pipeline_crop = new Pipeline(vkdev);
+        pipeline_crop->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_crop->create(LayerShaderType::crop, opt, specializations);
+    }
+
+    // pack4
+    if (out_shape.dims == 0 || out_elempack == 4)
+    {
+        pipeline_crop_pack4 = new Pipeline(vkdev);
+        pipeline_crop_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_crop_pack4->create(LayerShaderType::crop_pack4, opt, specializations);
+    }
+
+    // pack1to4
+    if (out_shape.dims == 0 || out_elempack == 4)
+    {
+        pipeline_crop_pack1to4 = new Pipeline(vkdev);
+        pipeline_crop_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_crop_pack1to4->create(LayerShaderType::crop_pack1to4, opt, specializations);
+    }
+
+    // pack4to1
+    if (out_shape.dims == 0 || out_elempack == 1)
+    {
+        pipeline_crop_pack4to1 = new Pipeline(vkdev);
+        pipeline_crop_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_crop_pack4to1->create(LayerShaderType::crop_pack4to1, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && out_shape.dims == 0) || (elempack == 8 && out_elempack == 8))
+    {
+        pipeline_crop_pack8 = new Pipeline(vkdev);
+        pipeline_crop_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_crop_pack8->create(LayerShaderType::crop_pack8, opt, specializations);
+    }
+
+    // pack1to8
+    if ((opt.use_shader_pack8 && out_shape.dims == 0) || out_elempack == 8)
+    {
+        pipeline_crop_pack1to8 = new Pipeline(vkdev);
+        pipeline_crop_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_crop_pack1to8->create(LayerShaderType::crop_pack1to8, opt, specializations);
+    }
+
+    // pack4to8
+    if ((opt.use_shader_pack8 && out_shape.dims == 0) || out_elempack == 8)
+    {
+        pipeline_crop_pack4to8 = new Pipeline(vkdev);
+        pipeline_crop_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_crop_pack4to8->create(LayerShaderType::crop_pack4to8, opt, specializations);
+    }
+
+    // pack8to4
+    if ((opt.use_shader_pack8 && out_shape.dims == 0) || (elempack == 8 && out_elempack == 4))
+    {
+        pipeline_crop_pack8to4 = new Pipeline(vkdev);
+        pipeline_crop_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_crop_pack8to4->create(LayerShaderType::crop_pack8to4, opt, specializations);
+    }
+
+    // pack8to1
+    if ((opt.use_shader_pack8 && out_shape.dims == 0) || (elempack == 8 && out_elempack == 1))
+    {
+        pipeline_crop_pack8to1 = new Pipeline(vkdev);
+        pipeline_crop_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_crop_pack8to1->create(LayerShaderType::crop_pack8to1, opt, specializations);
+    }
+
+   
+    return 0;
+}
+
+int Crop_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_crop;
+    pipeline_crop = 0;
+
+    delete pipeline_crop_pack4;
+    pipeline_crop_pack4 = 0;
+
+    delete pipeline_crop_pack1to4;
+    pipeline_crop_pack1to4 = 0;
+
+    delete pipeline_crop_pack4to1;
+    pipeline_crop_pack4to1 = 0;
+
+    delete pipeline_crop_pack8;
+    pipeline_crop_pack8 = 0;
+
+    delete pipeline_crop_pack1to8;
+    pipeline_crop_pack1to8 = 0;
+
+    delete pipeline_crop_pack4to8;
+    pipeline_crop_pack4to8 = 0;
+
+    delete pipeline_crop_pack8to4;
+    pipeline_crop_pack8to4 = 0;
+
+    delete pipeline_crop_pack8to1;
+    pipeline_crop_pack8to1 = 0;
+
+    return 0;
+}
+
+int Crop_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int _woffset, _hoffset, _coffset;
+    int _outw, _outh, _outc;
+    // resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);
+    _outw = output_w;
+    _outh = output_h;
+    _outc = output_c;
+    _woffset = offset_w;
+    _hoffset = offset_h;
+    _coffset = offset_c;
+
+    // TODO vec and image crop
+
+    if (dims == 3)
+    {
+        if (_woffset == 0 && _hoffset == 0 && _coffset == 0 && _outw == bottom_blob.w && _outh == bottom_blob.h && _outc == bottom_blob.c * elempack)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+
+        int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1;
+
+        int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        // unpacking
+        VkTensor bottom_blob_unpacked = bottom_blob;
+        if (elempack == out_elempack && elempack > offset_elempack)
+        {
+            Option opt_pack1 = opt;
+            opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
+
+            vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, offset_elempack, cmd, opt_pack1);
+        }
+
+        top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        std::vector<VkTensor> bindings(2);
+        bindings[0] = bottom_blob_unpacked;
+        bindings[1] = top_blob;
+
+        std::vector<vk_constant_type> constants(13);
+        constants[0].i = bottom_blob_unpacked.dims;
+        constants[1].i = bottom_blob_unpacked.w;
+        constants[2].i = bottom_blob_unpacked.h;
+        constants[3].i = bottom_blob_unpacked.c;
+        constants[4].i = bottom_blob_unpacked.cstep;
+        constants[5].i = top_blob.dims;
+        constants[6].i = top_blob.w;
+        constants[7].i = top_blob.h;
+        constants[8].i = top_blob.c;
+        constants[9].i = top_blob.cstep;
+        constants[10].i = _woffset;
+        constants[11].i = _hoffset;
+        constants[12].i = _coffset;
+
+        const Pipeline* pipeline = 0;
+        if (elempack == 1 && out_elempack == 1)
+        {
+            pipeline = pipeline_crop;
+        }
+        else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4)
+        {
+            constants[12].i = _coffset / 4;
+
+            pipeline = pipeline_crop_pack4;
+        }
+        else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4)
+        {
+            pipeline = pipeline_crop_pack1to4;
+        }
+        else if (elempack == 1 && out_elempack == 4)
+        {
+            pipeline = pipeline_crop_pack1to4;
+        }
+        else if (elempack == 4 && out_elempack == 1)
+        {
+            pipeline = pipeline_crop_pack4to1;
+        }
+        else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8)
+        {
+            constants[12].i = _coffset / 8;
+
+            pipeline = pipeline_crop_pack8;
+        }
+        else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8)
+        {
+            pipeline = pipeline_crop_pack4to8;
+        }
+        else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8)
+        {
+            pipeline = pipeline_crop_pack1to8;
+        }
+        else if (elempack == 1 && out_elempack == 8)
+        {
+            pipeline = pipeline_crop_pack1to8;
+        }
+        else if (elempack == 4 && out_elempack == 8)
+        {
+            pipeline = pipeline_crop_pack4to8;
+        }
+        else if (elempack == 8 && out_elempack == 4)
+        {
+            pipeline = pipeline_crop_pack8to4;
+        }
+        else if (elempack == 8 && out_elempack == 1)
+        {
+            pipeline = pipeline_crop_pack8to1;
+        }
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+    }
+
+    return 0;
+}
+
+int Crop_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    const VkTensor& bottom_blob = bottom_blobs[0];
+    const VkTensor& reference_blob = bottom_blobs[1];
+
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int _woffset, _hoffset, _coffset;
+    int _outw, _outh, _outc;
+    // if (woffset == -233)
+    // {
+    //     resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob.mapped(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);
+    // }
+    // else
+    // {
+    //     resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);
+    // }
+    _outw = output_w;
+    _outh = output_h;
+    _outc = output_c;
+    _woffset = 0;   // offset_w;
+    _hoffset = 0;   // offset_h;
+    _coffset = 0;   // offset_c;
+
+    // TODO vec and image crop
+
+    if (dims == 3)
+    {
+        if (_woffset == 0 && _hoffset == 0 && _coffset == 0 && _outw == bottom_blob.w && _outh == bottom_blob.h && _outc == bottom_blob.c * elempack)
+        {
+            top_blobs[0] = bottom_blob;
+            return 0;
+        }
+
+        int offset_elempack = _coffset == 0 ? elempack : opt.use_shader_pack8 && _coffset % 8 == 0 ? 8 : _coffset % 4 == 0 ? 4 : 1;
+
+        int out_elempack = opt.use_shader_pack8 && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        // unpacking
+        VkTensor bottom_blob_unpacked = bottom_blob;
+        if (elempack == out_elempack && elempack > offset_elempack)
+        {
+            Option opt_pack1 = opt;
+            opt_pack1.blob_vkallocator = opt.workspace_vkallocator;
+
+            vkdev->convert_packing(bottom_blob, bottom_blob_unpacked, offset_elempack, cmd, opt_pack1);
+        }
+
+        VkTensor& top_blob = top_blobs[0];
+
+        top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        std::vector<VkTensor> bindings(2);
+        bindings[0] = bottom_blob_unpacked;
+        bindings[1] = top_blob;
+
+        std::vector<vk_constant_type> constants(13);
+        constants[0].i = bottom_blob_unpacked.dims;
+        constants[1].i = bottom_blob_unpacked.w;
+        constants[2].i = bottom_blob_unpacked.h;
+        constants[3].i = bottom_blob_unpacked.c;
+        constants[4].i = bottom_blob_unpacked.cstep;
+        constants[5].i = top_blob.dims;
+        constants[6].i = top_blob.w;
+        constants[7].i = top_blob.h;
+        constants[8].i = top_blob.c;
+        constants[9].i = top_blob.cstep;
+        constants[10].i = _woffset;
+        constants[11].i = _hoffset;
+        constants[12].i = _coffset;
+
+        const Pipeline* pipeline = 0;
+        if (elempack == 1 && out_elempack == 1)
+        {
+            pipeline = pipeline_crop;
+        }
+        else if (elempack == 4 && offset_elempack == 4 && out_elempack == 4)
+        {
+            constants[12].i = _coffset / 4;
+
+            pipeline = pipeline_crop_pack4;
+        }
+        else if (elempack == 4 && offset_elempack == 1 && out_elempack == 4)
+        {
+            pipeline = pipeline_crop_pack1to4;
+        }
+        else if (elempack == 1 && out_elempack == 4)
+        {
+            pipeline = pipeline_crop_pack1to4;
+        }
+        else if (elempack == 4 && out_elempack == 1)
+        {
+            pipeline = pipeline_crop_pack4to1;
+        }
+        else if (elempack == 8 && offset_elempack == 8 && out_elempack == 8)
+        {
+            constants[12].i = _coffset / 8;
+
+            pipeline = pipeline_crop_pack8;
+        }
+        else if (elempack == 8 && offset_elempack == 4 && out_elempack == 8)
+        {
+            pipeline = pipeline_crop_pack4to8;
+        }
+        else if (elempack == 8 && offset_elempack == 1 && out_elempack == 8)
+        {
+            pipeline = pipeline_crop_pack1to8;
+        }
+        else if (elempack == 1 && out_elempack == 8)
+        {
+            pipeline = pipeline_crop_pack1to8;
+        }
+        else if (elempack == 4 && out_elempack == 8)
+        {
+            pipeline = pipeline_crop_pack4to8;
+        }
+        else if (elempack == 8 && out_elempack == 4)
+        {
+            pipeline = pipeline_crop_pack8to4;
+        }
+        else if (elempack == 8 && out_elempack == 1)
+        {
+            pipeline = pipeline_crop_pack8to1;
+        }
+
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+    }
+
+    return 0;
+}
+
+}   // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/crop_vulkan.hpp b/source/device/vulkan/layer/crop_vulkan.hpp
new file mode 100644
index 000000000..1a55f3ca1
--- /dev/null
+++ b/source/device/vulkan/layer/crop_vulkan.hpp
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_CROP_HPP
+#define LAYER_CROP_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "crop_param.h"
+
+namespace TEngine{
+
+class Crop_vulkan : public Layer
+{
+public:
+    Crop_vulkan();
+    Crop_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    void resolve_crop_roi(const Tensor& bottom_blob, int& _woffset, int& _hoffset, int& _coffset, int& _outw, int& _outh, int& _outc) const;
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+    virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_crop;
+    Pipeline* pipeline_crop_pack4;
+    Pipeline* pipeline_crop_pack1to4;
+    Pipeline* pipeline_crop_pack4to1;
+    Pipeline* pipeline_crop_pack8;
+    Pipeline* pipeline_crop_pack1to8;
+    Pipeline* pipeline_crop_pack4to8;
+    Pipeline* pipeline_crop_pack8to4;
+    Pipeline* pipeline_crop_pack8to1;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    
+    int num_args;
+    int offset_c;
+    int offset_h;
+    int offset_w;
+    int crop_h;
+    int crop_w;
+    int center_crop;
+    int axis;
+    int flag;
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/dropout_vulkan.cpp b/source/device/vulkan/layer/dropout_vulkan.cpp
new file mode 100644
index 000000000..a6c3e0724
--- /dev/null
+++ b/source/device/vulkan/layer/dropout_vulkan.cpp
@@ -0,0 +1,216 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "dropout_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Dropout_vulkan::Dropout_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_dropout = 0;
+    pipeline_dropout_pack4 = 0;
+    pipeline_dropout_pack8 = 0;
+}
+
+Dropout_vulkan::Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_dropout = 0;
+    pipeline_dropout_pack4 = 0;
+    pipeline_dropout_pack8 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // params
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+
+    if(input->scale != 0)
+        scale = input->scale;
+    else
+        scale = 1.0f;      
+}
+
+int Dropout_vulkan::create_pipeline(const Option& opt)
+{
+    const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    std::vector<vk_specialization_type> specializations(1 + 5);
+    specializations[0].f = scale;
+    specializations[1 + 0].i = shape_packed.dims;
+    specializations[1 + 1].i = shape_packed.w;
+    specializations[1 + 2].i = shape_packed.h;
+    specializations[1 + 3].i = shape_packed.c;
+    specializations[1 + 4].i = shape_packed.cstep;
+
+    Tensor local_size_xyz;
+    if (shape_packed.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, shape_packed.w);
+        local_size_xyz.h = std::min(8, shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, shape_packed.w);
+        local_size_xyz.h = std::min(4, shape_packed.h);
+        local_size_xyz.c = std::min(4, shape_packed.c);
+    }
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_dropout = new Pipeline(vkdev);
+        pipeline_dropout->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_dropout->create(LayerShaderType::dropout, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_dropout_pack4 = new Pipeline(vkdev);
+        pipeline_dropout_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_dropout_pack4->create(LayerShaderType::dropout_pack4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    {
+        pipeline_dropout_pack8 = new Pipeline(vkdev);
+        pipeline_dropout_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_dropout_pack8->create(LayerShaderType::dropout_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Dropout_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_dropout;
+    pipeline_dropout = 0;
+
+    delete pipeline_dropout_pack4;
+    pipeline_dropout_pack4 = 0;
+
+    delete pipeline_dropout_pack8;
+    pipeline_dropout_pack8 = 0;
+
+    return 0;
+}
+
+int Dropout_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& /*opt*/) const
+{
+    if (scale == 1.f)
+    {
+        return 0;
+    }
+
+    int elempack = bottom_top_blob.elempack;
+
+    std::vector<VkTensor> bindings(1);
+    bindings[0] = bottom_top_blob;
+
+    std::vector<vk_constant_type> constants(5);
+    constants[0].i = bottom_top_blob.dims;
+    constants[1].i = bottom_top_blob.w;
+    constants[2].i = bottom_top_blob.h;
+    constants[3].i = bottom_top_blob.c;
+    constants[4].i = bottom_top_blob.cstep;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_dropout_pack8
+                               : elempack == 4 ? pipeline_dropout_pack4
+                               : pipeline_dropout;
+
+    cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
+
+    return 0;
+}
+
+
+
+}   // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/dropout_vulkan.hpp b/source/device/vulkan/layer/dropout_vulkan.hpp
new file mode 100644
index 000000000..b6e943889
--- /dev/null
+++ b/source/device/vulkan/layer/dropout_vulkan.hpp
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_DROPOUT_HPP
+#define LAYER_DROPOUT_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+namespace TEngine{
+
+class Dropout_vulkan : public Layer
+{
+public:
+    Dropout_vulkan();
+    Dropout_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    // virtual int upload_model(VkTransfer& cmd, const Option& opt);
+    
+    virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_dropout;
+    Pipeline* pipeline_dropout_pack4;
+    Pipeline* pipeline_dropout_pack8;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    float scale;
+
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/eltwise_vulkan.cpp b/source/device/vulkan/layer/eltwise_vulkan.cpp
new file mode 100644
index 000000000..9fc322bc9
--- /dev/null
+++ b/source/device/vulkan/layer/eltwise_vulkan.cpp
@@ -0,0 +1,266 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "eltwise_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Eltwise_vulkan::Eltwise_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_eltwise[0] = 0;
+    pipeline_eltwise[1] = 0;
+    pipeline_eltwise_pack4[0] = 0;
+    pipeline_eltwise_pack4[1] = 0;
+    pipeline_eltwise_pack8[0] = 0;
+    pipeline_eltwise_pack8[1] = 0;
+}
+
+Eltwise_vulkan::Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_eltwise[0] = 0;
+    pipeline_eltwise[1] = 0;
+    pipeline_eltwise_pack4[0] = 0;
+    pipeline_eltwise_pack4[1] = 0;
+    pipeline_eltwise_pack8[0] = 0;
+    pipeline_eltwise_pack8[1] = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    for(int i = 0; i < ir_node->input_num; i++)
+    {
+        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        std::string name = input->name;
+        bottoms.push_back(name);
+    }
+
+    for(int i = 0; i < ir_node->output_num; i++)
+    {
+        struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        std::string name = output->name;
+        tops.push_back(name);
+    }
+
+    struct eltwise_param *param = (struct eltwise_param *)ir_node->op.param_mem;
+    op_type = (param -> type) / 2;
+}
+
+int Eltwise_vulkan::create_pipeline(const Option& opt)
+{
+    const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    std::vector<vk_specialization_type> specializations(2 + 5);
+    specializations[0].i = op_type;
+    specializations[1].i = 0;   // coeffs.w == 0 ? 0 : 1;   TODO fix coeffs value
+    specializations[2 + 0].i = 0;   // shape_packed.dims;
+    specializations[2 + 1].i = 0;   // shape_packed.w;
+    specializations[2 + 2].i = 0;   // shape_packed.h;
+    specializations[2 + 3].i = 0;   // shape_packed.c;
+    specializations[2 + 4].i = 0;   // shape_packed.cstep;
+
+    Tensor local_size_xyz;
+    if (shape_packed.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, shape_packed.w);
+        local_size_xyz.h = std::min(8, shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, shape_packed.w);
+        local_size_xyz.h = std::min(4, shape_packed.h);
+        local_size_xyz.c = std::min(4, shape_packed.c);
+    }
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_eltwise[0] = new Pipeline(vkdev);
+        pipeline_eltwise[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise[0]->create(LayerShaderType::eltwise, opt, specializations);
+        pipeline_eltwise[1] = new Pipeline(vkdev);
+        pipeline_eltwise[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise[1]->create(LayerShaderType::eltwise, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_eltwise_pack4[0] = new Pipeline(vkdev);
+        pipeline_eltwise_pack4[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise_pack4[0]->create(LayerShaderType::eltwise_pack4, opt, specializations);
+        pipeline_eltwise_pack4[1] = new Pipeline(vkdev);
+        pipeline_eltwise_pack4[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise_pack4[1]->create(LayerShaderType::eltwise_pack4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    {
+        pipeline_eltwise_pack8[0] = new Pipeline(vkdev);
+        pipeline_eltwise_pack8[0]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise_pack8[0]->create(LayerShaderType::eltwise_pack8, opt, specializations);
+        pipeline_eltwise_pack8[1] = new Pipeline(vkdev);
+        pipeline_eltwise_pack8[1]->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_eltwise_pack8[1]->create(LayerShaderType::eltwise_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Eltwise_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_eltwise[0];
+    delete pipeline_eltwise[1];
+    pipeline_eltwise[0] = 0;
+    pipeline_eltwise[1] = 0;
+
+    delete pipeline_eltwise_pack4[0];
+    delete pipeline_eltwise_pack4[1];
+    pipeline_eltwise_pack4[0] = 0;
+    pipeline_eltwise_pack4[1] = 0;
+
+    delete pipeline_eltwise_pack8[0];
+    delete pipeline_eltwise_pack8[1];
+    pipeline_eltwise_pack8[0] = 0;
+    pipeline_eltwise_pack8[1] = 0;
+
+    return 0;
+}
+
+int Eltwise_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    const VkTensor& bottom_blob = bottom_blobs[0];
+    const VkTensor& bottom_blob1 = bottom_blobs[1];
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    VkTensor& top_blob = top_blobs[0];
+    top_blob.create(w, h, channels, elemsize, elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkTensor> bindings(3);
+    bindings[0] = bottom_blob;
+    bindings[1] = bottom_blob1;
+    bindings[2] = top_blob;
+
+    std::vector<vk_constant_type> constants(5 + 2);
+    constants[0].i = top_blob.dims;
+    constants[1].i = top_blob.w;
+    constants[2].i = top_blob.h;
+    constants[3].i = top_blob.c;
+    constants[4].i = top_blob.cstep;
+    constants[5].f = 1.0f;  // coeffs.w == 0 ? 1.f : coeffs[0];     TODO fix coeffs value
+    constants[6].f = 1.0f;  // coeffs.w == 0 ? 1.f : coeffs[1];
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[1]
+                               : elempack == 4 ? pipeline_eltwise_pack4[1]
+                               : pipeline_eltwise[1];
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    for (size_t b = 2; b < bottom_blobs.size(); b++)
+    {
+        std::vector<VkTensor> bindings(3);
+        bindings[0] = top_blob;
+        bindings[1] = bottom_blobs[b];
+        bindings[2] = top_blob; // TODO use separated pipeline ?
+
+        std::vector<vk_constant_type> constants(5 + 2);
+        constants[0].i = top_blob.dims;
+        constants[1].i = top_blob.w;
+        constants[2].i = top_blob.h;
+        constants[3].i = top_blob.c;
+        constants[4].i = top_blob.cstep;
+        constants[5].f = 1.f;
+        constants[6].f = 1.0f;  // coeffs.w == 0 ? 1 : coeffs[b];       TODO fixcoeffs value
+
+        const Pipeline* pipeline = elempack == 8 ? pipeline_eltwise_pack8[b % 2]
+                                   : elempack == 4 ? pipeline_eltwise_pack4[b % 2]
+                                   : pipeline_eltwise[b % 2];
+
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+    }
+
+    return 0;
+}
+
+}   // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/eltwise_vulkan.hpp b/source/device/vulkan/layer/eltwise_vulkan.hpp
new file mode 100644
index 000000000..5830b076d
--- /dev/null
+++ b/source/device/vulkan/layer/eltwise_vulkan.hpp
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_ELTWISE_HPP
+#define LAYER_ELTWISE_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "eltwise_param.h"
+
+namespace TEngine{
+
+class Eltwise_vulkan : public Layer
+{
+public:
+    Eltwise_vulkan();
+    Eltwise_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_eltwise[2];
+    Pipeline* pipeline_eltwise_pack4[2];
+    Pipeline* pipeline_eltwise_pack8[2];
+
+public:
+    enum EltType
+    {
+        ELT_PROD,
+        ELT_PROD_SCALAR,
+        ELT_SUM,
+        ELT_SUM_SCALAR,
+        ELT_SUB,
+        ELT_SUB_SCALAR,
+        ELT_MAX,
+        ELT_RSQRT,
+        ELT_MIN_SCALAR,
+        ELT_LAST,
+        ELT_DIV,
+        ELT_LOG,
+        ELT_EXP,
+        ELT_SQRT,
+        ELT_FLOOR,
+        ELT_SQUARE,
+        ELT_POW
+    };
+    int op_type;    // Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2
+
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/flatten_vulkan.cpp b/source/device/vulkan/layer/flatten_vulkan.cpp
new file mode 100644
index 000000000..589b7d5d4
--- /dev/null
+++ b/source/device/vulkan/layer/flatten_vulkan.cpp
@@ -0,0 +1,326 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "flatten_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Flatten_vulkan::Flatten_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_flatten = 0;
+    pipeline_flatten_pack4 = 0;
+    pipeline_flatten_pack1to4 = 0;
+    pipeline_flatten_pack8 = 0;
+    pipeline_flatten_pack1to8 = 0;
+    pipeline_flatten_pack4to8 = 0;
+}
+
+Flatten_vulkan::Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_flatten = 0;
+    pipeline_flatten_pack4 = 0;
+    pipeline_flatten_pack1to4 = 0;
+    pipeline_flatten_pack8 = 0;
+    pipeline_flatten_pack1to8 = 0;
+    pipeline_flatten_pack4to8 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // params
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+    output_size = output->dims[3]*output->dims[2]*output->dims[1];
+}
+
+int Flatten_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    // const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
+    const Tensor& out_shape = Tensor(output_size, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
+
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    int out_elempack = 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    Tensor out_shape_packed;
+    if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
+
+    // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
+    {
+        support_image_storage = false;
+        opt.use_image_storage = false;
+    }
+
+    std::vector<vk_specialization_type> specializations(0 + 10);
+    specializations[0 + 0].i = 0;   // shape_packed.dims;
+    specializations[0 + 1].i = 0;   // shape_packed.w;
+    specializations[0 + 2].i = 0;   // shape_packed.h;
+    specializations[0 + 3].i = 0;   // shape_packed.c;
+    specializations[0 + 4].i = 0;   // shape_packed.cstep;
+    specializations[0 + 5].i = 0;   // out_shape_packed.dims;
+    specializations[0 + 6].i = 0;   // out_shape_packed.w;
+    specializations[0 + 7].i = 0;   // out_shape_packed.h;
+    specializations[0 + 8].i = 0;   // out_shape_packed.c;
+    specializations[0 + 9].i = 0;   // out_shape_packed.cstep;
+
+    Tensor local_size_xyz(64, 1, 1, (void*)0);
+    if (out_shape_packed.dims != 0)
+    {
+        local_size_xyz.w = std::min(64, out_shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+
+    // pack1
+    if (shape.dims == 0 || (elempack == 1 && out_elempack == 1))
+    {
+        pipeline_flatten = new Pipeline(vkdev);
+        pipeline_flatten->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten->create(LayerShaderType::flatten, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || (elempack == 4 && out_elempack == 4))
+    {
+        pipeline_flatten_pack4 = new Pipeline(vkdev);
+        pipeline_flatten_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack4->create(LayerShaderType::flatten_pack4, opt, specializations);
+    }
+
+    // pack1to4
+    if (shape.dims == 0 || (elempack == 1 && out_elempack == 4))
+    {
+        pipeline_flatten_pack1to4 = new Pipeline(vkdev);
+        pipeline_flatten_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack1to4->create(LayerShaderType::flatten_pack1to4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 8 && out_elempack == 8))
+    {
+        pipeline_flatten_pack8 = new Pipeline(vkdev);
+        pipeline_flatten_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack8->create(LayerShaderType::flatten_pack8, opt, specializations);
+    }
+
+    // pack1to8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 1 && out_elempack == 8))
+    {
+        pipeline_flatten_pack1to8 = new Pipeline(vkdev);
+        pipeline_flatten_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack1to8->create(LayerShaderType::flatten_pack1to8, opt, specializations);
+    }
+
+    // pack4to8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 4 && out_elempack == 8))
+    {
+        pipeline_flatten_pack4to8 = new Pipeline(vkdev);
+        pipeline_flatten_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_flatten_pack4to8->create(LayerShaderType::flatten_pack4to8, opt, specializations);
+    }
+
+    return 0;
+}
+
+
+
+int Flatten_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_flatten;
+    pipeline_flatten = 0;
+
+    delete pipeline_flatten_pack4;
+    pipeline_flatten_pack4 = 0;
+
+    delete pipeline_flatten_pack1to4;
+    pipeline_flatten_pack1to4 = 0;
+
+    delete pipeline_flatten_pack8;
+    pipeline_flatten_pack8 = 0;
+
+    delete pipeline_flatten_pack1to8;
+    pipeline_flatten_pack1to8 = 0;
+
+    delete pipeline_flatten_pack4to8;
+    pipeline_flatten_pack4to8 = 0;
+
+    return 0;
+}
+
+int Flatten_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+
+    if (dims == 1)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int total = w * h * channels * elempack;
+
+    int out_elempack = opt.use_shader_pack8 && total % 8 == 0 ? 8 : total % 4 == 0 ? 4 : 1;
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (opt.use_fp16_packed && !opt.use_fp16_storage)
+    {
+        if (out_elempack == 8) out_elemsize = 8 * 2u;
+        if (out_elempack == 4) out_elemsize = 4 * 2u;
+        if (out_elempack == 1) out_elemsize = 4u;
+    }
+
+    if (dims == 2 && elempack == 1 && !(opt.use_fp16_packed && !opt.use_fp16_storage && out_elempack != 1))
+    {
+        top_blob = bottom_blob;
+        top_blob.dims = 1;
+        top_blob.w = total / out_elempack;
+        top_blob.h = 1;
+        top_blob.cstep = top_blob.w;
+        top_blob.elemsize = out_elemsize;
+        top_blob.elempack = out_elempack;
+        return 0;
+    }
+
+    top_blob.create(total / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkTensor> bindings(2);
+    bindings[0] = bottom_blob;
+    bindings[1] = top_blob;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob.dims;
+    constants[1].i = bottom_blob.w;
+    constants[2].i = bottom_blob.h;
+    constants[3].i = bottom_blob.c;
+    constants[4].i = bottom_blob.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    const Pipeline* pipeline = 0;
+    if (elempack == 1 && out_elempack == 1)
+    {
+        pipeline = pipeline_flatten;
+    }
+    else if (elempack == 4 && out_elempack == 4)
+    {
+        pipeline = pipeline_flatten_pack4;
+    }
+    else if (elempack == 1 && out_elempack == 4)
+    {
+        pipeline = pipeline_flatten_pack1to4;
+    }
+    else if (elempack == 8 /*&& out_elempack == 8*/)
+    {
+        pipeline = pipeline_flatten_pack8;
+    }
+    else if (elempack == 1 && out_elempack == 8)
+    {
+        pipeline = pipeline_flatten_pack1to8;
+    }
+    else if (elempack == 4 && out_elempack == 8)
+    {
+        pipeline = pipeline_flatten_pack4to8;
+    }
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    return 0;
+}
+
+}   // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/flatten_vulkan.hpp b/source/device/vulkan/layer/flatten_vulkan.hpp
new file mode 100644
index 000000000..91de06f9f
--- /dev/null
+++ b/source/device/vulkan/layer/flatten_vulkan.hpp
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_FLATTEN_HPP
+#define LAYER_FLATTEN_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "flatten_param.h"
+
+namespace TEngine{
+
+class Flatten_vulkan : public Layer
+{
+public:
+    Flatten_vulkan();
+    Flatten_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_flatten;
+    Pipeline* pipeline_flatten_pack4;
+    Pipeline* pipeline_flatten_pack1to4;
+    Pipeline* pipeline_flatten_pack8;
+    Pipeline* pipeline_flatten_pack1to8;
+    Pipeline* pipeline_flatten_pack4to8;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    int output_size;
+
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/innerproduct_vulkan.cpp b/source/device/vulkan/layer/innerproduct_vulkan.cpp
new file mode 100644
index 000000000..c4ba14e99
--- /dev/null
+++ b/source/device/vulkan/layer/innerproduct_vulkan.cpp
@@ -0,0 +1,464 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "innerproduct_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+InnerProduct_vulkan::InnerProduct_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    flatten = 0;
+
+    pipeline_innerproduct = 0;
+    pipeline_innerproduct_pack4 = 0;
+    pipeline_innerproduct_pack1to4 = 0;
+    pipeline_innerproduct_pack4to1 = 0;
+    pipeline_innerproduct_pack8 = 0;
+    pipeline_innerproduct_pack1to8 = 0;
+    pipeline_innerproduct_pack4to8 = 0;
+    pipeline_innerproduct_pack8to4 = 0;
+    pipeline_innerproduct_pack8to1 = 0;
+}
+
+InnerProduct_vulkan::InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    flatten = 0;
+
+    pipeline_innerproduct = 0;
+    pipeline_innerproduct_pack4 = 0;
+    pipeline_innerproduct_pack1to4 = 0;
+    pipeline_innerproduct_pack4to1 = 0;
+    pipeline_innerproduct_pack8 = 0;
+    pipeline_innerproduct_pack1to8 = 0;
+    pipeline_innerproduct_pack4to8 = 0;
+    pipeline_innerproduct_pack8to4 = 0;
+    pipeline_innerproduct_pack8to1 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    struct fc_param *param = (struct fc_param *)ir_node->op.param_mem;
+
+    num_output = param->num_output;
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+
+    struct tensor *weight = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    weight_data_size = weight->elem_num;
+
+    activation_type = -1;
+
+}
+
+int InnerProduct_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    Tensor shape_flatten;
+    if (shape.dims != 0)
+    {
+        shape_flatten = Tensor(shape.w * shape.h * shape.c, (void*)0);
+    }
+
+    int num_input = weight_data_size / num_output;
+
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    Tensor shape_flatten_packed;
+    if (shape_flatten.dims == 1) shape_flatten_packed = Tensor(shape_flatten.w / elempack, (void*)0, elemsize, elempack);
+
+    Tensor out_shape_packed;
+    if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
+
+    {
+        support_image_storage = false;
+        opt.use_image_storage = false;
+    }
+
+    {
+        flatten = new Flatten_vulkan();
+        flatten->vkdev = vkdev;
+
+        flatten->input_w = shape.w;
+        flatten->input_h = shape.h;
+        flatten->input_c = shape.c;
+        flatten->output_w = shape_flatten.w;
+        flatten->output_h = shape_flatten.h;
+        flatten->output_c = shape_flatten.c;
+        flatten->output_size = shape_flatten.w*shape_flatten.h*shape_flatten.c;
+
+        flatten->create_pipeline(opt);
+    }
+
+
+    std::vector<vk_specialization_type> specializations(4 + 10);
+    specializations[0].i = bias_term;
+    specializations[1].i = activation_type;
+    specializations[2].f = 0.f; // activation_params.w >= 1 ? activation_params[0] : 0.f;
+    specializations[3].f = 0.f; // activation_params.w == 2 ? activation_params[1] : 0.f;
+    specializations[4 + 0].i = 0;   // shape_flatten_packed.dims;
+    specializations[4 + 1].i = 0;   // shape_flatten_packed.w;
+    specializations[4 + 2].i = 0;   // shape_flatten_packed.h;
+    specializations[4 + 3].i = 0;   // shape_flatten_packed.c;
+    specializations[4 + 4].i = 0;   // shape_flatten_packed.cstep;
+    specializations[4 + 5].i = 0;   // out_shape_packed.dims;
+    specializations[4 + 6].i = 0;   // out_shape_packed.w;
+    specializations[4 + 7].i = 0;   // out_shape_packed.h;
+    specializations[4 + 8].i = 0;   // out_shape_packed.c;
+    specializations[4 + 9].i = 0;   // out_shape_packed.cstep;
+
+    Tensor local_size_xyz(std::min(64, num_output / out_elempack), 1, 1, (void*)0);
+    if (out_shape_packed.dims != 0)
+    {
+        local_size_xyz.w = std::min(64, out_shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+
+    // pack1
+    if (elempack == 1 && out_elempack == 1)
+    {
+        pipeline_innerproduct = new Pipeline(vkdev);
+        pipeline_innerproduct->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct->create(LayerShaderType::innerproduct, opt, specializations);
+    }
+
+    // pack4
+    if (elempack == 4 && out_elempack == 4)
+    {
+        pipeline_innerproduct_pack4 = new Pipeline(vkdev);
+        pipeline_innerproduct_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct_pack4->create(LayerShaderType::innerproduct_pack4, opt, specializations);
+    }
+
+    // pack1to4
+    if (elempack == 1 && out_elempack == 4)
+    {
+        pipeline_innerproduct_pack1to4 = new Pipeline(vkdev);
+        pipeline_innerproduct_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct_pack1to4->create(LayerShaderType::innerproduct_pack1to4, opt, specializations);
+    }
+
+    // pack4to1
+    if (elempack == 4 && out_elempack == 1)
+    {
+        pipeline_innerproduct_pack4to1 = new Pipeline(vkdev);
+        pipeline_innerproduct_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct_pack4to1->create(LayerShaderType::innerproduct_pack4to1, opt, specializations);
+    }
+
+    // pack8
+    if (elempack == 8 && out_elempack == 8)
+    {
+        pipeline_innerproduct_pack8 = new Pipeline(vkdev);
+        pipeline_innerproduct_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct_pack8->create(LayerShaderType::innerproduct_pack8, opt, specializations);
+    }
+
+    // pack1to8
+    if (elempack == 1 && out_elempack == 8)
+    {
+        pipeline_innerproduct_pack1to8 = new Pipeline(vkdev);
+        pipeline_innerproduct_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct_pack1to8->create(LayerShaderType::innerproduct_pack1to8, opt, specializations);
+    }
+
+    // pack4to8
+    if (elempack == 4 && out_elempack == 8)
+    {
+        pipeline_innerproduct_pack4to8 = new Pipeline(vkdev);
+        pipeline_innerproduct_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct_pack4to8->create(LayerShaderType::innerproduct_pack4to8, opt, specializations);
+    }
+
+    // pack8to4
+    if (elempack == 8 && out_elempack == 4)
+    {
+        pipeline_innerproduct_pack8to4 = new Pipeline(vkdev);
+        pipeline_innerproduct_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct_pack8to4->create(LayerShaderType::innerproduct_pack8to4, opt, specializations);
+    }
+
+    // pack8to1
+    if (elempack == 8 && out_elempack == 1)
+    {
+        pipeline_innerproduct_pack8to1 = new Pipeline(vkdev);
+        pipeline_innerproduct_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_innerproduct_pack8to1->create(LayerShaderType::innerproduct_pack8to1, opt, specializations);
+    }
+
+    return 0;
+}
+
+int InnerProduct_vulkan::destroy_pipeline(const Option& opt)
+{
+    if (flatten)
+    {
+        flatten->destroy_pipeline(opt);
+        delete flatten;
+        flatten = 0;
+    }
+
+    delete pipeline_innerproduct;
+    pipeline_innerproduct = 0;
+
+    delete pipeline_innerproduct_pack4;
+    pipeline_innerproduct_pack4 = 0;
+
+    delete pipeline_innerproduct_pack1to4;
+    pipeline_innerproduct_pack1to4 = 0;
+
+    delete pipeline_innerproduct_pack4to1;
+    pipeline_innerproduct_pack4to1 = 0;
+
+    delete pipeline_innerproduct_pack8;
+    pipeline_innerproduct_pack8 = 0;
+
+    delete pipeline_innerproduct_pack1to8;
+    pipeline_innerproduct_pack1to8 = 0;
+
+    delete pipeline_innerproduct_pack4to8;
+    pipeline_innerproduct_pack4to8 = 0;
+
+    delete pipeline_innerproduct_pack8to4;
+    pipeline_innerproduct_pack8to4 = 0;
+
+    delete pipeline_innerproduct_pack8to1;
+    pipeline_innerproduct_pack8to1 = 0;
+
+    return 0;
+}
+
+int InnerProduct_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{
+    int num_input = weight_data_size / num_output;
+
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    // src = inch-outch
+    // dst = pa-pb-inch/pa-outch/pb
+    tensor* weight_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    Tensor weight_data = Tensor(weight_tensor->elem_num, weight_tensor->data);
+    Tensor weight_data_packed;
+    {
+        Tensor weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_packed.create(num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            float* g00 = weight_data_packed.row(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int i = 0; i < out_elempack; i++)
+                {
+                    const float* k0 = weight_data_r2.row(q + i);
+                    k0 += p;
+
+                    for (int j = 0; j < elempack; j++)
+                    {
+                        g00[0] = k0[j];
+
+                        g00++;
+                    }
+                }
+            }
+        }
+    }
+
+    if (support_image_storage && opt.use_image_storage)
+    {
+        // cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
+    }
+    else
+    {
+        cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
+    }
+
+    if (bias_term)
+    {
+        tensor* bias_tensor = get_ir_graph_tensor(graph, node->input_tensors[2]);
+        Tensor bias_data = Tensor(bias_tensor->elem_num, bias_tensor->data);
+        Tensor bias_data_packed;
+        convert_packing(bias_data, bias_data_packed, out_elempack);
+
+        if (support_image_storage && opt.use_image_storage)
+        {
+            // cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
+        }
+        else
+        {
+            cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
+        }
+    }
+    return 0;
+}
+
+int InnerProduct_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    // flatten
+    VkTensor bottom_blob_flattened = bottom_blob;
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_vkallocator = opt.workspace_vkallocator;
+
+        flatten->record_pipeline(bottom_blob, bottom_blob_flattened, cmd, opt_flatten);
+    }
+
+    size_t elemsize = bottom_blob_flattened.elemsize;
+    int elempack = bottom_blob_flattened.elempack;
+
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (opt.use_fp16_packed && !opt.use_fp16_storage)
+    {
+        if (out_elempack == 8) out_elemsize = 8 * 2u;
+        if (out_elempack == 4) out_elemsize = 4 * 2u;
+        if (out_elempack == 1) out_elemsize = 4u;
+    }
+
+    top_blob.create(num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkTensor> bindings(4);
+    bindings[0] = bottom_blob_flattened;
+    bindings[1] = top_blob;
+    bindings[2] = weight_data_gpu;
+    bindings[3] = bias_data_gpu;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob_flattened.dims;
+    constants[1].i = bottom_blob_flattened.w;
+    constants[2].i = bottom_blob_flattened.h;
+    constants[3].i = bottom_blob_flattened.c;
+    constants[4].i = bottom_blob_flattened.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    const Pipeline* pipeline = 0;
+    if (elempack == 1 && out_elempack == 1)
+    {
+        pipeline = pipeline_innerproduct;
+    }
+    else if (elempack == 4 && out_elempack == 4)
+    {
+        pipeline = pipeline_innerproduct_pack4;
+    }
+    else if (elempack == 1 && out_elempack == 4)
+    {
+        pipeline = pipeline_innerproduct_pack1to4;
+    }
+    else if (elempack == 4 && out_elempack == 1)
+    {
+        pipeline = pipeline_innerproduct_pack4to1;
+    }
+    else if (elempack == 8 && out_elempack == 8)
+    {
+        pipeline = pipeline_innerproduct_pack8;
+    }
+    else if (elempack == 1 && out_elempack == 8)
+    {
+        pipeline = pipeline_innerproduct_pack1to8;
+    }
+    else if (elempack == 4 && out_elempack == 8)
+    {
+        pipeline = pipeline_innerproduct_pack4to8;
+    }
+    else if (elempack == 8 && out_elempack == 4)
+    {
+        pipeline = pipeline_innerproduct_pack8to4;
+    }
+    else if (elempack == 8 && out_elempack == 1)
+    {
+        pipeline = pipeline_innerproduct_pack8to1;
+    }
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    return 0;
+}
+
+}   // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/innerproduct_vulkan.hpp b/source/device/vulkan/layer/innerproduct_vulkan.hpp
new file mode 100644
index 000000000..c682bcb46
--- /dev/null
+++ b/source/device/vulkan/layer/innerproduct_vulkan.hpp
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_INNERPRODUCT_HPP
+#define LAYER_INNERPRODUCT_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+#include "padding_vulkan.hpp"
+#include "flatten_vulkan.hpp"
+
+#include "fc_param.h"
+
+namespace TEngine {
+
+class InnerProduct_vulkan : public Layer
+{
+public:
+    InnerProduct_vulkan();
+    InnerProduct_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Flatten_vulkan* flatten;
+
+    VkTensor weight_data_gpu;
+    VkTensor bias_data_gpu;
+
+    Pipeline* pipeline_innerproduct;
+    Pipeline* pipeline_innerproduct_pack4;
+    Pipeline* pipeline_innerproduct_pack1to4;
+    Pipeline* pipeline_innerproduct_pack4to1;
+    Pipeline* pipeline_innerproduct_pack8;
+    Pipeline* pipeline_innerproduct_pack1to8;
+    Pipeline* pipeline_innerproduct_pack4to8;
+    Pipeline* pipeline_innerproduct_pack8to4;
+    Pipeline* pipeline_innerproduct_pack8to1;
+
+public:
+    // param
+    int num_output;
+    int bias_term;
+
+    int weight_data_size;
+
+    int int8_scale_term;
+
+    // 0=none 1=relu 2=leakyrelu 3=clip 4=sigmoid
+    int activation_type;
+    Tensor activation_params;
+
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+};
+
+}   // namespace TEngine
+
+#endif  // LAYER_INNERPRODUCT_VULKAN_H
\ No newline at end of file
diff --git a/source/device/vulkan/layer/interp_vulkan.cpp b/source/device/vulkan/layer/interp_vulkan.cpp
new file mode 100644
index 000000000..586846b72
--- /dev/null
+++ b/source/device/vulkan/layer/interp_vulkan.cpp
@@ -0,0 +1,464 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "interp_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Interp_vulkan::Interp_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_interp = 0;
+    pipeline_interp_pack4 = 0;
+    pipeline_interp_pack8 = 0;
+
+    pipeline_interp_bicubic_coeffs_x = 0;
+    pipeline_interp_bicubic_coeffs_y = 0;
+    pipeline_interp_bicubic = 0;
+    pipeline_interp_bicubic_pack4 = 0;
+    pipeline_interp_bicubic_pack8 = 0;
+}
+
+Interp_vulkan::Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_interp = 0;
+    pipeline_interp_pack4 = 0;
+    pipeline_interp_pack8 = 0;
+
+    pipeline_interp_bicubic_coeffs_x = 0;
+    pipeline_interp_bicubic_coeffs_y = 0;
+    pipeline_interp_bicubic = 0;
+    pipeline_interp_bicubic_pack4 = 0;
+    pipeline_interp_bicubic_pack8 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // params
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+
+    struct interp_param *param = (struct interp_param *)ir_node->op.param_mem;
+
+    if (param->height_scale != 0 && param->width_scale != 0)
+    {
+        output_height = input_h * param->height_scale;
+        output_width = input_w * param->width_scale;
+    }
+    else
+    {
+        height_scale = (float )output->dims[2] / (float )input_h;
+        width_scale = (float )output->dims[2] / (float )input_w;
+    }
+    resize_type = 2;//param->resize_type;
+}
+
+int Interp_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    int out_elempack = 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    Tensor out_shape_packed;
+    if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (out_shape.dims == 2) out_shape_packed = Tensor(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
+
+    // check blob shape
+    // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
+    {
+        support_image_storage = false;
+        opt.use_image_storage = false;
+    }
+
+    if (resize_type == 1 || resize_type == 2)
+    {
+        std::vector<vk_specialization_type> specializations(1 + 10);
+        specializations[0].i = resize_type;
+        specializations[1 + 0].i = 0;   // shape_packed.dims;
+        specializations[1 + 1].i = 0;   // shape_packed.w;
+        specializations[1 + 2].i = 0;   // shape_packed.h;
+        specializations[1 + 3].i = 0;   // shape_packed.c;
+        specializations[1 + 4].i = 0;   // shape_packed.cstep;
+        specializations[1 + 5].i = 0;   // out_shape_packed.dims;
+        specializations[1 + 6].i = 0;   // out_shape_packed.w;
+        specializations[1 + 7].i = 0;   // out_shape_packed.h;
+        specializations[1 + 8].i = 0;   // out_shape_packed.c;
+        specializations[1 + 9].i = 0;   // out_shape_packed.cstep;
+
+        Tensor local_size_xyz;
+        if (out_shape_packed.dims == 2)
+        {
+            local_size_xyz.w = std::min(8, out_shape_packed.w);
+            local_size_xyz.h = std::min(8, out_shape_packed.h);
+            local_size_xyz.c = 1;
+        }
+        if (out_shape_packed.dims == 3)
+        {
+            local_size_xyz.w = std::min(4, out_shape_packed.w);
+            local_size_xyz.h = std::min(4, out_shape_packed.h);
+            local_size_xyz.c = std::min(4, out_shape_packed.c);
+        }
+
+        // pack1
+        if (shape.dims == 0 || elempack == 1)
+        {
+            pipeline_interp = new Pipeline(vkdev);
+            pipeline_interp->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_interp->create(LayerShaderType::interp, opt, specializations);
+        }
+
+        // pack4
+        if (shape.dims == 0 || elempack == 4)
+        {
+            pipeline_interp_pack4 = new Pipeline(vkdev);
+            pipeline_interp_pack4->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_interp_pack4->create(LayerShaderType::interp_pack4, opt, specializations);
+        }
+
+        // pack8
+        if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+        {
+            pipeline_interp_pack8 = new Pipeline(vkdev);
+            pipeline_interp_pack8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_interp_pack8->create(LayerShaderType::interp_pack8, opt, specializations);
+        }
+    }
+
+    if (resize_type == 3)
+    {
+        {
+            std::vector<vk_specialization_type> specializations(0 + 2);
+            specializations[0 + 0].i = shape_packed.w;
+            specializations[0 + 1].i = out_shape_packed.w;
+
+            Tensor local_size_xyz(64, 1, 1, (void*)0);
+            if (out_shape_packed.dims != 0)
+            {
+                local_size_xyz.w = std::min(64, out_shape_packed.w);
+                local_size_xyz.h = 1;
+                local_size_xyz.c = 1;
+            }
+
+            pipeline_interp_bicubic_coeffs_x = new Pipeline(vkdev);
+            pipeline_interp_bicubic_coeffs_x->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_interp_bicubic_coeffs_x->create(LayerShaderType::interp_bicubic_coeffs, opt, specializations);
+        }
+        {
+            std::vector<vk_specialization_type> specializations(0 + 2);
+            specializations[0 + 0].i = shape_packed.h;
+            specializations[0 + 1].i = out_shape_packed.h;
+
+            Tensor local_size_xyz(64, 1, 1, (void*)0);
+            if (out_shape_packed.dims != 0)
+            {
+                local_size_xyz.w = std::min(64, out_shape_packed.h);
+                local_size_xyz.h = 1;
+                local_size_xyz.c = 1;
+            }
+
+            pipeline_interp_bicubic_coeffs_y = new Pipeline(vkdev);
+            pipeline_interp_bicubic_coeffs_y->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_interp_bicubic_coeffs_y->create(LayerShaderType::interp_bicubic_coeffs, opt, specializations);
+        }
+
+        std::vector<vk_specialization_type> specializations(0 + 10);
+        specializations[0 + 0].i = 0;   // shape_packed.dims;
+        specializations[0 + 1].i = 0;   // shape_packed.w;
+        specializations[0 + 2].i = 0;   // shape_packed.h;
+        specializations[0 + 3].i = 0;   // shape_packed.c;
+        specializations[0 + 4].i = 0;   // shape_packed.cstep;
+        specializations[0 + 5].i = 0;   // out_shape_packed.dims;
+        specializations[0 + 6].i = 0;   // out_shape_packed.w;
+        specializations[0 + 7].i = 0;   // out_shape_packed.h;
+        specializations[0 + 8].i = 0;   // out_shape_packed.c;
+        specializations[0 + 9].i = 0;   // out_shape_packed.cstep;
+
+        Tensor local_size_xyz;
+        if (out_shape_packed.dims == 2)
+        {
+            local_size_xyz.w = std::min(8, out_shape_packed.w);
+            local_size_xyz.h = std::min(8, out_shape_packed.h);
+            local_size_xyz.c = 1;
+        }
+        if (out_shape_packed.dims == 3)
+        {
+            local_size_xyz.w = std::min(4, out_shape_packed.w);
+            local_size_xyz.h = std::min(4, out_shape_packed.h);
+            local_size_xyz.c = std::min(4, out_shape_packed.c);
+        }
+
+        // pack1
+        if (shape.dims == 0 || elempack == 1)
+        {
+            pipeline_interp_bicubic = new Pipeline(vkdev);
+            pipeline_interp_bicubic->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_interp_bicubic->create(LayerShaderType::interp_bicubic, opt, specializations);
+        }
+
+        // pack4
+        if (shape.dims == 0 || elempack == 4)
+        {
+            pipeline_interp_bicubic_pack4 = new Pipeline(vkdev);
+            pipeline_interp_bicubic_pack4->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_interp_bicubic_pack4->create(LayerShaderType::interp_bicubic_pack4, opt, specializations);
+        }
+
+        // pack8
+        if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+        {
+            pipeline_interp_bicubic_pack8 = new Pipeline(vkdev);
+            pipeline_interp_bicubic_pack8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_interp_bicubic_pack8->create(LayerShaderType::interp_bicubic_pack8, opt, specializations);
+        }
+    }
+
+    return 0;
+}
+
+int Interp_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_interp;
+    pipeline_interp = 0;
+
+    delete pipeline_interp_pack4;
+    pipeline_interp_pack4 = 0;
+
+    delete pipeline_interp_pack8;
+    pipeline_interp_pack8 = 0;
+
+    delete pipeline_interp_bicubic_coeffs_x;
+    pipeline_interp_bicubic_coeffs_x = 0;
+
+    delete pipeline_interp_bicubic_coeffs_y;
+    pipeline_interp_bicubic_coeffs_y = 0;
+
+    delete pipeline_interp_bicubic;
+    pipeline_interp_bicubic = 0;
+
+    delete pipeline_interp_bicubic_pack4;
+    pipeline_interp_bicubic_pack4 = 0;
+
+    delete pipeline_interp_bicubic_pack8;
+    pipeline_interp_bicubic_pack8 = 0;
+
+    return 0;
+}
+
+int Interp_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = output_width;
+    int outh = output_height;
+    if (outw == 0 || outh == 0)
+    {
+        outw = w * width_scale;
+        outh = h * height_scale;
+    }
+
+    if (outh == h && outw == w)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (resize_type == 1 || resize_type == 2) // nearest or bilinear
+    {
+        std::vector<VkTensor> bindings(2);
+        bindings[0] = bottom_blob;
+        bindings[1] = top_blob;
+
+        std::vector<vk_constant_type> constants(12);
+        constants[0].i = bottom_blob.dims;
+        constants[1].i = bottom_blob.w;
+        constants[2].i = bottom_blob.h;
+        constants[3].i = bottom_blob.c;
+        constants[4].i = bottom_blob.cstep;
+        constants[5].i = top_blob.dims;
+        constants[6].i = top_blob.w;
+        constants[7].i = top_blob.h;
+        constants[8].i = top_blob.c;
+        constants[9].i = top_blob.cstep;
+        constants[10].f = w / (float)outw;
+        constants[11].f = h / (float)outh;
+
+        const Pipeline* pipeline = elempack == 8 ? pipeline_interp_pack8
+                                   : elempack == 4 ? pipeline_interp_pack4
+                                   : pipeline_interp;
+
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+    }
+
+    else if (resize_type == 3) // bicubic
+    {
+        VkTensor alpha(outw, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator);
+        if (alpha.empty())
+            return -100;
+
+        VkTensor xofs(outw, (size_t)4u, 1, opt.workspace_vkallocator);
+        if (xofs.empty())
+            return -100;
+
+        {
+            std::vector<VkTensor> bindings(2);
+            bindings[0] = alpha;
+            bindings[1] = xofs;
+
+            std::vector<vk_constant_type> constants(3);
+            constants[0].i = bottom_blob.w;
+            constants[1].i = outw;
+            constants[2].f = (float)bottom_blob.w / outw;
+
+            // record
+            cmd.record_pipeline(pipeline_interp_bicubic_coeffs_x, bindings, constants, alpha);
+        }
+
+        VkTensor beta(outh, (size_t)(elemsize / elempack * 4), 4, opt.workspace_vkallocator);
+        if (beta.empty())
+            return -100;
+
+        VkTensor yofs(outh, (size_t)4u, 1, opt.workspace_vkallocator);
+        if (yofs.empty())
+            return -100;
+
+        {
+            std::vector<VkTensor> bindings(2);
+            bindings[0] = beta;
+            bindings[1] = yofs;
+
+            std::vector<vk_constant_type> constants(3);
+            constants[0].i = bottom_blob.h;
+            constants[1].i = outh;
+            constants[2].f = (float)bottom_blob.h / outh;
+
+            // record
+            cmd.record_pipeline(pipeline_interp_bicubic_coeffs_y, bindings, constants, beta);
+        }
+
+        std::vector<VkTensor> bindings(6);
+        bindings[0] = bottom_blob;
+        bindings[1] = top_blob;
+        bindings[2] = alpha;
+        bindings[3] = xofs;
+        bindings[4] = beta;
+        bindings[5] = yofs;
+
+        std::vector<vk_constant_type> constants(10);
+        constants[0].i = bottom_blob.dims;
+        constants[1].i = bottom_blob.w;
+        constants[2].i = bottom_blob.h;
+        constants[3].i = bottom_blob.c;
+        constants[4].i = bottom_blob.cstep;
+        constants[5].i = top_blob.dims;
+        constants[6].i = top_blob.w;
+        constants[7].i = top_blob.h;
+        constants[8].i = top_blob.c;
+        constants[9].i = top_blob.cstep;
+
+        const Pipeline* pipeline = elempack == 8 ? pipeline_interp_bicubic_pack8
+                                   : elempack == 4 ? pipeline_interp_bicubic_pack4
+                                   : pipeline_interp_bicubic;
+
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+    }
+
+    return 0;
+}
+
+}   // TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/interp_vulkan.hpp b/source/device/vulkan/layer/interp_vulkan.hpp
new file mode 100644
index 000000000..ef3886f45
--- /dev/null
+++ b/source/device/vulkan/layer/interp_vulkan.hpp
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_INTERP_HPP
+#define LAYER_INTERP_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "interp_param.h"
+
+namespace TEngine{
+
+class Interp_vulkan : public Layer
+{
+public:
+    Interp_vulkan();
+    Interp_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    // virtual int upload_model(VkTransfer& cmd, const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_interp;
+    Pipeline* pipeline_interp_pack4;
+    Pipeline* pipeline_interp_pack8;
+
+    Pipeline* pipeline_interp_bicubic_coeffs_x;
+    Pipeline* pipeline_interp_bicubic_coeffs_y;
+    Pipeline* pipeline_interp_bicubic;
+    Pipeline* pipeline_interp_bicubic_pack4;
+    Pipeline* pipeline_interp_bicubic_pack8;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+
+    int resize_type;    //1=nearest  2=bilinear  3=bicubic
+    int output_height;
+    int output_width;
+    float height_scale;
+    float width_scale;
+
+
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/packing_vulkan.cpp b/source/device/vulkan/layer/packing_vulkan.cpp
new file mode 100644
index 000000000..86a6c9538
--- /dev/null
+++ b/source/device/vulkan/layer/packing_vulkan.cpp
@@ -0,0 +1,495 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "packing_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Packing_vulkan::Packing_vulkan()
+{
+    support_vulkan = true;
+    // support_image_storage = true;
+
+    pipeline_packing = 0;
+    pipeline_packing_pack4 = 0;
+    pipeline_packing_pack8 = 0;
+    pipeline_packing_pack1to4 = 0;
+    pipeline_packing_pack4to1 = 0;
+    pipeline_packing_pack1to8 = 0;
+    pipeline_packing_pack4to8 = 0;
+    pipeline_packing_pack8to4 = 0;
+    pipeline_packing_pack8to1 = 0;
+}
+
+int Packing_vulkan::create_pipeline(const Option& _opt)
+{
+    
+
+    Option opt = _opt;
+    // const Mat& shape = bottom_shapes.empty() ? Mat() : bottom_shapes[0];
+    // const Mat& out_shape = top_shapes.empty() ? Mat() : top_shapes[0];
+
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        out_elemsize = out_elempack * 4u;
+    }
+
+    // type casting override
+    if (cast_type_to == 1)
+    {
+        out_elemsize = out_elempack * 4u;
+    }
+
+    // Mat out_shape_packed;
+    // if (out_shape.dims == 1) out_shape_packed = Mat(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
+    // if (out_shape.dims == 2) out_shape_packed = Mat(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
+    // if (out_shape.dims == 3) out_shape_packed = Mat(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
+    
+
+    // check blob shape
+    // if (!vkdev->shape_support_image_storage(out_shape_packed))
+    {
+        // support_image_storage = false;
+        opt.use_image_storage = false;
+    }
+
+    std::vector<vk_specialization_type> specializations(2 + 10);
+    specializations[0].i = storage_type_from;
+    specializations[1].i = storage_type_to;
+    specializations[2 + 0].i = 0;// FIXME shape elempack may be dynamic
+    specializations[2 + 1].i = 0;
+    specializations[2 + 2].i = 0;
+    specializations[2 + 3].i = 0;
+    specializations[2 + 4].i = 0;
+    specializations[2 + 5].i = 0; //out_shape_packed_dims;
+    specializations[2 + 6].i = 0; //out_shape_packed_w;
+    specializations[2 + 7].i = 0; //out_shape_packed_h;
+    specializations[2 + 8].i = 0; //out_shape_packed_c;
+    specializations[2 + 9].i = 0; //out_shape_packed_cstep;
+    
+
+    // printf("out shape dims:%d ---------------------------------\n", out_shape_packed_dims);
+
+    VkTensor local_size_xyz;// TODO more precise group size guessed from out_shape_packed
+    if (out_shape_packed_dims == 1)
+    {
+        local_size_xyz.w = 64;
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_packed_dims == 2)
+    {
+        local_size_xyz.w = 8;
+        local_size_xyz.h = 8;
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_packed_dims == 3)
+    {
+        local_size_xyz.w = 4;
+        local_size_xyz.h = 4;
+        local_size_xyz.c = 4;
+    }
+
+    if (out_elempack == 8)
+    {
+        pipeline_packing_pack8 = new Pipeline(vkdev);
+        pipeline_packing_pack8->set_optimal_local_size_xyz(local_size_xyz);
+
+        pipeline_packing_pack1to8 = new Pipeline(vkdev);
+        pipeline_packing_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
+
+        pipeline_packing_pack4to8 = new Pipeline(vkdev);
+        pipeline_packing_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
+
+        if (cast_type_from == cast_type_to)
+        {
+            pipeline_packing_pack8->create(LayerShaderType::packing_pack8, opt, specializations);
+            pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8, opt, specializations);
+            pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8, opt, specializations);
+        }
+        else if (cast_type_from == 1)
+        {
+            pipeline_packing_pack8->create(LayerShaderType::packing_pack8_fp32_to_fp16, opt, specializations);
+            pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8_fp32_to_fp16, opt, specializations);
+            pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8_fp32_to_fp16, opt, specializations);
+        }
+        else if (cast_type_to == 1)
+        {
+            pipeline_packing_pack8->create(LayerShaderType::packing_pack8_fp16_to_fp32, opt, specializations);
+            pipeline_packing_pack1to8->create(LayerShaderType::packing_pack1to8_fp16_to_fp32, opt, specializations);
+            pipeline_packing_pack4to8->create(LayerShaderType::packing_pack4to8_fp16_to_fp32, opt, specializations);
+        }
+    }
+
+    if (out_elempack == 4)
+    {
+        pipeline_packing_pack4 = new Pipeline(vkdev);
+        pipeline_packing_pack4->set_optimal_local_size_xyz(local_size_xyz);
+
+        pipeline_packing_pack1to4 = new Pipeline(vkdev);
+        pipeline_packing_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
+
+        pipeline_packing_pack8to4 = new Pipeline(vkdev);
+        pipeline_packing_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
+
+        if (cast_type_from == cast_type_to)
+        {
+            pipeline_packing_pack4->create(LayerShaderType::packing_pack4, opt, specializations);
+            pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4, opt, specializations);
+            pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4, opt, specializations);
+        }
+        else if (cast_type_from == 1)
+        {
+            pipeline_packing_pack4->create(LayerShaderType::packing_pack4_fp32_to_fp16, opt, specializations);
+            pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4_fp32_to_fp16, opt, specializations);
+            pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4_fp32_to_fp16, opt, specializations);
+        }
+        else if (cast_type_to == 1)
+        {
+            pipeline_packing_pack4->create(LayerShaderType::packing_pack4_fp16_to_fp32, opt, specializations);
+            pipeline_packing_pack1to4->create(LayerShaderType::packing_pack1to4_fp16_to_fp32, opt, specializations);
+            pipeline_packing_pack8to4->create(LayerShaderType::packing_pack8to4_fp16_to_fp32, opt, specializations);
+        }
+    }
+
+    if (out_elempack == 1)
+    {
+        pipeline_packing = new Pipeline(vkdev);
+        pipeline_packing->set_optimal_local_size_xyz(local_size_xyz);
+
+        pipeline_packing_pack4to1 = new Pipeline(vkdev);
+        pipeline_packing_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
+
+        pipeline_packing_pack8to1 = new Pipeline(vkdev);
+        pipeline_packing_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
+
+        if (cast_type_from == cast_type_to)
+        {
+            pipeline_packing->create(LayerShaderType::packing, opt, specializations);
+            pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1, opt, specializations);
+            pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1, opt, specializations);
+        }
+        else if (cast_type_from == 1)
+        {
+            pipeline_packing->create(LayerShaderType::packing_fp32_to_fp16, opt, specializations);
+            pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1_fp32_to_fp16, opt, specializations);
+            pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1_fp32_to_fp16, opt, specializations);
+        }
+        else if (cast_type_to == 1)
+        {
+            pipeline_packing->create(LayerShaderType::packing_fp16_to_fp32, opt, specializations);
+            pipeline_packing_pack4to1->create(LayerShaderType::packing_pack4to1_fp16_to_fp32, opt, specializations);
+            pipeline_packing_pack8to1->create(LayerShaderType::packing_pack8to1_fp16_to_fp32, opt, specializations);
+        }
+    }
+
+    return 0;
+}
+
+int Packing_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_packing;
+    pipeline_packing = 0;
+
+    delete pipeline_packing_pack4;
+    pipeline_packing_pack4 = 0;
+
+    delete pipeline_packing_pack8;
+    pipeline_packing_pack8 = 0;
+
+    delete pipeline_packing_pack1to4;
+    pipeline_packing_pack1to4 = 0;
+
+    delete pipeline_packing_pack4to1;
+    pipeline_packing_pack4to1 = 0;
+
+    delete pipeline_packing_pack1to8;
+    pipeline_packing_pack1to8 = 0;
+
+    delete pipeline_packing_pack4to8;
+    pipeline_packing_pack4to8 = 0;
+
+    delete pipeline_packing_pack8to4;
+    pipeline_packing_pack8to4 = 0;
+
+    delete pipeline_packing_pack8to1;
+    pipeline_packing_pack8to1 = 0;
+
+    return 0;
+}
+
+int Packing_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int elempack = bottom_blob.elempack;
+    // printf("Packing_vulkan b2b %d %d   %d %d   %d %d\n", elempack, out_elempack, cast_type_from, cast_type_to, storage_type_from, storage_type_to);
+
+    if (elempack == out_elempack && cast_type_from == cast_type_to && bottom_blob.allocator == opt.blob_vkallocator)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+
+    if (!use_padding)
+    {
+        // identity if use_padding not allowed
+        if (dims == 1 && w * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if (dims == 2 && h * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+        if (dims == 3 && channels * elempack % out_elempack != 0)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+    }
+
+    size_t out_elemsize;
+    if (cast_type_to == 0)
+    {
+        if (opt.use_fp16_storage)
+        {
+            out_elemsize = out_elempack * 2u;
+        }
+        else if (opt.use_fp16_packed)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+        else
+        {
+            out_elemsize = out_elempack * 4u;
+        }
+    }
+    else if (cast_type_to == 1)
+    {
+        out_elemsize = out_elempack * 4u;
+    }
+    else if (cast_type_to == 2)
+    {
+        if (out_elempack == 8) out_elemsize = 8 * 2u;
+        if (out_elempack == 4) out_elemsize = 4 * 2u;
+        if (out_elempack == 1) out_elemsize = 4u;
+    }
+    else // if (cast_type_to == 3)
+    {
+        out_elemsize = out_elempack * 2u;
+    }
+
+    if (dims == 1)
+    {
+        if (opt.use_fp16_storage && out_elempack == 1 && cast_type_from == cast_type_to && bottom_blob.allocator == opt.blob_vkallocator)
+        {
+            top_blob = bottom_blob;
+            top_blob.w = w * elempack;
+            top_blob.cstep = w * elempack;
+            top_blob.elemsize = elemsize / elempack;
+            top_blob.elempack = out_elempack;
+            return 0;
+        }
+
+        int outw = (w * elempack + out_elempack - 1) / out_elempack;
+
+        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+        // int outw = (w * elempack + out_elempack - 1) / out_elempack;
+
+        // if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        // {
+        //     if (out_elempack == 8) out_elemsize = 8*2u;
+        //     if (out_elempack == 4) out_elemsize = 4*2u;
+        //     if (out_elempack == 1) out_elemsize = 4u;
+        // }
+
+        // // type casting override
+        // if (cast_type_to == 1)
+        // {
+        //     out_elemsize = out_elempack * 4u;
+        // }
+
+        // top_blob.create(outw, out_elemsize, out_elempack, opt.blob_vkallocator);
+        // if (top_blob.empty())
+        //     return -100;
+    }
+
+    if (dims == 2)
+    {
+        int outh = (h * elempack + out_elempack - 1) / out_elempack;
+
+        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        // int outh = (h * elempack + out_elempack - 1) / out_elempack;
+        // size_t out_elemsize = elemsize / elempack * out_elempack;
+        // if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        // {
+        //     if (out_elempack == 8) out_elemsize = 8*2u;
+        //     if (out_elempack == 4) out_elemsize = 4*2u;
+        //     if (out_elempack == 1) out_elemsize = 4u;
+        // }
+
+        // // type casting override
+        // if (cast_type_to == 1)
+        // {
+        //     out_elemsize = out_elempack * 4u;
+        // }
+
+        // top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_vkallocator);
+        // if (top_blob.empty())
+        //     return -100;
+    }
+
+    if (dims == 3)
+    {
+        int outc = (channels * elempack + out_elempack - 1) / out_elempack;
+
+        top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+        // int outc = (channels * elempack + out_elempack - 1) / out_elempack;
+        // size_t out_elemsize = elemsize / elempack * out_elempack;
+        // if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        // {
+        //     if (out_elempack == 8) out_elemsize = 8*2u;
+        //     if (out_elempack == 4) out_elemsize = 4*2u;
+        //     if (out_elempack == 1) out_elemsize = 4u;
+        // }
+
+        // // type casting override
+        // if (cast_type_to == 1)
+        // {
+        //     out_elemsize = out_elempack * 4u;
+        // }
+
+        // top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_vkallocator);
+        // if (top_blob.empty())
+        //     return -100;
+    }
+
+    std::vector<VkTensor> buffer_bindings(2);
+    buffer_bindings[0] = bottom_blob;
+    buffer_bindings[1] = top_blob;
+
+    std::vector<VkImageTensor> image_bindings(2);
+    if (!opt.use_image_storage)
+    {
+        image_bindings.clear();
+    }
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob.dims;
+    constants[1].i = bottom_blob.w;
+    constants[2].i = bottom_blob.h;
+    constants[3].i = bottom_blob.c;
+    constants[4].i = bottom_blob.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    // printf("record packing pipeline:%d %d %d %d %d %d %d %d %d\n", top_blob.dims, top_blob.c, top_blob.h, top_blob.w, top_blob.cstep, top_blob.elempack, top_blob.elemsize, elempack, out_elempack);
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        cmd.record_pipeline(pipeline_packing, buffer_bindings, image_bindings, constants, top_blob);
+    }
+    if (elempack == 4 && out_elempack == 4)
+    {
+        cmd.record_pipeline(pipeline_packing_pack4, buffer_bindings, image_bindings, constants, top_blob);
+    }
+    if (elempack == 1 && out_elempack == 4)
+    {
+        cmd.record_pipeline(pipeline_packing_pack1to4, buffer_bindings, image_bindings, constants, top_blob);
+    }
+    if (elempack == 4 && out_elempack == 1)
+    {
+        cmd.record_pipeline(pipeline_packing_pack4to1, buffer_bindings, image_bindings, constants, bottom_blob);
+    }
+    if (elempack == 8 && out_elempack == 8)
+    {
+        cmd.record_pipeline(pipeline_packing_pack8, buffer_bindings, image_bindings, constants, top_blob);
+    }
+    if (elempack == 1 && out_elempack == 8)
+    {
+        cmd.record_pipeline(pipeline_packing_pack1to8, buffer_bindings, image_bindings, constants, top_blob);
+    }
+    if (elempack == 4 && out_elempack == 8)
+    {
+        cmd.record_pipeline(pipeline_packing_pack4to8, buffer_bindings, image_bindings, constants, top_blob);
+    }
+    if (elempack == 8 && out_elempack == 4)
+    {
+        cmd.record_pipeline(pipeline_packing_pack8to4, buffer_bindings, image_bindings, constants, bottom_blob);
+    }
+    if (elempack == 8 && out_elempack == 1)
+    {
+        cmd.record_pipeline(pipeline_packing_pack8to1, buffer_bindings, image_bindings, constants, bottom_blob);
+    }
+
+
+    // printf("run packing vulkan record pipeline\n");
+    return 0;
+}
+
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/packing_vulkan.hpp b/source/device/vulkan/layer/packing_vulkan.hpp
new file mode 100644
index 000000000..10b748020
--- /dev/null
+++ b/source/device/vulkan/layer/packing_vulkan.hpp
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_PACKING_HPP
+#define LAYER_PACKING_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+namespace TEngine {
+
+class Packing_vulkan : public Layer
+{
+public:
+    Packing_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    int out_shape_packed_dims;
+    int out_shape_packed_w;
+    int out_shape_packed_h;
+    int out_shape_packed_c;
+    int out_shape_packed_cstep;
+
+    int out_elempack;
+    int use_padding;
+
+    // element type
+    // 0 = auto
+    // 1 = fp32
+    // 2 = fp16p
+    // 3 = fp16s
+    int cast_type_from;
+    int cast_type_to;
+
+    // storage type
+    // 0 = buffer
+    // 1 = image
+    int storage_type_from;
+    int storage_type_to;
+
+    Pipeline* pipeline_packing;
+    Pipeline* pipeline_packing_pack4;
+    Pipeline* pipeline_packing_pack8;
+    Pipeline* pipeline_packing_pack1to4;
+    Pipeline* pipeline_packing_pack4to1;
+    Pipeline* pipeline_packing_pack1to8;
+    Pipeline* pipeline_packing_pack4to8;
+    Pipeline* pipeline_packing_pack8to4;
+    Pipeline* pipeline_packing_pack8to1;
+};
+
+} // namespace TEngine
+
+
+#endif
diff --git a/source/device/vulkan/layer/padding_vulkan.cpp b/source/device/vulkan/layer/padding_vulkan.cpp
new file mode 100644
index 000000000..756fb05c9
--- /dev/null
+++ b/source/device/vulkan/layer/padding_vulkan.cpp
@@ -0,0 +1,174 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "padding_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Padding_vulkan::Padding_vulkan()
+{
+    support_vulkan = true;
+    pipeline_padding = 0;
+    pipeline_padding_pack4 = 0;
+    pipeline_padding_pack8 = 0;
+}
+
+
+
+int Padding_vulkan::create_pipeline(const Option& opt)
+{
+    int elempack = 1;
+    elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 : 1;
+    int out_elempack;
+    out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1;
+
+    // printf("create padding pipeline elempack:%d %d \n", elempack, out_elempack);
+
+
+    std::vector<vk_specialization_type> specializations(3 + 10);
+    specializations[0].i = type;
+    specializations[1].f = value;
+    specializations[2].i = 0;   // per_channel_pad_data_size ? 1 : 0;
+    specializations[3 + 0].i = 3;   // shape_packed.dims;                                                                                       
+    specializations[3 + 1].i = input_w;   // shape_packed.w;
+    specializations[3 + 2].i = input_h;   // shape_packed.h;
+    specializations[3 + 3].i = input_c;   // shape_packed.c;
+    specializations[3 + 4].i = input_w * input_h;   // shape_packed.cstep;
+    specializations[3 + 5].i = 3;   // out_shape_packed.dims;
+    specializations[3 + 6].i = output_w;   // out_shape_packed.w;
+    specializations[3 + 7].i = output_h;   // out_shape_packed.h;
+    specializations[3 + 8].i = output_c;   // out_shape_packed.c;
+    specializations[3 + 9].i = output_w * output_h;   // out_shape_packed.cstep;
+
+    VkTensor local_size_xyz;
+    // if (out_shape_packed.dims != 0)
+    {
+        local_size_xyz.w = std::min(4, output_w);
+        local_size_xyz.h = std::min(4, output_h);
+        local_size_xyz.c = std::min(4, output_c);
+    }
+
+    // pack1
+    // if (shape.dims == 0 || elempack == 1)
+    if(elempack == 1)
+    {
+        pipeline_padding = new Pipeline(vkdev);
+        pipeline_padding->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_padding->create(LayerShaderType::padding, opt, specializations);
+    }
+
+    // pack4
+    // if (shape.dims == 0 || elempack == 4)
+    if(elempack == 4)
+    {
+        pipeline_padding_pack4 = new Pipeline(vkdev);
+        pipeline_padding_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_padding_pack4->create(LayerShaderType::padding_pack4, opt, specializations);
+    }
+
+    // pack8
+    // if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    if (opt.use_shader_pack8 || elempack == 8)
+    {
+        pipeline_padding_pack8 = new Pipeline(vkdev);
+        pipeline_padding_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_padding_pack8->create(LayerShaderType::padding_pack8, opt, specializations);
+    }
+    
+    return 0;
+}
+
+int Padding_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    return 0;
+}
+
+
+int Padding_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    if (top == 0 && bottom == 0 && left == 0 && right == 0)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int outw = w + left + right;
+    int outh = h + top + bottom;
+
+    // printf("create padding top_blob vktensor, w, h, c:%d %d %d\n", outw, outh, channels);
+    top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkTensor> bindings(2);
+    bindings[0] = bottom_blob;
+    bindings[1] = top_blob;
+
+    std::vector<vk_constant_type> constants(12);
+    constants[0].i = bottom_blob.dims;
+    constants[1].i = bottom_blob.w;
+    constants[2].i = bottom_blob.h;
+    constants[3].i = bottom_blob.c;
+    constants[4].i = bottom_blob.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+    constants[10].i = left;
+    constants[11].i = top;
+    
+    // printf("padding shape:%d %d %d %d %d %d %d %d %d\n", top_blob.c, top_blob.h, top_blob.w, top_blob.cstep, bottom_blob.c, bottom_blob.h, bottom_blob.w, bottom_blob.cstep, elempack);
+    const Pipeline* pipeline = elempack == 8 ? pipeline_padding_pack8
+                             : elempack == 4 ? pipeline_padding_pack4
+                             : pipeline_padding;
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+    return 0;
+}
+
+} // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/padding_vulkan.hpp b/source/device/vulkan/layer/padding_vulkan.hpp
new file mode 100644
index 000000000..f6aabe066
--- /dev/null
+++ b/source/device/vulkan/layer/padding_vulkan.hpp
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_PADDING_HPP
+#define LAYER_PADDING_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+namespace TEngine {
+
+class Padding_vulkan : public Layer
+{
+public:
+    Padding_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    int top;
+    int bottom;
+    int left;
+    int right;
+    int type;// 0=CONSTANT 1=REPLICATE 2=REFLECT
+    float value;
+    int input_w;
+    int input_h;
+    int input_c;
+    int output_w;
+    int output_h;
+    int output_c;
+
+public:
+    Pipeline* pipeline_padding;
+    Pipeline* pipeline_padding_pack4;
+    Pipeline* pipeline_padding_pack8;
+};
+
+} // namespace TEngine
+
+
+#endif
diff --git a/source/device/vulkan/layer/permute_vulkan.cpp b/source/device/vulkan/layer/permute_vulkan.cpp
new file mode 100644
index 000000000..461b3cc25
--- /dev/null
+++ b/source/device/vulkan/layer/permute_vulkan.cpp
@@ -0,0 +1,475 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "permute_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Permute_vulkan::Permute_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_permute = 0;
+    pipeline_permute_pack4 = 0;
+    pipeline_permute_pack1to4 = 0;
+    pipeline_permute_pack4to1 = 0;
+    pipeline_permute_pack8 = 0;
+    pipeline_permute_pack1to8 = 0;
+    pipeline_permute_pack4to8 = 0;
+    pipeline_permute_pack8to4 = 0;
+    pipeline_permute_pack8to1 = 0;
+}
+
+Permute_vulkan::Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_permute = 0;
+    pipeline_permute_pack4 = 0;
+    pipeline_permute_pack1to4 = 0;
+    pipeline_permute_pack4to1 = 0;
+    pipeline_permute_pack8 = 0;
+    pipeline_permute_pack1to8 = 0;
+    pipeline_permute_pack4to8 = 0;
+    pipeline_permute_pack8to4 = 0;
+    pipeline_permute_pack8to1 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // params
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+
+    // TODO fix order_type value
+    struct permute_param *param = (struct permute_param *)ir_node->op.param_mem;
+    if ((param->order0 == 0) && (param->order1 == 2) && (param->order2 == 3) && (param->order3 == 1))
+    {
+        order_type = 3; 
+    }
+    else if ((param->order0 == 1) && (param->order1 == 0) && (param->order2 == 2) && input->dim_num == 3)
+    {
+        order_type = 1;
+    }
+    else
+    {
+        order_type = 0;
+    }
+    
+}
+
+int Permute_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    int out_elempack = 1;
+    if (out_shape.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape.w % 8 == 0 ? 8 : out_shape.w % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape.h % 8 == 0 ? 8 : out_shape.h % 4 == 0 ? 4 : 1;
+    if (out_shape.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape.c % 8 == 0 ? 8 : out_shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    Tensor out_shape_packed;
+    if (out_shape.dims == 1) out_shape_packed = Tensor(out_shape.w / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (out_shape.dims == 2) out_shape_packed = Tensor(out_shape.w, out_shape.h / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (out_shape.dims == 3) out_shape_packed = Tensor(out_shape.w, out_shape.h, out_shape.c / out_elempack, (void*)0, out_elemsize, out_elempack);
+
+    // check blob shape
+    // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
+    {
+        support_image_storage = false;
+        opt.use_image_storage = false;
+    }
+
+    std::vector<vk_specialization_type> specializations(1 + 10);
+    specializations[0].i = order_type;
+    specializations[1 + 0].i = 0;   // shape_packed.dims;
+    specializations[1 + 1].i = 0;   // shape_packed.w;
+    specializations[1 + 2].i = 0;   // shape_packed.h;
+    specializations[1 + 3].i = 0;   // shape_packed.c;
+    specializations[1 + 4].i = 0;   // shape_packed.cstep;
+    specializations[1 + 5].i = 0;   // out_shape_packed.dims;
+    specializations[1 + 6].i = 0;   // out_shape_packed.w;
+    specializations[1 + 7].i = 0;   // out_shape_packed.h;
+    specializations[1 + 8].i = 0;   // out_shape_packed.c;
+    specializations[1 + 9].i = 0;   // out_shape_packed.cstep;
+
+    Tensor local_size_xyz_bottom; // pack4to1 and pack8to1
+    if (shape_packed.dims == 2)
+    {
+        local_size_xyz_bottom.w = std::min(8, shape_packed.w);
+        local_size_xyz_bottom.h = std::min(8, shape_packed.h);
+        local_size_xyz_bottom.c = 1;
+    }
+    if (shape_packed.dims == 3)
+    {
+        local_size_xyz_bottom.w = std::min(4, shape_packed.w);
+        local_size_xyz_bottom.h = std::min(4, shape_packed.h);
+        local_size_xyz_bottom.c = std::min(4, shape_packed.c);
+    }
+
+    Tensor local_size_xyz;
+    if (out_shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, out_shape_packed.w);
+        local_size_xyz.h = std::min(8, out_shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, out_shape_packed.w);
+        local_size_xyz.h = std::min(4, out_shape_packed.h);
+        local_size_xyz.c = std::min(4, out_shape_packed.c);
+    }
+
+    // pack1
+    if (shape.dims == 0 || (elempack == 1 && out_elempack == 1))
+    {
+        pipeline_permute = new Pipeline(vkdev);
+        pipeline_permute->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_permute->create(LayerShaderType::permute, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || (elempack == 4 && out_elempack == 4))
+    {
+        pipeline_permute_pack4 = new Pipeline(vkdev);
+        pipeline_permute_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_permute_pack4->create(LayerShaderType::permute_pack4, opt, specializations);
+    }
+
+    // pack1to4
+    if (shape.dims == 0 || (elempack == 1 && out_elempack == 4))
+    {
+        pipeline_permute_pack1to4 = new Pipeline(vkdev);
+        pipeline_permute_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_permute_pack1to4->create(LayerShaderType::permute_pack1to4, opt, specializations);
+    }
+
+    // pack4to1
+    if (shape.dims == 0 || (elempack == 4 && out_elempack == 1))
+    {
+        pipeline_permute_pack4to1 = new Pipeline(vkdev);
+        pipeline_permute_pack4to1->set_optimal_local_size_xyz(local_size_xyz_bottom);
+        pipeline_permute_pack4to1->create(LayerShaderType::permute_pack4to1, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 8 && out_elempack == 8))
+    {
+        pipeline_permute_pack8 = new Pipeline(vkdev);
+        pipeline_permute_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_permute_pack8->create(LayerShaderType::permute_pack8, opt, specializations);
+    }
+
+    // pack1to8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 1 && out_elempack == 8))
+    {
+        pipeline_permute_pack1to8 = new Pipeline(vkdev);
+        pipeline_permute_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_permute_pack1to8->create(LayerShaderType::permute_pack1to8, opt, specializations);
+    }
+
+    // pack4to8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 4 && out_elempack == 8))
+    {
+        pipeline_permute_pack4to8 = new Pipeline(vkdev);
+        pipeline_permute_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_permute_pack4to8->create(LayerShaderType::permute_pack4to8, opt, specializations);
+    }
+
+    // pack8to4
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 8 && out_elempack == 4))
+    {
+        pipeline_permute_pack8to4 = new Pipeline(vkdev);
+        pipeline_permute_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_permute_pack8to4->create(LayerShaderType::permute_pack8to4, opt, specializations);
+    }
+
+    // pack8to1
+    if ((opt.use_shader_pack8 && shape.dims == 0) || (elempack == 8 && out_elempack == 1))
+    {
+        pipeline_permute_pack8to1 = new Pipeline(vkdev);
+        pipeline_permute_pack8to1->set_optimal_local_size_xyz(local_size_xyz_bottom);
+        pipeline_permute_pack8to1->create(LayerShaderType::permute_pack8to1, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Permute_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_permute;
+    pipeline_permute = 0;
+
+    delete pipeline_permute_pack4;
+    pipeline_permute_pack4 = 0;
+
+    delete pipeline_permute_pack1to4;
+    pipeline_permute_pack1to4 = 0;
+
+    delete pipeline_permute_pack4to1;
+    pipeline_permute_pack4to1 = 0;
+
+    delete pipeline_permute_pack8;
+    pipeline_permute_pack8 = 0;
+
+    delete pipeline_permute_pack1to8;
+    pipeline_permute_pack1to8 = 0;
+
+    delete pipeline_permute_pack4to8;
+    pipeline_permute_pack4to8 = 0;
+
+    delete pipeline_permute_pack8to4;
+    pipeline_permute_pack8to4 = 0;
+
+    delete pipeline_permute_pack8to1;
+    pipeline_permute_pack8to1 = 0;
+
+    return 0;
+}
+
+int Permute_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    int dims = bottom_blob.dims;
+
+    if (dims == 1 || order_type == 0)
+    {
+        top_blob = bottom_blob;
+        return 0;
+    }
+
+    int out_elempack;
+    size_t out_elemsize;
+
+    if (dims == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        int outw;
+        int outh;
+
+        // if (order_type == 1)
+        {
+            outw = h * elempack;
+            outh = w;
+        }
+
+        out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
+        out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+    }
+    else // if (dims == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        int outw;
+        int outh;
+        int outc;
+
+        if (order_type == 1)
+        {
+            outw = h;
+            outh = w;
+            outc = channels * elempack;
+        }
+        else if (order_type == 2)
+        {
+            outw = w;
+            outh = channels * elempack;
+            outc = h;
+        }
+        else if (order_type == 3)
+        {
+            outw = channels * elempack;
+            outh = w;
+            outc = h;
+        }
+        else if (order_type == 4)
+        {
+            outw = h;
+            outh = channels * elempack;
+            outc = w;
+        }
+        else // if (order_type == 5)
+        {
+            outw = channels * elempack;
+            outh = h;
+            outc = w;
+        }
+
+        out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
+        out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+    }
+
+    std::vector<VkTensor> bindings(2);
+    bindings[0] = bottom_blob;
+    bindings[1] = top_blob;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob.dims;
+    constants[1].i = bottom_blob.w;
+    constants[2].i = bottom_blob.h;
+    constants[3].i = bottom_blob.c;
+    constants[4].i = bottom_blob.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        cmd.record_pipeline(pipeline_permute, bindings, constants, top_blob);
+    }
+    else if (elempack == 4 && out_elempack == 4)
+    {
+        cmd.record_pipeline(pipeline_permute_pack4, bindings, constants, top_blob);
+    }
+    else if (elempack == 1 && out_elempack == 4)
+    {
+        cmd.record_pipeline(pipeline_permute_pack1to4, bindings, constants, top_blob);
+    }
+    else if (elempack == 4 && out_elempack == 1)
+    {
+        cmd.record_pipeline(pipeline_permute_pack4to1, bindings, constants, bottom_blob);
+    }
+    else if (elempack == 8 && out_elempack == 8)
+    {
+        cmd.record_pipeline(pipeline_permute_pack8, bindings, constants, top_blob);
+    }
+    else if (elempack == 1 && out_elempack == 8)
+    {
+        cmd.record_pipeline(pipeline_permute_pack1to8, bindings, constants, top_blob);
+    }
+    else if (elempack == 4 && out_elempack == 8)
+    {
+        cmd.record_pipeline(pipeline_permute_pack4to8, bindings, constants, top_blob);
+    }
+    else if (elempack == 8 && out_elempack == 4)
+    {
+        cmd.record_pipeline(pipeline_permute_pack8to4, bindings, constants, top_blob);
+    }
+    else if (elempack == 8 && out_elempack == 1)
+    {
+        cmd.record_pipeline(pipeline_permute_pack8to1, bindings, constants, bottom_blob);
+    }
+
+    return 0;
+}
+
+}   // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/permute_vulkan.hpp b/source/device/vulkan/layer/permute_vulkan.hpp
new file mode 100644
index 000000000..5ea17c635
--- /dev/null
+++ b/source/device/vulkan/layer/permute_vulkan.hpp
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_PERMUTE_HPP
+#define LAYER_PERMUTE_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "permute_param.h"
+
+namespace TEngine{
+
+class Permute_vulkan : public Layer
+{
+public:
+    Permute_vulkan();
+    Permute_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_permute;
+    Pipeline* pipeline_permute_pack4;
+    Pipeline* pipeline_permute_pack1to4;
+    Pipeline* pipeline_permute_pack4to1;
+    Pipeline* pipeline_permute_pack8;
+    Pipeline* pipeline_permute_pack1to8;
+    Pipeline* pipeline_permute_pack4to8;
+    Pipeline* pipeline_permute_pack8to4;
+    Pipeline* pipeline_permute_pack8to1;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    int order_type;
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/pooling_vulkan.cpp b/source/device/vulkan/layer/pooling_vulkan.cpp
new file mode 100644
index 000000000..eb50b1704
--- /dev/null
+++ b/source/device/vulkan/layer/pooling_vulkan.cpp
@@ -0,0 +1,338 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "pooling_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Pooling_vulkan::Pooling_vulkan()
+{
+    support_vulkan = true;
+    pipeline_pooling = 0;
+    pipeline_pooling_pack4 = 0;
+    pipeline_pooling_pack8 = 0;
+    pipeline_pooling_global = 0;
+    pipeline_pooling_global_pack4 = 0;
+    pipeline_pooling_global_pack8 = 0;
+
+}
+
+Pooling_vulkan::Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    pipeline_pooling = 0;
+    pipeline_pooling_pack4 = 0;
+    pipeline_pooling_pack8 = 0;
+    pipeline_pooling_global = 0;
+    pipeline_pooling_global_pack4 = 0;
+    pipeline_pooling_global_pack8 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    // Tensor* output_tensor = t_node->GetOutputTensor(0);
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    struct pool_param *param_ = (struct pool_param *)ir_node->op.param_mem;
+
+    pooling_type = param_->pool_method;     // 0:max    1:avg
+    kernel_h = param_->kernel_h;
+    kernel_w = param_->kernel_w;
+    stride_h = param_->stride_h;
+    stride_w = param_->stride_w;
+    global = param_->global;
+    caffe_flavor = param_->caffe_flavor;
+    pad_h0 = param_->pad_h0;  
+    pad_w0 = param_->pad_w0;  
+    pad_h1 = param_->pad_h1;  
+    pad_w1 = param_->pad_w1;  
+    input_c = input->dims[1];
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+    // printf("create pooling layer with param:%d %d %d %d %d %d %d %d %d %d\n", kernel_h, kernel_w, stride_h, stride_w, global, pad_h0, pad_h1, pad_w0, pad_w1, param_->alg);
+}
+
+
+int Pooling_vulkan::create_pipeline(const Option& opt)
+{
+    int elempack = opt.use_shader_pack8 && input_c % 8 == 0 ? 8 : input_c % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && output_c % 8 == 0 ? 8 : output_c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+    
+    {
+        padding = new Padding_vulkan();
+        padding->vkdev = vkdev;
+
+        padding->top = pad_h0;
+        padding->bottom = pad_h1;
+        padding->left = pad_w0;
+        padding->right = pad_w1;
+        padding->type = 0;
+        padding->value = 0;
+
+        padding->input_w = input_w;
+        padding->input_h = input_h;
+        padding->input_c = input_c;
+        padding->output_w = input_w + pad_w0 + pad_w1;
+        padding->output_h = input_h + pad_h0 + pad_h1;
+        padding->output_c = input_c;
+
+        padding->create_pipeline(opt);
+    }
+
+    if(global)
+    {
+        std::vector<vk_specialization_type> specializations(1 + 10);
+        specializations[0].i = pooling_type;
+        specializations[1 + 0].i = 3;
+        specializations[1 + 1].i = input_w + pad_w0 + pad_w1;
+        specializations[1 + 2].i = input_h + pad_h0 + pad_h1;
+        specializations[1 + 3].i = input_c;
+        specializations[1 + 4].i = (input_w + pad_w0 + pad_w1) * (input_h + pad_h0 + pad_h1);
+        specializations[1 + 5].i = 3;
+        specializations[1 + 6].i = output_c;
+        specializations[1 + 7].i = output_h;
+        specializations[1 + 8].i = output_w;
+        specializations[1 + 9].i = output_h * output_w;
+
+        VkTensor local_size_xyz;
+        // if (out_shape_packed.dims != 0)
+        {
+            local_size_xyz.w = std::min(4, output_w);
+            local_size_xyz.h = std::min(4, output_h);
+            local_size_xyz.c = std::min(4, output_c);
+        }
+
+        // pack1
+        if (elempack == 1)
+        {
+            pipeline_pooling_global = new Pipeline(vkdev);
+            pipeline_pooling_global->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_pooling_global->create(LayerShaderType::pooling_global, opt, specializations);
+        }
+
+        // pack4
+        if (elempack == 4)
+        {
+            pipeline_pooling_global_pack4 = new Pipeline(vkdev);
+            pipeline_pooling_global_pack4->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_pooling_global_pack4->create(LayerShaderType::pooling_global_pack4, opt, specializations);
+        }
+
+        // pack8
+        if (opt.use_shader_pack8 || elempack == 8)
+        {
+            pipeline_pooling_global_pack8 = new Pipeline(vkdev);
+            pipeline_pooling_global_pack8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_pooling_global_pack8->create(LayerShaderType::pooling_global_pack8, opt, specializations);
+        }
+    }
+    else
+    {
+        std::vector<vk_specialization_type> specializations(12 + 10);
+        specializations[0].i = pooling_type;
+        specializations[1].i = kernel_w;
+        specializations[2].i = kernel_h;
+        specializations[3].i = stride_w;
+        specializations[4].i = stride_h;
+        specializations[5].i = pad_w0;
+        specializations[6].i = pad_w1;
+        specializations[7].i = pad_h0;
+        specializations[8].i = pad_h1;
+        specializations[9].i = global;
+        specializations[10].i = 0; // pad_mode;
+        specializations[11].i = 0; // avgpool_count_include_pad;
+        specializations[12 + 0].i = 0;  // 3; // shape_bordered_packed.dims;
+        specializations[12 + 1].i = 0;  // input_w; // shape_bordered_packed.w;
+        specializations[12 + 2].i = 0;  // input_h; // shape_bordered_packed.h;
+        specializations[12 + 3].i = 0;  // input_c; // shape_bordered_packed.c;
+        specializations[12 + 4].i = 0;  // input_w * input_h; // shape_bordered_packed.cstep;
+        specializations[12 + 5].i = 0;  // 3; // out_shape_packed.dims;
+        specializations[12 + 6].i = 0;  // output_w; // out_shape_packed.w;
+        specializations[12 + 7].i = 0;  // output_h; // out_shape_packed.h;
+        specializations[12 + 8].i = 0;  // output_c; // out_shape_packed.c;
+        specializations[12 + 9].i = 0;  // output_h * output_c; // out_shape_packed.cstep;
+
+        VkTensor local_size_xyz;
+        local_size_xyz.w = std::min(4, output_w);
+        local_size_xyz.h = std::min(4, output_h);
+        local_size_xyz.c = std::min(4, output_c);
+
+        // pack1
+        if (elempack == 1)
+        {
+            pipeline_pooling = new Pipeline(vkdev);
+            pipeline_pooling->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_pooling->create(LayerShaderType::pooling, opt, specializations);
+        }
+
+        // pack4
+        if (elempack == 4)
+        {
+            pipeline_pooling_pack4 = new Pipeline(vkdev);
+            pipeline_pooling_pack4->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_pooling_pack4->create(LayerShaderType::pooling_pack4, opt, specializations);
+        }
+
+        // pack8
+        if (opt.use_shader_pack8 || elempack == 8)
+        {
+            pipeline_pooling_pack8 = new Pipeline(vkdev);
+            pipeline_pooling_pack8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_pooling_pack8->create(LayerShaderType::pooling_pack8, opt, specializations);
+        }
+    }
+
+    return 0;
+}
+
+int Pooling_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    return 0;
+}
+
+int Pooling_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    if(global)
+    {
+        // printf("input shape: %d %d %d, out shape: %d %d %d\n", input_c, input_h, input_w, output_c, output_h, output_w);
+        top_blob.create(output_c/elempack, elemsize, elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+        // printf("top shape:%d %d %d\n", top_blob.c, top_blob.h, top_blob.w);
+        std::vector<VkTensor> bindings(2);
+        bindings[0] = bottom_blob;
+        bindings[1] = top_blob;
+
+        std::vector<vk_constant_type> constants(10);
+        constants[0].i = bottom_blob.dims;
+        constants[1].i = bottom_blob.w;
+        constants[2].i = bottom_blob.h;
+        constants[3].i = bottom_blob.c;
+        constants[4].i = bottom_blob.cstep;
+        constants[5].i = top_blob.dims;
+        constants[6].i = top_blob.w;
+        constants[7].i = top_blob.h;
+        constants[8].i = top_blob.c;
+        constants[9].i = top_blob.cstep;
+
+        const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_global_pack8
+                                   : elempack == 4 ? pipeline_pooling_global_pack4
+                                   : pipeline_pooling_global;
+
+        cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+
+        return 0;
+    }
+
+    VkTensor bottom_blob_bordered = bottom_blob;
+    if (pad_h0 > 0 || pad_h1 > 0 || pad_w0 > 0 || pad_w1 > 0)
+    {
+        bottom_blob_bordered.w = bottom_blob_bordered.w + pad_w0 + pad_w1;
+        bottom_blob_bordered.h = bottom_blob_bordered.h + pad_h0 + pad_h1;
+        bottom_blob_bordered.cstep = bottom_blob_bordered.w * bottom_blob_bordered.h;
+        Option opt_pad = opt;
+        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+        padding->record_pipeline(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
+    }
+
+    top_blob.create(output_w, output_h, output_c/elempack, elemsize, elempack, opt.blob_vkallocator);
+
+
+    std::vector<VkTensor> bindings(2);
+    bindings[0] = bottom_blob_bordered;
+    bindings[1] = top_blob;
+
+    std::vector<vk_constant_type> constants(12);
+    constants[0].i = bottom_blob_bordered.dims;
+    constants[1].i = bottom_blob_bordered.w;
+    constants[2].i = bottom_blob_bordered.h;
+    constants[3].i = bottom_blob_bordered.c;
+    constants[4].i = bottom_blob_bordered.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+    constants[10].i = 0;
+    constants[11].i = 0;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_pooling_pack8
+                               : elempack == 4 ? pipeline_pooling_pack4
+                               : pipeline_pooling;
+
+    cmd.record_pipeline(pipeline, bindings, constants, top_blob);
+    return 0;
+}
+
+} // namespace TEngine
diff --git a/source/device/vulkan/layer/pooling_vulkan.hpp b/source/device/vulkan/layer/pooling_vulkan.hpp
new file mode 100644
index 000000000..e4a823e9e
--- /dev/null
+++ b/source/device/vulkan/layer/pooling_vulkan.hpp
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_POOLING_HPP
+#define LAYER_POOLING_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+#include "padding_vulkan.hpp"
+
+#include "pooling_param.h"
+
+namespace TEngine {
+
+class Pooling_vulkan : public Layer
+{
+public:
+    Pooling_vulkan();
+    Pooling_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    int pooling_type;   // // 0:max    1:avg
+    int kernel_h; // = param_->kernel_h;
+    int kernel_w; // = param_->kernel_w;
+    int stride_h; // = param_->stride_h;
+    int stride_w; // = param_->stride_w;
+    int global; // = param_->global;
+    int caffe_flavor; // = param_->caffe_flavor;
+    int pad_h0; // = param_->pad_h0;  
+    int pad_w0; // = param_->pad_w0;  
+    int pad_h1; // = param_->pad_h1;  
+    int pad_w1; // = param_->pad_w1;  
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+
+public:
+    Padding_vulkan* padding;
+
+    Pipeline* pipeline_pooling;
+    Pipeline* pipeline_pooling_pack4;
+    Pipeline* pipeline_pooling_pack8;
+    Pipeline* pipeline_pooling_global;
+    Pipeline* pipeline_pooling_global_pack4;
+    Pipeline* pipeline_pooling_global_pack8;
+};
+
+} // namespace TEngine
+
+
+#endif
diff --git a/source/device/vulkan/layer/priorbox_vulkan.cpp b/source/device/vulkan/layer/priorbox_vulkan.cpp
new file mode 100644
index 000000000..de81aec7a
--- /dev/null
+++ b/source/device/vulkan/layer/priorbox_vulkan.cpp
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "priorbox_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+PriorBox_vulkan::PriorBox_vulkan()
+{
+    support_vulkan = true;
+
+    pipeline_priorbox = 0;
+    pipeline_priorbox_mxnet = 0;
+}
+
+PriorBox_vulkan::PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+
+    pipeline_priorbox = 0;
+    pipeline_priorbox_mxnet = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    for(int i = 0; i < ir_node->input_num; i++)
+    {
+        struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        std::string name = input->name;
+        bottoms.push_back(name);
+    }
+
+    for(int i = 0; i < ir_node->output_num; i++)
+    {
+        struct tensor *output = get_ir_graph_tensor(graph, node->input_tensors[i]);
+        std::string name = output->name;
+        tops.push_back(name);
+    }
+
+    // params
+    struct tensor *featmap_tensor = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    struct tensor *data_tensor = get_ir_graph_tensor(graph, node->input_tensors[1]);
+    struct tensor *output_tensor = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    input_c = data_tensor->dims[1];   // param->input_channel;
+    input_h = data_tensor->dims[2];
+    input_w = data_tensor->dims[3];
+    output_c = output_tensor->dims[1];  // param->output_channel;
+    output_h = output_tensor->dims[2];
+    output_w = output_tensor->dims[3];
+
+    const int data_height = data_tensor->dims[2];
+    const int data_width = data_tensor->dims[3];
+    const int feat_height = featmap_tensor->dims[2];
+    const int feat_width = featmap_tensor->dims[3];
+
+    struct priorbox_param *param = (struct priorbox_param *)ir_node->op.param_mem;
+    
+    variances[0] = (param->variance)[0];
+    variances[1] = (param->variance)[1];
+    variances[2] = (param->variance)[2];
+    variances[3] = (param->variance)[3];
+    flip = param->flip;
+    clip = param->clip;
+
+    if (param->image_h == 0 || param->image_w == 0)
+    {
+        image_width = data_width;
+        image_height = data_height;
+    }
+    else
+    {
+        image_width = param->image_w;
+        image_height = param->image_h;
+    }
+
+    if (param->step_h == 0 || param->step_w == 0)
+    {
+        step_width = ( float )(image_width) / feat_width;
+        step_height = ( float )(image_height) / feat_height;
+    }
+    else
+    {
+        step_width = param->step_w;
+        step_height = param->step_h;
+    }
+    int num_priors = param->num_priors;
+
+    offset = param->offset;
+    step_mmdetection = 0;   // TODO fix step_mmdetection value
+    center_mmdetection = 0; // TODO fix center_mmdetection value
+
+    min_sizes = Tensor(param->min_size_num, param->min_size);
+    max_sizes = Tensor(param->max_size_num, param->max_size);
+    aspect_ratios = Tensor(param->aspect_ratio_size, param->aspect_ratio);
+    TLOG_INFO("size min max aspect:%d %d %d\n", param->min_size_num, param->max_size_num, param->aspect_ratio_size);
+}
+
+int PriorBox_vulkan::create_pipeline(const Option& opt)
+{
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    // caffe style
+    {
+        int num_min_size = min_sizes.w;
+        int num_max_size = max_sizes.w;
+        int num_aspect_ratio = aspect_ratios.w;
+
+        int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size;
+        if (flip)
+            num_prior += num_min_size * num_aspect_ratio;
+
+        std::vector<vk_specialization_type> specializations(11 + 2);
+        specializations[0].i = flip;
+        specializations[1].i = clip;
+        specializations[2].f = offset;
+        specializations[3].f = variances[0];
+        specializations[4].f = variances[1];
+        specializations[5].f = variances[2];
+        specializations[6].f = variances[3];
+        specializations[7].i = num_min_size;
+        specializations[8].i = num_max_size;
+        specializations[9].i = num_aspect_ratio;
+        specializations[10].i = num_prior;
+        specializations[11 + 0].i = 0;//shape_packed.w;
+        specializations[11 + 1].i = 0;//shape_packed.h;
+
+        pipeline_priorbox = new Pipeline(vkdev);
+        pipeline_priorbox->set_optimal_local_size_xyz();
+        pipeline_priorbox->create(LayerShaderType::priorbox, opt, specializations);
+    }
+
+    // mxnet style
+    {
+        int num_sizes = min_sizes.w;
+        int num_ratios = aspect_ratios.w;
+
+        int num_prior = num_sizes - 1 + num_ratios;
+
+        std::vector<vk_specialization_type> specializations(5 + 2);
+        specializations[0].i = clip;
+        specializations[1].f = offset;
+        specializations[2].i = num_sizes;
+        specializations[3].i = num_ratios;
+        specializations[4].i = num_prior;
+        specializations[5 + 0].i = shape_packed.w;
+        specializations[5 + 1].i = shape_packed.h;
+
+        pipeline_priorbox_mxnet = new Pipeline(vkdev);
+        pipeline_priorbox_mxnet->set_optimal_local_size_xyz();
+        pipeline_priorbox_mxnet->create(LayerShaderType::priorbox_mxnet, opt, specializations);
+    }
+
+    return 0;
+}
+
+int PriorBox_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_priorbox;
+    pipeline_priorbox = 0;
+
+    delete pipeline_priorbox_mxnet;
+    pipeline_priorbox_mxnet = 0;
+
+    return 0;
+}
+
+int PriorBox_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{
+    cmd.record_upload(min_sizes, min_sizes_gpu, opt);
+
+    if (max_sizes.w > 0)
+        cmd.record_upload(max_sizes, max_sizes_gpu, opt);
+
+    cmd.record_upload(aspect_ratios, aspect_ratios_gpu, opt);
+
+    return 0;
+}
+
+int PriorBox_vulkan::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blobs[0].w;
+    int h = bottom_blobs[0].h;
+
+    if (bottom_blobs.size() == 1 && image_width == -233 && image_height == -233 && max_sizes.empty())
+    {
+        // mxnet style _contrib_MultiBoxPrior
+        float step_w = step_width;
+        float step_h = step_height;
+        if (step_w == -233)
+            step_w = 1.f / (float)w;
+        if (step_h == -233)
+            step_h = 1.f / (float)h;
+
+        int num_sizes = min_sizes.w;
+        int num_ratios = aspect_ratios.w;
+
+        int num_prior = num_sizes - 1 + num_ratios;
+
+        int elempack = 4;
+
+        size_t elemsize = elempack * 4u;
+        if (opt.use_fp16_packed || opt.use_fp16_storage)
+        {
+            elemsize = elempack * 2u;
+        }
+
+        VkTensor& top_blob = top_blobs[0];
+        top_blob.create(4 * w * h * num_prior / elempack, elemsize, elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        std::vector<VkTensor> bindings(3);
+        bindings[0] = top_blob;
+        bindings[1] = min_sizes_gpu;
+        bindings[2] = aspect_ratios_gpu;
+
+        std::vector<vk_constant_type> constants(4);
+        constants[0].i = w;
+        constants[1].i = h;
+        constants[2].f = step_w;
+        constants[3].f = step_h;
+
+        VkTensor dispatcher;
+        dispatcher.w = num_sizes;
+        dispatcher.h = w;
+        dispatcher.c = h;
+
+        cmd.record_pipeline(pipeline_priorbox_mxnet, bindings, constants, dispatcher);
+
+        return 0;
+    }
+
+    int image_w = image_width;
+    int image_h = image_height;
+    if (image_w == -233)
+        image_w = bottom_blobs[1].w;
+    if (image_h == -233)
+        image_h = bottom_blobs[1].h;
+
+    float step_w = step_width;
+    float step_h = step_height;
+    if (step_w == -233)
+        step_w = (float)image_w / w;
+    if (step_h == -233)
+        step_h = (float)image_h / h;
+
+    int num_min_size = min_sizes.w;
+    int num_max_size = max_sizes.w;
+    int num_aspect_ratio = aspect_ratios.w;
+
+    int num_prior = num_min_size * num_aspect_ratio + num_min_size + num_max_size;
+    if (flip)
+        num_prior += num_min_size * num_aspect_ratio;
+
+    size_t elemsize = 4u;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = 2u;
+    }
+
+    VkTensor& top_blob = top_blobs[0];
+    top_blob.create(4 * w * h * num_prior, 2, elemsize, 1, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkTensor> bindings(4);
+    bindings[0] = top_blob;
+    bindings[1] = min_sizes_gpu;
+    bindings[2] = num_max_size > 0 ? max_sizes_gpu : min_sizes_gpu;
+    bindings[3] = aspect_ratios_gpu;
+
+    std::vector<vk_constant_type> constants(6);
+    constants[0].i = w;
+    constants[1].i = h;
+    constants[2].f = image_w;
+    constants[3].f = image_h;
+    constants[4].f = step_w;
+    constants[5].f = step_h;
+
+    VkTensor dispatcher;
+    dispatcher.w = num_min_size;
+    dispatcher.h = w;
+    dispatcher.c = h;
+
+    cmd.record_pipeline(pipeline_priorbox, bindings, constants, dispatcher);
+
+    return 0;
+}
+
+}   // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/priorbox_vulkan.hpp b/source/device/vulkan/layer/priorbox_vulkan.hpp
new file mode 100644
index 000000000..69b8f8bb7
--- /dev/null
+++ b/source/device/vulkan/layer/priorbox_vulkan.hpp
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_PRIORBOX_HPP
+#define LAYER_PRIORBOX_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "priorbox_param.h"
+
+namespace TEngine{
+
+class PriorBox_vulkan : public Layer
+{
+public:
+    PriorBox_vulkan();
+    PriorBox_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+    
+    virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_priorbox;
+    Pipeline* pipeline_priorbox_mxnet;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+
+    float variances[4];
+    int flip;
+    int clip;
+    int image_width;
+    int image_height;
+    float step_width;
+    float step_height;
+    float offset;
+    int num_priors;
+    bool step_mmdetection;
+    bool center_mmdetection;
+
+    Tensor min_sizes;
+    Tensor max_sizes;
+    Tensor aspect_ratios;
+    VkTensor min_sizes_gpu;
+    VkTensor max_sizes_gpu;
+    VkTensor aspect_ratios_gpu;
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/relu_vulkan.cpp b/source/device/vulkan/layer/relu_vulkan.cpp
new file mode 100644
index 000000000..f541806cf
--- /dev/null
+++ b/source/device/vulkan/layer/relu_vulkan.cpp
@@ -0,0 +1,214 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "relu_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+ReLU_vulkan::ReLU_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_relu = 0;
+    pipeline_relu_pack4 = 0;
+    pipeline_relu_pack8 = 0;
+}
+
+ReLU_vulkan::ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = false;
+
+    pipeline_relu = 0;
+    pipeline_relu_pack4 = 0;
+    pipeline_relu_pack8 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // params
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+
+    struct relu_param *param = (struct relu_param *)ir_node->op.param_mem;
+    negative_slope = param->negative_slope;
+}
+
+int ReLU_vulkan::create_pipeline(const Option& opt)
+{
+    const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Mat() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    std::vector<vk_specialization_type> specializations(1 + 5);
+    specializations[0].f = negative_slope;  // slope;
+    specializations[1 + 0].i = 0;   // shape_packed.dims;
+    specializations[1 + 1].i = 0;   // shape_packed.w;
+    specializations[1 + 2].i = 0;   // shape_packed.h;
+    specializations[1 + 3].i = 0;   // shape_packed.c;
+    specializations[1 + 4].i = 0;   // shape_packed.cstep;
+
+    Tensor local_size_xyz;
+    if (shape_packed.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, shape_packed.w);
+        local_size_xyz.h = std::min(8, shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, shape_packed.w);
+        local_size_xyz.h = std::min(4, shape_packed.h);
+        local_size_xyz.c = std::min(4, shape_packed.c);
+    }
+
+    // pack1
+    if (shape.dims == 0 || elempack == 1)
+    {
+        pipeline_relu = new Pipeline(vkdev);
+        pipeline_relu->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_relu->create(LayerShaderType::relu, opt, specializations);
+    }
+
+    // pack4
+    if (shape.dims == 0 || elempack == 4)
+    {
+        pipeline_relu_pack4 = new Pipeline(vkdev);
+        pipeline_relu_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_relu_pack4->create(LayerShaderType::relu_pack4, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape.dims == 0) || elempack == 8)
+    {
+        pipeline_relu_pack8 = new Pipeline(vkdev);
+        pipeline_relu_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_relu_pack8->create(LayerShaderType::relu_pack8, opt, specializations);
+    }
+
+    return 0;
+}
+
+
+int ReLU_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_relu;
+    pipeline_relu = 0;
+
+    delete pipeline_relu_pack4;
+    pipeline_relu_pack4 = 0;
+
+    delete pipeline_relu_pack8;
+    pipeline_relu_pack8 = 0;
+
+    return 0;
+}
+
+int ReLU_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int elempack = bottom_top_blob.elempack;
+
+    std::vector<VkTensor> bindings(1);
+    bindings[0] = bottom_top_blob;
+
+    std::vector<vk_constant_type> constants(5);
+    constants[0].i = bottom_top_blob.dims;
+    constants[1].i = bottom_top_blob.w;
+    constants[2].i = bottom_top_blob.h;
+    constants[3].i = bottom_top_blob.c;
+    constants[4].i = bottom_top_blob.cstep;
+
+    const Pipeline* pipeline = elempack == 8 ? pipeline_relu_pack8
+                               : elempack == 4 ? pipeline_relu_pack4
+                               : pipeline_relu;
+
+    cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
+
+    return 0;
+}
+
+int ReLU_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    printf("run record_pipeline relu!\n");
+    return 0;
+}
+
+}
\ No newline at end of file
diff --git a/source/device/vulkan/layer/relu_vulkan.hpp b/source/device/vulkan/layer/relu_vulkan.hpp
new file mode 100644
index 000000000..c928a756f
--- /dev/null
+++ b/source/device/vulkan/layer/relu_vulkan.hpp
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_RELU_HPP
+#define LAYER_RELU_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "relu_param.h"
+
+namespace TEngine{
+
+class ReLU_vulkan : public Layer
+{
+public:
+    ReLU_vulkan();
+    ReLU_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+    virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_relu;
+    Pipeline* pipeline_relu_pack4;
+    Pipeline* pipeline_relu_pack8;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+    float negative_slope;
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/reshape_vulkan.cpp b/source/device/vulkan/layer/reshape_vulkan.cpp
new file mode 100644
index 000000000..7e36dca8f
--- /dev/null
+++ b/source/device/vulkan/layer/reshape_vulkan.cpp
@@ -0,0 +1,580 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "reshape_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Reshape_vulkan::Reshape_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    permute_hwc = 0;
+    permute_hc = 0;
+    permute_hw = 0;
+    permute_chw = 0;
+
+    pipeline_reshape = 0;
+    pipeline_reshape_pack4 = 0;
+    pipeline_reshape_pack1to4 = 0;
+    pipeline_reshape_pack4to1 = 0;
+    pipeline_reshape_pack8 = 0;
+    pipeline_reshape_pack1to8 = 0;
+    pipeline_reshape_pack4to8 = 0;
+    pipeline_reshape_pack8to4 = 0;
+    pipeline_reshape_pack8to1 = 0;
+}
+
+Reshape_vulkan::Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    permute_hwc = 0;
+    permute_hc = 0;
+    permute_hw = 0;
+    permute_chw = 0;
+
+    pipeline_reshape = 0;
+    pipeline_reshape_pack4 = 0;
+    pipeline_reshape_pack1to4 = 0;
+    pipeline_reshape_pack4to1 = 0;
+    pipeline_reshape_pack8 = 0;
+    pipeline_reshape_pack1to8 = 0;
+    pipeline_reshape_pack4to8 = 0;
+    pipeline_reshape_pack8to4 = 0;
+    pipeline_reshape_pack8to1 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // params
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+
+    struct reshape_param *param = (struct reshape_param *)ir_node->op.param_mem;
+
+    ndim = param->dim_size;
+    permute = param->reverse;
+    // TODO fix 
+    // c = param->re_shape[0];
+    // w = param->re_shape[1];
+    // h = param->re_shape[2];
+    if(param->dim_size == 4)
+    {
+        ndim = 3;
+        output_c = output->dims[1];  // param->output_channel;
+        output_h = output->dims[2];
+        output_w = output->dims[3];
+
+        c = output->dims[1];  // param->output_channel;
+        h = output->dims[2];
+        w = output->dims[3];
+    }
+    else
+    {
+        ndim = param->dim_size;
+        
+        output_c = output->dims[0];  // param->output_channel;
+        output_h = output->dims[1];
+        output_w = output->dims[2];
+
+        c = output_c;  // param->output_channel;
+        h = output_h;
+        w = output_w;
+    }
+
+    
+
+}
+
+int Reshape_vulkan::create_pipeline(const Option& _opt)
+{
+    Option opt = _opt;
+    const Tensor& shape = Tensor(input_w, input_h, input_c, (void*)0); // bottom_shapes.empty() ? Tensor() : bottom_shapes[0];
+    const Tensor& out_shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    bool need_permute = permute == 1;
+    if (shape.dims == 2 && ndim == 2 && shape.h == out_shape.h)
+        need_permute = false;
+    if (shape.dims == 3 && ndim == 3 && shape.c == out_shape.c)
+        need_permute = false;
+
+    Tensor shape_permuted = shape;
+    Tensor out_shape_permuted = out_shape;
+    if (need_permute)
+    {
+        if (shape.dims == 1) shape_permuted = Tensor(shape.w, (void*)0);
+        if (shape.dims == 2) shape_permuted = Tensor(shape.h, shape.w, (void*)0);
+        if (shape.dims == 3) shape_permuted = Tensor(shape.c, shape.w, shape.h, (void*)0);
+
+        if (out_shape.dims == 1) out_shape_permuted = Tensor(out_shape.w, (void*)0);
+        if (out_shape.dims == 2) out_shape_permuted = Tensor(out_shape.h, out_shape.w, (void*)0);
+        if (out_shape.dims == 3) out_shape_permuted = Tensor(out_shape.c, out_shape.w, out_shape.h, (void*)0);
+    }
+
+    int elempack = 1;
+    if (shape_permuted.dims == 1) elempack = opt.use_shader_pack8 && shape_permuted.w % 8 == 0 ? 8 : shape_permuted.w % 4 == 0 ? 4 : 1;
+    if (shape_permuted.dims == 2) elempack = opt.use_shader_pack8 && shape_permuted.h % 8 == 0 ? 8 : shape_permuted.h % 4 == 0 ? 4 : 1;
+    if (shape_permuted.dims == 3) elempack = opt.use_shader_pack8 && shape_permuted.c % 8 == 0 ? 8 : shape_permuted.c % 4 == 0 ? 4 : 1;
+
+    int out_elempack = 1;
+    if (out_shape_permuted.dims == 1) out_elempack = opt.use_shader_pack8 && out_shape_permuted.w % 8 == 0 ? 8 : out_shape_permuted.w % 4 == 0 ? 4 : 1;
+    if (out_shape_permuted.dims == 2) out_elempack = opt.use_shader_pack8 && out_shape_permuted.h % 8 == 0 ? 8 : out_shape_permuted.h % 4 == 0 ? 4 : 1;
+    if (out_shape_permuted.dims == 3) out_elempack = opt.use_shader_pack8 && out_shape_permuted.c % 8 == 0 ? 8 : out_shape_permuted.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    size_t out_elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+        out_elemsize = out_elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+        out_elemsize = out_elempack == 1 ? 4u : out_elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+        out_elemsize = out_elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape_permuted.dims == 1) shape_packed = Tensor(shape_permuted.w / elempack, (void*)0, elemsize, elempack);
+    if (shape_permuted.dims == 2) shape_packed = Tensor(shape_permuted.w, shape_permuted.h / elempack, (void*)0, elemsize, elempack);
+    if (shape_permuted.dims == 3) shape_packed = Tensor(shape_permuted.w, shape_permuted.h, shape_permuted.c / elempack, (void*)0, elemsize, elempack);
+
+    Tensor out_shape_packed;
+    if (out_shape_permuted.dims == 1) out_shape_packed = Tensor(out_shape_permuted.w / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (out_shape_permuted.dims == 2) out_shape_packed = Tensor(out_shape_permuted.w, out_shape_permuted.h / out_elempack, (void*)0, out_elemsize, out_elempack);
+    if (out_shape_permuted.dims == 3) out_shape_packed = Tensor(out_shape_permuted.w, out_shape_permuted.h, out_shape_permuted.c / out_elempack, (void*)0, out_elemsize, out_elempack);
+
+    // check blob shape
+    // if (!vkdev->shape_support_image_storage(shape_packed) || !vkdev->shape_support_image_storage(out_shape_packed))
+    {
+        support_image_storage = false;
+        opt.use_image_storage = false;
+    }
+    
+    std::vector<vk_specialization_type> specializations(1 + 10);
+    specializations[0].i = ndim;
+    specializations[1 + 0].i = 0;   // shape_packed.dims;
+    specializations[1 + 1].i = 0;   // shape_packed.w;
+    specializations[1 + 2].i = 0;   // shape_packed.h;
+    specializations[1 + 3].i = 0;   // shape_packed.c;
+    specializations[1 + 4].i = 0;   // shape_packed.cstep;
+    specializations[1 + 5].i = 0;   // out_shape_packed.dims;
+    specializations[1 + 6].i = 0;   // out_shape_packed.w;
+    specializations[1 + 7].i = 0;   // out_shape_packed.h;
+    specializations[1 + 8].i = 0;   // out_shape_packed.c;
+    specializations[1 + 9].i = 0;   // out_shape_packed.cstep;
+
+    Tensor local_size_xyz_bottom; // pack4to1 and pack8to1
+    if (shape_packed.dims == 1)
+    {
+        local_size_xyz_bottom.w = std::min(64, shape_packed.w);
+        local_size_xyz_bottom.h = 1;
+        local_size_xyz_bottom.c = 1;
+    }
+    if (shape_packed.dims == 2)
+    {
+        local_size_xyz_bottom.w = std::min(8, shape_packed.w);
+        local_size_xyz_bottom.h = std::min(8, shape_packed.h);
+        local_size_xyz_bottom.c = 1;
+    }
+    if (shape_packed.dims == 3)
+    {
+        local_size_xyz_bottom.w = std::min(4, shape_packed.w);
+        local_size_xyz_bottom.h = std::min(4, shape_packed.h);
+        local_size_xyz_bottom.c = std::min(4, shape_packed.c);
+    }
+
+    Tensor local_size_xyz;
+    if (out_shape_packed.dims == 1)
+    {
+        local_size_xyz.w = std::min(64, out_shape_packed.w);
+        local_size_xyz.h = 1;
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_packed.dims == 2)
+    {
+        local_size_xyz.w = std::min(8, out_shape_packed.w);
+        local_size_xyz.h = std::min(8, out_shape_packed.h);
+        local_size_xyz.c = 1;
+    }
+    if (out_shape_packed.dims == 3)
+    {
+        local_size_xyz.w = std::min(4, out_shape_packed.w);
+        local_size_xyz.h = std::min(4, out_shape_packed.h);
+        local_size_xyz.c = std::min(4, out_shape_packed.c);
+    }
+
+    // pack1
+    if (shape_permuted.dims == 0 || (elempack == 1 && out_elempack == 1))
+    {
+        pipeline_reshape = new Pipeline(vkdev);
+        pipeline_reshape->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_reshape->create(LayerShaderType::reshape, opt, specializations);
+    }
+
+    // pack4
+    if (shape_permuted.dims == 0 || (elempack == 4 && out_elempack == 4))
+    {
+        pipeline_reshape_pack4 = new Pipeline(vkdev);
+        pipeline_reshape_pack4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_reshape_pack4->create(LayerShaderType::reshape_pack4, opt, specializations);
+    }
+
+    // pack1to4
+    if (shape_permuted.dims == 0 || (elempack == 1 && out_elempack == 4))
+    {
+        pipeline_reshape_pack1to4 = new Pipeline(vkdev);
+        pipeline_reshape_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_reshape_pack1to4->create(LayerShaderType::reshape_pack1to4, opt, specializations);
+    }
+
+    // pack4to1
+    if (shape_permuted.dims == 0 || (elempack == 4 && out_elempack == 1))
+    {
+        pipeline_reshape_pack4to1 = new Pipeline(vkdev);
+        pipeline_reshape_pack4to1->set_optimal_local_size_xyz(local_size_xyz_bottom);
+        pipeline_reshape_pack4to1->create(LayerShaderType::reshape_pack4to1, opt, specializations);
+    }
+
+    // pack8
+    if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 8 && out_elempack == 8))
+    {
+        pipeline_reshape_pack8 = new Pipeline(vkdev);
+        pipeline_reshape_pack8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_reshape_pack8->create(LayerShaderType::reshape_pack8, opt, specializations);
+    }
+
+    // pack1to8
+    if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 1 && out_elempack == 8))
+    {
+        pipeline_reshape_pack1to8 = new Pipeline(vkdev);
+        pipeline_reshape_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_reshape_pack1to8->create(LayerShaderType::reshape_pack1to8, opt, specializations);
+    }
+
+    // pack4to8
+    if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 4 && out_elempack == 8))
+    {
+        pipeline_reshape_pack4to8 = new Pipeline(vkdev);
+        pipeline_reshape_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_reshape_pack4to8->create(LayerShaderType::reshape_pack4to8, opt, specializations);
+    }
+
+    // pack8to4
+    if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 8 && out_elempack == 4))
+    {
+        pipeline_reshape_pack8to4 = new Pipeline(vkdev);
+        pipeline_reshape_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
+        pipeline_reshape_pack8to4->create(LayerShaderType::reshape_pack8to4, opt, specializations);
+    }
+
+    // pack8to1
+    if ((opt.use_shader_pack8 && shape_permuted.dims == 0) || (elempack == 8 && out_elempack == 1))
+    {
+        pipeline_reshape_pack8to1 = new Pipeline(vkdev);
+        pipeline_reshape_pack8to1->set_optimal_local_size_xyz(local_size_xyz_bottom);
+        pipeline_reshape_pack8to1->create(LayerShaderType::reshape_pack8to1, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Reshape_vulkan::destroy_pipeline(const Option& opt)
+{
+    if (permute_hwc)
+    {
+        permute_hwc->destroy_pipeline(opt);
+        delete permute_hwc;
+        permute_hwc = 0;
+    }
+
+    if (permute_hc)
+    {
+        permute_hc->destroy_pipeline(opt);
+        delete permute_hc;
+        permute_hc = 0;
+    }
+
+    if (permute_hw)
+    {
+        permute_hw->destroy_pipeline(opt);
+        delete permute_hw;
+        permute_hw = 0;
+    }
+
+    if (permute_chw)
+    {
+        permute_chw->destroy_pipeline(opt);
+        delete permute_chw;
+        permute_chw = 0;
+    }
+
+    delete pipeline_reshape;
+    pipeline_reshape = 0;
+
+    delete pipeline_reshape_pack4;
+    pipeline_reshape_pack4 = 0;
+
+    delete pipeline_reshape_pack1to4;
+    pipeline_reshape_pack1to4 = 0;
+
+    delete pipeline_reshape_pack4to1;
+    pipeline_reshape_pack4to1 = 0;
+
+    delete pipeline_reshape_pack8;
+    pipeline_reshape_pack8 = 0;
+
+    delete pipeline_reshape_pack1to8;
+    pipeline_reshape_pack1to8 = 0;
+
+    delete pipeline_reshape_pack4to8;
+    pipeline_reshape_pack4to8 = 0;
+
+    delete pipeline_reshape_pack8to4;
+    pipeline_reshape_pack8to4 = 0;
+
+    delete pipeline_reshape_pack8to1;
+    pipeline_reshape_pack8to1 = 0;
+
+    return 0;
+}
+
+int Reshape_vulkan::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+    int out_elempack;
+
+    int total = bottom_blob.w * bottom_blob.h * bottom_blob.c * elempack;
+
+    // resolve out shape
+    int outw = w;
+    int outh = h;
+    int outc = c;
+
+    if (ndim == 1)
+    {
+        if (outw == 0)
+            outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w;
+
+        if (outw == -1)
+            outw = total;
+
+        out_elempack = opt.use_shader_pack8 && outw % 8 == 0 ? 8 : outw % 4 == 0 ? 4 : 1;
+
+        if (dims == 1 && bottom_blob.w == outw && elempack == out_elempack)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+    }
+    if (ndim == 2)
+    {
+        if (outw == 0)
+            outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w;
+        if (outh == 0)
+            outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h;
+
+        if (outw == -1)
+            outw = total / outh;
+        if (outh == -1)
+            outh = total / outw;
+
+        out_elempack = opt.use_shader_pack8 && outh % 8 == 0 ? 8 : outh % 4 == 0 ? 4 : 1;
+
+        if (dims == 2 && bottom_blob.h == outh && elempack == out_elempack)
+        {
+            top_blob = bottom_blob;
+            return 0;
+        }
+    }
+
+    if (ndim == 3)
+    {
+        if (outw == 0)
+            outw = dims == 1 ? bottom_blob.w * elempack : bottom_blob.w;
+        if (outh == 0)
+            outh = dims == 2 ? bottom_blob.h * elempack : bottom_blob.h;
+        if (outc == 0)
+            outc = dims == 3 ? bottom_blob.c * elempack : bottom_blob.c;
+
+        if (outw == -1)
+            outw = total / outc / outh;
+        if (outh == -1)
+            outh = total / outc / outw;
+        if (outc == -1)
+            outc = total / outh / outw;
+
+        out_elempack = opt.use_shader_pack8 && outc % 8 == 0 ? 8 : outc % 4 == 0 ? 4 : 1;
+
+        if (dims == 3 && bottom_blob.c == outc && elempack == out_elempack)
+        {
+            top_blob = bottom_blob;
+            top_blob.w = outw;
+            top_blob.h = outh;
+            return 0;
+        }
+    }
+
+    bool need_permute = permute == 1;
+    if (dims == 2 && ndim == 2 && bottom_blob.h * elempack == outh)
+        need_permute = false;
+    if (dims == 3 && ndim == 3 && bottom_blob.c * elempack == outc)
+        need_permute = false;
+
+    if (ndim == 1)
+    {
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        top_blob.create(outw / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    if (ndim == 2)
+    {
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        top_blob.create(outw, outh / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+    if (ndim == 3)
+    {
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        if (opt.use_fp16_packed && !opt.use_fp16_storage)
+        {
+            if (out_elempack == 8) out_elemsize = 8 * 2u;
+            if (out_elempack == 4) out_elemsize = 4 * 2u;
+            if (out_elempack == 1) out_elemsize = 4u;
+        }
+
+        top_blob.create(outw, outh, outc / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    }
+
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkTensor> bindings(2);
+    bindings[0] = bottom_blob;
+    bindings[1] = top_blob;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob.dims;
+    constants[1].i = bottom_blob.w;
+    constants[2].i = bottom_blob.h;
+    constants[3].i = bottom_blob.c;
+    constants[4].i = bottom_blob.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    if (elempack == 1 && out_elempack == 1)
+    {
+        cmd.record_pipeline(pipeline_reshape, bindings, constants, top_blob);
+    }
+    else if (elempack == 4 && out_elempack == 4)
+    {
+        cmd.record_pipeline(pipeline_reshape_pack4, bindings, constants, top_blob);
+    }
+    else if (elempack == 1 && out_elempack == 4)
+    {
+        cmd.record_pipeline(pipeline_reshape_pack1to4, bindings, constants, top_blob);
+    }
+    else if (elempack == 4 && out_elempack == 1)
+    {
+        cmd.record_pipeline(pipeline_reshape_pack4to1, bindings, constants, bottom_blob);
+    }
+    else if (elempack == 8 && out_elempack == 8)
+    {
+        cmd.record_pipeline(pipeline_reshape_pack8, bindings, constants, top_blob);
+    }
+    else if (elempack == 1 && out_elempack == 8)
+    {
+        cmd.record_pipeline(pipeline_reshape_pack1to8, bindings, constants, top_blob);
+    }
+    else if (elempack == 4 && out_elempack == 8)
+    {
+        cmd.record_pipeline(pipeline_reshape_pack4to8, bindings, constants, top_blob);
+    }
+    else if (elempack == 8 && out_elempack == 4)
+    {
+        cmd.record_pipeline(pipeline_reshape_pack8to4, bindings, constants, top_blob);
+    }
+    else if (elempack == 8 && out_elempack == 1)
+    {
+        cmd.record_pipeline(pipeline_reshape_pack8to1, bindings, constants, bottom_blob);
+    }
+
+    return 0;
+}
+
+
+}   // namespace TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/layer/reshape_vulkan.hpp b/source/device/vulkan/layer/reshape_vulkan.hpp
new file mode 100644
index 000000000..33bc2be41
--- /dev/null
+++ b/source/device/vulkan/layer/reshape_vulkan.hpp
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_RESHAPE_HPP
+#define LAYER_RESHAPE_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "reshape_param.h"
+
+namespace TEngine{
+
+class Reshape_vulkan : public Layer
+{
+public:
+    Reshape_vulkan();
+    Reshape_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    TEngine::Layer* permute_hwc;
+    TEngine::Layer* permute_hc;
+    TEngine::Layer* permute_hw;
+    TEngine::Layer* permute_chw;
+
+    Pipeline* pipeline_reshape;
+    Pipeline* pipeline_reshape_pack4;
+    Pipeline* pipeline_reshape_pack1to4;
+    Pipeline* pipeline_reshape_pack4to1;
+    Pipeline* pipeline_reshape_pack8;
+    Pipeline* pipeline_reshape_pack1to8;
+    Pipeline* pipeline_reshape_pack4to8;
+    Pipeline* pipeline_reshape_pack8to4;
+    Pipeline* pipeline_reshape_pack8to1;
+
+public:
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+
+    int w;
+    int h;
+    int c;
+
+    // flag permute chw->hwc or hw->wh before and after reshape
+    int permute;
+
+    int ndim;
+
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer/softmax_vulkan.cpp b/source/device/vulkan/layer/softmax_vulkan.cpp
new file mode 100644
index 000000000..970e03295
--- /dev/null
+++ b/source/device/vulkan/layer/softmax_vulkan.cpp
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "softmax_vulkan.hpp"
+#include "../layer_shader_type.h"
+
+namespace TEngine {
+
+Softmax_vulkan::Softmax_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_softmax_reduce_max = 0;
+    pipeline_softmax_exp_sub_max = 0;
+    pipeline_softmax_reduce_sum = 0;
+    pipeline_softmax_div_sum = 0;
+
+    pipeline_softmax_reduce_max_pack4 = 0;
+    pipeline_softmax_exp_sub_max_pack4 = 0;
+    pipeline_softmax_reduce_sum_pack4 = 0;
+    pipeline_softmax_div_sum_pack4 = 0;
+
+    pipeline_softmax_reduce_max_pack8 = 0;
+    pipeline_softmax_exp_sub_max_pack8 = 0;
+    pipeline_softmax_reduce_sum_pack8 = 0;
+    pipeline_softmax_div_sum_pack8 = 0;
+}
+
+Softmax_vulkan::Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node)
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    pipeline_softmax_reduce_max = 0;
+    pipeline_softmax_exp_sub_max = 0;
+    pipeline_softmax_reduce_sum = 0;
+    pipeline_softmax_div_sum = 0;
+
+    pipeline_softmax_reduce_max_pack4 = 0;
+    pipeline_softmax_exp_sub_max_pack4 = 0;
+    pipeline_softmax_reduce_sum_pack4 = 0;
+    pipeline_softmax_div_sum_pack4 = 0;
+
+    pipeline_softmax_reduce_max_pack8 = 0;
+    pipeline_softmax_exp_sub_max_pack8 = 0;
+    pipeline_softmax_reduce_sum_pack8 = 0;
+    pipeline_softmax_div_sum_pack8 = 0;
+
+    graph = ir_graph;
+    node = ir_node;
+
+    struct tensor *input = get_ir_graph_tensor(graph, node->input_tensors[0]);
+    std::string name = input->name;
+    bottoms.push_back(name);
+
+    struct tensor *output = get_ir_graph_tensor(graph, node->output_tensors[0]);
+    name = output->name;
+    tops.push_back(name);
+
+    // params
+    input_c = input->dims[1];   // param->input_channel;
+    input_h = input->dims[2];
+    input_w = input->dims[3];
+    output_c = output->dims[1];  // param->output_channel;
+    output_h = output->dims[2];
+    output_w = output->dims[3];
+    
+    struct softmax_param *param = (struct softmax_param *)ir_node->op.param_mem;
+    axis = param->axis-1;
+}
+
+int Softmax_vulkan::create_pipeline(const Option& opt)
+{
+    const Tensor& shape = Tensor(output_w, output_h, output_c, (void*)0); // top_shapes.empty() ? Tensor() : top_shapes[0];
+
+    int elempack = 1;
+    if (shape.dims == 1) elempack = opt.use_shader_pack8 && shape.w % 8 == 0 ? 8 : shape.w % 4 == 0 ? 4 : 1;
+    if (shape.dims == 2) elempack = opt.use_shader_pack8 && shape.h % 8 == 0 ? 8 : shape.h % 4 == 0 ? 4 : 1;
+    if (shape.dims == 3) elempack = opt.use_shader_pack8 && shape.c % 8 == 0 ? 8 : shape.c % 4 == 0 ? 4 : 1;
+
+    size_t elemsize;
+    if (opt.use_fp16_storage)
+    {
+        elemsize = elempack * 2u;
+    }
+    else if (opt.use_fp16_packed)
+    {
+        elemsize = elempack == 1 ? 4u : elempack * 2u;
+    }
+    else
+    {
+        elemsize = elempack * 4u;
+    }
+
+    Tensor shape_packed;
+    if (shape.dims == 1) shape_packed = Tensor(shape.w / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 2) shape_packed = Tensor(shape.w, shape.h / elempack, (void*)0, elemsize, elempack);
+    if (shape.dims == 3) shape_packed = Tensor(shape.w, shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+
+    Tensor workspace_shape_packed;
+    if (shape.dims == 1) // axis == 0
+    {
+        workspace_shape_packed = Tensor(1, (void*)0, elemsize, elempack);
+    }
+    else if (shape.dims == 2 && axis == 0)
+    {
+        workspace_shape_packed = Tensor(shape.w, (void*)0, elemsize, elempack);
+    }
+    else if (shape.dims == 2 && axis == 1)
+    {
+        workspace_shape_packed = Tensor(shape.h / elempack, (void*)0, elemsize, elempack);
+    }
+    else if (shape.dims == 3 && axis == 0)
+    {
+        workspace_shape_packed = Tensor(shape.w, shape.h, (void*)0, elemsize, elempack);
+    }
+    else if (shape.dims == 3 && axis == 1)
+    {
+        workspace_shape_packed = Tensor(shape.w, shape.c / elempack, (void*)0, elemsize, elempack);
+    }
+    else if (shape.dims == 3 && axis == 2)
+    {
+        workspace_shape_packed = Tensor(shape.h, shape.c / elempack, (void*)0, elemsize, elempack);
+    }
+
+    std::vector<vk_specialization_type> specializations(1 + 10);
+    specializations[0].i = axis;
+    specializations[1 + 0].i = 0;   // shape_packed.dims;
+    specializations[1 + 1].i = 0;   // shape_packed.w;
+    specializations[1 + 2].i = 0;   // shape_packed.h;
+    specializations[1 + 3].i = 0;   // shape_packed.c;
+    specializations[1 + 4].i = 0;   // shape_packed.cstep;
+    specializations[1 + 5].i = 0;   // workspace_shape_packed.dims;
+    specializations[1 + 6].i = 0;   // workspace_shape_packed.w;
+    specializations[1 + 7].i = 0;   // workspace_shape_packed.h;
+    specializations[1 + 8].i = 0;   // workspace_shape_packed.c;
+    specializations[1 + 9].i = 0;   // workspace_shape_packed.cstep;
+
+    {
+        Tensor local_size_xyz;
+        if (workspace_shape_packed.dims == 1)
+        {
+            local_size_xyz.w = std::min(64, workspace_shape_packed.w);
+            local_size_xyz.h = 1;
+            local_size_xyz.c = 1;
+        }
+        if (workspace_shape_packed.dims == 2)
+        {
+            local_size_xyz.w = std::min(8, workspace_shape_packed.w);
+            local_size_xyz.h = std::min(8, workspace_shape_packed.h);
+            local_size_xyz.c = 1;
+        }
+        if (workspace_shape_packed.dims != 0)
+        {
+            local_size_xyz.w = std::min(4, workspace_shape_packed.w);
+            local_size_xyz.h = std::min(4, workspace_shape_packed.h);
+            local_size_xyz.c = std::min(4, workspace_shape_packed.c);
+        }
+
+        // pack1
+        {
+            pipeline_softmax_reduce_max = new Pipeline(vkdev);
+            pipeline_softmax_reduce_sum = new Pipeline(vkdev);
+
+            pipeline_softmax_reduce_max->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_softmax_reduce_sum->set_optimal_local_size_xyz(local_size_xyz);
+
+            pipeline_softmax_reduce_max->create(LayerShaderType::softmax_reduce_max, opt, specializations);
+            pipeline_softmax_reduce_sum->create(LayerShaderType::softmax_reduce_sum, opt, specializations);
+        }
+
+        // pack4
+        {
+            pipeline_softmax_reduce_max_pack4 = new Pipeline(vkdev);
+            pipeline_softmax_reduce_sum_pack4 = new Pipeline(vkdev);
+
+            pipeline_softmax_reduce_max_pack4->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_softmax_reduce_sum_pack4->set_optimal_local_size_xyz(local_size_xyz);
+
+            pipeline_softmax_reduce_max_pack4->create(LayerShaderType::softmax_reduce_max_pack4, opt, specializations);
+            pipeline_softmax_reduce_sum_pack4->create(LayerShaderType::softmax_reduce_sum_pack4, opt, specializations);
+        }
+
+        // pack8
+        if (opt.use_shader_pack8)
+        {
+            pipeline_softmax_reduce_max_pack8 = new Pipeline(vkdev);
+            pipeline_softmax_reduce_sum_pack8 = new Pipeline(vkdev);
+
+            pipeline_softmax_reduce_max_pack8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_softmax_reduce_sum_pack8->set_optimal_local_size_xyz(local_size_xyz);
+
+            pipeline_softmax_reduce_max_pack8->create(LayerShaderType::softmax_reduce_max_pack8, opt, specializations);
+            pipeline_softmax_reduce_sum_pack8->create(LayerShaderType::softmax_reduce_sum_pack8, opt, specializations);
+        }
+    }
+
+    {
+        Tensor local_size_xyz;
+        if (shape_packed.dims == 1)
+        {
+            local_size_xyz.w = std::min(64, shape_packed.w);
+            local_size_xyz.h = 1;
+            local_size_xyz.c = 1;
+        }
+        if (shape_packed.dims == 2)
+        {
+            local_size_xyz.w = std::min(8, shape_packed.w);
+            local_size_xyz.h = std::min(8, shape_packed.h);
+            local_size_xyz.c = 1;
+        }
+        if (shape_packed.dims == 3)
+        {
+            local_size_xyz.w = std::min(4, shape_packed.w);
+            local_size_xyz.h = std::min(4, shape_packed.h);
+            local_size_xyz.c = std::min(4, shape_packed.c);
+        }
+
+        // pack1
+        {
+            pipeline_softmax_exp_sub_max = new Pipeline(vkdev);
+            pipeline_softmax_div_sum = new Pipeline(vkdev);
+
+            pipeline_softmax_exp_sub_max->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_softmax_div_sum->set_optimal_local_size_xyz(local_size_xyz);
+
+            pipeline_softmax_exp_sub_max->create(LayerShaderType::softmax_exp_sub_max, opt, specializations);
+            pipeline_softmax_div_sum->create(LayerShaderType::softmax_div_sum, opt, specializations);
+        }
+
+        // pack4
+        {
+            pipeline_softmax_exp_sub_max_pack4 = new Pipeline(vkdev);
+            pipeline_softmax_div_sum_pack4 = new Pipeline(vkdev);
+
+            pipeline_softmax_exp_sub_max_pack4->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_softmax_div_sum_pack4->set_optimal_local_size_xyz(local_size_xyz);
+
+            pipeline_softmax_exp_sub_max_pack4->create(LayerShaderType::softmax_exp_sub_max_pack4, opt, specializations);
+            pipeline_softmax_div_sum_pack4->create(LayerShaderType::softmax_div_sum_pack4, opt, specializations);
+        }
+
+        // pack8
+        if (opt.use_shader_pack8)
+        {
+            pipeline_softmax_exp_sub_max_pack8 = new Pipeline(vkdev);
+            pipeline_softmax_div_sum_pack8 = new Pipeline(vkdev);
+
+            pipeline_softmax_exp_sub_max_pack8->set_optimal_local_size_xyz(local_size_xyz);
+            pipeline_softmax_div_sum_pack8->set_optimal_local_size_xyz(local_size_xyz);
+
+            pipeline_softmax_exp_sub_max_pack8->create(LayerShaderType::softmax_exp_sub_max_pack8, opt, specializations);
+            pipeline_softmax_div_sum_pack8->create(LayerShaderType::softmax_div_sum_pack8, opt, specializations);
+        }
+    }
+
+    return 0;
+}
+
+
+int Softmax_vulkan::destroy_pipeline(const Option& /*opt*/)
+{
+    delete pipeline_softmax_reduce_max;
+    pipeline_softmax_reduce_max = 0;
+
+    delete pipeline_softmax_exp_sub_max;
+    pipeline_softmax_exp_sub_max = 0;
+
+    delete pipeline_softmax_reduce_sum;
+    pipeline_softmax_reduce_sum = 0;
+
+    delete pipeline_softmax_div_sum;
+    pipeline_softmax_div_sum = 0;
+
+    delete pipeline_softmax_reduce_max_pack4;
+    pipeline_softmax_reduce_max_pack4 = 0;
+
+    delete pipeline_softmax_exp_sub_max_pack4;
+    pipeline_softmax_exp_sub_max_pack4 = 0;
+
+    delete pipeline_softmax_reduce_sum_pack4;
+    pipeline_softmax_reduce_sum_pack4 = 0;
+
+    delete pipeline_softmax_div_sum_pack4;
+    pipeline_softmax_div_sum_pack4 = 0;
+
+    delete pipeline_softmax_reduce_max_pack8;
+    pipeline_softmax_reduce_max_pack8 = 0;
+
+    delete pipeline_softmax_exp_sub_max_pack8;
+    pipeline_softmax_exp_sub_max_pack8 = 0;
+
+    delete pipeline_softmax_reduce_sum_pack8;
+    pipeline_softmax_reduce_sum_pack8 = 0;
+
+    delete pipeline_softmax_div_sum_pack8;
+    pipeline_softmax_div_sum_pack8 = 0;
+
+    return 0;
+}
+
+int Softmax_vulkan::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int dims = bottom_top_blob.dims;
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    size_t elemsize = bottom_top_blob.elemsize;
+    int elempack = bottom_top_blob.elempack;
+
+    VkTensor max_workspace;
+    VkTensor sum_workspace;
+
+    if (dims == 1) // axis == 0
+    {
+        max_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator);
+        sum_workspace.create(1, elemsize, elempack, opt.workspace_vkallocator);
+    }
+    else if (dims == 2 && axis == 0)
+    {
+        max_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator);
+        sum_workspace.create(w, elemsize, elempack, opt.workspace_vkallocator);
+    }
+    else if (dims == 2 && axis == 1)
+    {
+        max_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator);
+        sum_workspace.create(h, elemsize, elempack, opt.workspace_vkallocator);
+    }
+    else if (dims == 3 && axis == 0)
+    {
+        max_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator);
+        sum_workspace.create(w, h, elemsize, elempack, opt.workspace_vkallocator);
+    }
+    else if (dims == 3 && axis == 1)
+    {
+        max_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator);
+        sum_workspace.create(w, channels, elemsize, elempack, opt.workspace_vkallocator);
+    }
+    else if (dims == 3 && axis == 2)
+    {
+        max_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator);
+        sum_workspace.create(h, channels, elemsize, elempack, opt.workspace_vkallocator);
+    }
+
+    // reduce max
+    {
+        std::vector<VkTensor> bindings(2);
+        bindings[0] = bottom_top_blob;
+        bindings[1] = max_workspace;
+
+        std::vector<vk_constant_type> constants(10);
+        constants[0].i = bottom_top_blob.dims;
+        constants[1].i = bottom_top_blob.w;
+        constants[2].i = bottom_top_blob.h;
+        constants[3].i = bottom_top_blob.c;
+        constants[4].i = bottom_top_blob.cstep;
+        constants[5].i = max_workspace.dims;
+        constants[6].i = max_workspace.w;
+        constants[7].i = max_workspace.h;
+        constants[8].i = max_workspace.c;
+        constants[9].i = max_workspace.cstep;
+
+        const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_max_pack8
+                                   : elempack == 4 ? pipeline_softmax_reduce_max_pack4
+                                   : pipeline_softmax_reduce_max;
+
+        cmd.record_pipeline(pipeline, bindings, constants, max_workspace);
+    }
+
+    // exp( v - max )
+    {
+        std::vector<VkTensor> bindings(2);
+        bindings[0] = bottom_top_blob;
+        bindings[1] = max_workspace;
+
+        std::vector<vk_constant_type> constants(10);
+        constants[0].i = bottom_top_blob.dims;
+        constants[1].i = bottom_top_blob.w;
+        constants[2].i = bottom_top_blob.h;
+        constants[3].i = bottom_top_blob.c;
+        constants[4].i = bottom_top_blob.cstep;
+        constants[5].i = max_workspace.dims;
+        constants[6].i = max_workspace.w;
+        constants[7].i = max_workspace.h;
+        constants[8].i = max_workspace.c;
+        constants[9].i = max_workspace.cstep;
+
+        const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_exp_sub_max_pack8
+                                   : elempack == 4 ? pipeline_softmax_exp_sub_max_pack4
+                                   : pipeline_softmax_exp_sub_max;
+
+        cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
+    }
+
+    // reduce sum
+    {
+        std::vector<VkTensor> bindings(2);
+        bindings[0] = bottom_top_blob;
+        bindings[1] = sum_workspace;
+
+        std::vector<vk_constant_type> constants(10);
+        constants[0].i = bottom_top_blob.dims;
+        constants[1].i = bottom_top_blob.w;
+        constants[2].i = bottom_top_blob.h;
+        constants[3].i = bottom_top_blob.c;
+        constants[4].i = bottom_top_blob.cstep;
+        constants[5].i = sum_workspace.dims;
+        constants[6].i = sum_workspace.w;
+        constants[7].i = sum_workspace.h;
+        constants[8].i = sum_workspace.c;
+        constants[9].i = sum_workspace.cstep;
+
+        const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_reduce_sum_pack8
+                                   : elempack == 4 ? pipeline_softmax_reduce_sum_pack4
+                                   : pipeline_softmax_reduce_sum;
+
+        cmd.record_pipeline(pipeline, bindings, constants, sum_workspace);
+    }
+
+    // div sum
+    {
+        std::vector<VkTensor> bindings(2);
+        bindings[0] = bottom_top_blob;
+        bindings[1] = sum_workspace;
+
+        std::vector<vk_constant_type> constants(10);
+        constants[0].i = bottom_top_blob.dims;
+        constants[1].i = bottom_top_blob.w;
+        constants[2].i = bottom_top_blob.h;
+        constants[3].i = bottom_top_blob.c;
+        constants[4].i = bottom_top_blob.cstep;
+        constants[5].i = sum_workspace.dims;
+        constants[6].i = sum_workspace.w;
+        constants[7].i = sum_workspace.h;
+        constants[8].i = sum_workspace.c;
+        constants[9].i = sum_workspace.cstep;
+
+        const Pipeline* pipeline = elempack == 8 ? pipeline_softmax_div_sum_pack8
+                                   : elempack == 4 ? pipeline_softmax_div_sum_pack4
+                                   : pipeline_softmax_div_sum;
+
+        cmd.record_pipeline(pipeline, bindings, constants, bottom_top_blob);
+    }
+
+    return 0;
+}
+
+
+}   // namespace TEngine
diff --git a/source/device/vulkan/layer/softmax_vulkan.hpp b/source/device/vulkan/layer/softmax_vulkan.hpp
new file mode 100644
index 000000000..108ea5d62
--- /dev/null
+++ b/source/device/vulkan/layer/softmax_vulkan.hpp
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_SOFTMAX_HPP
+#define LAYER_SOFTMAX_HPP
+
+#include "../vulkan_layer.hpp"
+#include "../vulkan_command.hpp"
+
+#include "softmax_param.h"
+
+namespace TEngine{
+
+class Softmax_vulkan : public Layer
+{
+public:
+    Softmax_vulkan();
+    Softmax_vulkan(ir_graph_t* ir_graph, ir_node_t* ir_node);
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+    
+    virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    Pipeline* pipeline_softmax_reduce_max;
+    Pipeline* pipeline_softmax_exp_sub_max;
+    Pipeline* pipeline_softmax_reduce_sum;
+    Pipeline* pipeline_softmax_div_sum;
+
+    Pipeline* pipeline_softmax_reduce_max_pack4;
+    Pipeline* pipeline_softmax_exp_sub_max_pack4;
+    Pipeline* pipeline_softmax_reduce_sum_pack4;
+    Pipeline* pipeline_softmax_div_sum_pack4;
+
+    Pipeline* pipeline_softmax_reduce_max_pack8;
+    Pipeline* pipeline_softmax_exp_sub_max_pack8;
+    Pipeline* pipeline_softmax_reduce_sum_pack8;
+    Pipeline* pipeline_softmax_div_sum_pack8;
+
+public:
+    int axis;
+    int input_c;
+    int input_h;
+    int input_w;
+    int output_c;
+    int output_h;
+    int output_w;
+
+};
+
+}   // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/layer_shader_registry.h.in b/source/device/vulkan/layer_shader_registry.h.in
new file mode 100644
index 000000000..9a88eb460
--- /dev/null
+++ b/source/device/vulkan/layer_shader_registry.h.in
@@ -0,0 +1,6 @@
+// Layer Shader Registry header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+@layer_shader_registry@
+
diff --git a/source/device/vulkan/layer_shader_spv_data.h.in b/source/device/vulkan/layer_shader_spv_data.h.in
new file mode 100644
index 000000000..ab1b7b8aa
--- /dev/null
+++ b/source/device/vulkan/layer_shader_spv_data.h.in
@@ -0,0 +1,6 @@
+// Layer Shader Spv Data header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+@layer_shader_spv_data@
+
diff --git a/source/device/vulkan/layer_shader_type.h b/source/device/vulkan/layer_shader_type.h
new file mode 100644
index 000000000..e9c713062
--- /dev/null
+++ b/source/device/vulkan/layer_shader_type.h
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef LAYER_SHADER_TYPE_H
+#define LAYER_SHADER_TYPE_H
+
+namespace TEngine {
+
+namespace LayerShaderType {
+enum LayerShaderType
+{
+#include "layer_shader_type_enum.h"
+};
+} // namespace LayerType
+
+} // namespace TEngine
+
+#endif // LAYER_SHADER_TYPE_H
\ No newline at end of file
diff --git a/source/device/vulkan/layer_shader_type_enum.h.in b/source/device/vulkan/layer_shader_type_enum.h.in
new file mode 100644
index 000000000..1d3db77e9
--- /dev/null
+++ b/source/device/vulkan/layer_shader_type_enum.h.in
@@ -0,0 +1,5 @@
+// Layer Shader Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+@layer_shader_type_enum@
\ No newline at end of file
diff --git a/source/device/vulkan/layer_type_enum.h.in b/source/device/vulkan/layer_type_enum.h.in
new file mode 100644
index 000000000..88fa1a51b
--- /dev/null
+++ b/source/device/vulkan/layer_type_enum.h.in
@@ -0,0 +1,5 @@
+// Layer Type Enum header
+//
+// This file is auto-generated by cmake, don't edit it.
+
+@layer_type_enum@
\ No newline at end of file
diff --git a/source/device/vulkan/shaders/concat.comp b/source/device/vulkan/shaders/concat.comp
new file mode 100644
index 000000000..5c904b42e
--- /dev/null
+++ b/source/device/vulkan/shaders/concat.comp
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int offset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_cp1(top_blob_1d, gx + p.offset, bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        if (axis == 0) image2d_cp1(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy));
+        if (axis == 1) image2d_cp1(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (axis == 0) image3d_cp1(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz));
+        if (axis == 1) image3d_cp1(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz));
+        if (axis == 2) image3d_cp1(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    ivec3 gxyz = ivec3(gx, gy, gz);
+
+    gxyz[psc(dims) - 1 - axis] += p.offset;
+
+    int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;
+
+    buffer_cp1(top_blob_data, v_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/concat_pack4.comp b/source/device/vulkan/shaders/concat_pack4.comp
new file mode 100644
index 000000000..e904aec55
--- /dev/null
+++ b/source/device/vulkan/shaders/concat_pack4.comp
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int offset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_cp4(top_blob_1d, gx + p.offset, bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        if (axis == 0) image2d_cp4(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy));
+        if (axis == 1) image2d_cp4(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (axis == 0) image3d_cp4(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz));
+        if (axis == 1) image3d_cp4(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz));
+        if (axis == 2) image3d_cp4(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    ivec3 gxyz = ivec3(gx, gy, gz);
+
+    gxyz[psc(dims) - 1 - axis] += p.offset;
+
+    int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;
+
+    buffer_cp4(top_blob_data, v_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/concat_pack4to1.comp b/source/device/vulkan/shaders/concat_pack4to1.comp
new file mode 100644
index 000000000..bf69cebab
--- /dev/null
+++ b/source/device/vulkan/shaders/concat_pack4to1.comp
@@ -0,0 +1,164 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int offset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        afpvec4 v = image1d_ld4(bottom_blob_1d, gx);
+
+        int gx4 = gx * 4 + p.offset;
+
+        image1d_st1(top_blob_1d, gx4 + 0, v.r);
+        image1d_st1(top_blob_1d, gx4 + 1, v.g);
+        image1d_st1(top_blob_1d, gx4 + 2, v.b);
+        image1d_st1(top_blob_1d, gx4 + 3, v.a);
+    }
+    else if (psc(dims) == 2)
+    {
+        afpvec4 v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+
+        if (axis == 0)
+        {
+            int gy4 = gy * 4 + p.offset;
+
+            image2d_st1(top_blob_2d, ivec2(gx, gy4 + 0), v.r);
+            image2d_st1(top_blob_2d, ivec2(gx, gy4 + 1), v.g);
+            image2d_st1(top_blob_2d, ivec2(gx, gy4 + 2), v.b);
+            image2d_st1(top_blob_2d, ivec2(gx, gy4 + 3), v.a);
+        }
+        if (axis == 1)
+        {
+            int gx4 = gx * 4 + p.offset;
+
+            image2d_st1(top_blob_2d, ivec2(gx4 + 0, gy), v.r);
+            image2d_st1(top_blob_2d, ivec2(gx4 + 1, gy), v.g);
+            image2d_st1(top_blob_2d, ivec2(gx4 + 2, gy), v.b);
+            image2d_st1(top_blob_2d, ivec2(gx4 + 3, gy), v.a);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+
+        if (axis == 0)
+        {
+            int gz4 = gz * 4 + p.offset;
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 0), v.r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 1), v.g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 2), v.b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz4 + 3), v.a);
+        }
+        if (axis == 1)
+        {
+            int gy4 = gy * 4 + p.offset;
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy4 + 0, gz), v.r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy4 + 1, gz), v.g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy4 + 2, gz), v.b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy4 + 3, gz), v.a);
+        }
+        if (axis == 2)
+        {
+            int gx4 = gx * 4 + p.offset;
+
+            image3d_st1(top_blob_3d, ivec3(gx4 + 0, gy, gz), v.r);
+            image3d_st1(top_blob_3d, ivec3(gx4 + 1, gy, gz), v.g);
+            image3d_st1(top_blob_3d, ivec3(gx4 + 2, gy, gz), v.b);
+            image3d_st1(top_blob_3d, ivec3(gx4 + 3, gy, gz), v.a);
+        }
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    ivec3 gxyz = ivec3(gx, gy, gz);
+
+    gxyz[psc(dims) - 1] *= 4;
+    gxyz[psc(dims) - 1 - axis] += p.offset;
+
+    int v_offset_0 = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;
+
+    ivec3 gxyz4 = ivec3(1, psc(outw), psc(outcstep));
+
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1 - axis];
+
+    buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/concat_pack8.comp b/source/device/vulkan/shaders/concat_pack8.comp
new file mode 100644
index 000000000..6353705a5
--- /dev/null
+++ b/source/device/vulkan/shaders/concat_pack8.comp
@@ -0,0 +1,109 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int offset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_cp8(top_blob_1d, gx + p.offset, bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        if (axis == 0) image2d_cp8(top_blob_2d, ivec2(gx, gy + p.offset), bottom_blob_2d, ivec2(gx, gy));
+        if (axis == 1) image2d_cp8(top_blob_2d, ivec2(gx + p.offset, gy), bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (axis == 0) image3d_cp8(top_blob_3d, ivec3(gx, gy, gz + p.offset), bottom_blob_3d, ivec3(gx, gy, gz));
+        if (axis == 1) image3d_cp8(top_blob_3d, ivec3(gx, gy + p.offset, gz), bottom_blob_3d, ivec3(gx, gy, gz));
+        if (axis == 2) image3d_cp8(top_blob_3d, ivec3(gx + p.offset, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    ivec3 gxyz = ivec3(gx, gy, gz);
+
+    gxyz[psc(dims) - 1 - axis] += p.offset;
+
+    int v_offset = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;
+
+    buffer_cp8(top_blob_data, v_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/concat_pack8to1.comp b/source/device/vulkan/shaders/concat_pack8to1.comp
new file mode 100644
index 000000000..ffeedd8c9
--- /dev/null
+++ b/source/device/vulkan/shaders/concat_pack8to1.comp
@@ -0,0 +1,190 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int offset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        afpvec8 v = image1d_ld8(bottom_blob_1d, gx);
+
+        int gx8 = gx * 8 + p.offset;
+
+        image1d_st1(top_blob_1d, gx8 + 0, v[0].r);
+        image1d_st1(top_blob_1d, gx8 + 1, v[0].g);
+        image1d_st1(top_blob_1d, gx8 + 2, v[0].b);
+        image1d_st1(top_blob_1d, gx8 + 3, v[0].a);
+        image1d_st1(top_blob_1d, gx8 + 4, v[1].r);
+        image1d_st1(top_blob_1d, gx8 + 5, v[1].g);
+        image1d_st1(top_blob_1d, gx8 + 6, v[1].b);
+        image1d_st1(top_blob_1d, gx8 + 7, v[1].a);
+    }
+    else if (psc(dims) == 2)
+    {
+        afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+
+        if (axis == 0)
+        {
+            int gy8 = gy * 8 + p.offset;
+
+            image2d_st1(top_blob_2d, ivec2(gx, gy8 + 0), v[0].r);
+            image2d_st1(top_blob_2d, ivec2(gx, gy8 + 1), v[0].g);
+            image2d_st1(top_blob_2d, ivec2(gx, gy8 + 2), v[0].b);
+            image2d_st1(top_blob_2d, ivec2(gx, gy8 + 3), v[0].a);
+            image2d_st1(top_blob_2d, ivec2(gx, gy8 + 4), v[1].r);
+            image2d_st1(top_blob_2d, ivec2(gx, gy8 + 5), v[1].g);
+            image2d_st1(top_blob_2d, ivec2(gx, gy8 + 6), v[1].b);
+            image2d_st1(top_blob_2d, ivec2(gx, gy8 + 7), v[1].a);
+        }
+        if (axis == 1)
+        {
+            int gx8 = gx * 8 + p.offset;
+
+            image2d_st1(top_blob_2d, ivec2(gx8 + 0, gy), v[0].r);
+            image2d_st1(top_blob_2d, ivec2(gx8 + 1, gy), v[0].g);
+            image2d_st1(top_blob_2d, ivec2(gx8 + 2, gy), v[0].b);
+            image2d_st1(top_blob_2d, ivec2(gx8 + 3, gy), v[0].a);
+            image2d_st1(top_blob_2d, ivec2(gx8 + 4, gy), v[1].r);
+            image2d_st1(top_blob_2d, ivec2(gx8 + 5, gy), v[1].g);
+            image2d_st1(top_blob_2d, ivec2(gx8 + 6, gy), v[1].b);
+            image2d_st1(top_blob_2d, ivec2(gx8 + 7, gy), v[1].a);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+
+        if (axis == 0)
+        {
+            int gz8 = gz * 8 + p.offset;
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 0), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 1), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 2), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 3), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 4), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 5), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 6), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz8 + 7), v[1].a);
+        }
+        if (axis == 1)
+        {
+            int gy8 = gy * 8 + p.offset;
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy8 + 0, gz), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy8 + 1, gz), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy8 + 2, gz), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy8 + 3, gz), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(gx, gy8 + 4, gz), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy8 + 5, gz), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy8 + 6, gz), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy8 + 7, gz), v[1].a);
+        }
+        if (axis == 2)
+        {
+            int gx8 = gx * 8 + p.offset;
+
+            image3d_st1(top_blob_3d, ivec3(gx8 + 0, gy, gz), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(gx8 + 1, gy, gz), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(gx8 + 2, gy, gz), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(gx8 + 3, gy, gz), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(gx8 + 4, gy, gz), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(gx8 + 5, gy, gz), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(gx8 + 6, gy, gz), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(gx8 + 7, gy, gz), v[1].a);
+        }
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    ivec3 gxyz = ivec3(gx, gy, gz);
+
+    gxyz[psc(dims) - 1] *= 8;
+    gxyz[psc(dims) - 1 - axis] += p.offset;
+
+    int v_offset_0 = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;
+
+    ivec3 gxyz4 = ivec3(1, psc(outw), psc(outcstep));
+
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1 - axis];
+    ivec4 vv_offset = v_offset + 4 * gxyz4[psc(dims) - 1 - axis];
+
+    buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/concat_pack8to4.comp b/source/device/vulkan/shaders/concat_pack8to4.comp
new file mode 100644
index 000000000..6890e0f14
--- /dev/null
+++ b/source/device/vulkan/shaders/concat_pack8to4.comp
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int offset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        afpvec8 v = image1d_ld8(bottom_blob_1d, gx);
+
+        int gx2 = gx * 2 + p.offset;
+
+        image1d_st4(top_blob_1d, gx2 + 0, v[0]);
+        image1d_st4(top_blob_1d, gx2 + 1, v[1]);
+
+    }
+    else if (psc(dims) == 2)
+    {
+        afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+
+        if (axis == 0)
+        {
+            int gy2 = gy * 2 + p.offset;
+
+            image2d_st4(top_blob_2d, ivec2(gx, gy2 + 0), v[0]);
+            image2d_st4(top_blob_2d, ivec2(gx, gy2 + 1), v[1]);
+        }
+        if (axis == 1)
+        {
+            int gx2 = gx * 2 + p.offset;
+
+            image2d_st4(top_blob_2d, ivec2(gx2 + 0, gy), v[0]);
+            image2d_st4(top_blob_2d, ivec2(gx2 + 1, gy), v[1]);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+
+        if (axis == 0)
+        {
+            int gz2 = gz * 2 + p.offset;
+
+            image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 0), v[0]);
+            image3d_st4(top_blob_3d, ivec3(gx, gy, gz2 + 1), v[1]);
+        }
+        if (axis == 1)
+        {
+            int gy2 = gy * 2 + p.offset;
+
+            image3d_st4(top_blob_3d, ivec3(gx, gy2 + 0, gz), v[0]);
+            image3d_st4(top_blob_3d, ivec3(gx, gy2 + 1, gz), v[1]);
+        }
+        if (axis == 2)
+        {
+            int gx2 = gx * 2 + p.offset;
+
+            image3d_st4(top_blob_3d, ivec3(gx2 + 0, gy, gz), v[0]);
+            image3d_st4(top_blob_3d, ivec3(gx2 + 1, gy, gz), v[1]);
+        }
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    ivec3 gxyz = ivec3(gx, gy, gz);
+
+    gxyz[psc(dims) - 1] *= 2;
+    gxyz[psc(dims) - 1 - axis] += p.offset;
+
+    int v_offset_0 = gxyz.z * psc(outcstep) + gxyz.y * psc(outw) + gxyz.x;
+
+    ivec3 gxyz4 = ivec3(1, psc(outw), psc(outcstep));
+
+    ivec2 v_offset = v_offset_0 + ivec2(0, 1) * gxyz4[psc(dims) - 1 - axis];
+
+    buffer_cp8to4(top_blob_data, v_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution.comp b/source/device/vulkan/shaders/convolution.comp
new file mode 100644
index 000000000..1d1070950
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution.comp
@@ -0,0 +1,175 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gz);
+#else
+        sum = buffer_ld1(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int z = 0; z < psc(c); z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, z));
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+    }
+#else
+    int w_offset = gz * psc(c) * kernel_w * kernel_h;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                sum += buffer_ld1(weight_data, w_offset + x) * buffer_ld1(bottom_blob_data, v_offset + x * dilation_w);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_1x1s1d1.comp b/source/device/vulkan/shaders/convolution_1x1s1d1.comp
new file mode 100644
index 000000000..947f21fbe
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_1x1s1d1.comp
@@ -0,0 +1,187 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { vec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { vec4 top_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+#if NCNN_image_shader
+    int gx = int(gl_GlobalInvocationID.x) * 4;
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+        return;
+#else
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+        return;
+#endif
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = afpvec4(image1d_ld1(bias_blob, gz));
+#else
+        sum = afpvec4(buffer_ld1(bias_data, gz));
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+#if NCNN_image_shader
+    ivec4 gx4 = gx + ivec4(0, 1, 2, 3);
+
+    ivec4 sy4 = gx4 / psc(w);
+    ivec4 sx4 = gx4 % psc(w);
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        afp k = image3d_ld1(weight_blob, ivec3(0, z, gz));
+
+        sum.r += k * image3d_ld1(bottom_blob, ivec3(sx4.r, sy4.r, z));
+        sum.g += k * image3d_ld1(bottom_blob, ivec3(sx4.g, sy4.g, z));
+        sum.b += k * image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
+        sum.a += k * image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));
+    }
+#else
+    int w_offset = gz * psc(c);
+    int v_offset = gx;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+#if NCNN_fp16_packed
+        sum += afp(weight_data[w_offset]) * afpvec4(bottom_blob_data[v_offset]);
+#else
+        sum += buffer_ld1(weight_data, w_offset) * buffer_ld4(bottom_blob_data, v_offset);
+#endif
+
+        w_offset += 1;
+        v_offset += psc(cstep) / 4;
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum.r);
+    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum.g);
+    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum.b);
+    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum.a);
+#else
+    const int gi = gz * psc(outcstep) + gx;
+
+#if NCNN_fp16_packed
+    top_blob_data[gi] = sum;
+#else
+    buffer_st4(top_blob_data, gi, sum);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack1to4.comp b/source/device/vulkan/shaders/convolution_pack1to4.comp
new file mode 100644
index 000000000..711f44aa9
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack1to4.comp
@@ -0,0 +1,183 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gz);
+#else
+        sum = buffer_ld4(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int z = 0; z < psc(c); z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z));
+
+                afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz));
+
+                sum += v * k;
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+    }
+#else
+    int w_offset = gz * psc(c) * kernel_w * kernel_h;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = buffer_ld1(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec4 k = buffer_ld4(weight_data, w_offset + x);
+
+                sum += v * k;
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack1to8.comp b/source/device/vulkan/shaders/convolution_pack1to8.comp
new file mode 100644
index 000000000..d9849b8fa
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack1to8.comp
@@ -0,0 +1,193 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gz);
+#else
+        sum = buffer_ld8(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+#if NCNN_image_shader
+    for (int z = 0; z < psc(c); z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, z));
+
+                afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz));
+
+                // sum += v * k;
+                sum[0] += v * k[0];
+                sum[1] += v * k[1];
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+    }
+#else
+    int w_offset = gz * psc(c) * kernel_w * kernel_h;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = buffer_ld1(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec8 k = buffer_ld8(weight_data, w_offset + x);
+
+                // sum += v * k;
+                sum[0] += v * k[0];
+                sum[1] += v * k[1];
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack4.comp b/source/device/vulkan/shaders/convolution_pack4.comp
new file mode 100644
index 000000000..5a714f86d
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack4.comp
@@ -0,0 +1,203 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+#else
+layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
+#endif
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gz);
+#else
+        sum = buffer_ld4(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int z = 0; z < psc(c); z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z));
+
+                afpmat4 k = afpmat4(
+                    image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)),
+                    image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)),
+                    image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)),
+                    image3d_ld4(weight_blob, ivec3(wx + 3, z, gz))
+                );
+
+                sum += v * k;
+
+                sx += dilation_w;
+                wx += 4;
+            }
+
+            sy += dilation_h;
+        }
+    }
+#else
+    int w_offset = gz * psc(c) * kernel_w * kernel_h;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w);
+
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+                // GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+                afpmat4 k = afpmat4(
+                    buffer_ld4(weight_data, (w_offset + x) * 4 + 0),
+                    buffer_ld4(weight_data, (w_offset + x) * 4 + 1),
+                    buffer_ld4(weight_data, (w_offset + x) * 4 + 2),
+                    buffer_ld4(weight_data, (w_offset + x) * 4 + 3)
+                );
+#else
+                afpmat4 k = sfp2afpmat4(weight_data[w_offset + x]);
+#endif
+
+                sum += v * k;
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack4_1x1s1d1.comp b/source/device/vulkan/shaders/convolution_pack4_1x1s1d1.comp
new file mode 100644
index 000000000..a7efaefd7
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack4_1x1s1d1.comp
@@ -0,0 +1,237 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+#else
+layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
+#endif
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+#if NCNN_image_shader
+    int gx = int(gl_GlobalInvocationID.x) * 4;
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+        return;
+#else
+    int gx = int(gl_GlobalInvocationID.x) * 4;
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+        return;
+#endif
+
+    afpvec4 sum0;
+    afpvec4 sum1;
+    afpvec4 sum2;
+    afpvec4 sum3;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        afpvec4 b = image1d_ld4(bias_blob, gz);
+#else
+        afpvec4 b = buffer_ld4(bias_data, gz);
+#endif
+        sum0 = b;
+        sum1 = b;
+        sum2 = b;
+        sum3 = b;
+    }
+    else
+    {
+        sum0 = afpvec4(0.f);
+        sum1 = afpvec4(0.f);
+        sum2 = afpvec4(0.f);
+        sum3 = afpvec4(0.f);
+    }
+
+#if NCNN_image_shader
+    ivec4 gx4 = gx + ivec4(0, 1, 2, 3);
+
+    ivec4 sy4 = gx4 / psc(w);
+    ivec4 sx4 = gx4 % psc(w);
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(sx4.r, sy4.r, z));
+        afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(sx4.g, sy4.g, z));
+        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z));
+        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));
+
+        afpmat4 k = afpmat4(
+            image3d_ld4(weight_blob, ivec3(0, z, gz)),
+            image3d_ld4(weight_blob, ivec3(1, z, gz)),
+            image3d_ld4(weight_blob, ivec3(2, z, gz)),
+            image3d_ld4(weight_blob, ivec3(3, z, gz))
+        );
+
+        sum0 += v0 * k;
+        sum1 += v1 * k;
+        sum2 += v2 * k;
+        sum3 += v3 * k;
+    }
+#else
+    int w_offset = gz * psc(c);
+    int v_offset = gx;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset + 0);
+        afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset + 1);
+        afpvec4 v2 = buffer_ld4(bottom_blob_data, v_offset + 2);
+        afpvec4 v3 = buffer_ld4(bottom_blob_data, v_offset + 3);
+
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+        // GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+        afpmat4 k = afpmat4(
+            buffer_ld4(weight_data, w_offset * 4 + 0),
+            buffer_ld4(weight_data, w_offset * 4 + 1),
+            buffer_ld4(weight_data, w_offset * 4 + 2),
+            buffer_ld4(weight_data, w_offset * 4 + 3)
+        );
+#else
+        afpmat4 k = sfp2afpmat4(weight_data[w_offset]);
+#endif
+
+        sum0 += v0 * k;
+        sum1 += v1 * k;
+        sum2 += v2 * k;
+        sum3 += v3 * k;
+
+        w_offset += 1;
+        v_offset += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum0 = max(sum0, afp(0.f));
+        sum1 = max(sum1, afp(0.f));
+        sum2 = max(sum2, afp(0.f));
+        sum3 = max(sum3, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
+        sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
+        sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
+        sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum0 = clamp(sum0, const_min, const_max);
+        sum1 = clamp(sum1, const_min, const_max);
+        sum2 = clamp(sum2, const_min, const_max);
+        sum3 = clamp(sum3, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
+        sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
+        sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
+        sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
+    }
+    if (activation_type == 5)
+    {
+        sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
+        sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
+        sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
+        sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
+    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
+    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
+    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+#else
+    int gi = gz * psc(outcstep) + gx;
+
+    buffer_st4(top_blob_data, gi + 0, sum0);
+    if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);
+    if (gx + 2 < psc(outcstep)) buffer_st4(top_blob_data, gi + 2, sum2);
+    if (gx + 3 < psc(outcstep)) buffer_st4(top_blob_data, gi + 3, sum3);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp
new file mode 100644
index 000000000..40211c64f
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_gemm.comp
@@ -0,0 +1,139 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 2) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_tm_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob;
+layout (binding = 2) uniform unfp sampler3D weight_tm_blob;
+#else
+layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; };
+layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; };
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; };
+#else
+layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; };
+#endif
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int c;
+    int cstep;
+
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y) * 4;
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= 16 || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 sum0 = afpvec4(0.f);
+    afpvec4 sum1 = afpvec4(0.f);
+    afpvec4 sum2 = afpvec4(0.f);
+    afpvec4 sum3 = afpvec4(0.f);
+
+#if NCNN_image_shader
+    int wx = gx * 4;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 0, z));
+        afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 1, z));
+        afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 2, z));
+        afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 3, z));
+
+        afpmat4 k = afpmat4(
+            image3d_ld4(weight_tm_blob, ivec3(wx + 0, z, gz)),
+            image3d_ld4(weight_tm_blob, ivec3(wx + 1, z, gz)),
+            image3d_ld4(weight_tm_blob, ivec3(wx + 2, z, gz)),
+            image3d_ld4(weight_tm_blob, ivec3(wx + 3, z, gz))
+        );
+
+        sum0 += v0 * k;
+        sum1 += v1 * k;
+        sum2 += v2 * k;
+        sum3 += v3 * k;
+    }
+#else
+    int v_offset = gy * 16 + gx;
+    int w_offset = gz * psc(c) * 16 + gx;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        afpvec4 v0 = buffer_ld4(bottom_tm_blob_data, v_offset + 0);
+        afpvec4 v1 = buffer_ld4(bottom_tm_blob_data, v_offset + 16);
+        afpvec4 v2 = buffer_ld4(bottom_tm_blob_data, v_offset + 32);
+        afpvec4 v3 = buffer_ld4(bottom_tm_blob_data, v_offset + 48);
+
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+        // GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+        afpmat4 k = afpmat4(
+            buffer_ld4(weight_tm_data, w_offset * 4 + 0),
+            buffer_ld4(weight_tm_data, w_offset * 4 + 1),
+            buffer_ld4(weight_tm_data, w_offset * 4 + 2),
+            buffer_ld4(weight_tm_data, w_offset * 4 + 3)
+        );
+#else
+        afpmat4 k = sfpmat4(weight_tm_data[w_offset]);
+#endif
+
+        sum0 += v0 * k;
+        sum1 += v1 * k;
+        sum2 += v2 * k;
+        sum3 += v3 * k;
+
+        v_offset += psc(cstep);
+        w_offset += 16;
+    }
+#endif
+
+#if NCNN_image_shader
+    image3d_st4(top_tm_blob, ivec3(gx, gy + 0, gz), sum0);
+    image3d_st4(top_tm_blob, ivec3(gx, gy + 1, gz), sum1);
+    image3d_st4(top_tm_blob, ivec3(gx, gy + 2, gz), sum2);
+    image3d_st4(top_tm_blob, ivec3(gx, gy + 3, gz), sum3);
+#else
+    int gi = gz * psc(outcstep) + gy * 16 + gx;
+
+    buffer_st4(top_tm_blob_data, gi + 0, sum0);
+    if (gy + 1 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 16, sum1);
+    if (gy + 2 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 32, sum2);
+    if (gy + 3 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 48, sum3);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp
new file mode 100644
index 000000000..8734d01de
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_input.comp
@@ -0,0 +1,202 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int block_x = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int block_y = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
+        return;
+
+    // load 4x4
+#if NCNN_image_shader
+    int sx = gx * 2;
+    int sy = gy * 2;
+
+    afpvec4 v00 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 0, gz));
+    afpvec4 v01 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 0, gz));
+    afpvec4 v02 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 0, gz));
+    afpvec4 v03 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 0, gz));
+
+    afpvec4 v10 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 1, gz));
+    afpvec4 v11 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 1, gz));
+    afpvec4 v12 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 1, gz));
+    afpvec4 v13 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 1, gz));
+
+    afpvec4 v20 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 2, gz));
+    afpvec4 v21 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 2, gz));
+    afpvec4 v22 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 2, gz));
+    afpvec4 v23 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 2, gz));
+
+    afpvec4 v30 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 3, gz));
+    afpvec4 v31 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 3, gz));
+    afpvec4 v32 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 3, gz));
+    afpvec4 v33 = image3d_ld4(bottom_blob, ivec3(sx + 3, sy + 3, gz));
+#else
+    int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+
+    afpvec4 v00 = buffer_ld4(bottom_blob_data, v_offset.r + 0);
+    afpvec4 v01 = buffer_ld4(bottom_blob_data, v_offset.r + 1);
+    afpvec4 v02 = buffer_ld4(bottom_blob_data, v_offset.r + 2);
+    afpvec4 v03 = buffer_ld4(bottom_blob_data, v_offset.r + 3);
+
+    afpvec4 v10 = buffer_ld4(bottom_blob_data, v_offset.g + 0);
+    afpvec4 v11 = buffer_ld4(bottom_blob_data, v_offset.g + 1);
+    afpvec4 v12 = buffer_ld4(bottom_blob_data, v_offset.g + 2);
+    afpvec4 v13 = buffer_ld4(bottom_blob_data, v_offset.g + 3);
+
+    afpvec4 v20 = buffer_ld4(bottom_blob_data, v_offset.b + 0);
+    afpvec4 v21 = buffer_ld4(bottom_blob_data, v_offset.b + 1);
+    afpvec4 v22 = buffer_ld4(bottom_blob_data, v_offset.b + 2);
+    afpvec4 v23 = buffer_ld4(bottom_blob_data, v_offset.b + 3);
+
+    afpvec4 v30 = buffer_ld4(bottom_blob_data, v_offset.a + 0);
+    afpvec4 v31 = buffer_ld4(bottom_blob_data, v_offset.a + 1);
+    afpvec4 v32 = buffer_ld4(bottom_blob_data, v_offset.a + 2);
+    afpvec4 v33 = buffer_ld4(bottom_blob_data, v_offset.a + 3);
+#endif
+
+    // const float itm[4][4] = {
+    //     {1.0f,  0.0f, -1.0f,  0.0f},
+    //     {0.0f,  1.0f,  1.0f,  0.0f},
+    //     {0.0f, -1.0f,  1.0f,  0.0f},
+    //     {0.0f, -1.0f,  0.0f,  1.0f}
+    // };
+
+    // implicit transpose
+    afpvec4 m00 = v00 - v02;
+    afpvec4 m01 = v10 - v12;
+    afpvec4 m02 = v20 - v22;
+    afpvec4 m03 = v30 - v32;
+
+    afpvec4 m10 = v02 + v01;
+    afpvec4 m11 = v12 + v11;
+    afpvec4 m12 = v22 + v21;
+    afpvec4 m13 = v32 + v31;
+
+    afpvec4 m20 = v02 - v01;
+    afpvec4 m21 = v12 - v11;
+    afpvec4 m22 = v22 - v21;
+    afpvec4 m23 = v32 - v31;
+
+    afpvec4 m30 = v03 - v01;
+    afpvec4 m31 = v13 - v11;
+    afpvec4 m32 = v23 - v21;
+    afpvec4 m33 = v33 - v31;
+
+    v00 = m00 - m02;
+    v10 = m10 - m12;
+    v20 = m20 - m22;
+    v30 = m30 - m32;
+
+    v01 = m02 + m01;
+    v11 = m12 + m11;
+    v21 = m22 + m21;
+    v31 = m32 + m31;
+
+    v02 = m02 - m01;
+    v12 = m12 - m11;
+    v22 = m22 - m21;
+    v32 = m32 - m31;
+
+    v03 = m03 - m01;
+    v13 = m13 - m11;
+    v23 = m23 - m21;
+    v33 = m33 - m31;
+
+    // store 16
+#if NCNN_image_shader
+    int y = gy * p.block_x + gx;
+
+    image3d_st4(bottom_tm_blob, ivec3(0, y, gz), v00);
+    image3d_st4(bottom_tm_blob, ivec3(1, y, gz), v01);
+    image3d_st4(bottom_tm_blob, ivec3(2, y, gz), v02);
+    image3d_st4(bottom_tm_blob, ivec3(3, y, gz), v03);
+    image3d_st4(bottom_tm_blob, ivec3(4, y, gz), v10);
+    image3d_st4(bottom_tm_blob, ivec3(5, y, gz), v11);
+    image3d_st4(bottom_tm_blob, ivec3(6, y, gz), v12);
+    image3d_st4(bottom_tm_blob, ivec3(7, y, gz), v13);
+    image3d_st4(bottom_tm_blob, ivec3(8, y, gz), v20);
+    image3d_st4(bottom_tm_blob, ivec3(9, y, gz), v21);
+    image3d_st4(bottom_tm_blob, ivec3(10, y, gz), v22);
+    image3d_st4(bottom_tm_blob, ivec3(11, y, gz), v23);
+    image3d_st4(bottom_tm_blob, ivec3(12, y, gz), v30);
+    image3d_st4(bottom_tm_blob, ivec3(13, y, gz), v31);
+    image3d_st4(bottom_tm_blob, ivec3(14, y, gz), v32);
+    image3d_st4(bottom_tm_blob, ivec3(15, y, gz), v33);
+#else
+    int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16;
+
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 0, v00);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 1, v01);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 2, v02);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 3, v03);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 4, v10);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 5, v11);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 6, v12);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 7, v13);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 8, v20);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 9, v21);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 10, v22);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 11, v23);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 12, v30);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 13, v31);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 14, v32);
+    buffer_st4(bottom_tm_blob_data, v_tm_offset + 15, v33);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp
new file mode 100644
index 000000000..c693e74a4
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack4_3x3s1d1_winograd23_transform_output.comp
@@ -0,0 +1,209 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 2) const int block_x = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int block_y = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D top_tm_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int c;
+    int cstep;
+
+    int block_x;
+    int block_y;
+
+    int outw;
+    int outh;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
+        return;
+
+    // load 16
+#if NCNN_image_shader
+    int sy = gy * p.block_x + gx;
+
+    afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(0, sy, gz));
+    afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(1, sy, gz));
+    afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(2, sy, gz));
+    afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(3, sy, gz));
+    afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(4, sy, gz));
+    afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(5, sy, gz));
+    afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(6, sy, gz));
+    afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(7, sy, gz));
+    afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(8, sy, gz));
+    afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(9, sy, gz));
+    afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(10, sy, gz));
+    afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(11, sy, gz));
+    afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(12, sy, gz));
+    afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(13, sy, gz));
+    afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(14, sy, gz));
+    afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(15, sy, gz));
+#else
+    int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16;
+
+    afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0);
+    afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1);
+    afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2);
+    afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3);
+    afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4);
+    afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5);
+    afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6);
+    afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7);
+    afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8);
+    afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9);
+    afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10);
+    afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11);
+    afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12);
+    afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13);
+    afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14);
+    afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15);
+#endif
+
+    // const float itm[2][4] = {
+    //     {1.0f,  1.0f,  1.0f,  0.0f},
+    //     {0.0f,  1.0f, -1.0f,  1.0f}
+    // };
+
+    // implicit transpose
+    afpvec4 m00 = v00 + v01 + v02;
+    afpvec4 m01 = v10 + v11 + v12;
+    afpvec4 m02 = v20 + v21 + v22;
+    afpvec4 m03 = v30 + v31 + v32;
+
+    afpvec4 m10 = v01 - v02 + v03;
+    afpvec4 m11 = v11 - v12 + v13;
+    afpvec4 m12 = v21 - v22 + v23;
+    afpvec4 m13 = v31 - v32 + v33;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        const afpvec4 bias_value = image1d_ld4(bias_blob, gz);
+#else
+        const afpvec4 bias_value = buffer_ld4(bias_data, gz);
+#endif
+
+        v00 = bias_value + m00 + m01 + m02;
+        v10 = bias_value + m10 + m11 + m12;
+
+        v01 = bias_value + m01 - m02 + m03;
+        v11 = bias_value + m11 - m12 + m13;
+    }
+    else
+    {
+        v00 = m00 + m01 + m02;
+        v10 = m10 + m11 + m12;
+
+        v01 = m01 - m02 + m03;
+        v11 = m11 - m12 + m13;
+    }
+
+    if (activation_type == 1)
+    {
+        v00 = max(v00, afp(0.f));
+        v10 = max(v10, afp(0.f));
+        v01 = max(v01, afp(0.f));
+        v11 = max(v11, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        v00 = mix(v00, v00 * afp(slope), lessThan(v00, afpvec4(0.f)));
+        v10 = mix(v10, v10 * afp(slope), lessThan(v10, afpvec4(0.f)));
+        v01 = mix(v01, v01 * afp(slope), lessThan(v01, afpvec4(0.f)));
+        v11 = mix(v11, v11 * afp(slope), lessThan(v11, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        v00 = clamp(v00, const_min, const_max);
+        v10 = clamp(v10, const_min, const_max);
+        v01 = clamp(v01, const_min, const_max);
+        v11 = clamp(v11, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        v00 = afp(1.f) / (afp(1.f) + exp(-v00));
+        v10 = afp(1.f) / (afp(1.f) + exp(-v10));
+        v01 = afp(1.f) / (afp(1.f) + exp(-v01));
+        v11 = afp(1.f) / (afp(1.f) + exp(-v11));
+    }
+    if (activation_type == 5)
+    {
+        v00 = v00 * tanh(log(exp(v00) + afp(1.f)));
+        v01 = v01 * tanh(log(exp(v01) + afp(1.f)));
+        v10 = v10 * tanh(log(exp(v10) + afp(1.f)));
+        v11 = v11 * tanh(log(exp(v11) + afp(1.f)));
+    }
+
+    // store 2x2
+#if NCNN_image_shader
+    int x = gx * 2;
+    int y = gy * 2;
+
+    image3d_st4(top_blob, ivec3(x, y, gz), v00);
+    image3d_st4(top_blob, ivec3(x + 1, y, gz), v01);
+    image3d_st4(top_blob, ivec3(x, y + 1, gz), v10);
+    image3d_st4(top_blob, ivec3(x + 1, y + 1, gz), v11);
+#else
+    int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2;
+    int v_offset_1 = v_offset_0 + psc(outw);
+
+    buffer_st4(top_blob_data, v_offset_0 + 0, v00);
+    buffer_st4(top_blob_data, v_offset_0 + 1, v01);
+    buffer_st4(top_blob_data, v_offset_1 + 0, v10);
+    buffer_st4(top_blob_data, v_offset_1 + 1, v11);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack4to1.comp b/source/device/vulkan/shaders/convolution_pack4to1.comp
new file mode 100644
index 000000000..b318f7562
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack4to1.comp
@@ -0,0 +1,183 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gz);
+#else
+        sum = buffer_ld1(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int z = 0; z < psc(c); z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z));
+
+                afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz));
+
+                sum += dot(v, k);
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+    }
+#else
+    int w_offset = gz * psc(c) * kernel_w * kernel_h;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec4 k = buffer_ld4(weight_data, w_offset + x);
+
+                sum += dot(v, k);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack4to8.comp b/source/device/vulkan/shaders/convolution_pack4to8.comp
new file mode 100644
index 000000000..aed8ad6a9
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack4to8.comp
@@ -0,0 +1,219 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gz);
+#else
+        sum = buffer_ld8(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+#if NCNN_image_shader
+    for (int z = 0; z < psc(c); z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, z));
+
+                afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz));
+                afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz));
+                afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz));
+                afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz));
+                afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz));
+                afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz));
+                afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz));
+                afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz));
+
+                // sum += v * k;
+                sum[0].r += dot(v, k0);
+                sum[0].g += dot(v, k1);
+                sum[0].b += dot(v, k2);
+                sum[0].a += dot(v, k3);
+                sum[1].r += dot(v, k4);
+                sum[1].g += dot(v, k5);
+                sum[1].b += dot(v, k6);
+                sum[1].a += dot(v, k7);
+
+                sx += dilation_w;
+                wx += 8;
+            }
+
+            sy += dilation_h;
+        }
+    }
+#else
+    int w_offset = gz * psc(c) * kernel_w * kernel_h;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec4 k0 = buffer_ld4(weight_data, (w_offset + x) * 8 + 0);
+                afpvec4 k1 = buffer_ld4(weight_data, (w_offset + x) * 8 + 1);
+                afpvec4 k2 = buffer_ld4(weight_data, (w_offset + x) * 8 + 2);
+                afpvec4 k3 = buffer_ld4(weight_data, (w_offset + x) * 8 + 3);
+                afpvec4 k4 = buffer_ld4(weight_data, (w_offset + x) * 8 + 4);
+                afpvec4 k5 = buffer_ld4(weight_data, (w_offset + x) * 8 + 5);
+                afpvec4 k6 = buffer_ld4(weight_data, (w_offset + x) * 8 + 6);
+                afpvec4 k7 = buffer_ld4(weight_data, (w_offset + x) * 8 + 7);
+
+                // sum += v * k;
+                sum[0].r += dot(v, k0);
+                sum[0].g += dot(v, k1);
+                sum[0].b += dot(v, k2);
+                sum[0].a += dot(v, k3);
+                sum[1].r += dot(v, k4);
+                sum[1].g += dot(v, k5);
+                sum[1].b += dot(v, k6);
+                sum[1].a += dot(v, k7);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack8.comp b/source/device/vulkan/shaders/convolution_pack8.comp
new file mode 100644
index 000000000..7c1d5cbc2
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack8.comp
@@ -0,0 +1,219 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gz);
+#else
+        sum = buffer_ld8(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+#if NCNN_image_shader
+    for (int z = 0; z < psc(c); z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z));
+
+                afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz));
+                afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz));
+                afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz));
+                afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz));
+                afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz));
+                afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz));
+                afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz));
+                afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz));
+
+                // sum += v * k;
+                sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+                sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+                sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+                sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+                sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]);
+                sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]);
+                sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]);
+                sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]);
+
+                sx += dilation_w;
+                wx += 8;
+            }
+
+            sy += dilation_h;
+        }
+    }
+#else
+    int w_offset = gz * psc(c) * kernel_w * kernel_h;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec8 k0 = buffer_ld8(weight_data, (w_offset + x) * 8 + 0);
+                afpvec8 k1 = buffer_ld8(weight_data, (w_offset + x) * 8 + 1);
+                afpvec8 k2 = buffer_ld8(weight_data, (w_offset + x) * 8 + 2);
+                afpvec8 k3 = buffer_ld8(weight_data, (w_offset + x) * 8 + 3);
+                afpvec8 k4 = buffer_ld8(weight_data, (w_offset + x) * 8 + 4);
+                afpvec8 k5 = buffer_ld8(weight_data, (w_offset + x) * 8 + 5);
+                afpvec8 k6 = buffer_ld8(weight_data, (w_offset + x) * 8 + 6);
+                afpvec8 k7 = buffer_ld8(weight_data, (w_offset + x) * 8 + 7);
+
+                // sum += v * k
+                sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+                sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+                sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+                sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+                sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]);
+                sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]);
+                sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]);
+                sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack8_1x1s1d1.comp b/source/device/vulkan/shaders/convolution_pack8_1x1s1d1.comp
new file mode 100644
index 000000000..48c548efb
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack8_1x1s1d1.comp
@@ -0,0 +1,327 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+#if NCNN_image_shader
+    int gx = int(gl_GlobalInvocationID.x) * 4;
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+        return;
+#else
+    int gx = int(gl_GlobalInvocationID.x) * 4;
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+        return;
+#endif
+
+    afpvec8 sum0;
+    afpvec8 sum1;
+    afpvec8 sum2;
+    afpvec8 sum3;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        afpvec8 b = image1d_ld8(bias_blob, gz);
+#else
+        afpvec8 b = buffer_ld8(bias_data, gz);
+#endif
+        sum0 = b;
+        sum1 = b;
+        sum2 = b;
+        sum3 = b;
+    }
+    else
+    {
+        sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f));
+        sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f));
+        sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f));
+        sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+#if NCNN_image_shader
+    ivec4 gx4 = gx + ivec4(0, 1, 2, 3);
+
+    ivec4 sy4 = gx4 / psc(w);
+    ivec4 sx4 = gx4 % psc(w);
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(sx4.r, sy4.r, z));
+        afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(sx4.g, sy4.g, z));
+        afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
+        afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));
+
+        afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz));
+        afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz));
+        afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz));
+        afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz));
+        afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz));
+        afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz));
+        afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz));
+        afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz));
+
+        // sum += v * k
+        sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+        sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+        sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+        sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+        sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+        sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+        sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+        sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+        sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+        sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+        sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+        sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+        sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+        sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+        sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+        sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+        sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
+        sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
+        sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
+        sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
+        sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
+        sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
+        sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
+        sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);
+
+        sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
+        sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
+        sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
+        sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
+        sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
+        sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
+        sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
+        sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
+    }
+#else
+    int w_offset = gz * psc(c) * 8;
+    int v_offset = gx;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset + 0);
+        afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset + 1);
+        afpvec8 v2 = buffer_ld8(bottom_blob_data, v_offset + 2);
+        afpvec8 v3 = buffer_ld8(bottom_blob_data, v_offset + 3);
+
+        afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0);
+        afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1);
+        afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2);
+        afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3);
+        afpvec8 k4 = buffer_ld8(weight_data, w_offset + 4);
+        afpvec8 k5 = buffer_ld8(weight_data, w_offset + 5);
+        afpvec8 k6 = buffer_ld8(weight_data, w_offset + 6);
+        afpvec8 k7 = buffer_ld8(weight_data, w_offset + 7);
+
+        // sum += v * k
+        sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+        sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+        sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+        sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+        sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+        sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+        sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+        sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+        sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+        sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+        sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+        sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+        sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+        sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+        sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+        sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+        sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
+        sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
+        sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
+        sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
+        sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
+        sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
+        sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
+        sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);
+
+        sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
+        sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
+        sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
+        sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
+        sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
+        sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
+        sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
+        sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
+
+        w_offset += 8;
+        v_offset += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum0[0] = max(sum0[0], afp(0.f));
+        sum0[1] = max(sum0[1], afp(0.f));
+        sum1[0] = max(sum1[0], afp(0.f));
+        sum1[1] = max(sum1[1], afp(0.f));
+        sum2[0] = max(sum2[0], afp(0.f));
+        sum2[1] = max(sum2[1], afp(0.f));
+        sum3[0] = max(sum3[0], afp(0.f));
+        sum3[1] = max(sum3[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
+        sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
+        sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
+        sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
+        sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
+        sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
+        sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
+        sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum0[0] = clamp(sum0[0], const_min, const_max);
+        sum0[1] = clamp(sum0[1], const_min, const_max);
+        sum1[0] = clamp(sum1[0], const_min, const_max);
+        sum1[1] = clamp(sum1[1], const_min, const_max);
+        sum2[0] = clamp(sum2[0], const_min, const_max);
+        sum2[1] = clamp(sum2[1], const_min, const_max);
+        sum3[0] = clamp(sum3[0], const_min, const_max);
+        sum3[1] = clamp(sum3[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
+        sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
+        sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
+        sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
+        sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
+        sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
+        sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
+        sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
+        sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
+        sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
+        sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
+        sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
+        sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
+        sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
+        sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
+    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
+    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
+    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+#else
+    int gi = gz * psc(outcstep) + gx;
+
+    // afp tmp = afp(1.0f);
+    // sum0 = afpvec8(afpvec4(tmp), afpvec4(tmp));
+    // sum1 = afpvec8(afpvec4(tmp), afpvec4(tmp));
+    // sum2 = afpvec8(afpvec4(tmp), afpvec4(tmp));
+    // sum3 = afpvec8(afpvec4(tmp), afpvec4(tmp));
+    // w_offset = 0;
+    // sum0 = buffer_ld8(weight_data, w_offset + 0);
+    // sum1 = buffer_ld8(weight_data, w_offset + 1);
+    // sum2 = buffer_ld8(weight_data, w_offset + 2);
+    // sum3 = buffer_ld8(weight_data, w_offset + 3);
+
+    buffer_st8(top_blob_data, gi + 0, sum0);
+    if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);
+    if (gx + 2 < psc(outcstep)) buffer_st8(top_blob_data, gi + 2, sum2);
+    if (gx + 3 < psc(outcstep)) buffer_st8(top_blob_data, gi + 3, sum3);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp
new file mode 100644
index 000000000..e5f619fd3
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_gemm.comp
@@ -0,0 +1,198 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 2) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_tm_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob;
+layout (binding = 2) uniform unfp sampler3D weight_tm_blob;
+#else
+layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; };
+layout (binding = 1) writeonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; };
+layout (binding = 2) readonly buffer weight_tm_blob { sfpvec8 weight_tm_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int c;
+    int cstep;
+
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y) * 4;
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= 16 || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    afpvec8 sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f));
+
+#if NCNN_image_shader
+    int wx = gx * 8;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 0, z));
+        afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 1, z));
+        afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 2, z));
+        afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 3, z));
+
+        afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(wx + 0, z, gz));
+        afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(wx + 1, z, gz));
+        afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(wx + 2, z, gz));
+        afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(wx + 3, z, gz));
+        afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(wx + 4, z, gz));
+        afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(wx + 5, z, gz));
+        afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(wx + 6, z, gz));
+        afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(wx + 7, z, gz));
+
+        // sum += v * k
+        sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+        sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+        sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+        sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+        sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+        sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+        sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+        sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+        sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+        sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+        sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+        sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+        sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+        sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+        sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+        sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+        sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
+        sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
+        sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
+        sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
+        sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
+        sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
+        sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
+        sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);
+
+        sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
+        sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
+        sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
+        sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
+        sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
+        sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
+        sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
+        sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
+    }
+#else
+    int v_offset = gy * 16 + gx;
+    int w_offset = (gz * psc(c) * 16 + gx) * 8;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        afpvec8 v0 = buffer_ld8(bottom_tm_blob_data, v_offset + 0);
+        afpvec8 v1 = buffer_ld8(bottom_tm_blob_data, v_offset + 16);
+        afpvec8 v2 = buffer_ld8(bottom_tm_blob_data, v_offset + 32);
+        afpvec8 v3 = buffer_ld8(bottom_tm_blob_data, v_offset + 48);
+
+        afpvec8 k0 = buffer_ld8(weight_tm_data, w_offset + 0);
+        afpvec8 k1 = buffer_ld8(weight_tm_data, w_offset + 1);
+        afpvec8 k2 = buffer_ld8(weight_tm_data, w_offset + 2);
+        afpvec8 k3 = buffer_ld8(weight_tm_data, w_offset + 3);
+        afpvec8 k4 = buffer_ld8(weight_tm_data, w_offset + 4);
+        afpvec8 k5 = buffer_ld8(weight_tm_data, w_offset + 5);
+        afpvec8 k6 = buffer_ld8(weight_tm_data, w_offset + 6);
+        afpvec8 k7 = buffer_ld8(weight_tm_data, w_offset + 7);
+
+        // sum += v * k
+        sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+        sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+        sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+        sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+        sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+        sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+        sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+        sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+        sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+        sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+        sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+        sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+        sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+        sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+        sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+        sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+        sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
+        sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
+        sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
+        sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
+        sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
+        sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
+        sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
+        sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);
+
+        sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
+        sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
+        sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
+        sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
+        sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
+        sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
+        sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
+        sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
+
+        v_offset += psc(cstep);
+        w_offset += 16 * 8;
+    }
+#endif
+
+#if NCNN_image_shader
+    image3d_st8(top_tm_blob, ivec3(gx, gy + 0, gz), sum0);
+    image3d_st8(top_tm_blob, ivec3(gx, gy + 1, gz), sum1);
+    image3d_st8(top_tm_blob, ivec3(gx, gy + 2, gz), sum2);
+    image3d_st8(top_tm_blob, ivec3(gx, gy + 3, gz), sum3);
+#else
+    int gi = gz * psc(outcstep) + gy * 16 + gx;
+
+    buffer_st8(top_tm_blob_data, gi + 0, sum0);
+    if (gy + 1 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 16, sum1);
+    if (gy + 2 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 32, sum2);
+    if (gy + 3 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 48, sum3);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp
new file mode 100644
index 000000000..23b89c572
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_input.comp
@@ -0,0 +1,203 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int block_x = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int block_y = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D bottom_tm_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outcstep;
+
+    int block_x;
+    int block_y;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
+        return;
+
+    // load 4x4
+#if NCNN_image_shader
+    int sx = gx * 2;
+    int sy = gy * 2;
+
+    afpvec8 v00 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 0, gz));
+    afpvec8 v01 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 0, gz));
+    afpvec8 v02 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 0, gz));
+    afpvec8 v03 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 0, gz));
+
+    afpvec8 v10 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 1, gz));
+    afpvec8 v11 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz));
+    afpvec8 v12 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 1, gz));
+    afpvec8 v13 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 1, gz));
+
+    afpvec8 v20 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 2, gz));
+    afpvec8 v21 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 2, gz));
+    afpvec8 v22 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 2, gz));
+    afpvec8 v23 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 2, gz));
+
+    afpvec8 v30 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 3, gz));
+    afpvec8 v31 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 3, gz));
+    afpvec8 v32 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 3, gz));
+    afpvec8 v33 = image3d_ld8(bottom_blob, ivec3(sx + 3, sy + 3, gz));
+#else
+    int v_offset_0 = gz * psc(cstep) + gy * 2 * psc(w) + gx * 2;
+    ivec4 v_offset = v_offset_0 + ivec4(0, 1, 2, 3) * psc(w);
+
+    afpvec8 v00 = buffer_ld8(bottom_blob_data, v_offset.r + 0);
+    afpvec8 v01 = buffer_ld8(bottom_blob_data, v_offset.r + 1);
+    afpvec8 v02 = buffer_ld8(bottom_blob_data, v_offset.r + 2);
+    afpvec8 v03 = buffer_ld8(bottom_blob_data, v_offset.r + 3);
+
+    afpvec8 v10 = buffer_ld8(bottom_blob_data, v_offset.g + 0);
+    afpvec8 v11 = buffer_ld8(bottom_blob_data, v_offset.g + 1);
+    afpvec8 v12 = buffer_ld8(bottom_blob_data, v_offset.g + 2);
+    afpvec8 v13 = buffer_ld8(bottom_blob_data, v_offset.g + 3);
+
+    afpvec8 v20 = buffer_ld8(bottom_blob_data, v_offset.b + 0);
+    afpvec8 v21 = buffer_ld8(bottom_blob_data, v_offset.b + 1);
+    afpvec8 v22 = buffer_ld8(bottom_blob_data, v_offset.b + 2);
+    afpvec8 v23 = buffer_ld8(bottom_blob_data, v_offset.b + 3);
+
+    afpvec8 v30 = buffer_ld8(bottom_blob_data, v_offset.a + 0);
+    afpvec8 v31 = buffer_ld8(bottom_blob_data, v_offset.a + 1);
+    afpvec8 v32 = buffer_ld8(bottom_blob_data, v_offset.a + 2);
+    afpvec8 v33 = buffer_ld8(bottom_blob_data, v_offset.a + 3);
+#endif
+
+    // const float itm[4][4] = {
+    //     {1.0f,  0.0f, -1.0f,  0.0f},
+    //     {0.0f,  1.0f,  1.0f,  0.0f},
+    //     {0.0f, -1.0f,  1.0f,  0.0f},
+    //     {0.0f, -1.0f,  0.0f,  1.0f}
+    // };
+
+    // implicit transpose
+    afpvec8 m00 = v00 - v02;
+    afpvec8 m01 = v10 - v12;
+    afpvec8 m02 = v20 - v22;
+    afpvec8 m03 = v30 - v32;
+
+    afpvec8 m10 = v02 + v01;
+    afpvec8 m11 = v12 + v11;
+    afpvec8 m12 = v22 + v21;
+    afpvec8 m13 = v32 + v31;
+
+    afpvec8 m20 = v02 - v01;
+    afpvec8 m21 = v12 - v11;
+    afpvec8 m22 = v22 - v21;
+    afpvec8 m23 = v32 - v31;
+
+    afpvec8 m30 = v03 - v01;
+    afpvec8 m31 = v13 - v11;
+    afpvec8 m32 = v23 - v21;
+    afpvec8 m33 = v33 - v31;
+
+    v00 = m00 - m02;
+    v10 = m10 - m12;
+    v20 = m20 - m22;
+    v30 = m30 - m32;
+
+    v01 = m02 + m01;
+    v11 = m12 + m11;
+    v21 = m22 + m21;
+    v31 = m32 + m31;
+
+    v02 = m02 - m01;
+    v12 = m12 - m11;
+    v22 = m22 - m21;
+    v32 = m32 - m31;
+
+    v03 = m03 - m01;
+    v13 = m13 - m11;
+    v23 = m23 - m21;
+    v33 = m33 - m31;
+
+    // store 16
+#if NCNN_image_shader
+    int y = gy * p.block_x + gx;
+
+    image3d_st8(bottom_tm_blob, ivec3(0, y, gz), v00);
+    image3d_st8(bottom_tm_blob, ivec3(1, y, gz), v01);
+    image3d_st8(bottom_tm_blob, ivec3(2, y, gz), v02);
+    image3d_st8(bottom_tm_blob, ivec3(3, y, gz), v03);
+    image3d_st8(bottom_tm_blob, ivec3(4, y, gz), v10);
+    image3d_st8(bottom_tm_blob, ivec3(5, y, gz), v11);
+    image3d_st8(bottom_tm_blob, ivec3(6, y, gz), v12);
+    image3d_st8(bottom_tm_blob, ivec3(7, y, gz), v13);
+    image3d_st8(bottom_tm_blob, ivec3(8, y, gz), v20);
+    image3d_st8(bottom_tm_blob, ivec3(9, y, gz), v21);
+    image3d_st8(bottom_tm_blob, ivec3(10, y, gz), v22);
+    image3d_st8(bottom_tm_blob, ivec3(11, y, gz), v23);
+    image3d_st8(bottom_tm_blob, ivec3(12, y, gz), v30);
+    image3d_st8(bottom_tm_blob, ivec3(13, y, gz), v31);
+    image3d_st8(bottom_tm_blob, ivec3(14, y, gz), v32);
+    image3d_st8(bottom_tm_blob, ivec3(15, y, gz), v33);
+#else
+    int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16;
+
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 0, v00);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 1, v01);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 2, v02);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 3, v03);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 4, v10);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 5, v11);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 6, v12);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 7, v13);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 8, v20);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 9, v21);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 10, v22);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 11, v23);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 12, v30);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 13, v31);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 14, v32);
+    buffer_st8(bottom_tm_blob_data, v_tm_offset + 15, v33);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp
new file mode 100644
index 000000000..f15f48b8e
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack8_3x3s1d1_winograd23_transform_output.comp
@@ -0,0 +1,230 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 2) const int block_x = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int block_y = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D top_tm_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int c;
+    int cstep;
+
+    int block_x;
+    int block_y;
+
+    int outw;
+    int outh;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
+        return;
+
+    // load 16
+#if NCNN_image_shader
+    int sy = gy * p.block_x + gx;
+
+    afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(0, sy, gz));
+    afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(1, sy, gz));
+    afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(2, sy, gz));
+    afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(3, sy, gz));
+    afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(4, sy, gz));
+    afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(5, sy, gz));
+    afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(6, sy, gz));
+    afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(7, sy, gz));
+    afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(8, sy, gz));
+    afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(9, sy, gz));
+    afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(10, sy, gz));
+    afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(11, sy, gz));
+    afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(12, sy, gz));
+    afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(13, sy, gz));
+    afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(14, sy, gz));
+    afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(15, sy, gz));
+#else
+    int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16;
+
+    afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0);
+    afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1);
+    afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2);
+    afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3);
+    afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4);
+    afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5);
+    afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6);
+    afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7);
+    afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8);
+    afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9);
+    afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10);
+    afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11);
+    afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12);
+    afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13);
+    afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14);
+    afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15);
+#endif
+
+    // const float itm[2][4] = {
+    //     {1.0f,  1.0f,  1.0f,  0.0f},
+    //     {0.0f,  1.0f, -1.0f,  1.0f}
+    // };
+
+    // implicit transpose
+    afpvec8 m00 = v00 + v01 + v02;
+    afpvec8 m01 = v10 + v11 + v12;
+    afpvec8 m02 = v20 + v21 + v22;
+    afpvec8 m03 = v30 + v31 + v32;
+
+    afpvec8 m10 = v01 - v02 + v03;
+    afpvec8 m11 = v11 - v12 + v13;
+    afpvec8 m12 = v21 - v22 + v23;
+    afpvec8 m13 = v31 - v32 + v33;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        const afpvec8 bias_value = image1d_ld8(bias_blob, gz);
+#else
+        const afpvec8 bias_value = buffer_ld8(bias_data, gz);
+#endif
+
+        v00 = bias_value + m00 + m01 + m02;
+        v10 = bias_value + m10 + m11 + m12;
+
+        v01 = bias_value + m01 - m02 + m03;
+        v11 = bias_value + m11 - m12 + m13;
+    }
+    else
+    {
+        v00 = m00 + m01 + m02;
+        v10 = m10 + m11 + m12;
+
+        v01 = m01 - m02 + m03;
+        v11 = m11 - m12 + m13;
+    }
+
+    if (activation_type == 1)
+    {
+        v00[0] = max(v00[0], afp(0.f));
+        v00[1] = max(v00[1], afp(0.f));
+        v10[0] = max(v10[0], afp(0.f));
+        v10[1] = max(v10[1], afp(0.f));
+        v01[0] = max(v01[0], afp(0.f));
+        v01[1] = max(v01[1], afp(0.f));
+        v11[0] = max(v11[0], afp(0.f));
+        v11[1] = max(v11[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        v00[0] = mix(v00[0], v00[0] * afp(slope), lessThan(v00[0], afpvec4(0.f)));
+        v00[1] = mix(v00[1], v00[1] * afp(slope), lessThan(v00[1], afpvec4(0.f)));
+        v10[0] = mix(v10[0], v10[0] * afp(slope), lessThan(v10[0], afpvec4(0.f)));
+        v10[1] = mix(v10[1], v10[1] * afp(slope), lessThan(v10[1], afpvec4(0.f)));
+        v01[0] = mix(v01[0], v01[0] * afp(slope), lessThan(v01[0], afpvec4(0.f)));
+        v01[1] = mix(v01[1], v01[1] * afp(slope), lessThan(v01[1], afpvec4(0.f)));
+        v11[0] = mix(v11[0], v11[0] * afp(slope), lessThan(v11[0], afpvec4(0.f)));
+        v11[1] = mix(v11[1], v11[1] * afp(slope), lessThan(v11[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        v00[0] = clamp(v00[0], const_min, const_max);
+        v00[1] = clamp(v00[1], const_min, const_max);
+        v10[0] = clamp(v10[0], const_min, const_max);
+        v10[1] = clamp(v10[1], const_min, const_max);
+        v01[0] = clamp(v01[0], const_min, const_max);
+        v01[1] = clamp(v01[1], const_min, const_max);
+        v11[0] = clamp(v11[0], const_min, const_max);
+        v11[1] = clamp(v11[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        v00[0] = afp(1.f) / (afp(1.f) + exp(-v00[0]));
+        v00[1] = afp(1.f) / (afp(1.f) + exp(-v00[1]));
+        v10[0] = afp(1.f) / (afp(1.f) + exp(-v10[0]));
+        v10[1] = afp(1.f) / (afp(1.f) + exp(-v10[1]));
+        v01[0] = afp(1.f) / (afp(1.f) + exp(-v01[0]));
+        v01[1] = afp(1.f) / (afp(1.f) + exp(-v01[1]));
+        v11[0] = afp(1.f) / (afp(1.f) + exp(-v11[0]));
+        v11[1] = afp(1.f) / (afp(1.f) + exp(-v11[1]));
+    }
+    if (activation_type == 5)
+    {
+        v00[0] = v00[0] * tanh(log(exp(v00[0]) + afp(1.f)));
+        v00[1] = v00[1] * tanh(log(exp(v00[1]) + afp(1.f)));
+        v10[0] = v10[0] * tanh(log(exp(v10[0]) + afp(1.f)));
+        v10[1] = v10[1] * tanh(log(exp(v10[1]) + afp(1.f)));
+        v01[0] = v01[0] * tanh(log(exp(v01[0]) + afp(1.f)));
+        v01[1] = v01[1] * tanh(log(exp(v01[1]) + afp(1.f)));
+        v11[0] = v11[0] * tanh(log(exp(v11[0]) + afp(1.f)));
+        v11[1] = v11[1] * tanh(log(exp(v11[1]) + afp(1.f)));
+    }
+
+    // store 2x2
+#if NCNN_image_shader
+    int x = gx * 2;
+    int y = gy * 2;
+
+    image3d_st8(top_blob, ivec3(x, y, gz), v00);
+    image3d_st8(top_blob, ivec3(x + 1, y, gz), v01);
+    image3d_st8(top_blob, ivec3(x, y + 1, gz), v10);
+    image3d_st8(top_blob, ivec3(x + 1, y + 1, gz), v11);
+#else
+    int v_offset_0 = gz * psc(outcstep) + gy * 2 * psc(outw) + gx * 2;
+    int v_offset_1 = v_offset_0 + psc(outw);
+
+    buffer_st8(top_blob_data, v_offset_0 + 0, v00);
+    buffer_st8(top_blob_data, v_offset_0 + 1, v01);
+    buffer_st8(top_blob_data, v_offset_1 + 0, v10);
+    buffer_st8(top_blob_data, v_offset_1 + 1, v11);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack8to1.comp b/source/device/vulkan/shaders/convolution_pack8to1.comp
new file mode 100644
index 000000000..8d5afd5d5
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack8to1.comp
@@ -0,0 +1,186 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gz);
+#else
+        sum = buffer_ld1(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int z = 0; z < psc(c); z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z));
+
+                afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz));
+
+                // sum += dot(v, k);
+                sum += dot(v[0], k[0]) + dot(v[1], k[1]);
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+    }
+#else
+    int w_offset = gz * psc(c) * kernel_w * kernel_h;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec8 k = buffer_ld8(weight_data, w_offset + x);
+
+                // sum += dot(v, k);
+                sum += dot(v[0], k[0]) + dot(v[1], k[1]);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolution_pack8to4.comp b/source/device/vulkan/shaders/convolution_pack8to4.comp
new file mode 100644
index 000000000..a60bbffe8
--- /dev/null
+++ b/source/device/vulkan/shaders/convolution_pack8to4.comp
@@ -0,0 +1,198 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int activation_type = 0;
+layout (constant_id = 8) const float activation_param_0 = 0;
+layout (constant_id = 9) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 10
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gz);
+#else
+        sum = buffer_ld4(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int z = 0; z < psc(c); z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, z));
+
+                afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz));
+                afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz));
+                afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz));
+                afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz));
+
+                // sum += v * k
+                sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+                sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+                sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+                sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+
+                sx += dilation_w;
+                wx += 4;
+            }
+
+            sy += dilation_h;
+        }
+    }
+#else
+    int w_offset = gz * psc(c) * kernel_w * kernel_h;
+
+    for (int z = 0; z < psc(c); z++)
+    {
+        int v_offset = z * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec8 k0 = buffer_ld8(weight_data, (w_offset + x) * 4 + 0);
+                afpvec8 k1 = buffer_ld8(weight_data, (w_offset + x) * 4 + 1);
+                afpvec8 k2 = buffer_ld8(weight_data, (w_offset + x) * 4 + 2);
+                afpvec8 k3 = buffer_ld8(weight_data, (w_offset + x) * 4 + 3);
+
+                // sum += v * k
+                sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+                sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+                sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+                sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise.comp b/source/device/vulkan/shaders/convolutiondepthwise.comp
new file mode 100644
index 000000000..b4316deb3
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise.comp
@@ -0,0 +1,170 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gz);
+#else
+        sum = buffer_ld1(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+    // depth-wise convolution
+#if NCNN_image_shader
+    int sy = gy * stride_h;
+    int wx = 0;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        int sx = gx * stride_w;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            sum += image2d_ld1(weight_blob, ivec2(wx, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, gz));
+
+            sx += dilation_w;
+            wx += 1;
+        }
+
+        sy += dilation_h;
+    }
+#else
+    int w_offset = gz * kernel_w * kernel_h;
+    int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        for (int x = 0; x < kernel_w; x++)
+        {
+            sum += buffer_ld1(weight_data, w_offset + x) * buffer_ld1(bottom_blob_data, v_offset + x * dilation_w);
+        }
+
+        v_offset += dilation_h * psc(w);
+        w_offset += kernel_w;
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group.comp b/source/device/vulkan/shaders/convolutiondepthwise_group.comp
new file mode 100644
index 000000000..32069bf5a
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_group.comp
@@ -0,0 +1,186 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gz);
+#else
+        sum = buffer_ld1(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+    // group convolution
+    const int channels_g = psc(c) / group;
+    const int num_output_g = psc(outc) / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+#if NCNN_image_shader
+    int sz = gg * channels_g;
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                sum += image3d_ld1(weight_blob, ivec3(wx, z, gz)) * image3d_ld1(bottom_blob, ivec3(sx, sy, sz));
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+
+        sz += 1;
+    }
+#else
+    int w_offset = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * psc(cstep);
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                sum += buffer_ld1(weight_data, w_offset + x) * buffer_ld1(bottom_blob_data, v_offset + x * dilation_w);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+
+        v_offset_0 += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp
new file mode 100644
index 000000000..a3e9eb2e5
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to4.comp
@@ -0,0 +1,194 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gz);
+#else
+        sum = buffer_ld4(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+    // group convolution
+    const int channels_g = psc(c) / group;
+    const int num_output_g = psc(outc) / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+#if NCNN_image_shader
+    int sz = gg * channels_g;
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz));
+
+                afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz));
+
+                sum += v * k;
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+
+        sz += 1;
+    }
+#else
+    int w_offset = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * psc(cstep);
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = buffer_ld1(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec4 k = buffer_ld4(weight_data, w_offset + x);
+
+                sum += v * k;
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+
+        v_offset_0 += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp
new file mode 100644
index 000000000..b32a6aa87
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack1to8.comp
@@ -0,0 +1,204 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gz);
+#else
+        sum = buffer_ld8(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+    // group convolution
+    const int channels_g = psc(c) / group;
+    const int num_output_g = psc(outc) / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+#if NCNN_image_shader
+    int sz = gg * channels_g;
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = image3d_ld1(bottom_blob, ivec3(sx, sy, sz));
+
+                afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz));
+
+                // sum += v * k;
+                sum[0] += v * k[0];
+                sum[1] += v * k[1];
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+
+        sz += 1;
+    }
+#else
+    int w_offset = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * psc(cstep);
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = buffer_ld1(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec8 k = buffer_ld8(weight_data, w_offset + x);
+
+                // sum += v * k;
+                sum[0] += v * k[0];
+                sum[1] += v * k[1];
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+
+        v_offset_0 += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack4.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4.comp
new file mode 100644
index 000000000..2c9661fa6
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4.comp
@@ -0,0 +1,214 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+#else
+layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
+#endif
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gz);
+#else
+        sum = buffer_ld4(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+    // group convolution
+    const int channels_g = psc(c) / group;
+    const int num_output_g = psc(outc) / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+#if NCNN_image_shader
+    int sz = gg * channels_g;
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz));
+
+                afpmat4 k = afpmat4(
+                    image3d_ld4(weight_blob, ivec3(wx + 0, z, gz)),
+                    image3d_ld4(weight_blob, ivec3(wx + 1, z, gz)),
+                    image3d_ld4(weight_blob, ivec3(wx + 2, z, gz)),
+                    image3d_ld4(weight_blob, ivec3(wx + 3, z, gz))
+                );
+
+                sum += v * k;
+
+                sx += dilation_w;
+                wx += 4;
+            }
+
+            sy += dilation_h;
+        }
+
+        sz += 1;
+    }
+#else
+    int w_offset = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * psc(cstep);
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w);
+
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+                // GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+                afpmat4 k = afpmat4(
+                    buffer_ld4(weight_data, (w_offset + x) * 4 + 0),
+                    buffer_ld4(weight_data, (w_offset + x) * 4 + 1),
+                    buffer_ld4(weight_data, (w_offset + x) * 4 + 2),
+                    buffer_ld4(weight_data, (w_offset + x) * 4 + 3)
+                );
+#else
+                afpmat4 k = afpmat4(weight_data[w_offset + x]);
+#endif
+
+                sum += v * k;
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+
+        v_offset_0 += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp
new file mode 100644
index 000000000..7871cccb1
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to1.comp
@@ -0,0 +1,194 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gz);
+#else
+        sum = buffer_ld1(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+    // group convolution
+    const int channels_g = psc(c) / group;
+    const int num_output_g = psc(outc) / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+#if NCNN_image_shader
+    int sz = gg * channels_g;
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz));
+
+                afpvec4 k = image3d_ld4(weight_blob, ivec3(wx, z, gz));
+
+                sum += dot(v, k);
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+
+        sz += 1;
+    }
+#else
+    int w_offset = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * psc(cstep);
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec4 k = buffer_ld4(weight_data, w_offset + x);
+
+                sum += dot(v, k);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+
+        v_offset_0 += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp
new file mode 100644
index 000000000..f369a244d
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack4to8.comp
@@ -0,0 +1,230 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gz);
+#else
+        sum = buffer_ld8(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+    // group convolution
+    const int channels_g = psc(c) / group;
+    const int num_output_g = psc(outc) / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+#if NCNN_image_shader
+    int sz = gg * channels_g;
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, sz));
+
+                afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, z, gz));
+                afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, z, gz));
+                afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, z, gz));
+                afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, z, gz));
+                afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, z, gz));
+                afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, z, gz));
+                afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, z, gz));
+                afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, z, gz));
+
+                // sum += v * k;
+                sum[0].r += dot(v, k0);
+                sum[0].g += dot(v, k1);
+                sum[0].b += dot(v, k2);
+                sum[0].a += dot(v, k3);
+                sum[1].r += dot(v, k4);
+                sum[1].g += dot(v, k5);
+                sum[1].b += dot(v, k6);
+                sum[1].a += dot(v, k7);
+
+                sx += dilation_w;
+                wx += 8;
+            }
+
+            sy += dilation_h;
+        }
+
+        sz += 1;
+    }
+#else
+    int w_offset = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * psc(cstep);
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec4 k0 = buffer_ld4(weight_data, (w_offset + x) * 8 + 0);
+                afpvec4 k1 = buffer_ld4(weight_data, (w_offset + x) * 8 + 1);
+                afpvec4 k2 = buffer_ld4(weight_data, (w_offset + x) * 8 + 2);
+                afpvec4 k3 = buffer_ld4(weight_data, (w_offset + x) * 8 + 3);
+                afpvec4 k4 = buffer_ld4(weight_data, (w_offset + x) * 8 + 4);
+                afpvec4 k5 = buffer_ld4(weight_data, (w_offset + x) * 8 + 5);
+                afpvec4 k6 = buffer_ld4(weight_data, (w_offset + x) * 8 + 6);
+                afpvec4 k7 = buffer_ld4(weight_data, (w_offset + x) * 8 + 7);
+
+                // sum += v * k;
+                sum[0].r += dot(v, k0);
+                sum[0].g += dot(v, k1);
+                sum[0].b += dot(v, k2);
+                sum[0].a += dot(v, k3);
+                sum[1].r += dot(v, k4);
+                sum[1].g += dot(v, k5);
+                sum[1].b += dot(v, k6);
+                sum[1].a += dot(v, k7);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+
+        v_offset_0 += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack8.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8.comp
new file mode 100644
index 000000000..abd16aed8
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8.comp
@@ -0,0 +1,230 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gz);
+#else
+        sum = buffer_ld8(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+    // group convolution
+    const int channels_g = psc(c) / group;
+    const int num_output_g = psc(outc) / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+#if NCNN_image_shader
+    int sz = gg * channels_g;
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz));
+
+                afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz));
+                afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz));
+                afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz));
+                afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz));
+                afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, z, gz));
+                afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, z, gz));
+                afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, z, gz));
+                afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, z, gz));
+
+                // sum += v * k
+                sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+                sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+                sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+                sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+                sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]);
+                sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]);
+                sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]);
+                sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]);
+
+                sx += dilation_w;
+                wx += 8;
+            }
+
+            sy += dilation_h;
+        }
+
+        sz += 1;
+    }
+#else
+    int w_offset = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * psc(cstep);
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec8 k0 = buffer_ld8(weight_data, (w_offset + x) * 8 + 0);
+                afpvec8 k1 = buffer_ld8(weight_data, (w_offset + x) * 8 + 1);
+                afpvec8 k2 = buffer_ld8(weight_data, (w_offset + x) * 8 + 2);
+                afpvec8 k3 = buffer_ld8(weight_data, (w_offset + x) * 8 + 3);
+                afpvec8 k4 = buffer_ld8(weight_data, (w_offset + x) * 8 + 4);
+                afpvec8 k5 = buffer_ld8(weight_data, (w_offset + x) * 8 + 5);
+                afpvec8 k6 = buffer_ld8(weight_data, (w_offset + x) * 8 + 6);
+                afpvec8 k7 = buffer_ld8(weight_data, (w_offset + x) * 8 + 7);
+
+                // sum += v * k
+                sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+                sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+                sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+                sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+                sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]);
+                sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]);
+                sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]);
+                sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+
+        v_offset_0 += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp
new file mode 100644
index 000000000..c77771154
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to1.comp
@@ -0,0 +1,197 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gz);
+#else
+        sum = buffer_ld1(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+    // group convolution
+    const int channels_g = psc(c) / group;
+    const int num_output_g = psc(outc) / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+#if NCNN_image_shader
+    int sz = gg * channels_g;
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz));
+
+                afpvec8 k = image3d_ld8(weight_blob, ivec3(wx, z, gz));
+
+                // sum += dot(v, k);
+                sum += dot(v[0], k[0]) + dot(v[1], k[1]);
+
+                sx += dilation_w;
+                wx += 1;
+            }
+
+            sy += dilation_h;
+        }
+
+        sz += 1;
+    }
+#else
+    int w_offset = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * psc(cstep);
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec8 k = buffer_ld8(weight_data, w_offset + x);
+
+                // sum += dot(v, k);
+                sum += dot(v[0], k[0]) + dot(v[1], k[1]);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+
+        v_offset_0 += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp
new file mode 100644
index 000000000..9c9f43a89
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_group_pack8to4.comp
@@ -0,0 +1,209 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gz);
+#else
+        sum = buffer_ld4(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+    // group convolution
+    const int channels_g = psc(c) / group;
+    const int num_output_g = psc(outc) / group;
+
+    // group id
+    const int gg = gz / num_output_g;
+
+#if NCNN_image_shader
+    int sz = gg * channels_g;
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int sy = gy * stride_h;
+        int wx = 0;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            int sx = gx * stride_w;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, sz));
+
+                afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, z, gz));
+                afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, z, gz));
+                afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, z, gz));
+                afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, z, gz));
+
+                // sum += v * k
+                sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+                sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+                sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+                sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+
+                sx += dilation_w;
+                wx += 4;
+            }
+
+            sy += dilation_h;
+        }
+
+        sz += 1;
+    }
+#else
+    int w_offset = gz * channels_g * kernel_w * kernel_h;
+    int v_offset_0 = gg * channels_g * psc(cstep);
+
+    for (int z = 0; z < channels_g; z++)
+    {
+        int v_offset = v_offset_0 + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w);
+
+                afpvec8 k0 = buffer_ld8(weight_data, (w_offset + x) * 4 + 0);
+                afpvec8 k1 = buffer_ld8(weight_data, (w_offset + x) * 4 + 1);
+                afpvec8 k2 = buffer_ld8(weight_data, (w_offset + x) * 4 + 2);
+                afpvec8 k3 = buffer_ld8(weight_data, (w_offset + x) * 4 + 3);
+
+                // sum += v * k
+                sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+                sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+                sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+                sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+            }
+
+            v_offset += dilation_h * psc(w);
+            w_offset += kernel_w;
+        }
+
+        v_offset_0 += psc(cstep);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_pack4.comp b/source/device/vulkan/shaders/convolutiondepthwise_pack4.comp
new file mode 100644
index 000000000..0bd4929bf
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_pack4.comp
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gz);
+#else
+        sum = buffer_ld4(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+    // depth-wise convolution
+#if NCNN_image_shader
+    int sy = gy * stride_h;
+    int wx = 0;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        int sx = gx * stride_w;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx, sy, gz));
+
+            afpvec4 k = image2d_ld4(weight_blob, ivec2(wx, gz));
+
+            sum += v * k;
+
+            sx += dilation_w;
+            wx += 1;
+        }
+
+        sy += dilation_h;
+    }
+#else
+    int w_offset = gz * kernel_w * kernel_h;
+    int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x * dilation_w);
+
+            afpvec4 k = buffer_ld4(weight_data, w_offset + x);
+
+            sum += v * k;
+        }
+
+        v_offset += dilation_h * psc(w);
+        w_offset += kernel_w;
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/convolutiondepthwise_pack8.comp b/source/device/vulkan/shaders/convolutiondepthwise_pack8.comp
new file mode 100644
index 000000000..d19c97053
--- /dev/null
+++ b/source/device/vulkan/shaders/convolutiondepthwise_pack8.comp
@@ -0,0 +1,191 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gz);
+#else
+        sum = buffer_ld8(bias_data, gz);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+    // depth-wise convolution
+#if NCNN_image_shader
+    int sy = gy * stride_h;
+    int wx = 0;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        int sx = gx * stride_w;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx, sy, gz));
+
+            afpvec8 k = image2d_ld8(weight_blob, ivec2(wx, gz));
+
+            // sum += v * k;
+            sum[0] += v[0] * k[0];
+            sum[1] += v[1] * k[1];
+
+            sx += dilation_w;
+            wx += 1;
+        }
+
+        sy += dilation_h;
+    }
+#else
+    int w_offset = gz * kernel_w * kernel_h;
+    int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x * dilation_w);
+
+            afpvec8 k = buffer_ld8(weight_data, w_offset + x);
+
+            // sum += v * k;
+            sum[0] += v[0] * k[0];
+            sum[1] += v[1] * k[1];
+        }
+
+        v_offset += dilation_h * psc(w);
+        w_offset += kernel_w;
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(gx, gy, gz), sum);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    // sum = afpvec8(afpvec4(gi), afpvec4(gi));
+    // sum = buffer_ld8(bias_data, gz);
+
+    buffer_st8(top_blob_data, gi, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/crop.comp b/source/device/vulkan/shaders/crop.comp
new file mode 100644
index 000000000..234983eb0
--- /dev/null
+++ b/source/device/vulkan/shaders/crop.comp
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bugihfa = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int woffset;
+    int hoffset;
+    int coffset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int x = gx + p.woffset;
+    int y = gy + p.hoffset;
+    int z = gz + p.coffset;
+
+#if NCNN_image_shader
+    image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z));
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    int v_offset = z * psc(cstep) + y * psc(w) + x;
+
+    buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/crop_pack1to4.comp b/source/device/vulkan/shaders/crop_pack1to4.comp
new file mode 100644
index 000000000..27056a11c
--- /dev/null
+++ b/source/device/vulkan/shaders/crop_pack1to4.comp
@@ -0,0 +1,98 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bugihfa = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int woffset;
+    int hoffset;
+    int coffset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int x = gx + p.woffset;
+    int y = gy + p.hoffset;
+    int z = gz * 4 + p.coffset;
+
+#if NCNN_image_shader
+    afpvec4 v;
+    v.r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0));
+    v.g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1));
+    v.b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2));
+    v.a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3));
+
+    image3d_st4(top_blob, ivec3(gx, gy, gz), v);
+#else
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep);
+
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/crop_pack1to8.comp b/source/device/vulkan/shaders/crop_pack1to8.comp
new file mode 100644
index 000000000..5116a6995
--- /dev/null
+++ b/source/device/vulkan/shaders/crop_pack1to8.comp
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bugihfa = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int woffset;
+    int hoffset;
+    int coffset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int x = gx + p.woffset;
+    int y = gy + p.hoffset;
+    int z = gz * 8 + p.coffset;
+
+#if NCNN_image_shader
+    afpvec8 v;
+    v[0].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 0));
+    v[0].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 1));
+    v[0].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 2));
+    v[0].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 3));
+    v[1].r = image3d_ld1(bottom_blob, ivec3(x, y, z + 4));
+    v[1].g = image3d_ld1(bottom_blob, ivec3(x, y, z + 5));
+    v[1].b = image3d_ld1(bottom_blob, ivec3(x, y, z + 6));
+    v[1].a = image3d_ld1(bottom_blob, ivec3(x, y, z + 7));
+
+    image3d_st8(top_blob, ivec3(gx, gy, gz), v);
+#else
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    ivec4 v_offset = z * psc(cstep) + y * psc(w) + x + ivec4(0, 1, 2, 3) * psc(cstep);
+    ivec4 vv_offset = v_offset + 4 * psc(cstep);
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/crop_pack4.comp b/source/device/vulkan/shaders/crop_pack4.comp
new file mode 100644
index 000000000..d9262c217
--- /dev/null
+++ b/source/device/vulkan/shaders/crop_pack4.comp
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bugihfa = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int woffset;
+    int hoffset;
+    int coffset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int x = gx + p.woffset;
+    int y = gy + p.hoffset;
+    int z = gz + p.coffset;
+
+#if NCNN_image_shader
+    image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z));
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    int v_offset = z * psc(cstep) + y * psc(w) + x;
+
+    buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/crop_pack4to1.comp b/source/device/vulkan/shaders/crop_pack4to1.comp
new file mode 100644
index 000000000..69bf5069c
--- /dev/null
+++ b/source/device/vulkan/shaders/crop_pack4to1.comp
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bugihfa = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int woffset;
+    int hoffset;
+    int coffset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int x = gx + p.woffset;
+    int y = gy + p.hoffset;
+    int z = gz + p.coffset;
+
+#if NCNN_image_shader
+    afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, z / 4));
+
+    image3d_st1(top_blob, ivec3(gx, gy, gz), v[z % 4]);
+#else
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+#if NCNN_fp16_packed
+    int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z % 4) / 2;
+    int lane2 = z % 2;
+
+    afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
+
+    buffer_st1(top_blob_data, gi, v[lane2]);
+#else
+    int v_offset = ((z / 4) * psc(cstep) + y * psc(w) + x) * 4 + z % 4;
+
+    buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/crop_pack4to8.comp b/source/device/vulkan/shaders/crop_pack4to8.comp
new file mode 100644
index 000000000..6b46cdf26
--- /dev/null
+++ b/source/device/vulkan/shaders/crop_pack4to8.comp
@@ -0,0 +1,182 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bugihfa = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int woffset;
+    int hoffset;
+    int coffset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    int x = gx + p.woffset;
+    int y = gy + p.hoffset;
+    ivec4 z4 = gz * 8 + p.coffset + ivec4(0, 1, 2, 3);
+    ivec4 zz4 = z4 + 4;
+
+#if NCNN_image_shader
+    afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(x, y, z4.r / 4));
+    afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(x, y, z4.g / 4));
+    afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(x, y, z4.b / 4));
+    afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(x, y, z4.a / 4));
+    afpvec4 v4 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.r / 4));
+    afpvec4 v5 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.g / 4));
+    afpvec4 v6 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.b / 4));
+    afpvec4 v7 = image3d_ld4(bottom_blob, ivec3(x, y, zz4.a / 4));
+
+    afpvec8 v;
+#if NCNN_fp16_arithmetic
+    if (bugihfa == 1)
+    {
+        ivec4 z4m4 = z4 % 4;
+        ivec4 zz4m4 = zz4 % 4;
+
+        if (z4m4.r == 0) v[0].r = v0.r;
+        if (z4m4.r == 1) v[0].r = v0.g;
+        if (z4m4.r == 2) v[0].r = v0.b;
+        if (z4m4.r == 3) v[0].r = v0.a;
+        if (z4m4.g == 0) v[0].g = v1.r;
+        if (z4m4.g == 1) v[0].g = v1.g;
+        if (z4m4.g == 2) v[0].g = v1.b;
+        if (z4m4.g == 3) v[0].g = v1.a;
+        if (z4m4.b == 0) v[0].b = v2.r;
+        if (z4m4.b == 1) v[0].b = v2.g;
+        if (z4m4.b == 2) v[0].b = v2.b;
+        if (z4m4.b == 3) v[0].b = v2.a;
+        if (z4m4.a == 0) v[0].a = v3.r;
+        if (z4m4.a == 1) v[0].a = v3.g;
+        if (z4m4.a == 2) v[0].a = v3.b;
+        if (z4m4.a == 3) v[0].a = v3.a;
+        if (zz4m4.r == 0) v[1].r = v4.r;
+        if (zz4m4.r == 1) v[1].r = v4.g;
+        if (zz4m4.r == 2) v[1].r = v4.b;
+        if (zz4m4.r == 3) v[1].r = v4.a;
+        if (zz4m4.g == 0) v[1].g = v5.r;
+        if (zz4m4.g == 1) v[1].g = v5.g;
+        if (zz4m4.g == 2) v[1].g = v5.b;
+        if (zz4m4.g == 3) v[1].g = v5.a;
+        if (zz4m4.b == 0) v[1].b = v6.r;
+        if (zz4m4.b == 1) v[1].b = v6.g;
+        if (zz4m4.b == 2) v[1].b = v6.b;
+        if (zz4m4.b == 3) v[1].b = v6.a;
+        if (zz4m4.a == 0) v[1].a = v7.r;
+        if (zz4m4.a == 1) v[1].a = v7.g;
+        if (zz4m4.a == 2) v[1].a = v7.b;
+        if (zz4m4.a == 3) v[1].a = v7.a;
+    }
+    else
+#endif
+    {
+        v[0].r = v0[z4.r % 4];
+        v[0].g = v1[z4.g % 4];
+        v[0].b = v2[z4.b % 4];
+        v[0].a = v3[z4.a % 4];
+        v[1].r = v4[zz4.r % 4];
+        v[1].g = v5[zz4.g % 4];
+        v[1].b = v6[zz4.b % 4];
+        v[1].a = v7[zz4.a % 4];
+    }
+
+    image3d_st8(top_blob, ivec3(gx, gy, gz), v);
+#else
+#if NCNN_fp16_packed
+    ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (z4 % 4) / 2;
+    ivec4 lane2 = z4 % 2;
+    ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 2 + (zz4 % 4) / 2;
+    ivec4 lane4 = zz4 % 2;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
+    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
+    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
+    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
+
+    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
+
+    buffer_st8(top_blob_data, gi, v);
+#else
+    ivec4 v_offset = ((z4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + z4 % 4;
+    ivec4 vv_offset = ((zz4 / 4) * psc(cstep) + y * psc(w) + x) * 4 + zz4 % 4;
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/crop_pack8.comp b/source/device/vulkan/shaders/crop_pack8.comp
new file mode 100644
index 000000000..3465c79fe
--- /dev/null
+++ b/source/device/vulkan/shaders/crop_pack8.comp
@@ -0,0 +1,93 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bugihfa = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int woffset;
+    int hoffset;
+    int coffset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int x = gx + p.woffset;
+    int y = gy + p.hoffset;
+    int z = gz + p.coffset;
+
+#if NCNN_image_shader
+    image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, z));
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    int v_offset = z * psc(cstep) + y * psc(w) + x;
+
+    buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/crop_pack8to1.comp b/source/device/vulkan/shaders/crop_pack8to1.comp
new file mode 100644
index 000000000..885f9260b
--- /dev/null
+++ b/source/device/vulkan/shaders/crop_pack8to1.comp
@@ -0,0 +1,108 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bugihfa = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int woffset;
+    int hoffset;
+    int coffset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int x = gx + p.woffset;
+    int y = gy + p.hoffset;
+    int z = gz + p.coffset;
+
+#if NCNN_image_shader
+    afpvec8 v = image3d_ld8(bottom_blob, ivec3(x, y, z / 8));
+
+    image3d_st1(top_blob, ivec3(gx, gy, gz), v[(z % 8) / 4][z % 4]);
+#else
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+#if NCNN_fp16_packed
+    int v_offset = ((z / 8) * psc(cstep) + y * psc(w) + x) * 4 + (z % 8) / 2;
+    int lane2 = z % 2;
+
+    afpvec2 v = buffer_ld2(bottom_blob_data, v_offset);
+
+    buffer_st1(top_blob_data, gi, v[lane2]);
+#else
+    int v_offset = ((z / 8) * psc(cstep) + y * psc(w) + x) * 8 + z % 8;
+
+    buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/crop_pack8to4.comp b/source/device/vulkan/shaders/crop_pack8to4.comp
new file mode 100644
index 000000000..e102ce724
--- /dev/null
+++ b/source/device/vulkan/shaders/crop_pack8to4.comp
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bugihfa = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int woffset;
+    int hoffset;
+    int coffset;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    int x = gx + p.woffset;
+    int y = gy + p.hoffset;
+    ivec4 z4 = gz * 4 + p.coffset + ivec4(0, 1, 2, 3);
+
+#if NCNN_image_shader
+    afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(x, y, z4.r / 8));
+    afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(x, y, z4.g / 8));
+    afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(x, y, z4.b / 8));
+    afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(x, y, z4.a / 8));
+
+    afpvec4 v;
+#if NCNN_fp16_arithmetic
+    if (bugihfa == 1)
+    {
+        ivec4 z4lane2 = (z4 % 8) / 4;
+        ivec4 z4m4 = z4 % 4;
+
+        if (z4m4.r == 0) v.r = v0[z4lane2.r].r;
+        if (z4m4.r == 1) v.r = v0[z4lane2.r].g;
+        if (z4m4.r == 2) v.r = v0[z4lane2.r].b;
+        if (z4m4.r == 3) v.r = v0[z4lane2.r].a;
+        if (z4m4.g == 0) v.g = v0[z4lane2.g].r;
+        if (z4m4.g == 1) v.g = v0[z4lane2.g].g;
+        if (z4m4.g == 2) v.g = v0[z4lane2.g].b;
+        if (z4m4.g == 3) v.g = v0[z4lane2.g].a;
+        if (z4m4.b == 0) v.b = v0[z4lane2.b].r;
+        if (z4m4.b == 1) v.b = v0[z4lane2.b].g;
+        if (z4m4.b == 2) v.b = v0[z4lane2.b].b;
+        if (z4m4.b == 3) v.b = v0[z4lane2.b].a;
+        if (z4m4.a == 0) v.a = v0[z4lane2.a].r;
+        if (z4m4.a == 1) v.a = v0[z4lane2.a].g;
+        if (z4m4.a == 2) v.a = v0[z4lane2.a].b;
+        if (z4m4.a == 3) v.a = v0[z4lane2.a].a;
+    }
+    else
+#endif
+    {
+        v.r = v0[(z4.r % 8) / 4][z4.r % 4];
+        v.g = v1[(z4.g % 8) / 4][z4.g % 4];
+        v.b = v2[(z4.b % 8) / 4][z4.b % 4];
+        v.a = v3[(z4.a % 8) / 4][z4.a % 4];
+    }
+
+    image3d_st4(top_blob, ivec3(gx, gy, gz), v);
+#else
+#if NCNN_fp16_packed
+    ivec4 v_offset = ((z4 / 8) * psc(cstep) + y * psc(w) + x) * 4 + (z4 % 8) / 2;
+    ivec4 lane2 = z4 % 2;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
+
+    buffer_st4(top_blob_data, gi, v);
+#else
+    ivec4 v_offset = ((z4 / 8) * psc(cstep) + y * psc(w) + x) * 8 + z4 % 8;
+
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/depthwiseconvolution.comp b/source/device/vulkan/shaders/depthwiseconvolution.comp
new file mode 100644
index 000000000..bbbabf1c9
--- /dev/null
+++ b/source/device/vulkan/shaders/depthwiseconvolution.comp
@@ -0,0 +1,121 @@
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int kernel_h = 1;
+layout (constant_id = 2) const int dilation_w = 1;
+layout (constant_id = 3) const int dilation_h = 1;
+layout (constant_id = 4) const int stride_w = 1;
+layout (constant_id = 5) const int stride_h = 1;
+layout (constant_id = 6) const int bias_term = 0;
+layout (constant_id = 7) const int group = 1;
+layout (constant_id = 8) const int activation_type = 0;
+layout (constant_id = 9) const float activation_param_0 = 0;
+layout (constant_id = 10) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+        sum = buffer_ld1(bias_data, gz);
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+    // depth-wise convolution
+    int w_offset = gz * kernel_w * kernel_h;
+    int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+    for (int y = 0; y < kernel_h; y++)
+    {
+        for (int x = 0; x < kernel_w; x++)
+        {
+            sum += buffer_ld1(weight_data, w_offset + x) * buffer_ld1(bottom_blob_data, v_offset + x * dilation_w);
+        }
+
+        v_offset += dilation_h * psc(w);
+        w_offset += kernel_w;
+    }
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+    // sum = gi;//bottom_blob_data[gi];
+    buffer_st1(top_blob_data, gi, sum);
+}
\ No newline at end of file
diff --git a/source/device/vulkan/shaders/dropout.comp b/source/device/vulkan/shaders/dropout.comp
new file mode 100644
index 000000000..53bf43a38
--- /dev/null
+++ b/source/device/vulkan/shaders/dropout.comp
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const float scale = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afp v;
+    if (psc(dims) == 1)
+    {
+        v = image1d_ld1(bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afp v = buffer_ld1(bottom_top_blob_data, gi);
+#endif
+
+    v *= afp(scale);
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st1(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st1(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st1(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/dropout_pack4.comp b/source/device/vulkan/shaders/dropout_pack4.comp
new file mode 100644
index 000000000..c71d37966
--- /dev/null
+++ b/source/device/vulkan/shaders/dropout_pack4.comp
@@ -0,0 +1,104 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const float scale = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec4 v;
+    if (psc(dims) == 1)
+    {
+        v = image1d_ld4(bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec4 v = buffer_ld4(bottom_top_blob_data, gi);
+#endif
+
+    v *= afp(scale);
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st4(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st4(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/dropout_pack8.comp b/source/device/vulkan/shaders/dropout_pack8.comp
new file mode 100644
index 000000000..acecba62d
--- /dev/null
+++ b/source/device/vulkan/shaders/dropout_pack8.comp
@@ -0,0 +1,106 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const float scale = 1;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec8 v;
+    if (psc(dims) == 1)
+    {
+        v = image1d_ld8(bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);
+#endif
+
+    v[0] = v[0] * afp(scale);
+    v[1] = v[1] * afp(scale);
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st8(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st8(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/eltwise.comp b/source/device/vulkan/shaders/eltwise.comp
new file mode 100644
index 000000000..addb1bfb0
--- /dev/null
+++ b/source/device/vulkan/shaders/eltwise.comp
@@ -0,0 +1,141 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int op_type = 0;
+layout (constant_id = 1) const int coeff_term = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob1_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob1_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob1_3d;
+layout (binding = 1) uniform unfp sampler1D bottom_blob2_1d;
+layout (binding = 1) uniform unfp sampler2D bottom_blob2_2d;
+layout (binding = 1) uniform unfp sampler3D bottom_blob2_3d;
+layout (binding = 2, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 2, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 2, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob1 { sfp bottom_blob1_data[]; };
+layout (binding = 1) readonly buffer bottom_blob2 { sfp bottom_blob2_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    float coeff0;
+    float coeff1;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afp v1;
+    afp v2;
+    if (psc(dims) == 1)
+    {
+        v1 = image1d_ld1(bottom_blob1_1d, gx);
+        v2 = image1d_ld1(bottom_blob2_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v1 = image2d_ld1(bottom_blob1_2d, ivec2(gx, gy));
+        v2 = image2d_ld1(bottom_blob2_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v1 = image3d_ld1(bottom_blob1_3d, ivec3(gx, gy, gz));
+        v2 = image3d_ld1(bottom_blob2_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afp v1 = buffer_ld1(bottom_blob1_data, gi);
+    afp v2 = buffer_ld1(bottom_blob2_data, gi);
+#endif
+
+    afp res;
+
+    if (coeff_term == 0)
+    {
+        if (op_type == 0)
+            res = v1 * v2;
+
+        if (op_type == 1)
+            res = v1 + v2;
+
+        if (op_type == 2)
+            res = max(v1, v2);
+    }
+    else
+    {
+        if (op_type == 0)
+            res = v1 * v2;
+
+        if (op_type == 1)
+            res = v1 * afp(p.coeff0) + v2 * afp(p.coeff1);
+
+        if (op_type == 2)
+            res = max(v1, v2);
+    }
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st1(top_blob_1d, gx, res);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st1(top_blob_2d, ivec2(gx, gy), res);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st1(top_blob_3d, ivec3(gx, gy, gz), res);
+    }
+#else
+    buffer_st1(top_blob_data, gi, res);
+#endif
+}
diff --git a/source/device/vulkan/shaders/eltwise_pack4.comp b/source/device/vulkan/shaders/eltwise_pack4.comp
new file mode 100644
index 000000000..c93d1000b
--- /dev/null
+++ b/source/device/vulkan/shaders/eltwise_pack4.comp
@@ -0,0 +1,141 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int op_type = 0;
+layout (constant_id = 1) const int coeff_term = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob1_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob1_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob1_3d;
+layout (binding = 1) uniform unfp sampler1D bottom_blob2_1d;
+layout (binding = 1) uniform unfp sampler2D bottom_blob2_2d;
+layout (binding = 1) uniform unfp sampler3D bottom_blob2_3d;
+layout (binding = 2, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 2, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 2, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob1 { sfpvec4 bottom_blob1_data[]; };
+layout (binding = 1) readonly buffer bottom_blob2 { sfpvec4 bottom_blob2_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    float coeff0;
+    float coeff1;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec4 v1;
+    afpvec4 v2;
+    if (psc(dims) == 1)
+    {
+        v1 = image1d_ld4(bottom_blob1_1d, gx);
+        v2 = image1d_ld4(bottom_blob2_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v1 = image2d_ld4(bottom_blob1_2d, ivec2(gx, gy));
+        v2 = image2d_ld4(bottom_blob2_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v1 = image3d_ld4(bottom_blob1_3d, ivec3(gx, gy, gz));
+        v2 = image3d_ld4(bottom_blob2_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec4 v1 = buffer_ld4(bottom_blob1_data, gi);
+    afpvec4 v2 = buffer_ld4(bottom_blob2_data, gi);
+#endif
+
+    afpvec4 res;
+
+    if (coeff_term == 0)
+    {
+        if (op_type == 0)
+            res = v1 * v2;
+
+        if (op_type == 1)
+            res = v1 + v2;
+
+        if (op_type == 2)
+            res = max(v1, v2);
+    }
+    else
+    {
+        if (op_type == 0)
+            res = v1 * v2;
+
+        if (op_type == 1)
+            res = v1 * afp(p.coeff0) + v2 * afp(p.coeff1);
+
+        if (op_type == 2)
+            res = max(v1, v2);
+    }
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st4(top_blob_1d, gx, res);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st4(top_blob_2d, ivec2(gx, gy), res);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), res);
+    }
+#else
+    buffer_st4(top_blob_data, gi, res);
+#endif
+}
diff --git a/source/device/vulkan/shaders/eltwise_pack8.comp b/source/device/vulkan/shaders/eltwise_pack8.comp
new file mode 100644
index 000000000..5f767b82f
--- /dev/null
+++ b/source/device/vulkan/shaders/eltwise_pack8.comp
@@ -0,0 +1,160 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int op_type = 0;
+layout (constant_id = 1) const int coeff_term = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob1_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob1_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob1_3d;
+layout (binding = 1) uniform unfp sampler1D bottom_blob2_1d;
+layout (binding = 1) uniform unfp sampler2D bottom_blob2_2d;
+layout (binding = 1) uniform unfp sampler3D bottom_blob2_3d;
+layout (binding = 2, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 2, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 2, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob1 { sfpvec8 bottom_blob1_data[]; };
+layout (binding = 1) readonly buffer bottom_blob2 { sfpvec8 bottom_blob2_data[]; };
+layout (binding = 2) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    float coeff0;
+    float coeff1;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec8 v1;
+    afpvec8 v2;
+    if (psc(dims) == 1)
+    {
+        v1 = image1d_ld8(bottom_blob1_1d, gx);
+        v2 = image1d_ld8(bottom_blob2_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v1 = image2d_ld8(bottom_blob1_2d, ivec2(gx, gy));
+        v2 = image2d_ld8(bottom_blob2_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v1 = image3d_ld8(bottom_blob1_3d, ivec3(gx, gy, gz));
+        v2 = image3d_ld8(bottom_blob2_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec8 v1 = buffer_ld8(bottom_blob1_data, gi);
+    afpvec8 v2 = buffer_ld8(bottom_blob2_data, gi);
+#endif
+
+    afpvec8 res;
+
+    if (coeff_term == 0)
+    {
+        if (op_type == 0)
+        {
+            res[0] = v1[0] * v2[0];
+            res[1] = v1[1] * v2[1];
+        }
+
+        if (op_type == 1)
+        {
+            res[0] = v1[0] + v2[0];
+            res[1] = v1[1] + v2[1];
+        }
+
+        if (op_type == 2)
+        {
+            res[0] = max(v1[0], v2[0]);
+            res[1] = max(v1[1], v2[1]);
+        }
+    }
+    else
+    {
+        if (op_type == 0)
+        {
+            res[0] = v1[0] * v2[0];
+            res[1] = v1[1] * v2[1];
+        }
+
+        if (op_type == 1)
+        {
+            res[0] = v1[0] * afp(p.coeff0) + v2[0] * afp(p.coeff1);
+            res[1] = v1[1] * afp(p.coeff0) + v2[1] * afp(p.coeff1);
+        }
+
+        if (op_type == 2)
+        {
+            res[0] = max(v1[0], v2[0]);
+            res[1] = max(v1[1], v2[1]);
+        }
+    }
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st8(top_blob_1d, gx, res);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st8(top_blob_2d, ivec2(gx, gy), res);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), res);
+    }
+#else
+    buffer_st8(top_blob_data, gi, res);
+#endif
+}
diff --git a/source/device/vulkan/shaders/flatten.comp b/source/device/vulkan/shaders/flatten.comp
new file mode 100644
index 000000000..8cc137789
--- /dev/null
+++ b/source/device/vulkan/shaders/flatten.comp
@@ -0,0 +1,98 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    int size = psc(w) * psc(h);
+
+    int z = gx / size;
+    int y = gx % size / psc(w);
+    int x = gx % size % psc(w);
+
+#if NCNN_image_shader
+    afp v;
+
+    if (psc(dims) == 2)
+    {
+        v = image2d_ld1(bottom_blob_2d, ivec2(x, y));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(x, y, z));
+    }
+
+    image1d_st1(top_blob, gx, v);
+#else
+    int v_offset = z * psc(cstep) + y * psc(w) + x;
+
+    buffer_cp1(top_blob_data, gx, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/flatten_pack1to4.comp b/source/device/vulkan/shaders/flatten_pack1to4.comp
new file mode 100644
index 000000000..b0ff244e5
--- /dev/null
+++ b/source/device/vulkan/shaders/flatten_pack1to4.comp
@@ -0,0 +1,127 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3);
+
+#if NCNN_image_shader
+    afpvec4 v;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v.r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r));
+        v.g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g));
+        v.b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b));
+        v.a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a));
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r));
+        v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g));
+        v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b));
+        v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a));
+    }
+
+    image1d_st4(top_blob, gx, v);
+#else
+    ivec4 v_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = y4 * psc(w) + x4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = z4 * psc(cstep) + y4 * psc(w) + x4;
+    }
+
+    buffer_cp1to4(top_blob_data, gx, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/flatten_pack1to8.comp b/source/device/vulkan/shaders/flatten_pack1to8.comp
new file mode 100644
index 000000000..38f3f89d3
--- /dev/null
+++ b/source/device/vulkan/shaders/flatten_pack1to8.comp
@@ -0,0 +1,154 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    ivec4 i4 = gx * 8 + ivec4(0, 1, 2, 3);
+    ivec4 ii4 = i4 + 4;
+
+#if NCNN_image_shader
+    afpvec8 v;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v[0].r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r));
+        v[0].g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g));
+        v[0].b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b));
+        v[0].a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a));
+        v[1].r = image2d_ld1(bottom_blob_2d, ivec2(xx4.r, yy4.r));
+        v[1].g = image2d_ld1(bottom_blob_2d, ivec2(xx4.g, yy4.g));
+        v[1].b = image2d_ld1(bottom_blob_2d, ivec2(xx4.b, yy4.b));
+        v[1].a = image2d_ld1(bottom_blob_2d, ivec2(xx4.a, yy4.a));
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v[0].r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r));
+        v[0].g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g));
+        v[0].b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b));
+        v[0].a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a));
+        v[1].r = image3d_ld1(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r));
+        v[1].g = image3d_ld1(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g));
+        v[1].b = image3d_ld1(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b));
+        v[1].a = image3d_ld1(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a));
+    }
+
+    image1d_st8(top_blob, gx, v);
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = y4 * psc(w) + x4;
+        vv_offset = yy4 * psc(w) + xx4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = z4 * psc(cstep) + y4 * psc(w) + x4;
+        vv_offset = zz4 * psc(cstep) + yy4 * psc(w) + xx4;
+    }
+
+    buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/flatten_pack4.comp b/source/device/vulkan/shaders/flatten_pack4.comp
new file mode 100644
index 000000000..a6827efd4
--- /dev/null
+++ b/source/device/vulkan/shaders/flatten_pack4.comp
@@ -0,0 +1,175 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    ivec4 i4 = gx * 4 + ivec4(0, 1, 2, 3);
+
+#if NCNN_image_shader
+    afpvec4 v;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4));
+        afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4));
+        afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4));
+        afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4));
+
+        v.r = v0[y4.r % 4];
+        v.g = v1[y4.g % 4];
+        v.b = v2[y4.b % 4];
+        v.a = v3[y4.a % 4];
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4));
+        afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4));
+        afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4));
+        afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4));
+
+        v.r = v0[z4.r % 4];
+        v.g = v1[z4.g % 4];
+        v.b = v2[z4.b % 4];
+        v.a = v3[z4.a % 4];
+    }
+
+    image1d_st4(top_blob, gx, v);
+#else
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 lane2;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2;
+        lane2 = y4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
+        lane2 = z4 % 2;
+    }
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
+
+    buffer_st4(top_blob_data, gx, v);
+#else
+    ivec4 v_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
+    }
+
+    buffer_cp1to4(top_blob_data, gx, bottom_blob_data, v_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/flatten_pack4to8.comp b/source/device/vulkan/shaders/flatten_pack4to8.comp
new file mode 100644
index 000000000..8dfaf3b15
--- /dev/null
+++ b/source/device/vulkan/shaders/flatten_pack4to8.comp
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    ivec4 i4 = gx * 8 + ivec4(0, 1, 2, 3);
+    ivec4 ii4 = i4 + 4;
+
+#if NCNN_image_shader
+    afpvec8 v;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4));
+        afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4));
+        afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4));
+        afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4));
+        afpvec4 v4 = image2d_ld4(bottom_blob_2d, ivec2(xx4.r, yy4.r / 4));
+        afpvec4 v5 = image2d_ld4(bottom_blob_2d, ivec2(xx4.g, yy4.g / 4));
+        afpvec4 v6 = image2d_ld4(bottom_blob_2d, ivec2(xx4.b, yy4.b / 4));
+        afpvec4 v7 = image2d_ld4(bottom_blob_2d, ivec2(xx4.a, yy4.a / 4));
+
+        v[0].r = v0[y4.r % 4];
+        v[0].g = v1[y4.g % 4];
+        v[0].b = v2[y4.b % 4];
+        v[0].a = v3[y4.a % 4];
+        v[1].r = v4[yy4.r % 4];
+        v[1].g = v5[yy4.g % 4];
+        v[1].b = v6[yy4.b % 4];
+        v[1].a = v7[yy4.a % 4];
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4));
+        afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4));
+        afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4));
+        afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4));
+        afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 4));
+        afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 4));
+        afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 4));
+        afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 4));
+
+        v[0].r = v0[z4.r % 4];
+        v[0].g = v1[z4.g % 4];
+        v[0].b = v2[z4.b % 4];
+        v[0].a = v3[z4.a % 4];
+        v[1].r = v4[zz4.r % 4];
+        v[1].g = v5[zz4.g % 4];
+        v[1].b = v6[zz4.b % 4];
+        v[1].a = v7[zz4.a % 4];
+    }
+
+    image1d_st8(top_blob, gx, v);
+#else
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 lane4;
+    ivec4 vv_offset;
+    ivec4 lane8;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2;
+        lane4 = y4 % 2;
+        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 2 + (yy4 % 4) / 2;
+        lane8 = yy4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
+        lane4 = z4 % 2;
+        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2;
+        lane8 = zz4 % 2;
+    }
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
+    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
+    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
+    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
+
+    afpvec8 v = afpvec8(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a], vvr[lane8.r], vvg[lane8.g], vvb[lane8.b], vva[lane8.a]);
+
+    buffer_st8(top_blob_data, gx, v);
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
+        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 4 + yy4 % 4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
+        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4;
+    }
+
+    buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/flatten_pack8.comp b/source/device/vulkan/shaders/flatten_pack8.comp
new file mode 100644
index 000000000..01a06f451
--- /dev/null
+++ b/source/device/vulkan/shaders/flatten_pack8.comp
@@ -0,0 +1,222 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    ivec4 i4 = gx * 8 + ivec4(0, 1, 2, 3);
+    ivec4 ii4 = i4 + 4;
+
+#if NCNN_image_shader
+    afpvec8 v;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8));
+        afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8));
+        afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8));
+        afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8));
+        afpvec8 v4 = image2d_ld8(bottom_blob_2d, ivec2(xx4.r, yy4.r / 8));
+        afpvec8 v5 = image2d_ld8(bottom_blob_2d, ivec2(xx4.g, yy4.g / 8));
+        afpvec8 v6 = image2d_ld8(bottom_blob_2d, ivec2(xx4.b, yy4.b / 8));
+        afpvec8 v7 = image2d_ld8(bottom_blob_2d, ivec2(xx4.a, yy4.a / 8));
+
+        v[0].r = v0[(y4.r % 8) / 4][y4.r % 4];
+        v[0].g = v1[(y4.g % 8) / 4][y4.g % 4];
+        v[0].b = v2[(y4.b % 8) / 4][y4.b % 4];
+        v[0].a = v3[(y4.a % 8) / 4][y4.a % 4];
+        v[1].r = v4[(yy4.r % 8) / 4][yy4.r % 4];
+        v[1].g = v5[(yy4.g % 8) / 4][yy4.g % 4];
+        v[1].b = v6[(yy4.b % 8) / 4][yy4.b % 4];
+        v[1].a = v7[(yy4.a % 8) / 4][yy4.a % 4];
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8));
+        afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8));
+        afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8));
+        afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8));
+        afpvec8 v4 = image3d_ld8(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 8));
+        afpvec8 v5 = image3d_ld8(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 8));
+        afpvec8 v6 = image3d_ld8(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 8));
+        afpvec8 v7 = image3d_ld8(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 8));
+
+        v[0].r = v0[(z4.r % 8) / 4][z4.r % 4];
+        v[0].g = v1[(z4.g % 8) / 4][z4.g % 4];
+        v[0].b = v2[(z4.b % 8) / 4][z4.b % 4];
+        v[0].a = v3[(z4.a % 8) / 4][z4.a % 4];
+        v[1].r = v4[(zz4.r % 8) / 4][zz4.r % 4];
+        v[1].g = v5[(zz4.g % 8) / 4][zz4.g % 4];
+        v[1].b = v6[(zz4.b % 8) / 4][zz4.b % 4];
+        v[1].a = v7[(zz4.a % 8) / 4][zz4.a % 4];
+    }
+
+    image1d_st8(top_blob, gx, v);
+#else
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 lane4;
+    ivec4 vv_offset;
+    ivec4 lane8;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2;
+        lane4 = y4 % 2;
+        vv_offset = ((yy4 / 8) * psc(w) + xx4) * 4 + (yy4 % 8) / 2;
+        lane8 = yy4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2;
+        lane4 = z4 % 2;
+        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + (zz4 % 8) / 2;
+        lane8 = zz4 % 2;
+    }
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
+    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
+    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
+    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
+
+    afpvec8 v = afpvec8(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a], vvr[lane8.r], vvg[lane8.g], vvb[lane8.b], vva[lane8.a]);
+
+    buffer_st8(top_blob_data, gx, v);
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8;
+        vv_offset = ((yy4 / 8) * psc(w) + xx4) * 8 + yy4 % 8;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8;
+        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 8 + zz4 % 8;
+    }
+
+    buffer_cp1to8(top_blob_data, gx, bottom_blob_data, v_offset, vv_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/innerproduct.comp b/source/device/vulkan/shaders/innerproduct.comp
new file mode 100644
index 000000000..baa8c4b9e
--- /dev/null
+++ b/source/device/vulkan/shaders/innerproduct.comp
@@ -0,0 +1,140 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gx);
+#else
+        sum = buffer_ld1(bias_data, gx);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int i = 0; i < psc(w); i++)
+    {
+        sum += image2d_ld1(weight_blob, ivec2(i, gx)) * image1d_ld1(bottom_blob, i);
+    }
+#else
+    int w_offset = gx * psc(w);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        sum += buffer_ld1(weight_data, w_offset + i) * buffer_ld1(bottom_blob_data, i);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image1d_st1(top_blob, gx, sum);
+#else
+    buffer_st1(top_blob_data, gx, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/innerproduct_pack1to4.comp b/source/device/vulkan/shaders/innerproduct_pack1to4.comp
new file mode 100644
index 000000000..d2f96e4ec
--- /dev/null
+++ b/source/device/vulkan/shaders/innerproduct_pack1to4.comp
@@ -0,0 +1,148 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gx);
+#else
+        sum = buffer_ld4(bias_data, gx);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int i = 0; i < psc(w); i++)
+    {
+        afp v = image1d_ld1(bottom_blob, i);
+
+        afpvec4 k = image2d_ld4(weight_blob, ivec2(i, gx));
+
+        sum += v * k;
+    }
+#else
+    int w_offset = gx * psc(w);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afp v = buffer_ld1(bottom_blob_data, i);
+
+        afpvec4 k = buffer_ld4(weight_data, w_offset + i);
+
+        sum += v * k;
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image1d_st4(top_blob, gx, sum);
+#else
+    buffer_st4(top_blob_data, gx, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/innerproduct_pack1to8.comp b/source/device/vulkan/shaders/innerproduct_pack1to8.comp
new file mode 100644
index 000000000..5bb3ffd84
--- /dev/null
+++ b/source/device/vulkan/shaders/innerproduct_pack1to8.comp
@@ -0,0 +1,160 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gx);
+#else
+        sum = buffer_ld8(bias_data, gx);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+#if NCNN_image_shader
+    int wx = 0;
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afp v = image1d_ld1(bottom_blob, i);
+
+        afpvec8 k = image2d_ld8(weight_blob, ivec2(i, gx));
+
+        // sum += v * k;
+        sum[0] += v * k[0];
+        sum[1] += v * k[1];
+    }
+#else
+    int w_offset = gx * psc(w);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afp v = buffer_ld1(bottom_blob_data, i);
+
+        afpvec8 k = buffer_ld8(weight_data, w_offset + i);
+
+        // sum += v * k;
+        sum[0] += v * k[0];
+        sum[1] += v * k[1];
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image1d_st8(top_blob, gx, sum);
+#else
+    buffer_st8(top_blob_data, gx, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/innerproduct_pack4.comp b/source/device/vulkan/shaders/innerproduct_pack4.comp
new file mode 100644
index 000000000..b8d4d7554
--- /dev/null
+++ b/source/device/vulkan/shaders/innerproduct_pack4.comp
@@ -0,0 +1,171 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+#else
+layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
+#endif
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gx);
+#else
+        sum = buffer_ld4(bias_data, gx);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+#if NCNN_image_shader
+    int wx = 0;
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec4 v = image1d_ld4(bottom_blob, i);
+        afpmat4 k = afpmat4(
+            image2d_ld4(weight_blob, ivec2(wx + 0, gx)),
+            image2d_ld4(weight_blob, ivec2(wx + 1, gx)),
+            image2d_ld4(weight_blob, ivec2(wx + 2, gx)),
+            image2d_ld4(weight_blob, ivec2(wx + 3, gx))
+        );
+
+        sum += v * k;
+
+        wx += 4;
+    }
+#else
+    int w_offset = gx * psc(w);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec4 v = buffer_ld4(bottom_blob_data, i);
+
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+        // GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+        afpmat4 k = afpmat4(
+            buffer_ld4(weight_data, (w_offset + i) * 4 + 0),
+            buffer_ld4(weight_data, (w_offset + i) * 4 + 1),
+            buffer_ld4(weight_data, (w_offset + i) * 4 + 2),
+            buffer_ld4(weight_data, (w_offset + i) * 4 + 3)
+        );
+#else
+        afpmat4 k = afpmat4(weight_data[w_offset + i]);
+#endif
+
+        sum += v * k;
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image1d_st4(top_blob, gx, sum);
+#else
+    buffer_st4(top_blob_data, gx, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/innerproduct_pack4to1.comp b/source/device/vulkan/shaders/innerproduct_pack4to1.comp
new file mode 100644
index 000000000..9faf8100f
--- /dev/null
+++ b/source/device/vulkan/shaders/innerproduct_pack4to1.comp
@@ -0,0 +1,148 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gx);
+#else
+        sum = buffer_ld1(bias_data, gx);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec4 v = image1d_ld4(bottom_blob, i);
+
+        afpvec4 k = image2d_ld4(weight_blob, ivec2(i, gx));
+
+        sum += dot(v, k);
+    }
+#else
+    int w_offset = gx * psc(w);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec4 v = buffer_ld4(bottom_blob_data, i);
+
+        afpvec4 k = buffer_ld4(weight_data, w_offset + i);
+
+        sum += dot(v, k);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image1d_st1(top_blob, gx, sum);
+#else
+    buffer_st1(top_blob_data, gx, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/innerproduct_pack4to8.comp b/source/device/vulkan/shaders/innerproduct_pack4to8.comp
new file mode 100644
index 000000000..a8ee4a309
--- /dev/null
+++ b/source/device/vulkan/shaders/innerproduct_pack4to8.comp
@@ -0,0 +1,188 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gx);
+#else
+        sum = buffer_ld8(bias_data, gx);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+#if NCNN_image_shader
+    int wx = 0;
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec4 v = image1d_ld4(bottom_blob, i);
+
+        afpvec4 k0 = image2d_ld4(weight_blob, ivec2(wx + 0, gx));
+        afpvec4 k1 = image2d_ld4(weight_blob, ivec2(wx + 1, gx));
+        afpvec4 k2 = image2d_ld4(weight_blob, ivec2(wx + 2, gx));
+        afpvec4 k3 = image2d_ld4(weight_blob, ivec2(wx + 3, gx));
+        afpvec4 k4 = image2d_ld4(weight_blob, ivec2(wx + 4, gx));
+        afpvec4 k5 = image2d_ld4(weight_blob, ivec2(wx + 5, gx));
+        afpvec4 k6 = image2d_ld4(weight_blob, ivec2(wx + 6, gx));
+        afpvec4 k7 = image2d_ld4(weight_blob, ivec2(wx + 7, gx));
+
+        // sum += v * k;
+        sum[0].r += dot(v, k0);
+        sum[0].g += dot(v, k1);
+        sum[0].b += dot(v, k2);
+        sum[0].a += dot(v, k3);
+        sum[1].r += dot(v, k4);
+        sum[1].g += dot(v, k5);
+        sum[1].b += dot(v, k6);
+        sum[1].a += dot(v, k7);
+
+        wx += 8;
+    }
+#else
+    int w_offset = gx * psc(w);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec4 v = buffer_ld4(bottom_blob_data, i);
+
+        afpvec4 k0 = buffer_ld4(weight_data, (w_offset + i) * 8 + 0);
+        afpvec4 k1 = buffer_ld4(weight_data, (w_offset + i) * 8 + 1);
+        afpvec4 k2 = buffer_ld4(weight_data, (w_offset + i) * 8 + 2);
+        afpvec4 k3 = buffer_ld4(weight_data, (w_offset + i) * 8 + 3);
+        afpvec4 k4 = buffer_ld4(weight_data, (w_offset + i) * 8 + 4);
+        afpvec4 k5 = buffer_ld4(weight_data, (w_offset + i) * 8 + 5);
+        afpvec4 k6 = buffer_ld4(weight_data, (w_offset + i) * 8 + 6);
+        afpvec4 k7 = buffer_ld4(weight_data, (w_offset + i) * 8 + 7);
+
+        // sum += v * k;
+        sum[0].r += dot(v, k0);
+        sum[0].g += dot(v, k1);
+        sum[0].b += dot(v, k2);
+        sum[0].a += dot(v, k3);
+        sum[1].r += dot(v, k4);
+        sum[1].g += dot(v, k5);
+        sum[1].b += dot(v, k6);
+        sum[1].a += dot(v, k7);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image1d_st8(top_blob, gx, sum);
+#else
+    buffer_st8(top_blob_data, gx, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/innerproduct_pack8.comp b/source/device/vulkan/shaders/innerproduct_pack8.comp
new file mode 100644
index 000000000..50f7f4139
--- /dev/null
+++ b/source/device/vulkan/shaders/innerproduct_pack8.comp
@@ -0,0 +1,188 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afpvec8 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld8(bias_blob, gx);
+#else
+        sum = buffer_ld8(bias_data, gx);
+#endif
+    }
+    else
+    {
+        sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+#if NCNN_image_shader
+    int wx = 0;
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec8 v = image1d_ld8(bottom_blob, i);
+
+        afpvec8 k0 = image2d_ld8(weight_blob, ivec2(wx + 0, gx));
+        afpvec8 k1 = image2d_ld8(weight_blob, ivec2(wx + 1, gx));
+        afpvec8 k2 = image2d_ld8(weight_blob, ivec2(wx + 2, gx));
+        afpvec8 k3 = image2d_ld8(weight_blob, ivec2(wx + 3, gx));
+        afpvec8 k4 = image2d_ld8(weight_blob, ivec2(wx + 4, gx));
+        afpvec8 k5 = image2d_ld8(weight_blob, ivec2(wx + 5, gx));
+        afpvec8 k6 = image2d_ld8(weight_blob, ivec2(wx + 6, gx));
+        afpvec8 k7 = image2d_ld8(weight_blob, ivec2(wx + 7, gx));
+
+        // sum += v * k
+        sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+        sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+        sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+        sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+        sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]);
+        sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]);
+        sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]);
+        sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]);
+
+        wx += 8;
+    }
+#else
+    int w_offset = gx * psc(w);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec8 v = buffer_ld8(bottom_blob_data, i);
+
+        afpvec8 k0 = buffer_ld8(weight_data, (w_offset + i) * 8 + 0);
+        afpvec8 k1 = buffer_ld8(weight_data, (w_offset + i) * 8 + 1);
+        afpvec8 k2 = buffer_ld8(weight_data, (w_offset + i) * 8 + 2);
+        afpvec8 k3 = buffer_ld8(weight_data, (w_offset + i) * 8 + 3);
+        afpvec8 k4 = buffer_ld8(weight_data, (w_offset + i) * 8 + 4);
+        afpvec8 k5 = buffer_ld8(weight_data, (w_offset + i) * 8 + 5);
+        afpvec8 k6 = buffer_ld8(weight_data, (w_offset + i) * 8 + 6);
+        afpvec8 k7 = buffer_ld8(weight_data, (w_offset + i) * 8 + 7);
+
+        // sum += v * k
+        sum[0].r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+        sum[0].g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+        sum[0].b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+        sum[0].a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+        sum[1].r += dot(v[0], k4[0]) + dot(v[1], k4[1]);
+        sum[1].g += dot(v[0], k5[0]) + dot(v[1], k5[1]);
+        sum[1].b += dot(v[0], k6[0]) + dot(v[1], k6[1]);
+        sum[1].a += dot(v[0], k7[0]) + dot(v[1], k7[1]);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum[0] = max(sum[0], afp(0.f));
+        sum[1] = max(sum[1], afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
+        sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum[0] = clamp(sum[0], const_min, const_max);
+        sum[1] = clamp(sum[1], const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
+        sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
+    }
+    if (activation_type == 5)
+    {
+        sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
+        sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image1d_st8(top_blob, gx, sum);
+#else
+    buffer_st8(top_blob_data, gx, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/innerproduct_pack8to1.comp b/source/device/vulkan/shaders/innerproduct_pack8to1.comp
new file mode 100644
index 000000000..6fa3b1adc
--- /dev/null
+++ b/source/device/vulkan/shaders/innerproduct_pack8to1.comp
@@ -0,0 +1,151 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afp sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld1(bias_blob, gx);
+#else
+        sum = buffer_ld1(bias_data, gx);
+#endif
+    }
+    else
+    {
+        sum = afp(0.f);
+    }
+
+#if NCNN_image_shader
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec8 v = image1d_ld8(bottom_blob, i);
+
+        afpvec8 k = image2d_ld8(weight_blob, ivec2(i, gx));
+
+        // sum += dot(v, k);
+        sum += dot(v[0], k[0]) + dot(v[1], k[1]);
+    }
+#else
+    int w_offset = gx * psc(w);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec8 v = buffer_ld8(bottom_blob_data, i);
+
+        afpvec8 k = buffer_ld8(weight_data, w_offset + i);
+
+        // sum += dot(v, k);
+        sum += dot(v[0], k[0]) + dot(v[1], k[1]);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = sum < afp(0.f) ? sum * slope : sum;
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image1d_st1(top_blob, gx, sum);
+#else
+    buffer_st1(top_blob_data, gx, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/innerproduct_pack8to4.comp b/source/device/vulkan/shaders/innerproduct_pack8to4.comp
new file mode 100644
index 000000000..0fb99082b
--- /dev/null
+++ b/source/device/vulkan/shaders/innerproduct_pack8to4.comp
@@ -0,0 +1,167 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+layout (binding = 2) uniform unfp sampler2D weight_blob;
+layout (binding = 3) uniform unfp sampler1D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afpvec4 sum;
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum = image1d_ld4(bias_blob, gx);
+#else
+        sum = buffer_ld4(bias_data, gx);
+#endif
+    }
+    else
+    {
+        sum = afpvec4(0.f);
+    }
+
+#if NCNN_image_shader
+    int wx = 0;
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec8 v = image1d_ld8(bottom_blob, i);
+
+        afpvec8 k0 = image2d_ld8(weight_blob, ivec2(wx + 0, gx));
+        afpvec8 k1 = image2d_ld8(weight_blob, ivec2(wx + 1, gx));
+        afpvec8 k2 = image2d_ld8(weight_blob, ivec2(wx + 2, gx));
+        afpvec8 k3 = image2d_ld8(weight_blob, ivec2(wx + 3, gx));
+
+        // sum += v * k
+        sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+        sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+        sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+        sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+
+        wx += 4;
+    }
+#else
+    int w_offset = gx * psc(w);
+
+    for (int i = 0; i < psc(w); i++)
+    {
+        afpvec8 v = buffer_ld8(bottom_blob_data, i);
+
+        afpvec8 k0 = buffer_ld8(weight_data, (w_offset + i) * 4 + 0);
+        afpvec8 k1 = buffer_ld8(weight_data, (w_offset + i) * 4 + 1);
+        afpvec8 k2 = buffer_ld8(weight_data, (w_offset + i) * 4 + 2);
+        afpvec8 k3 = buffer_ld8(weight_data, (w_offset + i) * 4 + 3);
+
+        // sum += v * k
+        sum.r += dot(v[0], k0[0]) + dot(v[1], k0[1]);
+        sum.g += dot(v[0], k1[0]) + dot(v[1], k1[1]);
+        sum.b += dot(v[0], k2[0]) + dot(v[1], k2[1]);
+        sum.a += dot(v[0], k3[0]) + dot(v[1], k3[1]);
+    }
+#endif
+
+    if (activation_type == 1)
+    {
+        sum = max(sum, afp(0.f));
+    }
+    if (activation_type == 2)
+    {
+        const afp slope = afp(activation_param_0);
+        sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
+    }
+    if (activation_type == 3)
+    {
+        const afp const_min = afp(activation_param_0);
+        const afp const_max = afp(activation_param_1);
+        sum = clamp(sum, const_min, const_max);
+    }
+    if (activation_type == 4)
+    {
+        sum = afp(1.f) / (afp(1.f) + exp(-sum));
+    }
+    if (activation_type == 5)
+    {
+        sum = sum * tanh(log(exp(sum) + afp(1.f)));
+    }
+
+#if NCNN_image_shader
+    image1d_st4(top_blob, gx, sum);
+#else
+    buffer_st4(top_blob_data, gx, sum);
+#endif
+}
diff --git a/source/device/vulkan/shaders/interp.comp b/source/device/vulkan/shaders/interp.comp
new file mode 100644
index 000000000..f0f24fa33
--- /dev/null
+++ b/source/device/vulkan/shaders/interp.comp
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int resize_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    float scale_x;
+    float scale_y;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    if (resize_type == 1) // nearest
+    {
+        afpvec2 gxy = afpvec2(gx, gy);
+        ivec2 sxy_max = ivec2(psc(w) - 1, psc(h) - 1);
+        ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max);
+
+        int sx = sxy.r;
+        int sy = sxy.g;
+
+#if NCNN_image_shader
+        image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx, sy, gz));
+#else
+        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
+
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+    }
+    if (resize_type == 2) // bilinear
+    {
+        afpvec2 gxy = afpvec2(gx, gy);
+        afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f);
+
+        ivec2 sxy = ivec2(floor(fxy));
+
+        fxy -= afpvec2(sxy);
+
+        ivec2 sxy_max = ivec2(psc(w) - 2, psc(h) - 2);
+
+        bvec2 underflow = lessThan(sxy, ivec2(0));
+        bvec2 overflow = greaterThan(sxy, sxy_max);
+
+        sxy = clamp(sxy, ivec2(0), sxy_max);
+
+        fxy = mix(fxy, afpvec2(0.f), underflow);
+        fxy = mix(fxy, afpvec2(1.f), overflow);
+
+        int sx = sxy.r;
+        int sy = sxy.g;
+
+#if NCNN_image_shader
+        afp a0 = image3d_ld1(bottom_blob, ivec3(sx, sy, gz));
+        afp a1 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy, gz));
+        afp b0 = image3d_ld1(bottom_blob, ivec3(sx, sy + 1, gz));
+        afp b1 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 1, gz));
+#else
+        int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+        int v_offset_1 = gz * psc(cstep) + (sy + 1) * psc(w) + sx;
+
+        afp a0 = buffer_ld1(bottom_blob_data, v_offset_0);
+        afp a1 = buffer_ld1(bottom_blob_data, v_offset_0 + 1);
+        afp b0 = buffer_ld1(bottom_blob_data, v_offset_1);
+        afp b1 = buffer_ld1(bottom_blob_data, v_offset_1 + 1);
+#endif
+
+        afp fx = fxy.r;
+        afp fy = fxy.g;
+
+        afpvec2 ab = afpvec2(a0, b0) * (afp(1.f) - fx) + afpvec2(a1, b1) * fx;
+
+        afp res = ab.r * (afp(1.f) - fy) + ab.g * fy;
+
+#if NCNN_image_shader
+        image3d_st1(top_blob, ivec3(gx, gy, gz), res);
+#else
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        buffer_st1(top_blob_data, gi, res);
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/interp_bicubic.comp b/source/device/vulkan/shaders/interp_bicubic.comp
new file mode 100644
index 000000000..2f4e26886
--- /dev/null
+++ b/source/device/vulkan/shaders/interp_bicubic.comp
@@ -0,0 +1,149 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; };
+layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; };
+layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; };
+layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int sx = xofs_blob_data[gx];
+    int sy = yofs_blob_data[gy];
+
+#if NCNN_image_shader
+    afp a0 = image3d_ld1(bottom_blob, ivec3(sx - 1, sy - 1, gz));
+    afp a1 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy - 1, gz));
+    afp a2 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy - 1, gz));
+    afp a3 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy - 1, gz));
+
+    afp b0 = image3d_ld1(bottom_blob, ivec3(sx - 1, sy + 0, gz));
+    afp b1 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 0, gz));
+    afp b2 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 0, gz));
+    afp b3 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 0, gz));
+
+    afp c0 = image3d_ld1(bottom_blob, ivec3(sx - 1, sy + 1, gz));
+    afp c1 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 1, gz));
+    afp c2 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 1, gz));
+    afp c3 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 1, gz));
+
+    afp d0 = image3d_ld1(bottom_blob, ivec3(sx - 1, sy + 2, gz));
+    afp d1 = image3d_ld1(bottom_blob, ivec3(sx + 0, sy + 2, gz));
+    afp d2 = image3d_ld1(bottom_blob, ivec3(sx + 1, sy + 2, gz));
+    afp d3 = image3d_ld1(bottom_blob, ivec3(sx + 2, sy + 2, gz));
+#else
+    int v_offset_0 = gz * psc(cstep) + (sy - 1) * psc(w) + sx;
+    int v_offset_1 = gz * psc(cstep) + (sy + 0) * psc(w) + sx;
+    int v_offset_2 = gz * psc(cstep) + (sy + 1) * psc(w) + sx;
+    int v_offset_3 = gz * psc(cstep) + (sy + 2) * psc(w) + sx;
+
+    afp a0 = buffer_ld1(bottom_blob_data, v_offset_0 - 1);
+    afp a1 = buffer_ld1(bottom_blob_data, v_offset_0 + 0);
+    afp a2 = buffer_ld1(bottom_blob_data, v_offset_0 + 1);
+    afp a3 = buffer_ld1(bottom_blob_data, v_offset_0 + 2);
+
+    afp b0 = buffer_ld1(bottom_blob_data, v_offset_1 - 1);
+    afp b1 = buffer_ld1(bottom_blob_data, v_offset_1 + 0);
+    afp b2 = buffer_ld1(bottom_blob_data, v_offset_1 + 1);
+    afp b3 = buffer_ld1(bottom_blob_data, v_offset_1 + 2);
+
+    afp c0 = buffer_ld1(bottom_blob_data, v_offset_2 - 1);
+    afp c1 = buffer_ld1(bottom_blob_data, v_offset_2 + 0);
+    afp c2 = buffer_ld1(bottom_blob_data, v_offset_2 + 1);
+    afp c3 = buffer_ld1(bottom_blob_data, v_offset_2 + 2);
+
+    afp d0 = buffer_ld1(bottom_blob_data, v_offset_3 - 1);
+    afp d1 = buffer_ld1(bottom_blob_data, v_offset_3 + 0);
+    afp d2 = buffer_ld1(bottom_blob_data, v_offset_3 + 1);
+    afp d3 = buffer_ld1(bottom_blob_data, v_offset_3 + 2);
+#endif
+
+    afpmat4 abcd0123 = afpmat4(
+        a0, a1, a2, a3,
+        b0, b1, b2, b3,
+        c0, c1, c2, c3,
+        d0, d1, d2, d3
+    );
+
+    afpvec4 alpha = buffer_ld4(alpha_blob_data, gx);
+
+    afpvec4 abcd = alpha * abcd0123;
+
+    afpvec4 beta = buffer_ld4(beta_blob_data, gy);
+
+    afp v = dot(abcd, beta);
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(gx, gy, gz), v);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/interp_bicubic_coeffs.comp b/source/device/vulkan/shaders/interp_bicubic_coeffs.comp
new file mode 100644
index 000000000..1de3ce938
--- /dev/null
+++ b/source/device/vulkan/shaders/interp_bicubic_coeffs.comp
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int outw = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) writeonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; };
+layout (binding = 1) writeonly buffer xofs_blob { int xofs_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int outw;
+    float scale;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    afp fx = (afp(gx) + afp(0.5f)) * afp(p.scale) - afp(0.5f);
+    int sx = int(floor(fx));
+    fx -= afp(sx);
+
+    // interpolate_cubic(fx, coeffs);
+    afpvec4 coeffs;
+    {
+        const afp A = afp(-0.75f);
+
+        afp fx0 = fx + afp(1.f);
+        afp fx1 = fx;
+        afp fx2 = afp(1.f) - fx;
+        // afp fx3 = afp(2.f) - fx;
+
+        coeffs.r = A * fx0*fx0*fx0 - afp(5.f)*A * fx0*fx0 + afp(8.f)*A * fx0 - afp(4.f)*A;
+        coeffs.g = (A+afp(2.f)) * fx1*fx1*fx1 - (A+afp(3.f)) * fx1*fx1 + afp(1.f);
+        coeffs.b = (A+afp(2.f)) * fx2*fx2*fx2 - (A+afp(3.f)) * fx2*fx2 + afp(1.f);
+        coeffs.a = afp(1.f) - coeffs.r - coeffs.g - coeffs.b;
+    }
+
+    if (sx <= -1)
+    {
+        sx = 1;
+        coeffs.r = afp(1.f) - coeffs.a;
+        coeffs.g = coeffs.a;
+        coeffs.b = afp(0.f);
+        coeffs.a = afp(0.f);
+    }
+    if (sx == 0)
+    {
+        sx = 1;
+        coeffs.r = coeffs.r + coeffs.g;
+        coeffs.g = coeffs.b;
+        coeffs.b = coeffs.a;
+        coeffs.a = afp(0.f);
+    }
+    if (sx == psc(w) - 2)
+    {
+        sx = psc(w) - 3;
+        coeffs.a = coeffs.b + coeffs.a;
+        coeffs.b = coeffs.g;
+        coeffs.g = coeffs.r;
+        coeffs.r = afp(0.f);
+    }
+    if (sx >= psc(w) - 1)
+    {
+        sx = psc(w) - 3;
+        coeffs.a = afp(1.f) - coeffs.r;
+        coeffs.b = coeffs.r;
+        coeffs.g = afp(0.f);
+        coeffs.r = afp(0.f);
+    }
+
+    buffer_st4(alpha_blob_data, gx, coeffs);
+
+    xofs_blob_data[gx] = sx;
+}
diff --git a/source/device/vulkan/shaders/interp_bicubic_pack4.comp b/source/device/vulkan/shaders/interp_bicubic_pack4.comp
new file mode 100644
index 000000000..e89d6b141
--- /dev/null
+++ b/source/device/vulkan/shaders/interp_bicubic_pack4.comp
@@ -0,0 +1,163 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; };
+layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; };
+layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; };
+layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int sx = xofs_blob_data[gx];
+    int sy = yofs_blob_data[gy];
+
+#if NCNN_image_shader
+    afpvec4 a0 = image3d_ld4(bottom_blob, ivec3(sx - 1, sy - 1, gz));
+    afpvec4 a1 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy - 1, gz));
+    afpvec4 a2 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy - 1, gz));
+    afpvec4 a3 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy - 1, gz));
+
+    afpmat4 a0123 = afpmat4(a0, a1, a2, a3);
+
+    afpvec4 b0 = image3d_ld4(bottom_blob, ivec3(sx - 1, sy + 0, gz));
+    afpvec4 b1 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 0, gz));
+    afpvec4 b2 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 0, gz));
+    afpvec4 b3 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 0, gz));
+
+    afpmat4 b0123 = afpmat4(b0, b1, b2, b3);
+
+    afpvec4 c0 = image3d_ld4(bottom_blob, ivec3(sx - 1, sy + 1, gz));
+    afpvec4 c1 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 1, gz));
+    afpvec4 c2 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 1, gz));
+    afpvec4 c3 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 1, gz));
+
+    afpmat4 c0123 = afpmat4(c0, c1, c2, c3);
+
+    afpvec4 d0 = image3d_ld4(bottom_blob, ivec3(sx - 1, sy + 2, gz));
+    afpvec4 d1 = image3d_ld4(bottom_blob, ivec3(sx + 0, sy + 2, gz));
+    afpvec4 d2 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 2, gz));
+    afpvec4 d3 = image3d_ld4(bottom_blob, ivec3(sx + 2, sy + 2, gz));
+
+    afpmat4 d0123 = afpmat4(d0, d1, d2, d3);
+#else
+    int v_offset_0 = gz * psc(cstep) + (sy - 1) * psc(w) + sx;
+    int v_offset_1 = gz * psc(cstep) + (sy + 0) * psc(w) + sx;
+    int v_offset_2 = gz * psc(cstep) + (sy + 1) * psc(w) + sx;
+    int v_offset_3 = gz * psc(cstep) + (sy + 2) * psc(w) + sx;
+
+    afpvec4 a0 = buffer_ld4(bottom_blob_data, v_offset_0 - 1);
+    afpvec4 a1 = buffer_ld4(bottom_blob_data, v_offset_0 + 0);
+    afpvec4 a2 = buffer_ld4(bottom_blob_data, v_offset_0 + 1);
+    afpvec4 a3 = buffer_ld4(bottom_blob_data, v_offset_0 + 2);
+
+    afpmat4 a0123 = afpmat4(a0, a1, a2, a3);
+
+    afpvec4 b0 = buffer_ld4(bottom_blob_data, v_offset_1 - 1);
+    afpvec4 b1 = buffer_ld4(bottom_blob_data, v_offset_1 + 0);
+    afpvec4 b2 = buffer_ld4(bottom_blob_data, v_offset_1 + 1);
+    afpvec4 b3 = buffer_ld4(bottom_blob_data, v_offset_1 + 2);
+
+    afpmat4 b0123 = afpmat4(b0, b1, b2, b3);
+
+    afpvec4 c0 = buffer_ld4(bottom_blob_data, v_offset_2 - 1);
+    afpvec4 c1 = buffer_ld4(bottom_blob_data, v_offset_2 + 0);
+    afpvec4 c2 = buffer_ld4(bottom_blob_data, v_offset_2 + 1);
+    afpvec4 c3 = buffer_ld4(bottom_blob_data, v_offset_2 + 2);
+
+    afpmat4 c0123 = afpmat4(c0, c1, c2, c3);
+
+    afpvec4 d0 = buffer_ld4(bottom_blob_data, v_offset_3 - 1);
+    afpvec4 d1 = buffer_ld4(bottom_blob_data, v_offset_3 + 0);
+    afpvec4 d2 = buffer_ld4(bottom_blob_data, v_offset_3 + 1);
+    afpvec4 d3 = buffer_ld4(bottom_blob_data, v_offset_3 + 2);
+
+    afpmat4 d0123 = afpmat4(d0, d1, d2, d3);
+#endif
+
+    afpvec4 alpha = buffer_ld4(alpha_blob_data, gx);
+
+    afpvec4 a = a0123 * alpha;
+    afpvec4 b = b0123 * alpha;
+    afpvec4 c = c0123 * alpha;
+    afpvec4 d = d0123 * alpha;
+
+    afpmat4 abcd = afpmat4(a, b, c, d);
+
+    afpvec4 beta = buffer_ld4(beta_blob_data, gy);
+
+    afpvec4 v = abcd * beta;
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(gx, gy, gz), v);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/interp_bicubic_pack8.comp b/source/device/vulkan/shaders/interp_bicubic_pack8.comp
new file mode 100644
index 000000000..f51bd3bee
--- /dev/null
+++ b/source/device/vulkan/shaders/interp_bicubic_pack8.comp
@@ -0,0 +1,175 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define shape_constant_id_offset 0
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; };
+layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; };
+layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; };
+layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int sx = xofs_blob_data[gx];
+    int sy = yofs_blob_data[gy];
+
+    afpvec4 alpha = buffer_ld4(alpha_blob_data, gx);
+
+#if NCNN_image_shader
+    afpvec8 a0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy - 1, gz));
+    afpvec8 a1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy - 1, gz));
+    afpvec8 a2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy - 1, gz));
+    afpvec8 a3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy - 1, gz));
+
+    afpvec8 a;
+    a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a;
+    a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a;
+
+    afpvec8 b0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy + 0, gz));
+    afpvec8 b1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 0, gz));
+    afpvec8 b2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 0, gz));
+    afpvec8 b3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 0, gz));
+
+    afpvec8 b;
+    b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a;
+    b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a;
+
+    afpvec8 c0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy + 1, gz));
+    afpvec8 c1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 1, gz));
+    afpvec8 c2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz));
+    afpvec8 c3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 1, gz));
+
+    afpvec8 c;
+    c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a;
+    c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a;
+
+    afpvec8 d0 = image3d_ld8(bottom_blob, ivec3(sx - 1, sy + 2, gz));
+    afpvec8 d1 = image3d_ld8(bottom_blob, ivec3(sx + 0, sy + 2, gz));
+    afpvec8 d2 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 2, gz));
+    afpvec8 d3 = image3d_ld8(bottom_blob, ivec3(sx + 2, sy + 2, gz));
+
+    afpvec8 d;
+    d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a;
+    d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a;
+#else
+    int v_offset_0 = gz * psc(cstep) + (sy - 1) * psc(w) + sx;
+    int v_offset_1 = gz * psc(cstep) + (sy + 0) * psc(w) + sx;
+    int v_offset_2 = gz * psc(cstep) + (sy + 1) * psc(w) + sx;
+    int v_offset_3 = gz * psc(cstep) + (sy + 2) * psc(w) + sx;
+
+    afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0 - 1);
+    afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 0);
+    afpvec8 a2 = buffer_ld8(bottom_blob_data, v_offset_0 + 1);
+    afpvec8 a3 = buffer_ld8(bottom_blob_data, v_offset_0 + 2);
+
+    afpvec8 a;
+    a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a;
+    a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a;
+
+    afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1 - 1);
+    afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 0);
+    afpvec8 b2 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);
+    afpvec8 b3 = buffer_ld8(bottom_blob_data, v_offset_1 + 2);
+
+    afpvec8 b;
+    b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a;
+    b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a;
+
+    afpvec8 c0 = buffer_ld8(bottom_blob_data, v_offset_2 - 1);
+    afpvec8 c1 = buffer_ld8(bottom_blob_data, v_offset_2 + 0);
+    afpvec8 c2 = buffer_ld8(bottom_blob_data, v_offset_2 + 1);
+    afpvec8 c3 = buffer_ld8(bottom_blob_data, v_offset_2 + 2);
+
+    afpvec8 c;
+    c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a;
+    c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a;
+
+    afpvec8 d0 = buffer_ld8(bottom_blob_data, v_offset_3 - 1);
+    afpvec8 d1 = buffer_ld8(bottom_blob_data, v_offset_3 + 0);
+    afpvec8 d2 = buffer_ld8(bottom_blob_data, v_offset_3 + 1);
+    afpvec8 d3 = buffer_ld8(bottom_blob_data, v_offset_3 + 2);
+
+    afpvec8 d;
+    d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a;
+    d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a;
+#endif
+
+    afpvec4 beta = buffer_ld4(beta_blob_data, gy);
+
+    afpvec8 v;
+    v[0] = a[0] * beta.r + b[0] * beta.g + c[0] * beta.b + d[0] * beta.a;
+    v[1] = a[1] * beta.r + b[1] * beta.g + c[1] * beta.b + d[1] * beta.a;
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(gx, gy, gz), v);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/interp_pack4.comp b/source/device/vulkan/shaders/interp_pack4.comp
new file mode 100644
index 000000000..47d652e5f
--- /dev/null
+++ b/source/device/vulkan/shaders/interp_pack4.comp
@@ -0,0 +1,150 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int resize_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    float scale_x;
+    float scale_y;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    if (resize_type == 1) // nearest
+    {
+        afpvec2 gxy = afpvec2(gx, gy);
+        ivec2 sxy_max = ivec2(psc(w) - 1, psc(h) - 1);
+        ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max);
+
+        int sx = sxy.r;
+        int sy = sxy.g;
+
+#if NCNN_image_shader
+        image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx, sy, gz));
+#else
+        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
+
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+    }
+    if (resize_type == 2) // bilinear
+    {
+        afpvec2 gxy = afpvec2(gx, gy);
+        afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f);
+
+        ivec2 sxy = ivec2(floor(fxy));
+
+        fxy -= afpvec2(sxy);
+
+        ivec2 sxy_max = ivec2(psc(w) - 2, psc(h) - 2);
+
+        bvec2 underflow = lessThan(sxy, ivec2(0));
+        bvec2 overflow = greaterThan(sxy, sxy_max);
+
+        sxy = clamp(sxy, ivec2(0), sxy_max);
+
+        fxy = mix(fxy, afpvec2(0.f), underflow);
+        fxy = mix(fxy, afpvec2(1.f), overflow);
+
+        int sx = sxy.r;
+        int sy = sxy.g;
+
+#if NCNN_image_shader
+        afpvec4 a0 = image3d_ld4(bottom_blob, ivec3(sx, sy, gz));
+        afpvec4 a1 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy, gz));
+        afpvec4 b0 = image3d_ld4(bottom_blob, ivec3(sx, sy + 1, gz));
+        afpvec4 b1 = image3d_ld4(bottom_blob, ivec3(sx + 1, sy + 1, gz));
+#else
+        int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+        int v_offset_1 = gz * psc(cstep) + (sy + 1) * psc(w) + sx;
+
+        afpvec4 a0 = buffer_ld4(bottom_blob_data, v_offset_0);
+        afpvec4 a1 = buffer_ld4(bottom_blob_data, v_offset_0 + 1);
+        afpvec4 b0 = buffer_ld4(bottom_blob_data, v_offset_1);
+        afpvec4 b1 = buffer_ld4(bottom_blob_data, v_offset_1 + 1);
+#endif
+
+        afp fx = fxy.r;
+        afp fy = fxy.g;
+
+        afpvec4 a = a0 * (afp(1.f) - fx) + a1 * fx;
+        afpvec4 b = b0 * (afp(1.f) - fx) + b1 * fx;
+
+        afpvec4 res = a * (afp(1.f) - fy) + b * fy;
+
+#if NCNN_image_shader
+        image3d_st4(top_blob, ivec3(gx, gy, gz), res);
+#else
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        buffer_st4(top_blob_data, gi, res);
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/interp_pack8.comp b/source/device/vulkan/shaders/interp_pack8.comp
new file mode 100644
index 000000000..e62c831e8
--- /dev/null
+++ b/source/device/vulkan/shaders/interp_pack8.comp
@@ -0,0 +1,238 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int resize_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    float scale_x;
+    float scale_y;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    if (resize_type == 1) // nearest
+    {
+        afpvec2 gxy = afpvec2(gx, gy);
+        ivec2 sxy_max = ivec2(psc(w) - 1, psc(h) - 1);
+        ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max);
+
+        int sx = sxy.r;
+        int sy = sxy.g;
+
+#if NCNN_image_shader
+        image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx, sy, gz));
+#else
+        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
+
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+    }
+    if (resize_type == 5) // bilinear
+    {
+        afpvec2 gxy = afpvec2(gx, gy);
+        afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f);
+
+        ivec2 sxy = ivec2(floor(fxy));
+
+        fxy -= afpvec2(sxy);
+
+        ivec2 sxy_max = ivec2(psc(w) - 2, psc(h) - 2);
+
+        bvec2 underflow = lessThan(sxy, ivec2(0));
+        bvec2 overflow = greaterThan(sxy, sxy_max);
+
+        sxy = clamp(sxy, ivec2(0), sxy_max);
+
+        fxy = mix(fxy, afpvec2(0.f), underflow);
+        fxy = mix(fxy, afpvec2(1.f), overflow);
+
+        int sx = sxy.r;
+        int sy = sxy.g;
+
+#if NCNN_image_shader
+        afpvec8 a0 = image3d_ld8(bottom_blob, ivec3(sx, sy, gz));
+        afpvec8 a1 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy, gz));
+        afpvec8 b0 = image3d_ld8(bottom_blob, ivec3(sx, sy + 1, gz));
+        afpvec8 b1 = image3d_ld8(bottom_blob, ivec3(sx + 1, sy + 1, gz));
+#else
+        int v_offset_0 = gz * psc(cstep) + sy * psc(w) + sx;
+        int v_offset_1 = gz * psc(cstep) + (sy + 1) * psc(w) + sx;
+
+        afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0);
+        afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 1);
+        afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1);
+        afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);
+#endif
+
+        afp fx = fxy.r;
+        afp fy = fxy.g;
+
+        afpvec8 a;
+        afpvec8 b;
+        a[0] = a0[0] * (afp(1.f) - fx) + a1[0] * fx;
+        a[1] = a0[1] * (afp(1.f) - fx) + a1[1] * fx;
+        b[0] = b0[0] * (afp(1.f) - fx) + b1[0] * fx;
+        b[1] = b0[1] * (afp(1.f) - fx) + b1[1] * fx;
+
+        afpvec8 res;
+        res[0] = a[0] * (afp(1.f) - fy) + b[0] * fy;
+        res[1] = a[1] * (afp(1.f) - fy) + b[1] * fy;
+
+#if NCNN_image_shader
+        image3d_st8(top_blob, ivec3(gx, gy, gz), res);
+#else
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        buffer_st8(top_blob_data, gi, res);
+#endif
+    }
+    else
+    {
+        afpvec2 gxy = afpvec2(gx, gy);
+        ivec2 sxy_max = ivec2(psc(w) - 1, psc(h) - 1);
+        // ivec2 in_xy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max);
+        ivec2 in_xy = min(ivec2(floor(gxy / afpvec2(2.0f, 2.0f))), sxy_max);
+
+        afpvec2 ff_sxy_max = afpvec2(psc(w) - 1, psc(h) - 1);
+        afpvec2 ffin_xy = afpvec2(gxy * afpvec2(p.scale_x, p.scale_y));
+
+        afp ff_in_x = ffin_xy.r;
+        afp ff_in_y = ffin_xy.g;
+
+        int in_x = in_xy.r;
+        int in_y = in_xy.g;
+
+        int in_y1 = min(in_y, psc(h) - 1);
+        int in_y2 = min(in_y1 + 1, psc(h) - 1);
+
+        float dy1 = abs(in_y - in_y1);
+        float dy2 = abs(in_y - in_y2);
+
+        afp ff_dy1 = abs(ff_in_y - afp(in_y1));
+        afp ff_dy2 = abs(ff_in_y - afp(in_y2));
+
+        if (in_y1 == in_y2)
+        {
+                dy1 = 0.5f;
+                dy2 = 0.5f;
+        }
+
+        if (ff_dy1 == ff_dy2)
+        {
+                dy1 = 0.5f;
+                dy2 = 0.5f;
+        }
+
+        int in_x1 = min(in_x, psc(w) - 1);
+        int in_x2 = min(in_x1 + 1, psc(w) - 1);
+
+        float dx1 = abs(in_x - in_x1);
+        float dx2 = abs(in_x - in_x2);
+
+        afp ff_dx1 = abs(ff_in_x - afp(in_x1));
+        afp ff_dx2 = abs(ff_in_x - afp(in_x2));
+        if (in_x1 == in_x2)
+        {
+                dx1 = 0.5f;
+                dx2 = 0.5f;
+        }
+        if (ff_dx1 == ff_dx2)
+        {
+                dx1 = 0.5f;
+                dx2 = 0.5f;
+        }
+
+
+#if NCNN_image_shader
+#else
+        int v_offset_0 = gz * psc(cstep) + in_y1 * psc(w) + in_x1;
+        int v_offset_1 = gz * psc(cstep) + in_y1 * psc(w) + in_x2;
+        int v_offset_2 = gz * psc(cstep) + in_y2 * psc(w) + in_x1;
+        int v_offset_3 = gz * psc(cstep) + in_y2 * psc(w) + in_x2;
+
+        afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0);
+        afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_1);
+        afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_2);
+        afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_3);
+
+        afpvec8 res;
+        res[0] = afp(dx2 * dy2) * a0[0] + afp(dx1 * dy2) * a1[0] + afp(dx2 * dy1) * b0[0] + afp(dx1 * dy1) * b1[0];
+        res[1] = afp(dx2 * dy2) * a0[1] + afp(dx1 * dy2) * a1[1] + afp(dx2 * dy1) * b0[1] + afp(dx1 * dy1) * b1[1];
+
+        const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+        // res = afpvec8(afpvec4(ff_dy1), afpvec4(ff_dy1));
+
+        buffer_st8(top_blob_data, gi, res);
+#endif
+
+    }
+}
diff --git a/source/device/vulkan/shaders/packing.comp b/source/device/vulkan/shaders/packing.comp
new file mode 100644
index 000000000..f018ab5fe
--- /dev/null
+++ b/source/device/vulkan/shaders/packing.comp
@@ -0,0 +1,165 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld1(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld1(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st1(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st1(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld1(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st1(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st1(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld1(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st1(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_fp16_to_fp32.comp
new file mode 100644
index 000000000..7a2b2bf9d
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_fp16_to_fp32.comp
@@ -0,0 +1,165 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob_fp32 { float top_blob_fp32_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, r32f) writeonly uniform highp image1D top_blob_1d_fp32;
+layout (binding = 3, r32f) writeonly uniform highp image2D top_blob_2d_fp32;
+layout (binding = 3, r32f) writeonly uniform highp image3D top_blob_3d_fp32;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld1(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld1(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            top_blob_fp32_data[gi] = float(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_1d_fp32, gx, vec4(v));
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld1(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = float(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_2d_fp32, ivec2(gx, gy), vec4(v));
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld1(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = float(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_3d_fp32, ivec3(gx, gy, gz), vec4(v));
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_fp32_to_fp16.comp
new file mode 100644
index 000000000..213f62cd2
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_fp32_to_fp16.comp
@@ -0,0 +1,165 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob_fp32 { float bottom_blob_fp32_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32;
+layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32;
+layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = afp(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afp(texelFetch(bottom_blob_1d_fp32, gx, 0).r);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st1(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st1(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = afp(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, gy), 0).r);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st1(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st1(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = afp(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, gz), 0).r);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st1(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack1to4.comp b/source/device/vulkan/shaders/packing_pack1to4.comp
new file mode 100644
index 000000000..ba270d696
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack1to4.comp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+
+            v.r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v.g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v.b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v.a = buffer_ld1(bottom_blob_data, v_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int x4 = gx * 4;
+
+            v.r = image1d_ld1(bottom_blob_1d, x4 + 0);
+            v.g = image1d_ld1(bottom_blob_1d, x4 + 1);
+            v.b = image1d_ld1(bottom_blob_1d, x4 + 2);
+            v.a = image1d_ld1(bottom_blob_1d, x4 + 3);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st4(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(w) + gx;
+
+            v.r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v.g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v.b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v.a = buffer_ld1(bottom_blob_data, v_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int y4 = gy * 4;
+
+            v.r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0));
+            v.g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1));
+            v.b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2));
+            v.a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx);
+
+            v.r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v.g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v.b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v.a = buffer_ld1(bottom_blob_data, v_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int z4 = gz * 4;
+
+            v.r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0));
+            v.g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1));
+            v.b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2));
+            v.a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp
new file mode 100644
index 000000000..f75546fbb
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack1to4_fp16_to_fp32.comp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob_fp32 { vec4 top_blob_fp32_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+
+            v.r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v.g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v.b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v.a = buffer_ld1(bottom_blob_data, v_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int x4 = gx * 4;
+
+            v.r = image1d_ld1(bottom_blob_1d, x4 + 0);
+            v.g = image1d_ld1(bottom_blob_1d, x4 + 1);
+            v.b = image1d_ld1(bottom_blob_1d, x4 + 2);
+            v.a = image1d_ld1(bottom_blob_1d, x4 + 3);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            top_blob_fp32_data[gi] = vec4(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_1d_fp32, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(w) + gx;
+
+            v.r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v.g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v.b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v.a = buffer_ld1(bottom_blob_data, v_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int y4 = gy * 4;
+
+            v.r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0));
+            v.g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1));
+            v.b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2));
+            v.a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = vec4(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_2d_fp32, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx);
+
+            v.r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v.g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v.b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v.a = buffer_ld1(bottom_blob_data, v_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int z4 = gz * 4;
+
+            v.r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0));
+            v.g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1));
+            v.b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2));
+            v.a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = vec4(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_3d_fp32, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp
new file mode 100644
index 000000000..48569c9f1
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack1to4_fp32_to_fp16.comp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob_fp32 { float bottom_blob_fp32_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32;
+layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32;
+layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+
+            v.r = afp(bottom_blob_fp32_data[v_offset.r]);
+            v.g = afp(bottom_blob_fp32_data[v_offset.g]);
+            v.b = afp(bottom_blob_fp32_data[v_offset.b]);
+            v.a = afp(bottom_blob_fp32_data[v_offset.a]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int x4 = gx * 4;
+
+            v.r = afp(texelFetch(bottom_blob_1d_fp32, x4 + 0, 0).r);
+            v.g = afp(texelFetch(bottom_blob_1d_fp32, x4 + 1, 0).r);
+            v.b = afp(texelFetch(bottom_blob_1d_fp32, x4 + 2, 0).r);
+            v.a = afp(texelFetch(bottom_blob_1d_fp32, x4 + 3, 0).r);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st4(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(w) + gx;
+
+            v.r = afp(bottom_blob_fp32_data[v_offset.r]);
+            v.g = afp(bottom_blob_fp32_data[v_offset.g]);
+            v.b = afp(bottom_blob_fp32_data[v_offset.b]);
+            v.a = afp(bottom_blob_fp32_data[v_offset.a]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int y4 = gy * 4;
+
+            v.r = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 0), 0).r);
+            v.g = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 1), 0).r);
+            v.b = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 2), 0).r);
+            v.a = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 3), 0).r);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx);
+
+            v.r = afp(bottom_blob_fp32_data[v_offset.r]);
+            v.g = afp(bottom_blob_fp32_data[v_offset.g]);
+            v.b = afp(bottom_blob_fp32_data[v_offset.b]);
+            v.a = afp(bottom_blob_fp32_data[v_offset.a]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int z4 = gz * 4;
+
+            v.r = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 0), 0).r);
+            v.g = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 1), 0).r);
+            v.b = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 2), 0).r);
+            v.a = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 3), 0).r);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack1to8.comp b/source/device/vulkan/shaders/packing_pack1to8.comp
new file mode 100644
index 000000000..a97fbe923
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack1to8.comp
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+            ivec4 vv_offset = x4 + 4;
+
+            v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
+            v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r);
+            v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g);
+            v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b);
+            v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int x4 = gx * 8;
+
+            v[0].r = image1d_ld1(bottom_blob_1d, x4 + 0);
+            v[0].g = image1d_ld1(bottom_blob_1d, x4 + 1);
+            v[0].b = image1d_ld1(bottom_blob_1d, x4 + 2);
+            v[0].a = image1d_ld1(bottom_blob_1d, x4 + 3);
+            v[1].r = image1d_ld1(bottom_blob_1d, x4 + 4);
+            v[1].g = image1d_ld1(bottom_blob_1d, x4 + 5);
+            v[1].b = image1d_ld1(bottom_blob_1d, x4 + 6);
+            v[1].a = image1d_ld1(bottom_blob_1d, x4 + 7);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st8(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(w) + gx;
+            ivec4 vv_offset = (y4 + 4) * psc(w) + gx;
+
+            v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
+            v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r);
+            v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g);
+            v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b);
+            v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int y4 = gy * 8;
+
+            v[0].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0));
+            v[0].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1));
+            v[0].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2));
+            v[0].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3));
+            v[1].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 4));
+            v[1].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 5));
+            v[1].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 6));
+            v[1].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 7));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx);
+            ivec4 vv_offset = (z4 + 4) * psc(cstep) + ivec4(gy * psc(w) + gx);
+
+            v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
+            v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r);
+            v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g);
+            v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b);
+            v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int z4 = gz * 8;
+
+            v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0));
+            v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1));
+            v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2));
+            v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3));
+            v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 4));
+            v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 5));
+            v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 6));
+            v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 7));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp
new file mode 100644
index 000000000..62a980788
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack1to8_fp16_to_fp32.comp
@@ -0,0 +1,226 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob_fp32 { mat2x4 top_blob_fp32_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+            ivec4 vv_offset = x4 + 4;
+
+            v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
+            v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r);
+            v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g);
+            v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b);
+            v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int x4 = gx * 8;
+
+            v[0].r = image1d_ld1(bottom_blob_1d, x4 + 0);
+            v[0].g = image1d_ld1(bottom_blob_1d, x4 + 1);
+            v[0].b = image1d_ld1(bottom_blob_1d, x4 + 2);
+            v[0].a = image1d_ld1(bottom_blob_1d, x4 + 3);
+            v[1].r = image1d_ld1(bottom_blob_1d, x4 + 4);
+            v[1].g = image1d_ld1(bottom_blob_1d, x4 + 5);
+            v[1].b = image1d_ld1(bottom_blob_1d, x4 + 6);
+            v[1].a = image1d_ld1(bottom_blob_1d, x4 + 7);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1]));
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_1d_fp32, gx * 2, v[0]);
+            imageStore(top_blob_1d_fp32, gx * 2 + 1, v[1]);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(w) + gx;
+            ivec4 vv_offset = (y4 + 4) * psc(w) + gx;
+
+            v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
+            v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r);
+            v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g);
+            v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b);
+            v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int y4 = gy * 8;
+
+            v[0].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 0));
+            v[0].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 1));
+            v[0].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 2));
+            v[0].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 3));
+            v[1].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 4));
+            v[1].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 5));
+            v[1].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 6));
+            v[1].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4 + 7));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1]));
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_2d_fp32, ivec2(gx * 2, gy), v[0]);
+            imageStore(top_blob_2d_fp32, ivec2(gx * 2 + 1, gy), v[1]);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx);
+            ivec4 vv_offset = (z4 + 4) * psc(cstep) + ivec4(gy * psc(w) + gx);
+
+            v[0].r = buffer_ld1(bottom_blob_data, v_offset.r);
+            v[0].g = buffer_ld1(bottom_blob_data, v_offset.g);
+            v[0].b = buffer_ld1(bottom_blob_data, v_offset.b);
+            v[0].a = buffer_ld1(bottom_blob_data, v_offset.a);
+            v[1].r = buffer_ld1(bottom_blob_data, vv_offset.r);
+            v[1].g = buffer_ld1(bottom_blob_data, vv_offset.g);
+            v[1].b = buffer_ld1(bottom_blob_data, vv_offset.b);
+            v[1].a = buffer_ld1(bottom_blob_data, vv_offset.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int z4 = gz * 8;
+
+            v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 0));
+            v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 1));
+            v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 2));
+            v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 3));
+            v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 4));
+            v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 5));
+            v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 6));
+            v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4 + 7));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1]));
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_3d_fp32, ivec3(gx * 2, gy, gz), v[0]);
+            imageStore(top_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), v[1]);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp
new file mode 100644
index 000000000..6b3a405e7
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack1to8_fp32_to_fp16.comp
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob_fp32 { float bottom_blob_fp32_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32;
+layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32;
+layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+            ivec4 vv_offset = x4 + 4;
+
+            v[0].r = afp(bottom_blob_fp32_data[v_offset.r]);
+            v[0].g = afp(bottom_blob_fp32_data[v_offset.g]);
+            v[0].b = afp(bottom_blob_fp32_data[v_offset.b]);
+            v[0].a = afp(bottom_blob_fp32_data[v_offset.a]);
+            v[1].r = afp(bottom_blob_fp32_data[vv_offset.r]);
+            v[1].g = afp(bottom_blob_fp32_data[vv_offset.g]);
+            v[1].b = afp(bottom_blob_fp32_data[vv_offset.b]);
+            v[1].a = afp(bottom_blob_fp32_data[vv_offset.a]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int x4 = gx * 8;
+
+            v[0].r = afp(texelFetch(bottom_blob_1d_fp32, x4 + 0, 0).r);
+            v[0].g = afp(texelFetch(bottom_blob_1d_fp32, x4 + 1, 0).r);
+            v[0].b = afp(texelFetch(bottom_blob_1d_fp32, x4 + 2, 0).r);
+            v[0].a = afp(texelFetch(bottom_blob_1d_fp32, x4 + 3, 0).r);
+            v[1].r = afp(texelFetch(bottom_blob_1d_fp32, x4 + 4, 0).r);
+            v[1].g = afp(texelFetch(bottom_blob_1d_fp32, x4 + 5, 0).r);
+            v[1].b = afp(texelFetch(bottom_blob_1d_fp32, x4 + 6, 0).r);
+            v[1].a = afp(texelFetch(bottom_blob_1d_fp32, x4 + 7, 0).r);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st8(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(w) + gx;
+            ivec4 vv_offset = (y4 + 4) * psc(w) + gx;
+
+            v[0].r = afp(bottom_blob_fp32_data[v_offset.r]);
+            v[0].g = afp(bottom_blob_fp32_data[v_offset.g]);
+            v[0].b = afp(bottom_blob_fp32_data[v_offset.b]);
+            v[0].a = afp(bottom_blob_fp32_data[v_offset.a]);
+            v[1].r = afp(bottom_blob_fp32_data[vv_offset.r]);
+            v[1].g = afp(bottom_blob_fp32_data[vv_offset.g]);
+            v[1].b = afp(bottom_blob_fp32_data[vv_offset.b]);
+            v[1].a = afp(bottom_blob_fp32_data[vv_offset.a]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int y4 = gy * 8;
+
+            v[0].r = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 0), 0).r);
+            v[0].g = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 1), 0).r);
+            v[0].b = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 2), 0).r);
+            v[0].a = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 3), 0).r);
+            v[1].r = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 4), 0).r);
+            v[1].g = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 5), 0).r);
+            v[1].b = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 6), 0).r);
+            v[1].a = afp(texelFetch(bottom_blob_2d_fp32, ivec2(gx, y4 + 7), 0).r);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(cstep) + ivec4(gy * psc(w) + gx);
+            ivec4 vv_offset = (z4 + 4) * psc(cstep) + ivec4(gy * psc(w) + gx);
+
+            v[0].r = afp(bottom_blob_fp32_data[v_offset.r]);
+            v[0].g = afp(bottom_blob_fp32_data[v_offset.g]);
+            v[0].b = afp(bottom_blob_fp32_data[v_offset.b]);
+            v[0].a = afp(bottom_blob_fp32_data[v_offset.a]);
+            v[1].r = afp(bottom_blob_fp32_data[vv_offset.r]);
+            v[1].g = afp(bottom_blob_fp32_data[vv_offset.g]);
+            v[1].b = afp(bottom_blob_fp32_data[vv_offset.b]);
+            v[1].a = afp(bottom_blob_fp32_data[vv_offset.a]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int z4 = gz * 8;
+
+            v[0].r = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 0), 0).r);
+            v[0].g = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 1), 0).r);
+            v[0].b = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 2), 0).r);
+            v[0].a = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 3), 0).r);
+            v[1].r = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 4), 0).r);
+            v[1].g = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 5), 0).r);
+            v[1].b = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 6), 0).r);
+            v[1].a = afp(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, z4 + 7), 0).r);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack4.comp b/source/device/vulkan/shaders/packing_pack4.comp
new file mode 100644
index 000000000..c0c64e5a2
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack4.comp
@@ -0,0 +1,165 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld4(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st4(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack4_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack4_fp16_to_fp32.comp
new file mode 100644
index 000000000..b05b9eda0
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack4_fp16_to_fp32.comp
@@ -0,0 +1,165 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob_fp32 { vec4 top_blob_fp32_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld4(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            top_blob_fp32_data[gi] = vec4(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_1d_fp32, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = vec4(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_2d_fp32, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = vec4(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_3d_fp32, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack4_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack4_fp32_to_fp16.comp
new file mode 100644
index 000000000..fcd96950f
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack4_fp32_to_fp16.comp
@@ -0,0 +1,165 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob_fp32 { vec4 bottom_blob_fp32_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32;
+layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32;
+layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = afpvec4(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec4(texelFetch(bottom_blob_1d_fp32, gx, 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st4(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = afpvec4(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec4(texelFetch(bottom_blob_2d_fp32, ivec2(gx, gy), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = afpvec4(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec4(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, gz), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack4to1.comp b/source/device/vulkan/shaders/packing_pack4to1.comp
new file mode 100644
index 000000000..ef070eaab
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack4to1.comp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld4(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+
+            buffer_st1(top_blob_data, v_offset.r, v.r);
+            buffer_st1(top_blob_data, v_offset.g, v.g);
+            buffer_st1(top_blob_data, v_offset.b, v.b);
+            buffer_st1(top_blob_data, v_offset.a, v.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int x4 = gx * 4;
+
+            image1d_st1(top_blob_1d, x4 + 0, v.r);
+            image1d_st1(top_blob_1d, x4 + 1, v.g);
+            image1d_st1(top_blob_1d, x4 + 2, v.b);
+            image1d_st1(top_blob_1d, x4 + 3, v.a);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(outw) + gx;
+
+            buffer_st1(top_blob_data, v_offset.r, v.r);
+            buffer_st1(top_blob_data, v_offset.g, v.g);
+            buffer_st1(top_blob_data, v_offset.b, v.b);
+            buffer_st1(top_blob_data, v_offset.a, v.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int y4 = gy * 4;
+
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v.r);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v.g);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v.b);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v.a);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx);
+
+            buffer_st1(top_blob_data, v_offset.r, v.r);
+            buffer_st1(top_blob_data, v_offset.g, v.g);
+            buffer_st1(top_blob_data, v_offset.b, v.b);
+            buffer_st1(top_blob_data, v_offset.a, v.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int z4 = gz * 4;
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v.r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v.g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v.b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v.a);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp
new file mode 100644
index 000000000..7fd911969
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack4to1_fp16_to_fp32.comp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob_fp32 { float top_blob_fp32_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, r32f) writeonly uniform highp image1D top_blob_1d_fp32;
+layout (binding = 3, r32f) writeonly uniform highp image2D top_blob_2d_fp32;
+layout (binding = 3, r32f) writeonly uniform highp image3D top_blob_3d_fp32;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld4(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+
+            top_blob_fp32_data[v_offset.r] = float(v.r);
+            top_blob_fp32_data[v_offset.g] = float(v.g);
+            top_blob_fp32_data[v_offset.b] = float(v.b);
+            top_blob_fp32_data[v_offset.a] = float(v.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int x4 = gx * 4;
+
+            image1d_st1(top_blob_1d_fp32, x4 + 0, v.r);
+            image1d_st1(top_blob_1d_fp32, x4 + 1, v.g);
+            image1d_st1(top_blob_1d_fp32, x4 + 2, v.b);
+            image1d_st1(top_blob_1d_fp32, x4 + 3, v.a);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(outw) + gx;
+
+            top_blob_fp32_data[v_offset.r] = float(v.r);
+            top_blob_fp32_data[v_offset.g] = float(v.g);
+            top_blob_fp32_data[v_offset.b] = float(v.b);
+            top_blob_fp32_data[v_offset.a] = float(v.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int y4 = gy * 4;
+
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 0), v.r);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 1), v.g);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 2), v.b);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 3), v.a);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld4(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx);
+
+            top_blob_fp32_data[v_offset.r] = float(v.r);
+            top_blob_fp32_data[v_offset.g] = float(v.g);
+            top_blob_fp32_data[v_offset.b] = float(v.b);
+            top_blob_fp32_data[v_offset.a] = float(v.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int z4 = gz * 4;
+
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 0), v.r);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 1), v.g);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 2), v.b);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 3), v.a);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp
new file mode 100644
index 000000000..6a0d0346d
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack4to1_fp32_to_fp16.comp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob_fp32 { vec4 bottom_blob_fp32_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32;
+layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32;
+layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = afpvec4(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec4(texelFetch(bottom_blob_1d_fp32, gx, 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+
+            buffer_st1(top_blob_data, v_offset.r, v.r);
+            buffer_st1(top_blob_data, v_offset.g, v.g);
+            buffer_st1(top_blob_data, v_offset.b, v.b);
+            buffer_st1(top_blob_data, v_offset.a, v.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int x4 = gx * 4;
+
+            image1d_st1(top_blob_1d, x4 + 0, v.r);
+            image1d_st1(top_blob_1d, x4 + 1, v.g);
+            image1d_st1(top_blob_1d, x4 + 2, v.b);
+            image1d_st1(top_blob_1d, x4 + 3, v.a);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = afpvec4(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec4(texelFetch(bottom_blob_2d_fp32, ivec2(gx, gy), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(outw) + gx;
+
+            buffer_st1(top_blob_data, v_offset.r, v.r);
+            buffer_st1(top_blob_data, v_offset.g, v.g);
+            buffer_st1(top_blob_data, v_offset.b, v.b);
+            buffer_st1(top_blob_data, v_offset.a, v.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int y4 = gy * 4;
+
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v.r);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v.g);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v.b);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v.a);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = afpvec4(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec4(texelFetch(bottom_blob_3d_fp32, ivec3(gx, gy, gz), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx);
+
+            buffer_st1(top_blob_data, v_offset.r, v.r);
+            buffer_st1(top_blob_data, v_offset.g, v.g);
+            buffer_st1(top_blob_data, v_offset.b, v.b);
+            buffer_st1(top_blob_data, v_offset.a, v.a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int z4 = gz * 4;
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v.r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v.g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v.b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v.a);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack4to8.comp b/source/device/vulkan/shaders/packing_pack4to8.comp
new file mode 100644
index 000000000..4dd23773e
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack4to8.comp
@@ -0,0 +1,184 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = x2;
+
+            v[0] = buffer_ld4(bottom_blob_data, v_offset.r);
+            v[1] = buffer_ld4(bottom_blob_data, v_offset.g);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int x2 = gx * 2;
+
+            v[0] = image1d_ld4(bottom_blob_1d, x2 + 0);
+            v[1] = image1d_ld4(bottom_blob_1d, x2 + 1);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st8(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = y2 * psc(w) + gx;
+
+            v[0] = buffer_ld4(bottom_blob_data, v_offset.r);
+            v[1] = buffer_ld4(bottom_blob_data, v_offset.g);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int y2 = gy * 2;
+
+            v[0] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 0));
+            v[1] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 1));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = z2 * psc(cstep) + ivec2(gy * psc(w) + gx);
+
+            v[0] = buffer_ld4(bottom_blob_data, v_offset.r);
+            v[1] = buffer_ld4(bottom_blob_data, v_offset.g);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int z2 = gz * 2;
+
+            v[0] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 0));
+            v[1] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 1));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp
new file mode 100644
index 000000000..defc14089
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack4to8_fp16_to_fp32.comp
@@ -0,0 +1,184 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob_fp32 { mat2x4 top_blob_fp32_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = x2;
+
+            v[0] = buffer_ld4(bottom_blob_data, v_offset.r);
+            v[1] = buffer_ld4(bottom_blob_data, v_offset.g);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int x2 = gx * 2;
+
+            v[0] = image1d_ld4(bottom_blob_1d, x2 + 0);
+            v[1] = image1d_ld4(bottom_blob_1d, x2 + 1);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1]));
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st8(top_blob_1d_fp32, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = y2 * psc(w) + gx;
+
+            v[0] = buffer_ld4(bottom_blob_data, v_offset.r);
+            v[1] = buffer_ld4(bottom_blob_data, v_offset.g);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int y2 = gy * 2;
+
+            v[0] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 0));
+            v[1] = image2d_ld4(bottom_blob_2d, ivec2(gx, y2 + 1));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1]));
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st8(top_blob_2d_fp32, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = z2 * psc(cstep) + ivec2(gy * psc(w) + gx);
+
+            v[0] = buffer_ld4(bottom_blob_data, v_offset.r);
+            v[1] = buffer_ld4(bottom_blob_data, v_offset.g);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int z2 = gz * 2;
+
+            v[0] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 0));
+            v[1] = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, z2 + 1));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = mat2x4(vec4(v[0]), vec4(v[1]));
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st8(top_blob_3d_fp32, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp
new file mode 100644
index 000000000..48ac54ca9
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack4to8_fp32_to_fp16.comp
@@ -0,0 +1,184 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob_fp32 { vec4 bottom_blob_fp32_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32;
+layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32;
+layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = x2;
+
+            v[0] = afpvec4(bottom_blob_fp32_data[v_offset.r]);
+            v[1] = afpvec4(bottom_blob_fp32_data[v_offset.g]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int x2 = gx * 2;
+
+            v[0] = image1d_ld4(bottom_blob_1d_fp32, x2 + 0);
+            v[1] = image1d_ld4(bottom_blob_1d_fp32, x2 + 1);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st8(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = y2 * psc(w) + gx;
+
+            v[0] = afpvec4(bottom_blob_fp32_data[v_offset.r]);
+            v[1] = afpvec4(bottom_blob_fp32_data[v_offset.g]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int y2 = gy * 2;
+
+            v[0] = image2d_ld4(bottom_blob_2d_fp32, ivec2(gx, y2 + 0));
+            v[1] = image2d_ld4(bottom_blob_2d_fp32, ivec2(gx, y2 + 1));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = z2 * psc(cstep) + ivec2(gy * psc(w) + gx);
+
+            v[0] = afpvec4(bottom_blob_fp32_data[v_offset.r]);
+            v[1] = afpvec4(bottom_blob_fp32_data[v_offset.g]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            int z2 = gz * 2;
+
+            v[0] = image3d_ld4(bottom_blob_3d_fp32, ivec3(gx, gy, z2 + 0));
+            v[1] = image3d_ld4(bottom_blob_3d_fp32, ivec3(gx, gy, z2 + 1));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack8.comp b/source/device/vulkan/shaders/packing_pack8.comp
new file mode 100644
index 000000000..5b53e5b55
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack8.comp
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld8(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st8(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack8_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack8_fp16_to_fp32.comp
new file mode 100644
index 000000000..9576e59a6
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack8_fp16_to_fp32.comp
@@ -0,0 +1,169 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob_fp32 { mat2x4 top_blob_fp32_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld8(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            top_blob_fp32_data[gi] = mat2x4(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_1d_fp32, gx * 2, v[0]);
+            imageStore(top_blob_1d_fp32, gx * 2 + 1, v[1]);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = mat2x4(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_2d_fp32, ivec2(gx * 2, gy), v[0]);
+            imageStore(top_blob_2d_fp32, ivec2(gx * 2 + 1, gy), v[1]);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            top_blob_fp32_data[gi] = mat2x4(v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            imageStore(top_blob_3d_fp32, ivec3(gx * 2, gy, gz), v[0]);
+            imageStore(top_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), v[1]);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack8_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack8_fp32_to_fp16.comp
new file mode 100644
index 000000000..b78422346
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack8_fp32_to_fp16.comp
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob_fp32 { mat2x4 bottom_blob_fp32_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32;
+layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32;
+layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = afpvec8(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec8(texelFetch(bottom_blob_1d_fp32, gx * 2, 0), texelFetch(bottom_blob_1d_fp32, gx * 2 + 1, 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image1d_st8(top_blob_1d, gx, v);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = afpvec8(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec8(texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2, gy), 0), texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2 + 1, gy), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = afpvec8(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec8(texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2, gy, gz), 0), texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+            buffer_st8(top_blob_data, gi, v);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack8to1.comp b/source/device/vulkan/shaders/packing_pack8to1.comp
new file mode 100644
index 000000000..6eed4ce56
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack8to1.comp
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld8(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+            ivec4 vv_offset = x4 + 4;
+
+            buffer_st1(top_blob_data, v_offset.r, v[0].r);
+            buffer_st1(top_blob_data, v_offset.g, v[0].g);
+            buffer_st1(top_blob_data, v_offset.b, v[0].b);
+            buffer_st1(top_blob_data, v_offset.a, v[0].a);
+            buffer_st1(top_blob_data, vv_offset.r, v[1].r);
+            buffer_st1(top_blob_data, vv_offset.g, v[1].g);
+            buffer_st1(top_blob_data, vv_offset.b, v[1].b);
+            buffer_st1(top_blob_data, vv_offset.a, v[1].a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int x4 = gx * 8;
+
+            image1d_st1(top_blob_1d, x4 + 0, v[0].r);
+            image1d_st1(top_blob_1d, x4 + 1, v[0].g);
+            image1d_st1(top_blob_1d, x4 + 2, v[0].b);
+            image1d_st1(top_blob_1d, x4 + 3, v[0].a);
+            image1d_st1(top_blob_1d, x4 + 4, v[1].r);
+            image1d_st1(top_blob_1d, x4 + 5, v[1].g);
+            image1d_st1(top_blob_1d, x4 + 6, v[1].b);
+            image1d_st1(top_blob_1d, x4 + 7, v[1].a);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(outw) + gx;
+            ivec4 vv_offset = (y4 + 4) * psc(outw) + gx;
+
+            buffer_st1(top_blob_data, v_offset.r, v[0].r);
+            buffer_st1(top_blob_data, v_offset.g, v[0].g);
+            buffer_st1(top_blob_data, v_offset.b, v[0].b);
+            buffer_st1(top_blob_data, v_offset.a, v[0].a);
+            buffer_st1(top_blob_data, vv_offset.r, v[1].r);
+            buffer_st1(top_blob_data, vv_offset.g, v[1].g);
+            buffer_st1(top_blob_data, vv_offset.b, v[1].b);
+            buffer_st1(top_blob_data, vv_offset.a, v[1].a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int y4 = gy * 8;
+
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v[0].r);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v[0].g);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v[0].b);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v[0].a);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 4), v[1].r);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 5), v[1].g);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 6), v[1].b);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 7), v[1].a);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx);
+            ivec4 vv_offset = (z4 + 4) * psc(outcstep) + ivec4(gy * psc(outw) + gx);
+
+            buffer_st1(top_blob_data, v_offset.r, v[0].r);
+            buffer_st1(top_blob_data, v_offset.g, v[0].g);
+            buffer_st1(top_blob_data, v_offset.b, v[0].b);
+            buffer_st1(top_blob_data, v_offset.a, v[0].a);
+            buffer_st1(top_blob_data, vv_offset.r, v[1].r);
+            buffer_st1(top_blob_data, vv_offset.g, v[1].g);
+            buffer_st1(top_blob_data, vv_offset.b, v[1].b);
+            buffer_st1(top_blob_data, vv_offset.a, v[1].a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int z4 = gz * 8;
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 4), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 5), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 6), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 7), v[1].a);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp
new file mode 100644
index 000000000..f670c5443
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack8to1_fp16_to_fp32.comp
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob_fp32 { float top_blob_fp32_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, r32f) writeonly uniform highp image1D top_blob_1d_fp32;
+layout (binding = 3, r32f) writeonly uniform highp image2D top_blob_2d_fp32;
+layout (binding = 3, r32f) writeonly uniform highp image3D top_blob_3d_fp32;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld8(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+            ivec4 vv_offset = x4 + 4;
+
+            top_blob_fp32_data[v_offset.r] = float(v[0].r);
+            top_blob_fp32_data[v_offset.g] = float(v[0].g);
+            top_blob_fp32_data[v_offset.b] = float(v[0].b);
+            top_blob_fp32_data[v_offset.a] = float(v[0].a);
+            top_blob_fp32_data[vv_offset.r] = float(v[1].r);
+            top_blob_fp32_data[vv_offset.g] = float(v[1].g);
+            top_blob_fp32_data[vv_offset.b] = float(v[1].b);
+            top_blob_fp32_data[vv_offset.a] = float(v[1].a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int x4 = gx * 8;
+
+            image1d_st1(top_blob_1d_fp32, x4 + 0, v[0].r);
+            image1d_st1(top_blob_1d_fp32, x4 + 1, v[0].g);
+            image1d_st1(top_blob_1d_fp32, x4 + 2, v[0].b);
+            image1d_st1(top_blob_1d_fp32, x4 + 3, v[0].a);
+            image1d_st1(top_blob_1d_fp32, x4 + 4, v[1].r);
+            image1d_st1(top_blob_1d_fp32, x4 + 5, v[1].g);
+            image1d_st1(top_blob_1d_fp32, x4 + 6, v[1].b);
+            image1d_st1(top_blob_1d_fp32, x4 + 7, v[1].a);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(outw) + gx;
+            ivec4 vv_offset = (y4 + 4) * psc(outw) + gx;
+
+            top_blob_fp32_data[v_offset.r] = float(v[0].r);
+            top_blob_fp32_data[v_offset.g] = float(v[0].g);
+            top_blob_fp32_data[v_offset.b] = float(v[0].b);
+            top_blob_fp32_data[v_offset.a] = float(v[0].a);
+            top_blob_fp32_data[vv_offset.r] = float(v[1].r);
+            top_blob_fp32_data[vv_offset.g] = float(v[1].g);
+            top_blob_fp32_data[vv_offset.b] = float(v[1].b);
+            top_blob_fp32_data[vv_offset.a] = float(v[1].a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int y4 = gy * 8;
+
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 0), v[0].r);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 1), v[0].g);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 2), v[0].b);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 3), v[0].a);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 4), v[1].r);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 5), v[1].g);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 6), v[1].b);
+            image2d_st1(top_blob_2d_fp32, ivec2(gx, y4 + 7), v[1].a);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx);
+            ivec4 vv_offset = (z4 + 4) * psc(outcstep) + ivec4(gy * psc(outw) + gx);
+
+            top_blob_fp32_data[v_offset.r] = float(v[0].r);
+            top_blob_fp32_data[v_offset.g] = float(v[0].g);
+            top_blob_fp32_data[v_offset.b] = float(v[0].b);
+            top_blob_fp32_data[v_offset.a] = float(v[0].a);
+            top_blob_fp32_data[vv_offset.r] = float(v[1].r);
+            top_blob_fp32_data[vv_offset.g] = float(v[1].g);
+            top_blob_fp32_data[vv_offset.b] = float(v[1].b);
+            top_blob_fp32_data[vv_offset.a] = float(v[1].a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int z4 = gz * 8;
+
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 0), v[0].r);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 1), v[0].g);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 2), v[0].b);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 3), v[0].a);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 4), v[1].r);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 5), v[1].g);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 6), v[1].b);
+            image3d_st1(top_blob_3d_fp32, ivec3(gx, gy, z4 + 7), v[1].a);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp
new file mode 100644
index 000000000..8c162f0f3
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack8to1_fp32_to_fp16.comp
@@ -0,0 +1,223 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob_fp32 { mat2x4 bottom_blob_fp32_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32;
+layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32;
+layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = afpvec8(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec8(texelFetch(bottom_blob_1d_fp32, gx * 2, 0), texelFetch(bottom_blob_1d_fp32, gx * 2 + 1, 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 x4 = ivec4(gx * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = x4;
+            ivec4 vv_offset = x4 + 4;
+
+            buffer_st1(top_blob_data, v_offset.r, v[0].r);
+            buffer_st1(top_blob_data, v_offset.g, v[0].g);
+            buffer_st1(top_blob_data, v_offset.b, v[0].b);
+            buffer_st1(top_blob_data, v_offset.a, v[0].a);
+            buffer_st1(top_blob_data, vv_offset.r, v[1].r);
+            buffer_st1(top_blob_data, vv_offset.g, v[1].g);
+            buffer_st1(top_blob_data, vv_offset.b, v[1].b);
+            buffer_st1(top_blob_data, vv_offset.a, v[1].a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int x4 = gx * 8;
+
+            image1d_st1(top_blob_1d, x4 + 0, v[0].r);
+            image1d_st1(top_blob_1d, x4 + 1, v[0].g);
+            image1d_st1(top_blob_1d, x4 + 2, v[0].b);
+            image1d_st1(top_blob_1d, x4 + 3, v[0].a);
+            image1d_st1(top_blob_1d, x4 + 4, v[1].r);
+            image1d_st1(top_blob_1d, x4 + 5, v[1].g);
+            image1d_st1(top_blob_1d, x4 + 6, v[1].b);
+            image1d_st1(top_blob_1d, x4 + 7, v[1].a);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = afpvec8(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec8(texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2, gy), 0), texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2 + 1, gy), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 y4 = ivec4(gy * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = y4 * psc(outw) + gx;
+            ivec4 vv_offset = (y4 + 4) * psc(outw) + gx;
+
+            buffer_st1(top_blob_data, v_offset.r, v[0].r);
+            buffer_st1(top_blob_data, v_offset.g, v[0].g);
+            buffer_st1(top_blob_data, v_offset.b, v[0].b);
+            buffer_st1(top_blob_data, v_offset.a, v[0].a);
+            buffer_st1(top_blob_data, vv_offset.r, v[1].r);
+            buffer_st1(top_blob_data, vv_offset.g, v[1].g);
+            buffer_st1(top_blob_data, vv_offset.b, v[1].b);
+            buffer_st1(top_blob_data, vv_offset.a, v[1].a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int y4 = gy * 8;
+
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 0), v[0].r);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 1), v[0].g);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 2), v[0].b);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 3), v[0].a);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 4), v[1].r);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 5), v[1].g);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 6), v[1].b);
+            image2d_st1(top_blob_2d, ivec2(gx, y4 + 7), v[1].a);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = afpvec8(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec8(texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2, gy, gz), 0), texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec4 z4 = ivec4(gz * 8) + ivec4(0, 1, 2, 3);
+
+            ivec4 v_offset = z4 * psc(outcstep) + ivec4(gy * psc(outw) + gx);
+            ivec4 vv_offset = (z4 + 4) * psc(outcstep) + ivec4(gy * psc(outw) + gx);
+
+            buffer_st1(top_blob_data, v_offset.r, v[0].r);
+            buffer_st1(top_blob_data, v_offset.g, v[0].g);
+            buffer_st1(top_blob_data, v_offset.b, v[0].b);
+            buffer_st1(top_blob_data, v_offset.a, v[0].a);
+            buffer_st1(top_blob_data, vv_offset.r, v[1].r);
+            buffer_st1(top_blob_data, vv_offset.g, v[1].g);
+            buffer_st1(top_blob_data, vv_offset.b, v[1].b);
+            buffer_st1(top_blob_data, vv_offset.a, v[1].a);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int z4 = gz * 8;
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 0), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 1), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 2), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 3), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 4), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 5), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 6), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4 + 7), v[1].a);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack8to4.comp b/source/device/vulkan/shaders/packing_pack8to4.comp
new file mode 100644
index 000000000..4a61fb77e
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack8to4.comp
@@ -0,0 +1,184 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld8(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = x2;
+
+            buffer_st4(top_blob_data, v_offset.r, v[0]);
+            buffer_st4(top_blob_data, v_offset.g, v[1]);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int x2 = gx * 2;
+
+            image1d_st4(top_blob_1d, x2 + 0, v[0]);
+            image1d_st4(top_blob_1d, x2 + 1, v[1]);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = y2 * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, v_offset.r, v[0]);
+            buffer_st4(top_blob_data, v_offset.g, v[1]);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int y2 = gy * 2;
+
+            image2d_st4(top_blob_2d, ivec2(gx, y2 + 0), v[0]);
+            image2d_st4(top_blob_2d, ivec2(gx, y2 + 1), v[1]);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = z2 * psc(outcstep) + ivec2(gy * psc(outw) + gx);
+
+            buffer_st4(top_blob_data, v_offset.r, v[0]);
+            buffer_st4(top_blob_data, v_offset.g, v[1]);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int z2 = gz * 2;
+
+            image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 0), v[0]);
+            image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 1), v[1]);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp b/source/device/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp
new file mode 100644
index 000000000..564356caa
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack8to4_fp16_to_fp32.comp
@@ -0,0 +1,184 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob_fp32 { vec4 top_blob_fp32_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 2) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 2) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 3, rgba32f) writeonly uniform highp image1D top_blob_1d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image2D top_blob_2d_fp32;
+layout (binding = 3, rgba32f) writeonly uniform highp image3D top_blob_3d_fp32;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image1d_ld8(bottom_blob_1d, gx);
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = x2;
+
+            top_blob_fp32_data[v_offset.r] = vec4(v[0]);
+            top_blob_fp32_data[v_offset.g] = vec4(v[1]);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int x2 = gx * 2;
+
+            imageStore(top_blob_1d_fp32, x2 + 0, v[0]);
+            imageStore(top_blob_1d_fp32, x2 + 1, v[1]);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = y2 * psc(outw) + gx;
+
+            top_blob_fp32_data[v_offset.r] = vec4(v[0]);
+            top_blob_fp32_data[v_offset.g] = vec4(v[1]);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int y2 = gy * 2;
+
+            imageStore(top_blob_2d_fp32, ivec2(gx, y2 + 0), v[0]);
+            imageStore(top_blob_2d_fp32, ivec2(gx, y2 + 1), v[1]);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = buffer_ld8(bottom_blob_data, gi);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = z2 * psc(outcstep) + ivec2(gy * psc(outw) + gx);
+
+            top_blob_fp32_data[v_offset.r] = vec4(v[0]);
+            top_blob_fp32_data[v_offset.g] = vec4(v[1]);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int z2 = gz * 2;
+
+            imageStore(top_blob_3d_fp32, ivec3(gx, gy, z2 + 0), v[0]);
+            imageStore(top_blob_3d_fp32, ivec3(gx, gy, z2 + 1), v[1]);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp b/source/device/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp
new file mode 100644
index 000000000..762977406
--- /dev/null
+++ b/source/device/vulkan/shaders/packing_pack8to4_fp32_to_fp16.comp
@@ -0,0 +1,184 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int storage_type_from = 0;
+layout (constant_id = 1) const int storage_type_to = 0;
+
+#define shape_constant_id_offset 2
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob_fp32 { mat2x4 bottom_blob_fp32_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_image_shader
+layout (binding = 2) uniform highp sampler1D bottom_blob_1d_fp32;
+layout (binding = 2) uniform highp sampler2D bottom_blob_2d_fp32;
+layout (binding = 2) uniform highp sampler3D bottom_blob_3d_fp32;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 3, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gx;
+
+            v = afpvec8(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec8(texelFetch(bottom_blob_1d_fp32, gx * 2, 0), texelFetch(bottom_blob_1d_fp32, gx * 2 + 1, 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec2 x2 = ivec2(gx * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = x2;
+
+            buffer_st4(top_blob_data, v_offset.r, v[0]);
+            buffer_st4(top_blob_data, v_offset.g, v[1]);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int x2 = gx * 2;
+
+            image1d_st4(top_blob_1d, x2 + 0, v[0]);
+            image1d_st4(top_blob_1d, x2 + 1, v[1]);
+        }
+#endif
+    }
+    else if (psc(dims) == 2)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gy * psc(w) + gx;
+
+            v = afpvec8(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec8(texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2, gy), 0), texelFetch(bottom_blob_2d_fp32, ivec2(gx * 2 + 1, gy), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec2 y2 = ivec2(gy * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = y2 * psc(outw) + gx;
+
+            buffer_st4(top_blob_data, v_offset.r, v[0]);
+            buffer_st4(top_blob_data, v_offset.g, v[1]);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int y2 = gy * 2;
+
+            image2d_st4(top_blob_2d, ivec2(gx, y2 + 0), v[0]);
+            image2d_st4(top_blob_2d, ivec2(gx, y2 + 1), v[1]);
+        }
+#endif
+    }
+    else // if (psc(dims) == 3)
+    {
+        if (storage_type_from == 0)
+        {
+            int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+            v = afpvec8(bottom_blob_fp32_data[gi]);
+        }
+#if NCNN_image_shader
+        if (storage_type_from == 1)
+        {
+            v = afpvec8(texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2, gy, gz), 0), texelFetch(bottom_blob_3d_fp32, ivec3(gx * 2 + 1, gy, gz), 0));
+        }
+#endif
+
+        if (storage_type_to == 0)
+        {
+            ivec2 z2 = ivec2(gz * 2) + ivec2(0, 1);
+
+            ivec2 v_offset = z2 * psc(outcstep) + ivec2(gy * psc(outw) + gx);
+
+            buffer_st4(top_blob_data, v_offset.r, v[0]);
+            buffer_st4(top_blob_data, v_offset.g, v[1]);
+        }
+#if NCNN_image_shader
+        if (storage_type_to == 1)
+        {
+            int z2 = gz * 2;
+
+            image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 0), v[0]);
+            image3d_st4(top_blob_3d, ivec3(gx, gy, z2 + 1), v[1]);
+        }
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/padding.comp b/source/device/vulkan/shaders/padding.comp
new file mode 100644
index 000000000..9a5dd1c8b
--- /dev/null
+++ b/source/device/vulkan/shaders/padding.comp
@@ -0,0 +1,145 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int type = 1;
+layout (constant_id = 1) const float value = 0;
+layout (constant_id = 2) const int per_channel_pad = 0;
+
+#define shape_constant_id_offset 3
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+//layout (binding = 2) uniform unfp sampler1D per_channel_pad_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+//layout (binding = 2) readonly buffer per_channel_pad_blob { sfp per_channel_pad_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int left;
+    int top;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    int x = gx - p.left;
+    int y = gy - p.top;
+
+    if (type == 0)
+    {
+        if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h))
+        {
+#if NCNN_image_shader
+            image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz));
+#else
+            int v_offset = gz * psc(cstep) + y * psc(w) + x;
+            buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+        }
+/*
+        else if (per_channel_pad == 1)
+        {
+#if NCNN_image_shader
+            afp v = image1d_ld1(per_channel_pad_blob, gz);
+            image3d_st1(top_blob, ivec3(gx, gy, gz), v);
+#else
+            buffer_cp1(top_blob_data, gi, per_channel_pad_blob_data, gz);
+#endif
+        }
+*/
+        else
+        {
+            afp v = afp(value);
+#if NCNN_image_shader
+            image3d_st1(top_blob, ivec3(gx, gy, gz), v);
+#else
+            buffer_st1(top_blob_data, gi, v);
+#endif
+        }
+    }
+    if (type == 1)
+    {
+        x = clamp(x, 0, psc(w) - 1);
+        y = clamp(y, 0, psc(h) - 1);
+
+#if NCNN_image_shader
+        image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz));
+#else
+        int v_offset = gz * psc(cstep) + y * psc(w) + x;
+        buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+    }
+    if (type == 2)
+    {
+        x = abs(x);
+        y = abs(y);
+        x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+        y = (psc(h) - 1) - abs(y - (psc(h) - 1));
+
+#if NCNN_image_shader
+        image3d_cp1(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz));
+#else
+        int v_offset = gz * psc(cstep) + y * psc(w) + x;
+        buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/padding_pack4.comp b/source/device/vulkan/shaders/padding_pack4.comp
new file mode 100644
index 000000000..9f8cf99af
--- /dev/null
+++ b/source/device/vulkan/shaders/padding_pack4.comp
@@ -0,0 +1,144 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int type = 1;
+layout (constant_id = 1) const float value = 0;
+layout (constant_id = 2) const int per_channel_pad = 0;
+
+#define shape_constant_id_offset 3
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+// layout (binding = 2) uniform unfp sampler1D per_channel_pad_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+// layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec4 per_channel_pad_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int left;
+    int top;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    int x = gx - p.left;
+    int y = gy - p.top;
+
+    if (type == 0)
+    {
+        if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h))
+        {
+#if NCNN_image_shader
+            image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz));
+#else
+            int v_offset = gz * psc(cstep) + y * psc(w) + x;
+            buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+        }
+//         else if (per_channel_pad == 1)
+//         {
+// #if NCNN_image_shader
+//             afpvec4 v = image1d_ld4(per_channel_pad_blob, gz);
+//             image3d_st4(top_blob, ivec3(gx, gy, gz), v);
+// #else
+//             buffer_cp4(top_blob_data, gi, per_channel_pad_blob_data, gz);
+// #endif
+//         }
+        else
+        {
+            afpvec4 v = afpvec4(value);
+#if NCNN_image_shader
+            image3d_st4(top_blob, ivec3(gx, gy, gz), v);
+#else
+            buffer_st4(top_blob_data, gi, v);
+#endif
+        }
+        
+    }
+    if (type == 1)
+    {
+        x = clamp(x, 0, psc(w) - 1);
+        y = clamp(y, 0, psc(h) - 1);
+
+#if NCNN_image_shader
+        image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz));
+#else
+        int v_offset = gz * psc(cstep) + y * psc(w) + x;
+        buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+    }
+    if (type == 2)
+    {
+        x = abs(x);
+        y = abs(y);
+        x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+        y = (psc(h) - 1) - abs(y - (psc(h) - 1));
+
+#if NCNN_image_shader
+        image3d_cp4(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz));
+#else
+        int v_offset = gz * psc(cstep) + y * psc(w) + x;
+        buffer_cp4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/padding_pack8.comp b/source/device/vulkan/shaders/padding_pack8.comp
new file mode 100644
index 000000000..b1d84887f
--- /dev/null
+++ b/source/device/vulkan/shaders/padding_pack8.comp
@@ -0,0 +1,144 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int type = 1;
+layout (constant_id = 1) const float value = 0;
+layout (constant_id = 2) const int per_channel_pad = 0;
+
+#define shape_constant_id_offset 3
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+// layout (binding = 2) uniform unfp sampler1D per_channel_pad_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+// layout (binding = 2) readonly buffer per_channel_pad_blob { sfpvec8 per_channel_pad_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int left;
+    int top;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    int x = gx - p.left;
+    int y = gy - p.top;
+
+    if (type == 0)
+    {
+        if (x >= 0 && x < psc(w) && y >= 0 && y < psc(h))
+        {
+#if NCNN_image_shader
+            image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz));
+#else
+            int v_offset = gz * psc(cstep) + y * psc(w) + x;
+            buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+        }
+//         else if (per_channel_pad == 1)
+//         {
+// #if NCNN_image_shader
+//             afpvec8 v = image1d_ld8(per_channel_pad_blob, gz);
+//             image3d_st8(top_blob, ivec3(gx, gy, gz), v);
+// #else
+//             buffer_cp8(top_blob_data, gi, per_channel_pad_blob_data, gz);
+// #endif
+//         }
+        else
+        {
+            afpvec8 v = afpvec8(afpvec4(value), afpvec4(value));
+#if NCNN_image_shader
+            image3d_st8(top_blob, ivec3(gx, gy, gz), v);
+#else
+            buffer_st8(top_blob_data, gi, v);
+#endif
+        }
+    }
+    if (type == 1)
+    {
+        x = clamp(x, 0, psc(w) - 1);
+        y = clamp(y, 0, psc(h) - 1);
+
+#if NCNN_image_shader
+        image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz));
+#else
+        int v_offset = gz * psc(cstep) + y * psc(w) + x;
+        buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+    }
+    if (type == 2)
+    {
+        x = abs(x);
+        y = abs(y);
+        x = (psc(w) - 1) - abs(x - (psc(w) - 1));
+        y = (psc(h) - 1) - abs(y - (psc(h) - 1));
+
+#if NCNN_image_shader
+        image3d_cp8(top_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(x, y, gz));
+#else
+        int v_offset = gz * psc(cstep) + y * psc(w) + x;
+        buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+    }
+}
diff --git a/source/device/vulkan/shaders/permute.comp b/source/device/vulkan/shaders/permute.comp
new file mode 100644
index 000000000..613734a68
--- /dev/null
+++ b/source/device/vulkan/shaders/permute.comp
@@ -0,0 +1,186 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int order_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gx, gy));
+        }
+        if (order_type == 1)
+        {
+            image2d_cp1(top_blob_2d, ivec2(gx, gy), bottom_blob_2d, ivec2(gy, gx));
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gy, gz));
+        }
+        if (order_type == 1)
+        {
+            image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gy, gx, gz));
+        }
+        if (order_type == 2)
+        {
+            image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gx, gz, gy));
+        }
+        if (order_type == 3)
+        {
+            image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gy, gz, gx));
+        }
+        if (order_type == 4)
+        {
+            image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gz, gx, gy));
+        }
+        if (order_type == 5)
+        {
+            image3d_cp1(top_blob_3d, ivec3(gx, gy, gz), bottom_blob_3d, ivec3(gz, gy, gx));
+        }
+    }
+#else
+    int v_offset;
+
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            v_offset = gy * psc(w) + gx;
+        }
+        if (order_type == 1)
+        {
+            v_offset = gx * psc(w) + gy;
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            v_offset = gz * psc(cstep) + gy * psc(w) + gx;
+        }
+        if (order_type == 1)
+        {
+            v_offset = gz * psc(cstep) + gx * psc(w) + gy;
+        }
+        if (order_type == 2)
+        {
+            v_offset = gy * psc(cstep) + gz * psc(w) + gx;
+        }
+        if (order_type == 3)
+        {
+            v_offset = gx * psc(cstep) + gz * psc(w) + gy;
+        }
+        if (order_type == 4)
+        {
+            v_offset = gy * psc(cstep) + gx * psc(w) + gz;
+        }
+        if (order_type == 5)
+        {
+            v_offset = gx * psc(cstep) + gy * psc(w) + gz;
+        }
+    }
+
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/permute_pack1to4.comp b/source/device/vulkan/shaders/permute_pack1to4.comp
new file mode 100644
index 000000000..d1ad932ff
--- /dev/null
+++ b/source/device/vulkan/shaders/permute_pack1to4.comp
@@ -0,0 +1,234 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int order_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        afpvec4 v;
+
+        if (order_type == 0)
+        {
+            ivec4 y4 = gy * 4 + ivec4(0, 1, 2, 3);
+
+            v.r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.r));
+            v.g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.g));
+            v.b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.b));
+            v.a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.a));
+        }
+        if (order_type == 1)
+        {
+            ivec4 x4 = gy * 4 + ivec4(0, 1, 2, 3);
+
+            v.r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, gx));
+            v.g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, gx));
+            v.b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, gx));
+            v.a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, gx));
+        }
+
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        afpvec4 v;
+
+        if (order_type == 0)
+        {
+            ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            v.r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.r));
+            v.g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.g));
+            v.b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.b));
+            v.a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.a));
+        }
+        if (order_type == 1)
+        {
+            ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            v.r = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.r));
+            v.g = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.g));
+            v.b = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.b));
+            v.a = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.a));
+        }
+        if (order_type == 2)
+        {
+            ivec4 y4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            v.r = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.r, gy));
+            v.g = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.g, gy));
+            v.b = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.b, gy));
+            v.a = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.a, gy));
+        }
+        if (order_type == 3)
+        {
+            ivec4 y4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            v.r = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.r, gx));
+            v.g = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.g, gx));
+            v.b = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.b, gx));
+            v.a = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.a, gx));
+        }
+        if (order_type == 4)
+        {
+            ivec4 x4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, gx, gy));
+            v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, gx, gy));
+            v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, gx, gy));
+            v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, gx, gy));
+        }
+        if (order_type == 5)
+        {
+            ivec4 x4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, gy, gx));
+            v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, gy, gx));
+            v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, gy, gx));
+            v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, gy, gx));
+        }
+
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    ivec4 v_offset;
+
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            v_offset = (gy * 4 + ivec4(0, 1, 2, 3)) * psc(w) + gx;
+        }
+        if (order_type == 1)
+        {
+            v_offset = gx * psc(w) + (gy * 4 + ivec4(0, 1, 2, 3));
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            v_offset = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(cstep) + gy * psc(w) + gx;
+        }
+        if (order_type == 1)
+        {
+            v_offset = (gz * 4 + ivec4(0, 1, 2, 3)) * psc(cstep) + gx * psc(w) + gy;
+        }
+        if (order_type == 2)
+        {
+            v_offset = gy * psc(cstep) + (gz * 4 + ivec4(0, 1, 2, 3)) * psc(w) + gx;
+        }
+        if (order_type == 3)
+        {
+            v_offset = gx * psc(cstep) + (gz * 4 + ivec4(0, 1, 2, 3)) * psc(w) + gy;
+        }
+        if (order_type == 4)
+        {
+            v_offset = gy * psc(cstep) + gx * psc(w) + (gz * 4 + ivec4(0, 1, 2, 3));
+        }
+        if (order_type == 5)
+        {
+            v_offset = gx * psc(cstep) + gy * psc(w) + (gz * 4 + ivec4(0, 1, 2, 3));
+        }
+    }
+
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/permute_pack1to8.comp b/source/device/vulkan/shaders/permute_pack1to8.comp
new file mode 100644
index 000000000..816a94268
--- /dev/null
+++ b/source/device/vulkan/shaders/permute_pack1to8.comp
@@ -0,0 +1,284 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int order_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        afpvec8 v;
+
+        if (order_type == 0)
+        {
+            ivec4 y4 = gy * 8 + ivec4(0, 1, 2, 3);
+            ivec4 yy4 = y4 + 4;
+
+            v[0].r = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.r));
+            v[0].g = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.g));
+            v[0].b = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.b));
+            v[0].a = image2d_ld1(bottom_blob_2d, ivec2(gx, y4.a));
+            v[1].r = image2d_ld1(bottom_blob_2d, ivec2(gx, yy4.r));
+            v[1].g = image2d_ld1(bottom_blob_2d, ivec2(gx, yy4.g));
+            v[1].b = image2d_ld1(bottom_blob_2d, ivec2(gx, yy4.b));
+            v[1].a = image2d_ld1(bottom_blob_2d, ivec2(gx, yy4.a));
+        }
+        if (order_type == 1)
+        {
+            ivec4 x4 = gy * 8 + ivec4(0, 1, 2, 3);
+            ivec4 xx4 = x4 + 4;
+
+            v[0].r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, gx));
+            v[0].g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, gx));
+            v[0].b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, gx));
+            v[0].a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, gx));
+            v[1].r = image2d_ld1(bottom_blob_2d, ivec2(xx4.r, gx));
+            v[1].g = image2d_ld1(bottom_blob_2d, ivec2(xx4.g, gx));
+            v[1].b = image2d_ld1(bottom_blob_2d, ivec2(xx4.b, gx));
+            v[1].a = image2d_ld1(bottom_blob_2d, ivec2(xx4.a, gx));
+        }
+
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        afpvec8 v;
+
+        if (order_type == 0)
+        {
+            ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 zz4 = z4 + 4;
+
+            v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.r));
+            v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.g));
+            v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.b));
+            v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, z4.a));
+            v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, zz4.r));
+            v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, zz4.g));
+            v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, zz4.b));
+            v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, zz4.a));
+        }
+        if (order_type == 1)
+        {
+            ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 zz4 = z4 + 4;
+
+            v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.r));
+            v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.g));
+            v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.b));
+            v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, z4.a));
+            v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, zz4.r));
+            v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, zz4.g));
+            v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, zz4.b));
+            v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gy, gx, zz4.a));
+        }
+        if (order_type == 2)
+        {
+            ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 yy4 = y4 + 4;
+
+            v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.r, gy));
+            v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.g, gy));
+            v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.b, gy));
+            v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gx, y4.a, gy));
+            v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gx, yy4.r, gy));
+            v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gx, yy4.g, gy));
+            v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gx, yy4.b, gy));
+            v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gx, yy4.a, gy));
+        }
+        if (order_type == 3)
+        {
+            ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 yy4 = y4 + 4;
+
+            v[0].r = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.r, gx));
+            v[0].g = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.g, gx));
+            v[0].b = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.b, gx));
+            v[0].a = image3d_ld1(bottom_blob_3d, ivec3(gy, y4.a, gx));
+            v[1].r = image3d_ld1(bottom_blob_3d, ivec3(gy, yy4.r, gx));
+            v[1].g = image3d_ld1(bottom_blob_3d, ivec3(gy, yy4.g, gx));
+            v[1].b = image3d_ld1(bottom_blob_3d, ivec3(gy, yy4.b, gx));
+            v[1].a = image3d_ld1(bottom_blob_3d, ivec3(gy, yy4.a, gx));
+        }
+        if (order_type == 4)
+        {
+            ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 xx4 = x4 + 4;
+
+            v[0].r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, gx, gy));
+            v[0].g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, gx, gy));
+            v[0].b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, gx, gy));
+            v[0].a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, gx, gy));
+            v[1].r = image3d_ld1(bottom_blob_3d, ivec3(xx4.r, gx, gy));
+            v[1].g = image3d_ld1(bottom_blob_3d, ivec3(xx4.g, gx, gy));
+            v[1].b = image3d_ld1(bottom_blob_3d, ivec3(xx4.b, gx, gy));
+            v[1].a = image3d_ld1(bottom_blob_3d, ivec3(xx4.a, gx, gy));
+        }
+        if (order_type == 5)
+        {
+            ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 xx4 = x4 + 4;
+
+            v[0].r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, gy, gx));
+            v[0].g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, gy, gx));
+            v[0].b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, gy, gx));
+            v[0].a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, gy, gx));
+            v[1].r = image3d_ld1(bottom_blob_3d, ivec3(xx4.r, gy, gx));
+            v[1].g = image3d_ld1(bottom_blob_3d, ivec3(xx4.g, gy, gx));
+            v[1].b = image3d_ld1(bottom_blob_3d, ivec3(xx4.b, gy, gx));
+            v[1].a = image3d_ld1(bottom_blob_3d, ivec3(xx4.a, gy, gx));
+        }
+
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            v_offset = (gy * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx;
+            vv_offset = v_offset + 4 * psc(w);
+        }
+        if (order_type == 1)
+        {
+            v_offset = gx * psc(w) + (gy * 8 + ivec4(0, 1, 2, 3));
+            vv_offset = v_offset + 4;
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            v_offset = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gy * psc(w) + gx;
+            vv_offset = v_offset + 4 * psc(cstep);
+        }
+        if (order_type == 1)
+        {
+            v_offset = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gx * psc(w) + gy;
+            vv_offset = v_offset + 4 * psc(cstep);
+        }
+        if (order_type == 2)
+        {
+            v_offset = gy * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx;
+            vv_offset = v_offset + 4 * psc(w);
+        }
+        if (order_type == 3)
+        {
+            v_offset = gx * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gy;
+            vv_offset = v_offset + 4 * psc(w);
+        }
+        if (order_type == 4)
+        {
+            v_offset = gy * psc(cstep) + gx * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3));
+            vv_offset = v_offset + 4;
+        }
+        if (order_type == 5)
+        {
+            v_offset = gx * psc(cstep) + gy * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3));
+            vv_offset = v_offset + 4;
+        }
+    }
+
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/permute_pack4.comp b/source/device/vulkan/shaders/permute_pack4.comp
new file mode 100644
index 000000000..3e1ff6ef8
--- /dev/null
+++ b/source/device/vulkan/shaders/permute_pack4.comp
@@ -0,0 +1,281 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int order_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        ivec4 i4;
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gy * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4(gx * psc(w) + gy * 4) + ivec4(0, 1, 2, 3);
+        }
+
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        afpvec4 vr = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4));
+        afpvec4 vg = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4));
+        afpvec4 vb = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4));
+        afpvec4 va = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4));
+
+        ivec4 lane4 = y4 % 4;
+
+        afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]);
+
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        int size = psc(w) * psc(h);
+
+        ivec4 i4;
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gz * 4) * size + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * size;
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4((gz * 4) * size + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * size;
+        }
+        if (order_type == 2)
+        {
+            i4 = ivec4(gy * size + (gz * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 3)
+        {
+            i4 = ivec4(gx * size+ (gz * 4) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 4)
+        {
+            i4 = ivec4(gy * size + gx * psc(w) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+        if (order_type == 5)
+        {
+            i4 = ivec4(gx * size + gy * psc(w) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        afpvec4 vr = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4));
+        afpvec4 vg = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4));
+        afpvec4 vb = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4));
+        afpvec4 va = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4));
+
+        ivec4 lane4 = z4 % 4;
+
+        afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]);
+
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    ivec4 i4;
+
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gy * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4(gx * psc(w) + gy * 4) + ivec4(0, 1, 2, 3);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gz * 4) * psc(cstep) + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(cstep);
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4((gz * 4) * psc(cstep) + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(cstep);
+        }
+        if (order_type == 2)
+        {
+            i4 = ivec4(gy * psc(cstep) + (gz * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 3)
+        {
+            i4 = ivec4(gx * psc(cstep) + (gz * 4) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 4)
+        {
+            i4 = ivec4(gy * psc(cstep) + gx * psc(w) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+        if (order_type == 5)
+        {
+            i4 = ivec4(gx * psc(cstep) + gy * psc(w) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+    }
+
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 lane2;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2;
+        lane2 = y4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
+        lane2 = z4 % 2;
+    }
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
+
+    buffer_st4(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
+    }
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/permute_pack4to1.comp b/source/device/vulkan/shaders/permute_pack4to1.comp
new file mode 100644
index 000000000..5d33904d0
--- /dev/null
+++ b/source/device/vulkan/shaders/permute_pack4to1.comp
@@ -0,0 +1,230 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int order_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        afpvec4 v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+
+        if (order_type == 0)
+        {
+            ivec4 y4 = gy * 4 + ivec4(0, 1, 2, 3);
+
+            image2d_st1(top_blob_2d, ivec2(gx, y4.r), v.r);
+            image2d_st1(top_blob_2d, ivec2(gx, y4.g), v.g);
+            image2d_st1(top_blob_2d, ivec2(gx, y4.b), v.b);
+            image2d_st1(top_blob_2d, ivec2(gx, y4.a), v.a);
+        }
+        if (order_type == 1)
+        {
+            ivec4 x4 = gy * 4 + ivec4(0, 1, 2, 3);
+
+            image2d_st1(top_blob_2d, ivec2(x4.r, gx), v.r);
+            image2d_st1(top_blob_2d, ivec2(x4.g, gx), v.g);
+            image2d_st1(top_blob_2d, ivec2(x4.b, gx), v.b);
+            image2d_st1(top_blob_2d, ivec2(x4.a, gx), v.a);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        afpvec4 v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+
+        if (order_type == 0)
+        {
+            ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.r), v.r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.g), v.g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.b), v.b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.a), v.a);
+        }
+        if (order_type == 1)
+        {
+            ivec4 z4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.r), v.r);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.g), v.g);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.b), v.b);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.a), v.a);
+        }
+        if (order_type == 2)
+        {
+            ivec4 y4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            image3d_st1(top_blob_3d, ivec3(gx, y4.r, gy), v.r);
+            image3d_st1(top_blob_3d, ivec3(gx, y4.g, gy), v.g);
+            image3d_st1(top_blob_3d, ivec3(gx, y4.b, gy), v.b);
+            image3d_st1(top_blob_3d, ivec3(gx, y4.a, gy), v.a);
+        }
+        if (order_type == 3)
+        {
+            ivec4 x4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            image3d_st1(top_blob_3d, ivec3(x4.r, gx, gy), v.r);
+            image3d_st1(top_blob_3d, ivec3(x4.g, gx, gy), v.g);
+            image3d_st1(top_blob_3d, ivec3(x4.b, gx, gy), v.b);
+            image3d_st1(top_blob_3d, ivec3(x4.a, gx, gy), v.a);
+        }
+        if (order_type == 4)
+        {
+            ivec4 y4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            image3d_st1(top_blob_3d, ivec3(gy, y4.r, gx), v.r);
+            image3d_st1(top_blob_3d, ivec3(gy, y4.g, gx), v.g);
+            image3d_st1(top_blob_3d, ivec3(gy, y4.b, gx), v.b);
+            image3d_st1(top_blob_3d, ivec3(gy, y4.a, gx), v.a);
+        }
+        if (order_type == 5)
+        {
+            ivec4 x4 = gz * 4 + ivec4(0, 1, 2, 3);
+
+            image3d_st1(top_blob_3d, ivec3(x4.r, gy, gx), v.r);
+            image3d_st1(top_blob_3d, ivec3(x4.g, gy, gx), v.g);
+            image3d_st1(top_blob_3d, ivec3(x4.b, gy, gx), v.b);
+            image3d_st1(top_blob_3d, ivec3(x4.a, gy, gx), v.a);
+        }
+    }
+#else
+    ivec4 v_offset;
+
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            v_offset = ivec4((gy * 4) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw);
+        }
+        if (order_type == 1)
+        {
+            v_offset = ivec4(gx * psc(outw) + gy * 4) + ivec4(0, 1, 2, 3);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            v_offset = ivec4((gz * 4) * psc(outcstep) + gy * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outcstep);
+        }
+        if (order_type == 1)
+        {
+            v_offset = ivec4((gz * 4) * psc(outcstep) + gx * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outcstep);
+        }
+        if (order_type == 2)
+        {
+            v_offset = ivec4(gy * psc(outcstep) + (gz * 4) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw);
+        }
+        if (order_type == 3)
+        {
+            v_offset = ivec4(gy * psc(outcstep) + gx * psc(outw) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+        if (order_type == 4)
+        {
+            v_offset = ivec4(gx * psc(outcstep) + (gz * 4) * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outw);
+        }
+        if (order_type == 5)
+        {
+            v_offset = ivec4(gx * psc(outcstep) + gy * psc(outw) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+    }
+
+    int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/permute_pack4to8.comp b/source/device/vulkan/shaders/permute_pack4to8.comp
new file mode 100644
index 000000000..f35abd828
--- /dev/null
+++ b/source/device/vulkan/shaders/permute_pack4to8.comp
@@ -0,0 +1,350 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int order_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        ivec4 i4;
+        ivec4 ii4;
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gy * 8) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4(gx * psc(w) + gy * 8) + ivec4(0, 1, 2, 3);
+            ii4 = i4 + 4;
+        }
+
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4));
+        afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4));
+        afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4));
+        afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4));
+        afpvec4 v4 = image2d_ld4(bottom_blob_2d, ivec2(xx4.r, yy4.r / 4));
+        afpvec4 v5 = image2d_ld4(bottom_blob_2d, ivec2(xx4.g, yy4.g / 4));
+        afpvec4 v6 = image2d_ld4(bottom_blob_2d, ivec2(xx4.b, yy4.b / 4));
+        afpvec4 v7 = image2d_ld4(bottom_blob_2d, ivec2(xx4.a, yy4.a / 4));
+
+        afpvec8 v;
+        v[0].r = v0[y4.r % 4];
+        v[0].g = v1[y4.g % 4];
+        v[0].b = v2[y4.b % 4];
+        v[0].a = v3[y4.a % 4];
+        v[1].r = v4[yy4.r % 4];
+        v[1].g = v5[yy4.g % 4];
+        v[1].b = v6[yy4.b % 4];
+        v[1].a = v7[yy4.a % 4];
+
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        int size = psc(w) * psc(h);
+
+        ivec4 i4;
+        ivec4 ii4;
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gz * 8) * size + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * size;
+            ii4 = i4 + 4 * size;
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4((gz * 8) * size + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * size;
+            ii4 = i4 + 4 * size;
+        }
+        if (order_type == 2)
+        {
+            i4 = ivec4(gy * size + (gz * 8) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 3)
+        {
+            i4 = ivec4(gx * size+ (gz * 8) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w);
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 4)
+        {
+            i4 = ivec4(gy * size + gx * psc(w) + gz * 8) + ivec4(0, 1, 2, 3);
+            ii4 = i4 + 4;
+        }
+        if (order_type == 5)
+        {
+            i4 = ivec4(gx * size + gy * psc(w) + gz * 8) + ivec4(0, 1, 2, 3);
+            ii4 = i4 + 4;
+        }
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4));
+        afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4));
+        afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4));
+        afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4));
+        afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 4));
+        afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 4));
+        afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 4));
+        afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 4));
+
+        afpvec8 v;
+        v[0].r = v0[z4.r % 4];
+        v[0].g = v1[z4.g % 4];
+        v[0].b = v2[z4.b % 4];
+        v[0].a = v3[z4.a % 4];
+        v[1].r = v4[zz4.r % 4];
+        v[1].g = v5[zz4.g % 4];
+        v[1].b = v6[zz4.b % 4];
+        v[1].a = v7[zz4.a % 4];
+
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    ivec4 i4;
+    ivec4 ii4;
+
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            i4 = (gy * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx;
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 1)
+        {
+            i4 = gx * psc(w) + (gy * 8 + ivec4(0, 1, 2, 3));
+            ii4 = i4 + 4;
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            i4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gy * psc(w) + gx;
+            ii4 = i4 + 4 * psc(cstep);
+        }
+        if (order_type == 1)
+        {
+            i4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gx * psc(w) + gy;
+            ii4 = i4 + 4 * psc(cstep);
+        }
+        if (order_type == 2)
+        {
+            i4 = gy * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx;
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 3)
+        {
+            i4 = gx * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gy;
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 4)
+        {
+            i4 = gy * psc(cstep) + gx * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3));
+            ii4 = i4 + 4;
+        }
+        if (order_type == 5)
+        {
+            i4 = gx * psc(cstep) + gy * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3));
+            ii4 = i4 + 4;
+        }
+    }
+
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 vv_offset;
+    ivec4 lane2;
+    ivec4 lane4;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2;
+        lane2 = y4 % 2;
+        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 2 + (yy4 % 4) / 2;
+        lane4 = yy4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
+        lane2 = z4 % 2;
+        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2;
+        lane4 = zz4 % 2;
+    }
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
+    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
+    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
+    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
+
+    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
+
+    buffer_st8(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
+        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 4 + yy4 % 4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
+        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4;
+    }
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/permute_pack8.comp b/source/device/vulkan/shaders/permute_pack8.comp
new file mode 100644
index 000000000..5fa215538
--- /dev/null
+++ b/source/device/vulkan/shaders/permute_pack8.comp
@@ -0,0 +1,350 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int order_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        ivec4 i4;
+        ivec4 ii4;
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gy * 8) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4(gx * psc(w) + gy * 8) + ivec4(0, 1, 2, 3);
+            ii4 = i4 + 4;
+        }
+
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = i4 / psc(w);
+        ivec4 xx4 = i4 % psc(w);
+
+        afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8));
+        afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8));
+        afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8));
+        afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8));
+        afpvec8 v4 = image2d_ld8(bottom_blob_2d, ivec2(xx4.r, yy4.r / 8));
+        afpvec8 v5 = image2d_ld8(bottom_blob_2d, ivec2(xx4.g, yy4.g / 8));
+        afpvec8 v6 = image2d_ld8(bottom_blob_2d, ivec2(xx4.b, yy4.b / 8));
+        afpvec8 v7 = image2d_ld8(bottom_blob_2d, ivec2(xx4.a, yy4.a / 8));
+
+        afpvec8 v;
+        v[0].r = v0[(y4.r % 8) / 4][y4.r % 4];
+        v[0].g = v1[(y4.g % 8) / 4][y4.g % 4];
+        v[0].b = v2[(y4.b % 8) / 4][y4.b % 4];
+        v[0].a = v3[(y4.a % 8) / 4][y4.a % 4];
+        v[1].r = v4[(yy4.r % 8) / 4][yy4.r % 4];
+        v[1].g = v5[(yy4.g % 8) / 4][yy4.g % 4];
+        v[1].b = v6[(yy4.b % 8) / 4][yy4.b % 4];
+        v[1].a = v7[(yy4.a % 8) / 4][yy4.a % 4];
+
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        int size = psc(w) * psc(h);
+
+        ivec4 i4;
+        ivec4 ii4;
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gz * 8) * size + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * size;
+            ii4 = i4 + 4 * size;
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4((gz * 8) * size + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * size;
+            ii4 = i4 + 4 * size;
+        }
+        if (order_type == 2)
+        {
+            i4 = ivec4(gy * size + (gz * 8) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 3)
+        {
+            i4 = ivec4(gx * size+ (gz * 8) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w);
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 4)
+        {
+            i4 = ivec4(gy * size + gx * psc(w) + gz * 8) + ivec4(0, 1, 2, 3);
+            ii4 = i4 + 4;
+        }
+        if (order_type == 5)
+        {
+            i4 = ivec4(gx * size + gy * psc(w) + gz * 8) + ivec4(0, 1, 2, 3);
+            ii4 = i4 + 4;
+        }
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8));
+        afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8));
+        afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8));
+        afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8));
+        afpvec8 v4 = image3d_ld8(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 8));
+        afpvec8 v5 = image3d_ld8(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 8));
+        afpvec8 v6 = image3d_ld8(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 8));
+        afpvec8 v7 = image3d_ld8(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 8));
+
+        afpvec8 v;
+        v[0].r = v0[(z4.r % 8) / 4][z4.r % 4];
+        v[0].g = v1[(z4.g % 8) / 4][z4.g % 4];
+        v[0].b = v2[(z4.b % 8) / 4][z4.b % 4];
+        v[0].a = v3[(z4.a % 8) / 4][z4.a % 4];
+        v[1].r = v4[(zz4.r % 8) / 4][zz4.r % 4];
+        v[1].g = v5[(zz4.g % 8) / 4][zz4.g % 4];
+        v[1].b = v6[(zz4.b % 8) / 4][zz4.b % 4];
+        v[1].a = v7[(zz4.a % 8) / 4][zz4.a % 4];
+
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    ivec4 i4;
+    ivec4 ii4;
+
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            i4 = (gy * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx;
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 1)
+        {
+            i4 = gx * psc(w) + (gy * 8 + ivec4(0, 1, 2, 3));
+            ii4 = i4 + 4;
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            i4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gy * psc(w) + gx;
+            ii4 = i4 + 4 * psc(cstep);
+        }
+        if (order_type == 1)
+        {
+            i4 = (gz * 8 + ivec4(0, 1, 2, 3)) * psc(cstep) + gx * psc(w) + gy;
+            ii4 = i4 + 4 * psc(cstep);
+        }
+        if (order_type == 2)
+        {
+            i4 = gy * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gx;
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 3)
+        {
+            i4 = gx * psc(cstep) + (gz * 8 + ivec4(0, 1, 2, 3)) * psc(w) + gy;
+            ii4 = i4 + 4 * psc(w);
+        }
+        if (order_type == 4)
+        {
+            i4 = gy * psc(cstep) + gx * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3));
+            ii4 = i4 + 4;
+        }
+        if (order_type == 5)
+        {
+            i4 = gx * psc(cstep) + gy * psc(w) + (gz * 8 + ivec4(0, 1, 2, 3));
+            ii4 = i4 + 4;
+        }
+    }
+
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 vv_offset;
+    ivec4 lane2;
+    ivec4 lane4;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2;
+        lane2 = y4 % 2;
+        vv_offset = ((yy4 / 8) * psc(w) + xx4) * 4 + (yy4 % 8) / 2;
+        lane4 = yy4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2;
+        lane2 = z4 % 2;
+        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + (zz4 % 8) / 2;
+        lane4 = zz4 % 2;
+    }
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
+    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
+    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
+    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
+
+    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
+
+    buffer_st8(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8;
+        vv_offset = ((yy4 / 8) * psc(w) + xx4) * 8 + yy4 % 8;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8;
+        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 8 + zz4 % 8;
+    }
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/permute_pack8to1.comp b/source/device/vulkan/shaders/permute_pack8to1.comp
new file mode 100644
index 000000000..3d152f68b
--- /dev/null
+++ b/source/device/vulkan/shaders/permute_pack8to1.comp
@@ -0,0 +1,280 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int order_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        afpvec8 v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+
+        if (order_type == 0)
+        {
+            ivec4 y4 = gy * 8 + ivec4(0, 1, 2, 3);
+            ivec4 yy4 = y4 + 4;
+
+            image2d_st1(top_blob_2d, ivec2(gx, y4.r), v[0].r);
+            image2d_st1(top_blob_2d, ivec2(gx, y4.g), v[0].g);
+            image2d_st1(top_blob_2d, ivec2(gx, y4.b), v[0].b);
+            image2d_st1(top_blob_2d, ivec2(gx, y4.a), v[0].a);
+            image2d_st1(top_blob_2d, ivec2(gx, yy4.r), v[1].r);
+            image2d_st1(top_blob_2d, ivec2(gx, yy4.g), v[1].g);
+            image2d_st1(top_blob_2d, ivec2(gx, yy4.b), v[1].b);
+            image2d_st1(top_blob_2d, ivec2(gx, yy4.a), v[1].a);
+        }
+        if (order_type == 1)
+        {
+            ivec4 x4 = gy * 8 + ivec4(0, 1, 2, 3);
+            ivec4 xx4 = x4 + 4;
+
+            image2d_st1(top_blob_2d, ivec2(x4.r, gx), v[0].r);
+            image2d_st1(top_blob_2d, ivec2(x4.g, gx), v[0].g);
+            image2d_st1(top_blob_2d, ivec2(x4.b, gx), v[0].b);
+            image2d_st1(top_blob_2d, ivec2(x4.a, gx), v[0].a);
+            image2d_st1(top_blob_2d, ivec2(xx4.r, gx), v[1].r);
+            image2d_st1(top_blob_2d, ivec2(xx4.g, gx), v[1].g);
+            image2d_st1(top_blob_2d, ivec2(xx4.b, gx), v[1].b);
+            image2d_st1(top_blob_2d, ivec2(xx4.a, gx), v[1].a);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        afpvec8 v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+
+        if (order_type == 0)
+        {
+            ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 zz4 = z4 + 4;
+
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.r), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.g), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.b), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, z4.a), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.r), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.g), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.b), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(gx, gy, zz4.a), v[1].a);
+        }
+        if (order_type == 1)
+        {
+            ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 zz4 = z4 + 4;
+
+            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.r), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.g), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.b), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, z4.a), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.r), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.g), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.b), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(gy, gx, zz4.a), v[1].a);
+        }
+        if (order_type == 2)
+        {
+            ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 yy4 = y4 + 4;
+
+            image3d_st1(top_blob_3d, ivec3(gx, y4.r, gy), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(gx, y4.g, gy), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(gx, y4.b, gy), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(gx, y4.a, gy), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(gx, yy4.r, gy), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(gx, yy4.g, gy), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(gx, yy4.b, gy), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(gx, yy4.a, gy), v[1].a);
+        }
+        if (order_type == 3)
+        {
+            ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 xx4 = x4 + 4;
+
+            image3d_st1(top_blob_3d, ivec3(x4.r, gx, gy), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(x4.g, gx, gy), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(x4.b, gx, gy), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(x4.a, gx, gy), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(xx4.r, gx, gy), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(xx4.g, gx, gy), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(xx4.b, gx, gy), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(xx4.a, gx, gy), v[1].a);
+        }
+        if (order_type == 4)
+        {
+            ivec4 y4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 yy4 = y4 + 4;
+
+            image3d_st1(top_blob_3d, ivec3(gy, y4.r, gx), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(gy, y4.g, gx), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(gy, y4.b, gx), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(gy, y4.a, gx), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(gy, yy4.r, gx), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(gy, yy4.g, gx), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(gy, yy4.b, gx), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(gy, yy4.a, gx), v[1].a);
+        }
+        if (order_type == 5)
+        {
+            ivec4 x4 = gz * 8 + ivec4(0, 1, 2, 3);
+            ivec4 xx4 = x4 + 4;
+
+            image3d_st1(top_blob_3d, ivec3(x4.r, gy, gx), v[0].r);
+            image3d_st1(top_blob_3d, ivec3(x4.g, gy, gx), v[0].g);
+            image3d_st1(top_blob_3d, ivec3(x4.b, gy, gx), v[0].b);
+            image3d_st1(top_blob_3d, ivec3(x4.a, gy, gx), v[0].a);
+            image3d_st1(top_blob_3d, ivec3(xx4.r, gy, gx), v[1].r);
+            image3d_st1(top_blob_3d, ivec3(xx4.g, gy, gx), v[1].g);
+            image3d_st1(top_blob_3d, ivec3(xx4.b, gy, gx), v[1].b);
+            image3d_st1(top_blob_3d, ivec3(xx4.a, gy, gx), v[1].a);
+        }
+    }
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            v_offset = ivec4((gy * 8) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw);
+            vv_offset = v_offset + 4 * psc(outw);
+        }
+        if (order_type == 1)
+        {
+            v_offset = ivec4(gx * psc(outw) + gy * 8) + ivec4(0, 1, 2, 3);
+            vv_offset = v_offset + 4;
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            v_offset = ivec4((gz * 8) * psc(outcstep) + gy * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outcstep);
+            vv_offset = v_offset + 4 * psc(outcstep);
+        }
+        if (order_type == 1)
+        {
+            v_offset = ivec4((gz * 8) * psc(outcstep) + gx * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outcstep);
+            vv_offset = v_offset + 4 * psc(outcstep);
+        }
+        if (order_type == 2)
+        {
+            v_offset = ivec4(gy * psc(outcstep) + (gz * 8) * psc(outw) + gx) + ivec4(0, 1, 2, 3) * psc(outw);
+            vv_offset = v_offset + 4 * psc(outw);
+        }
+        if (order_type == 3)
+        {
+            v_offset = ivec4(gy * psc(outcstep) + gx * psc(outw) + gz * 8) + ivec4(0, 1, 2, 3);
+            vv_offset = v_offset + 4;
+        }
+        if (order_type == 4)
+        {
+            v_offset = ivec4(gx * psc(outcstep) + (gz * 8) * psc(outw) + gy) + ivec4(0, 1, 2, 3) * psc(outw);
+            vv_offset = v_offset + 4 * psc(outw);
+        }
+        if (order_type == 5)
+        {
+            v_offset = ivec4(gx * psc(outcstep) + gy * psc(outw) + gz * 8) + ivec4(0, 1, 2, 3);
+            vv_offset = v_offset + 4;
+        }
+    }
+
+    int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/permute_pack8to4.comp b/source/device/vulkan/shaders/permute_pack8to4.comp
new file mode 100644
index 000000000..86e01e6fd
--- /dev/null
+++ b/source/device/vulkan/shaders/permute_pack8to4.comp
@@ -0,0 +1,285 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int order_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+#if NCNN_image_shader
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        ivec4 i4;
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gy * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4(gx * psc(w) + gy * 4) + ivec4(0, 1, 2, 3);
+        }
+
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8));
+        afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8));
+        afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8));
+        afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8));
+
+        afpvec4 v;
+        v.r = v0[(y4.r % 8) / 4][y4.r % 4];
+        v.g = v1[(y4.g % 8) / 4][y4.g % 4];
+        v.b = v2[(y4.b % 8) / 4][y4.b % 4];
+        v.a = v3[(y4.a % 8) / 4][y4.a % 4];
+
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        int size = psc(w) * psc(h);
+
+        ivec4 i4;
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gz * 4) * size + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * size;
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4((gz * 4) * size + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * size;
+        }
+        if (order_type == 2)
+        {
+            i4 = ivec4(gy * size + (gz * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 3)
+        {
+            i4 = ivec4(gx * size+ (gz * 4) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 4)
+        {
+            i4 = ivec4(gy * size + gx * psc(w) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+        if (order_type == 5)
+        {
+            i4 = ivec4(gx * size + gy * psc(w) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8));
+        afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8));
+        afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8));
+        afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8));
+
+        afpvec4 v;
+        v.r = v0[(z4.r % 8) / 4][z4.r % 4];
+        v.g = v1[(z4.g % 8) / 4][z4.g % 4];
+        v.b = v2[(z4.b % 8) / 4][z4.b % 4];
+        v.a = v3[(z4.a % 8) / 4][z4.a % 4];
+
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    ivec4 i4;
+
+    if (psc(dims) == 2)
+    {
+        // order_type
+        // 0 = w h
+        // 1 = h w
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gy * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4(gx * psc(w) + gy * 4) + ivec4(0, 1, 2, 3);
+        }
+    }
+    else // if (psc(dims) == 3)
+    {
+        // order_type
+        // 0 = w h c
+        // 1 = h w c
+        // 2 = w c h
+        // 3 = c w h
+        // 4 = h c w
+        // 5 = c h w
+
+        if (order_type == 0)
+        {
+            i4 = ivec4((gz * 4) * psc(cstep) + gy * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(cstep);
+        }
+        if (order_type == 1)
+        {
+            i4 = ivec4((gz * 4) * psc(cstep) + gx * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(cstep);
+        }
+        if (order_type == 2)
+        {
+            i4 = ivec4(gy * psc(cstep) + (gz * 4) * psc(w) + gx) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 3)
+        {
+            i4 = ivec4(gx * psc(cstep) + (gz * 4) * psc(w) + gy) + ivec4(0, 1, 2, 3) * psc(w);
+        }
+        if (order_type == 4)
+        {
+            i4 = ivec4(gy * psc(cstep) + gx * psc(w) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+        if (order_type == 5)
+        {
+            i4 = ivec4(gx * psc(cstep) + gy * psc(w) + gz * 4) + ivec4(0, 1, 2, 3);
+        }
+    }
+
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 lane2;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2;
+        lane2 = y4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2;
+        lane2 = z4 % 2;
+    }
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
+
+    buffer_st4(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+
+    if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8;
+    }
+
+    int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/pooling.comp b/source/device/vulkan/shaders/pooling.comp
new file mode 100644
index 000000000..5a647430f
--- /dev/null
+++ b/source/device/vulkan/shaders/pooling.comp
@@ -0,0 +1,226 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define FLT_MAX 3.402823466e+38
+
+layout (constant_id = 0) const int pooling_type = 0;
+layout (constant_id = 1) const int kernel_w = 1;
+layout (constant_id = 2) const int kernel_h = 1;
+layout (constant_id = 3) const int stride_w = 1;
+layout (constant_id = 4) const int stride_h = 1;
+layout (constant_id = 5) const int pad_left = 0;
+layout (constant_id = 6) const int pad_right = 0;
+layout (constant_id = 7) const int pad_top = 0;
+layout (constant_id = 8) const int pad_bottom = 0;
+layout (constant_id = 9) const int global_pooling = 0;
+layout (constant_id = 10) const int pad_mode = 0;
+layout (constant_id = 11) const int avgpool_count_include_pad = 0;
+
+#define shape_constant_id_offset 12
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int wtailpad;
+    int htailpad;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp res;
+
+    if (pooling_type == 0)
+    {
+        res = afp(-FLT_MAX);
+
+#if NCNN_image_shader
+        int sx = gx * stride_w;
+        int sy = gy * stride_h;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz));
+                res = max(res, v);
+            }
+        }
+#else
+        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afp v = buffer_ld1(bottom_blob_data, v_offset + x);
+                res = max(res, v);
+            }
+
+            v_offset += psc(w);
+        }
+#endif
+    }
+    if (pooling_type == 1 && avgpool_count_include_pad == 0)
+    {
+        res = afp(0.f);
+        int area = 0;
+
+        int sx = gx * stride_w;
+        int sy = gy * stride_h;
+
+#if NCNN_image_shader
+        for (int y = 0; y < kernel_h; y++)
+        {
+            if (sy + y < pad_top)
+                continue;
+
+            if (sy + y >= psc(h) - pad_bottom - p.htailpad)
+                break;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                if (sx + x < pad_left)
+                    continue;
+
+                if (sx + x >= psc(w) - pad_right - p.wtailpad)
+                    break;
+
+                res += image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz));
+                area += 1;
+            }
+        }
+#else
+        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            if (sy + y < pad_top)
+            {
+                v_offset += psc(w);
+                continue;
+            }
+
+            if (sy + y >= psc(h) - pad_bottom - p.htailpad)
+                break;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                if (sx + x < pad_left)
+                {
+                    continue;
+                }
+
+                if (sx + x >= psc(w) - pad_right - p.wtailpad)
+                    break;
+
+                res += buffer_ld1(bottom_blob_data, v_offset + x);
+                area += 1;
+            }
+
+            v_offset += psc(w);
+        }
+#endif
+
+        res /= afp(area);
+    }
+    if (pooling_type == 1 && avgpool_count_include_pad == 1)
+    {
+        res = afp(0.f);
+
+#if NCNN_image_shader
+        int sx = gx * stride_w;
+        int sy = gy * stride_h;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                res += image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz));
+            }
+        }
+#else
+        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                res += buffer_ld1(bottom_blob_data, v_offset + x);
+            }
+
+            v_offset += psc(w);
+        }
+#endif
+
+        res /= afp(kernel_w * kernel_h);
+    }
+
+#if NCNN_image_shader
+    image3d_st1(top_blob, ivec3(gx, gy, gz), res);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, res);
+#endif
+}
diff --git a/source/device/vulkan/shaders/pooling_global.comp b/source/device/vulkan/shaders/pooling_global.comp
new file mode 100644
index 000000000..8947a3d7a
--- /dev/null
+++ b/source/device/vulkan/shaders/pooling_global.comp
@@ -0,0 +1,130 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define FLT_MAX 3.402823466e+38
+
+layout (constant_id = 0) const int pooling_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    int size = psc(w) * psc(h);
+    int v_offset = gx * psc(cstep);
+
+    afp res;
+
+    if (pooling_type == 0)
+    {
+        res = afp(-FLT_MAX);
+
+#if NCNN_image_shader
+        for (int y = 0; y < psc(h); y++)
+        {
+            for (int x = 0; x < psc(w); x++)
+            {
+                afp v = image3d_ld1(bottom_blob, ivec3(x, y, gx));
+                res = max(res, v);
+            }
+        }
+#else
+        for (int i = 0; i < size; i++)
+        {
+            afp v = buffer_ld1(bottom_blob_data, v_offset + i);
+            res = max(res, v);
+        }
+#endif
+    }
+    if (pooling_type == 1)
+    {
+        res = afp(0.f);
+
+#if NCNN_image_shader
+        for (int y = 0; y < psc(h); y++)
+        {
+            for (int x = 0; x < psc(w); x++)
+            {
+                res += image3d_ld1(bottom_blob, ivec3(x, y, gx));
+            }
+        }
+#else
+        for (int i = 0; i < size; i++)
+        {
+            res += buffer_ld1(bottom_blob_data, v_offset + i);
+        }
+#endif
+
+        res /= afp(size);
+    }
+
+#if NCNN_image_shader
+    image1d_st1(top_blob, gx, res);
+#else
+    buffer_st1(top_blob_data, gx, res);
+#endif
+}
diff --git a/source/device/vulkan/shaders/pooling_global_pack4.comp b/source/device/vulkan/shaders/pooling_global_pack4.comp
new file mode 100644
index 000000000..a8634cce8
--- /dev/null
+++ b/source/device/vulkan/shaders/pooling_global_pack4.comp
@@ -0,0 +1,130 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define FLT_MAX 3.402823466e+38
+
+layout (constant_id = 0) const int pooling_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    int size = psc(w) * psc(h);
+    int v_offset = gx * psc(cstep);
+
+    afpvec4 res;
+
+    if (pooling_type == 0)
+    {
+        res = afpvec4(-FLT_MAX);
+
+#if NCNN_image_shader
+        for (int y = 0; y < psc(h); y++)
+        {
+            for (int x = 0; x < psc(w); x++)
+            {
+                afpvec4 v = image3d_ld4(bottom_blob, ivec3(x, y, gx));
+                res = max(res, v);
+            }
+        }
+#else
+        for (int i = 0; i < size; i++)
+        {
+            afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + i);
+            res = max(res, v);
+        }
+#endif
+    }
+    if (pooling_type == 1)
+    {
+        res = afpvec4(0.f);
+
+#if NCNN_image_shader
+        for (int y = 0; y < psc(h); y++)
+        {
+            for (int x = 0; x < psc(w); x++)
+            {
+                res += image3d_ld4(bottom_blob, ivec3(x, y, gx));
+            }
+        }
+#else
+        for (int i = 0; i < size; i++)
+        {
+            res += buffer_ld4(bottom_blob_data, v_offset + i);
+        }
+#endif
+
+        res /= afp(size);
+    }
+
+#if NCNN_image_shader
+    image1d_st4(top_blob, gx, res);
+#else
+    buffer_st4(top_blob_data, gx, res);
+#endif
+}
diff --git a/source/device/vulkan/shaders/pooling_global_pack8.comp b/source/device/vulkan/shaders/pooling_global_pack8.comp
new file mode 100644
index 000000000..3b9f43069
--- /dev/null
+++ b/source/device/vulkan/shaders/pooling_global_pack8.comp
@@ -0,0 +1,139 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define FLT_MAX 3.402823466e+38
+
+layout (constant_id = 0) const int pooling_type = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= 1 || gz >= 1)
+        return;
+
+    int size = psc(w) * psc(h);
+    int v_offset = gx * psc(cstep);
+
+    afpvec8 res;
+
+    if (pooling_type == 0)
+    {
+        res = afpvec8(afpvec4(-FLT_MAX), afpvec4(-FLT_MAX));
+
+#if NCNN_image_shader
+        for (int y = 0; y < psc(h); y++)
+        {
+            for (int x = 0; x < psc(w); x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(x, y, gx));
+                res[0] = max(res[0], v[0]);
+                res[1] = max(res[1], v[1]);
+            }
+        }
+#else
+        for (int i = 0; i < size; i++)
+        {
+            afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + i);
+            res[0] = max(res[0], v[0]);
+            res[1] = max(res[1], v[1]);
+        }
+#endif
+    }
+    if (pooling_type == 1)
+    {
+        res = afpvec8(afpvec4(0.f), afpvec4(0.f));
+
+#if NCNN_image_shader
+        for (int y = 0; y < psc(h); y++)
+        {
+            for (int x = 0; x < psc(w); x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(x, y, gx));
+                res[0] += v[0];
+                res[1] += v[1];
+            }
+        }
+#else
+        for (int i = 0; i < size; i++)
+        {
+            afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + i);
+            res[0] += v[0];
+            res[1] += v[1];
+        }
+#endif
+
+        afp area = afp(size);
+        res[0] /= area;
+        res[1] /= area;
+    }
+
+#if NCNN_image_shader
+    image1d_st8(top_blob, gx, res);
+#else
+    buffer_st8(top_blob_data, gx, res);
+#endif
+}
diff --git a/source/device/vulkan/shaders/pooling_pack4.comp b/source/device/vulkan/shaders/pooling_pack4.comp
new file mode 100644
index 000000000..4b574ac4d
--- /dev/null
+++ b/source/device/vulkan/shaders/pooling_pack4.comp
@@ -0,0 +1,226 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define FLT_MAX 3.402823466e+38
+
+layout (constant_id = 0) const int pooling_type = 0;
+layout (constant_id = 1) const int kernel_w = 1;
+layout (constant_id = 2) const int kernel_h = 1;
+layout (constant_id = 3) const int stride_w = 1;
+layout (constant_id = 4) const int stride_h = 1;
+layout (constant_id = 5) const int pad_left = 0;
+layout (constant_id = 6) const int pad_right = 0;
+layout (constant_id = 7) const int pad_top = 0;
+layout (constant_id = 8) const int pad_bottom = 0;
+layout (constant_id = 9) const int global_pooling = 0;
+layout (constant_id = 10) const int pad_mode = 0;
+layout (constant_id = 11) const int avgpool_count_include_pad = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#define shape_constant_id_offset 12
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int wtailpad;
+    int htailpad;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 res;
+
+    if (pooling_type == 0)
+    {
+        res = afpvec4(-FLT_MAX);
+
+#if NCNN_image_shader
+        int sx = gx * stride_w;
+        int sy = gy * stride_h;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz));
+                res = max(res, v);
+            }
+        }
+#else
+        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec4 v = buffer_ld4(bottom_blob_data, v_offset + x);
+                res = max(res, v);
+            }
+
+            v_offset += psc(w);
+        }
+#endif
+    }
+    else if (pooling_type == 1 && avgpool_count_include_pad == 0)
+    {
+        res = afpvec4(0.f);
+        int area = 0;
+
+        int sx = gx * stride_w;
+        int sy = gy * stride_h;
+
+#if NCNN_image_shader
+        for (int y = 0; y < kernel_h; y++)
+        {
+            if (sy + y < pad_top)
+                continue;
+
+            if (sy + y >= psc(h) - pad_bottom - p.htailpad)
+                break;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                if (sx + x < pad_left)
+                    continue;
+
+                if (sx + x >= psc(w) - pad_right - p.wtailpad)
+                    break;
+
+                res += image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz));
+                area += 1;
+            }
+        }
+#else
+        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            if (sy + y < pad_top)
+            {
+                v_offset += psc(w);
+                continue;
+            }
+
+            if (sy + y >= psc(h) - pad_bottom - p.htailpad)
+                break;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                if (sx + x < pad_left)
+                {
+                    continue;
+                }
+
+                if (sx + x >= psc(w) - pad_right - p.wtailpad)
+                    break;
+
+                res += buffer_ld4(bottom_blob_data, v_offset + x);
+                area += 1;
+            }
+
+            v_offset += psc(w);
+        }
+#endif
+
+        res /= afp(area);
+    }
+    else if (pooling_type == 1 && avgpool_count_include_pad == 1)
+    {
+        res = afpvec4(0.f);
+
+#if NCNN_image_shader
+        int sx = gx * stride_w;
+        int sy = gy * stride_h;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                res += image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz));
+            }
+        }
+#else
+        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                res += buffer_ld4(bottom_blob_data, v_offset + x);
+            }
+
+            v_offset += psc(w);
+        }
+#endif
+
+        res /= afp(kernel_w * kernel_h);
+    }
+
+#if NCNN_image_shader
+    image3d_st4(top_blob, ivec3(gx, gy, gz), res);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, res);
+#endif
+}
diff --git a/source/device/vulkan/shaders/pooling_pack8.comp b/source/device/vulkan/shaders/pooling_pack8.comp
new file mode 100644
index 000000000..4ff7ac902
--- /dev/null
+++ b/source/device/vulkan/shaders/pooling_pack8.comp
@@ -0,0 +1,242 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#define FLT_MAX 3.402823466e+38
+
+layout (constant_id = 0) const int pooling_type = 0;
+layout (constant_id = 1) const int kernel_w = 1;
+layout (constant_id = 2) const int kernel_h = 1;
+layout (constant_id = 3) const int stride_w = 1;
+layout (constant_id = 4) const int stride_h = 1;
+layout (constant_id = 5) const int pad_left = 0;
+layout (constant_id = 6) const int pad_right = 0;
+layout (constant_id = 7) const int pad_top = 0;
+layout (constant_id = 8) const int pad_bottom = 0;
+layout (constant_id = 9) const int global_pooling = 0;
+layout (constant_id = 10) const int pad_mode = 0;
+layout (constant_id = 11) const int avgpool_count_include_pad = 0;
+
+#define shape_constant_id_offset 12
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    int wtailpad;
+    int htailpad;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 res;
+
+    if (pooling_type == 0)
+    {
+        res = afpvec8(afpvec4(-FLT_MAX), afpvec4(-FLT_MAX));
+
+#if NCNN_image_shader
+        int sx = gx * stride_w;
+        int sy = gy * stride_h;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz));
+                res[0] = max(res[0], v[0]);
+                res[1] = max(res[1], v[1]);
+            }
+        }
+#else
+        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x);
+                res[0] = max(res[0], v[0]);
+                res[1] = max(res[1], v[1]);
+            }
+
+            v_offset += psc(w);
+        }
+#endif
+    }
+    else if (pooling_type == 1 && avgpool_count_include_pad == 0)
+    {
+        res = afpvec8(afpvec4(0.f), afpvec4(0.f));
+        int area = 0;
+
+        int sx = gx * stride_w;
+        int sy = gy * stride_h;
+
+#if NCNN_image_shader
+        for (int y = 0; y < kernel_h; y++)
+        {
+            if (sy + y < pad_top)
+                continue;
+
+            if (sy + y >= psc(h) - pad_bottom - p.htailpad)
+                break;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                if (sx + x < pad_left)
+                    continue;
+
+                if (sx + x >= psc(w) - pad_right - p.wtailpad)
+                    break;
+
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz));
+                res[0] += v[0];
+                res[1] += v[1];
+                area += 1;
+            }
+        }
+#else
+        int v_offset = gz * psc(cstep) + sy * psc(w) + sx;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            if (sy + y < pad_top)
+            {
+                v_offset += psc(w);
+                continue;
+            }
+
+            if (sy + y >= psc(h) - pad_bottom - p.htailpad)
+                break;
+
+            for (int x = 0; x < kernel_w; x++)
+            {
+                if (sx + x < pad_left)
+                {
+                    continue;
+                }
+
+                if (sx + x >= psc(w) - pad_right - p.wtailpad)
+                    break;
+
+                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x);
+                res[0] += v[0];
+                res[1] += v[1];
+                area += 1;
+            }
+
+            v_offset += psc(w);
+        }
+#endif
+
+        res[0] /= afp(area);
+        res[1] /= afp(area);
+    }
+    else if (pooling_type == 1 && avgpool_count_include_pad == 1)
+    {
+        res = afpvec8(afpvec4(0.f), afpvec4(0.f));
+
+#if NCNN_image_shader
+        int sx = gx * stride_w;
+        int sy = gy * stride_h;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz));
+                res[0] += v[0];
+                res[1] += v[1];
+            }
+        }
+#else
+        int v_offset = gz * psc(cstep) + gy * stride_h * psc(w) + gx * stride_w;
+
+        for (int y = 0; y < kernel_h; y++)
+        {
+            for (int x = 0; x < kernel_w; x++)
+            {
+                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x);
+                res[0] += v[0];
+                res[1] += v[1];
+            }
+
+            v_offset += psc(w);
+        }
+#endif
+
+        afp area = afp(kernel_w * kernel_h);
+        res[0] /= area;
+        res[1] /= area;
+    }
+
+#if NCNN_image_shader
+    image3d_st8(top_blob, ivec3(gx, gy, gz), res);
+#else
+    const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    // res = afpvec8(afpvec4(1.0f), afpvec4(1.0f));
+
+    buffer_st8(top_blob_data, gi, res);
+#endif
+}
diff --git a/source/device/vulkan/shaders/priorbox.comp b/source/device/vulkan/shaders/priorbox.comp
new file mode 100644
index 000000000..1503b8866
--- /dev/null
+++ b/source/device/vulkan/shaders/priorbox.comp
@@ -0,0 +1,170 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int flip = 0;
+layout (constant_id = 1) const int clip = 0;
+layout (constant_id = 2) const float offset = 0;
+layout (constant_id = 3) const float variances_0 = 0;
+layout (constant_id = 4) const float variances_1 = 0;
+layout (constant_id = 5) const float variances_2 = 0;
+layout (constant_id = 6) const float variances_3 = 0;
+layout (constant_id = 7) const int num_min_size = 0;
+layout (constant_id = 8) const int num_max_size = 0;
+layout (constant_id = 9) const int num_aspect_ratio = 0;
+layout (constant_id = 10) const int num_prior = 0;
+
+#define shape_constant_id_offset 11
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_fp16_packed
+layout (binding = 0) writeonly buffer top_blob { vec4 top_blob_data[]; };
+#else
+layout (binding = 0) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+layout (binding = 1) readonly buffer min_sizes { sfp min_sizes_data[]; };
+layout (binding = 2) readonly buffer max_sizes { sfp max_sizes_data[]; };
+layout (binding = 3) readonly buffer aspect_ratios { sfp aspect_ratios_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+
+    float image_w;
+    float image_h;
+    float step_w;
+    float step_h;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= num_min_size || gy >= psc(w) || gz >= psc(h))
+        return;
+
+    // anchor and variance
+    int v_offset = (gz * psc(w) + gy) * num_prior + gx;
+    int var_offset = psc(w) * psc(h) * num_prior + v_offset;
+
+    afp center_x = (afp(gy) + afp(offset)) * afp(p.step_w);
+    afp center_y = (afp(gz) + afp(offset)) * afp(p.step_h);
+    afpvec4 center = afpvec4(center_x, center_y, center_x, center_y);
+
+    afpvec4 image_norm = afp(1.f) / afpvec4(p.image_w, p.image_h, p.image_w, p.image_h);
+
+    afpvec4 box;
+
+    afp box_w;
+    afp box_h;
+
+    afp min_size = buffer_ld1(min_sizes_data, gx);
+
+    afpvec4 variances = afpvec4(variances_0, variances_1, variances_2, variances_3);
+
+    // min size box
+    box_w = box_h = min_size;
+
+    box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;
+    box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;
+
+#if NCNN_fp16_packed
+    top_blob_data[v_offset] = vec4(box);
+    top_blob_data[var_offset] = vec4(variances);
+#else
+    buffer_st4(top_blob_data, v_offset, box);
+    buffer_st4(top_blob_data, var_offset, variances);
+#endif
+
+    v_offset += 1;
+    var_offset += 1;
+
+    if (num_max_size > 0)
+    {
+        afp max_size = buffer_ld1(max_sizes_data, gx);
+
+        // max size box
+        box_w = box_h = sqrt(min_size * max_size);
+
+        box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;
+        box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;
+
+#if NCNN_fp16_packed
+        top_blob_data[v_offset] = vec4(box);
+        top_blob_data[var_offset] = vec4(variances);
+#else
+        buffer_st4(top_blob_data, v_offset, box);
+        buffer_st4(top_blob_data, var_offset, variances);
+#endif
+
+        v_offset += 1;
+        var_offset += 1;
+    }
+
+    // all aspect_ratios
+    for (int pi = 0; pi < num_aspect_ratio; pi++)
+    {
+        afp ar = buffer_ld1(aspect_ratios_data, pi);
+
+        box_w = min_size * sqrt(ar);
+        box_h = min_size / sqrt(ar);
+
+        box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;
+        box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;
+
+#if NCNN_fp16_packed
+        top_blob_data[v_offset] = vec4(box);
+        top_blob_data[var_offset] = vec4(variances);
+#else
+        buffer_st4(top_blob_data, v_offset, box);
+        buffer_st4(top_blob_data, var_offset, variances);
+#endif
+
+        v_offset += 1;
+        var_offset += 1;
+
+        if (flip == 1)
+        {
+            box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm;
+            box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;
+
+#if NCNN_fp16_packed
+            top_blob_data[v_offset] = vec4(box);
+            top_blob_data[var_offset] = vec4(variances);
+#else
+            buffer_st4(top_blob_data, v_offset, box);
+            buffer_st4(top_blob_data, var_offset, variances);
+#endif
+
+            v_offset += 1;
+            var_offset += 1;
+        }
+    }
+}
diff --git a/source/device/vulkan/shaders/priorbox_mxnet.comp b/source/device/vulkan/shaders/priorbox_mxnet.comp
new file mode 100644
index 000000000..bec66fde9
--- /dev/null
+++ b/source/device/vulkan/shaders/priorbox_mxnet.comp
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int clip = 0;
+layout (constant_id = 1) const float offset = 0;
+layout (constant_id = 2) const int num_sizes = 0;
+layout (constant_id = 3) const int num_ratios = 0;
+layout (constant_id = 4) const int num_prior = 0;
+
+#define shape_constant_id_offset 5
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 1) readonly buffer min_sizes { sfp min_sizes_data[]; };
+layout (binding = 2) readonly buffer aspect_ratios { sfp aspect_ratios_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+
+    float step_w;
+    float step_h;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= num_sizes || gy >= psc(w) || gz >= psc(h))
+        return;
+
+    // mxnet style _contrib_MultiBoxPrior
+    int v_offset = (gz * psc(w) + gy) * num_prior + gx;
+
+    afp center_x = (afp(gy) + afp(offset)) * afp(p.step_w);
+    afp center_y = (afp(gz) + afp(offset)) * afp(p.step_h);
+    afpvec4 center = afpvec4(center_x, center_y, center_x, center_y);
+
+    // ratio = 1, various sizes
+    afp size = buffer_ld1(min_sizes_data, gx);
+    afp cw = size * afp(0.5f) * afp(psc(h)) / afp(psc(w));
+    afp ch = size * afp(0.5f);
+
+    afpvec4 box = center + afpvec4(-cw, -ch, cw, ch);
+    box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;
+
+    buffer_st4(top_blob_data, v_offset, box);
+
+    if (gx == 0)
+    {
+        // various ratios, size = min_size = size[0]
+        for (int pi = 1; pi < num_ratios; pi++)
+        {
+            afp v = buffer_ld1(aspect_ratios_data, pi);
+            afp cwr = cw * sqrt(v);
+            afp chr = ch / sqrt(v);
+
+            afpvec4 box = center + afpvec4(-cwr, -chr, cwr, chr);
+            box = clip == 1 ? clamp(box, afp(0.f), afp(1.f)) : box;
+
+            buffer_st4(top_blob_data, v_offset + num_sizes - 1 + pi, box);
+        }
+    }
+}
diff --git a/source/device/vulkan/shaders/relu.comp b/source/device/vulkan/shaders/relu.comp
new file mode 100644
index 000000000..cb08948d3
--- /dev/null
+++ b/source/device/vulkan/shaders/relu.comp
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const float slope = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afp v;
+    if (psc(dims) == 1)
+    {
+        v = image1d_ld1(bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afp v = buffer_ld1(bottom_top_blob_data, gi);
+#endif
+
+    if (slope == 0)
+        v = max(v, afp(0.f));
+    else
+        v = v < afp(0.f) ? v * afp(slope) : v;
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st1(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st1(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st1(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/relu_pack4.comp b/source/device/vulkan/shaders/relu_pack4.comp
new file mode 100644
index 000000000..cc02824cb
--- /dev/null
+++ b/source/device/vulkan/shaders/relu_pack4.comp
@@ -0,0 +1,107 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const float slope = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec4 v;
+    if (psc(dims) == 1)
+    {
+        v = image1d_ld4(bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec4 v = buffer_ld4(bottom_top_blob_data, gi);
+#endif
+
+    if (slope == 0)
+        v = max(v, afp(0.f));
+    else
+        v = mix(v, v * afp(slope), lessThan(v, afpvec4(0.f)));
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st4(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st4(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/relu_pack8.comp b/source/device/vulkan/shaders/relu_pack8.comp
new file mode 100644
index 000000000..25862cde3
--- /dev/null
+++ b/source/device/vulkan/shaders/relu_pack8.comp
@@ -0,0 +1,114 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const float slope = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec8 v;
+    if (psc(dims) == 1)
+    {
+        v = image1d_ld8(bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);
+#endif
+
+    if (slope == 0)
+    {
+        v[0] = max(v[0], afp(0.f));
+        v[1] = max(v[1], afp(0.f));
+    }
+    else
+    {
+        v[0] = mix(v[0], v[0] * afp(slope), lessThan(v[0], afpvec4(0.f)));
+        v[1] = mix(v[1], v[1] * afp(slope), lessThan(v[1], afpvec4(0.f)));
+    }
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st8(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st8(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/reshape.comp b/source/device/vulkan/shaders/reshape.comp
new file mode 100644
index 000000000..3b2109789
--- /dev/null
+++ b/source/device/vulkan/shaders/reshape.comp
@@ -0,0 +1,138 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    int i;
+
+    if (ndim == 1) i = gx;
+    if (ndim == 2) i = gy * psc(outw) + gx;
+    if (ndim == 3) i = gz * psc(outh) * psc(outw) + gy * psc(outw) + gx;
+
+    int size = psc(w) * psc(h);
+
+    int z = i / size;
+    int y = i % size / psc(w);
+    int x = i % size % psc(w);
+
+#if NCNN_image_shader
+    afp v;
+
+    if (psc(dims) == 1)
+    {
+        v = image1d_ld1(bottom_blob_1d, x);
+    }
+    else if (psc(dims) == 2)
+    {
+        v = image2d_ld1(bottom_blob_2d, ivec2(x, y));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(x, y, z));
+    }
+
+    if (ndim == 1)
+    {
+        image1d_st1(top_blob_1d, gx, v);
+    }
+    if (ndim == 2)
+    {
+        image2d_st1(top_blob_2d, ivec2(gx, gy), v);
+    }
+    if (ndim == 3)
+    {
+        image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    int v_offset = z * psc(cstep) + y * psc(w) + x;
+
+    int gi;
+    if (ndim == 1)
+    {
+        gi = gx;
+    }
+    if (ndim == 2)
+    {
+        gi = gy * psc(outw) + gx;
+    }
+    if (ndim == 3)
+    {
+        gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+    }
+
+    buffer_cp1(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/reshape_pack1to4.comp b/source/device/vulkan/shaders/reshape_pack1to4.comp
new file mode 100644
index 000000000..9a33d7908
--- /dev/null
+++ b/source/device/vulkan/shaders/reshape_pack1to4.comp
@@ -0,0 +1,147 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    ivec4 i4;
+
+    if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3);
+    if (ndim == 2) i4 = (gy * 4) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw);
+    if (ndim == 3) i4 = (gz * 4) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw);
+
+    int size = psc(w) * psc(h);
+
+    ivec4 z4 = i4 / size;
+    ivec4 y4 = i4 % size / psc(w);
+    ivec4 x4 = i4 % size % psc(w);
+
+#if NCNN_image_shader
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        v.r = image1d_ld1(bottom_blob_1d, x4.r);
+        v.g = image1d_ld1(bottom_blob_1d, x4.g);
+        v.b = image1d_ld1(bottom_blob_1d, x4.b);
+        v.a = image1d_ld1(bottom_blob_1d, x4.a);
+    }
+    else if (psc(dims) == 2)
+    {
+        v.r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r));
+        v.g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g));
+        v.b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b));
+        v.a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v.r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r));
+        v.g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g));
+        v.b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b));
+        v.a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a));
+    }
+
+    if (ndim == 1)
+    {
+        image1d_st4(top_blob_1d, gx, v);
+    }
+    if (ndim == 2)
+    {
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    if (ndim == 3)
+    {
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    ivec4 v_offset = z4 * psc(cstep) + y4 * psc(w) + x4;
+
+    int gi;
+    if (ndim == 1)
+    {
+        gi = gx;
+    }
+    if (ndim == 2)
+    {
+        gi = gy * psc(outw) + gx;
+    }
+    if (ndim == 3)
+    {
+        gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+    }
+
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/reshape_pack1to8.comp b/source/device/vulkan/shaders/reshape_pack1to8.comp
new file mode 100644
index 000000000..93e096a51
--- /dev/null
+++ b/source/device/vulkan/shaders/reshape_pack1to8.comp
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    ivec4 i4;
+    ivec4 ii4;
+
+    if (ndim == 1)
+    {
+        i4 = gx * 8 + ivec4(0, 1, 2, 3);
+        ii4 = i4 + 4;
+    }
+    if (ndim == 2)
+    {
+        i4 = (gy * 8) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw);
+        ii4 = i4 + 4 * psc(outw);
+    }
+    if (ndim == 3)
+    {
+        i4 = (gz * 8) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw);
+        ii4 = i4 + 4 * psc(outh) * psc(outw);
+    }
+
+    int size = psc(w) * psc(h);
+
+    ivec4 z4 = i4 / size;
+    ivec4 y4 = i4 % size / psc(w);
+    ivec4 x4 = i4 % size % psc(w);
+    ivec4 zz4 = ii4 / size;
+    ivec4 yy4 = ii4 % size / psc(w);
+    ivec4 xx4 = ii4 % size % psc(w);
+
+#if NCNN_image_shader
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        v[0].r = image1d_ld1(bottom_blob_1d, x4.r);
+        v[0].g = image1d_ld1(bottom_blob_1d, x4.g);
+        v[0].b = image1d_ld1(bottom_blob_1d, x4.b);
+        v[0].a = image1d_ld1(bottom_blob_1d, x4.a);
+        v[1].r = image1d_ld1(bottom_blob_1d, xx4.r);
+        v[1].g = image1d_ld1(bottom_blob_1d, xx4.g);
+        v[1].b = image1d_ld1(bottom_blob_1d, xx4.b);
+        v[1].a = image1d_ld1(bottom_blob_1d, xx4.a);
+    }
+    else if (psc(dims) == 2)
+    {
+        v[0].r = image2d_ld1(bottom_blob_2d, ivec2(x4.r, y4.r));
+        v[0].g = image2d_ld1(bottom_blob_2d, ivec2(x4.g, y4.g));
+        v[0].b = image2d_ld1(bottom_blob_2d, ivec2(x4.b, y4.b));
+        v[0].a = image2d_ld1(bottom_blob_2d, ivec2(x4.a, y4.a));
+        v[1].r = image2d_ld1(bottom_blob_2d, ivec2(xx4.r, yy4.r));
+        v[1].g = image2d_ld1(bottom_blob_2d, ivec2(xx4.g, yy4.g));
+        v[1].b = image2d_ld1(bottom_blob_2d, ivec2(xx4.b, yy4.b));
+        v[1].a = image2d_ld1(bottom_blob_2d, ivec2(xx4.a, yy4.a));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v[0].r = image3d_ld1(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r));
+        v[0].g = image3d_ld1(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g));
+        v[0].b = image3d_ld1(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b));
+        v[0].a = image3d_ld1(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a));
+        v[1].r = image3d_ld1(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r));
+        v[1].g = image3d_ld1(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g));
+        v[1].b = image3d_ld1(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b));
+        v[1].a = image3d_ld1(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a));
+    }
+
+    if (ndim == 1)
+    {
+        image1d_st8(top_blob_1d, gx, v);
+    }
+    if (ndim == 2)
+    {
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    if (ndim == 3)
+    {
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    ivec4 v_offset = z4 * psc(cstep) + y4 * psc(w) + x4;
+    ivec4 vv_offset = zz4 * psc(cstep) + yy4 * psc(w) + xx4;
+
+    int gi;
+    if (ndim == 1)
+    {
+        gi = gx;
+    }
+    if (ndim == 2)
+    {
+        gi = gy * psc(outw) + gx;
+    }
+    if (ndim == 3)
+    {
+        gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+    }
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+}
diff --git a/source/device/vulkan/shaders/reshape_pack4.comp b/source/device/vulkan/shaders/reshape_pack4.comp
new file mode 100644
index 000000000..6f85d9779
--- /dev/null
+++ b/source/device/vulkan/shaders/reshape_pack4.comp
@@ -0,0 +1,228 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    ivec4 i4;
+
+    if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3);
+    if (ndim == 2) i4 = (gy * 4) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw);
+    if (ndim == 3) i4 = (gz * 4) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw);
+
+#if NCNN_image_shader
+    afpvec4 vr;
+    afpvec4 vg;
+    afpvec4 vb;
+    afpvec4 va;
+
+    ivec4 lane4;
+
+    if (psc(dims) == 1)
+    {
+        ivec4 x4 = i4;
+
+        vr = image1d_ld4(bottom_blob_1d, x4.r / 4);
+        vg = image1d_ld4(bottom_blob_1d, x4.g / 4);
+        vb = image1d_ld4(bottom_blob_1d, x4.b / 4);
+        va = image1d_ld4(bottom_blob_1d, x4.a / 4);
+
+        lane4 = x4 % 4;
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        vr = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4));
+        vg = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4));
+        vb = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4));
+        va = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4));
+
+        lane4 = y4 % 4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        vr = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4));
+        vg = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4));
+        vb = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4));
+        va = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4));
+
+        lane4 = z4 % 4;
+    }
+
+    afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]);
+
+    if (ndim == 1)
+    {
+        image1d_st4(top_blob_1d, gx, v);
+    }
+    if (ndim == 2)
+    {
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    if (ndim == 3)
+    {
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 lane2;
+
+    if (psc(dims) == 1)
+    {
+        v_offset = i4 / 2;
+        lane2 = i4 % 2;
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2;
+        lane2 = y4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
+        lane2 = z4 % 2;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * psc(outw) + gx;
+    if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
+
+    buffer_st4(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+
+    if (psc(dims) == 1)
+    {
+        v_offset = i4;
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * psc(outw) + gx;
+    if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/reshape_pack4to1.comp b/source/device/vulkan/shaders/reshape_pack4to1.comp
new file mode 100644
index 000000000..abf9331e7
--- /dev/null
+++ b/source/device/vulkan/shaders/reshape_pack4to1.comp
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    ivec3 gxyz = ivec3(gx, gy, gz);
+
+    gxyz[psc(dims) - 1] *= 4;
+
+    int i4_0 = gxyz.z * psc(h) * psc(w) + gxyz.y * psc(w) + gxyz.x;
+
+    ivec3 gxyz4 = ivec3(1, psc(w), psc(h) * psc(w));
+
+    ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1];
+
+#if NCNN_image_shader
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        v = image1d_ld4(bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+
+    if (ndim == 1)
+    {
+        ivec4 x4 = i4;
+
+        image1d_st1(top_blob_1d, x4.r, v.r);
+        image1d_st1(top_blob_1d, x4.g, v.g);
+        image1d_st1(top_blob_1d, x4.b, v.b);
+        image1d_st1(top_blob_1d, x4.a, v.a);
+    }
+    if (ndim == 2)
+    {
+        ivec4 y4 = i4 / psc(outw);
+        ivec4 x4 = i4 % psc(outw);
+
+        image2d_st1(top_blob_2d, ivec2(x4.r, y4.r), v.r);
+        image2d_st1(top_blob_2d, ivec2(x4.g, y4.g), v.g);
+        image2d_st1(top_blob_2d, ivec2(x4.b, y4.b), v.b);
+        image2d_st1(top_blob_2d, ivec2(x4.a, y4.a), v.a);
+    }
+    if (ndim == 3)
+    {
+        int size = psc(outw) * psc(outh);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(outw);
+        ivec4 x4 = i4 % size % psc(outw);
+
+        image3d_st1(top_blob_3d, ivec3(x4.r, y4.r, z4.r), v.r);
+        image3d_st1(top_blob_3d, ivec3(x4.g, y4.g, z4.g), v.g);
+        image3d_st1(top_blob_3d, ivec3(x4.b, y4.b, z4.b), v.b);
+        image3d_st1(top_blob_3d, ivec3(x4.a, y4.a, z4.a), v.a);
+    }
+#else
+    ivec4 v_offset;
+
+    if (ndim == 1)
+    {
+        v_offset = i4;
+    }
+    if (ndim == 2)
+    {
+        ivec4 y4 = i4 / psc(outw);
+        ivec4 x4 = i4 % psc(outw);
+
+        v_offset = y4 * psc(outw) + x4;
+    }
+    if (ndim == 3)
+    {
+        int size = psc(outw) * psc(outh);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(outw);
+        ivec4 x4 = i4 % size % psc(outw);
+
+        v_offset = z4 * psc(outcstep) + y4 * psc(outw) + x4;
+    }
+
+    int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    buffer_cp4to1(top_blob_data, v_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/reshape_pack4to8.comp b/source/device/vulkan/shaders/reshape_pack4to8.comp
new file mode 100644
index 000000000..c3950a1aa
--- /dev/null
+++ b/source/device/vulkan/shaders/reshape_pack4to8.comp
@@ -0,0 +1,301 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    ivec4 i4;
+    ivec4 ii4;
+
+    if (ndim == 1)
+    {
+        i4 = gx * 8 + ivec4(0, 1, 2, 3);
+        ii4 = i4 + 4;
+    }
+    if (ndim == 2)
+    {
+        i4 = (gy * 8) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw);
+        ii4 = i4 + 4 * psc(outw);
+    }
+    if (ndim == 3)
+    {
+        i4 = (gz * 8) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw);
+        ii4 = i4 + 4 * psc(outh) * psc(outw);
+    }
+
+#if NCNN_image_shader
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        ivec4 x4 = i4;
+        ivec4 xx4 = ii4;
+
+        afpvec4 v0 = image1d_ld4(bottom_blob_1d, x4.r / 4);
+        afpvec4 v1 = image1d_ld4(bottom_blob_1d, x4.g / 4);
+        afpvec4 v2 = image1d_ld4(bottom_blob_1d, x4.b / 4);
+        afpvec4 v3 = image1d_ld4(bottom_blob_1d, x4.a / 4);
+        afpvec4 v4 = image1d_ld4(bottom_blob_1d, xx4.r / 4);
+        afpvec4 v5 = image1d_ld4(bottom_blob_1d, xx4.g / 4);
+        afpvec4 v6 = image1d_ld4(bottom_blob_1d, xx4.b / 4);
+        afpvec4 v7 = image1d_ld4(bottom_blob_1d, xx4.a / 4);
+
+        v[0].r = v0[x4.r % 4];
+        v[0].g = v1[x4.g % 4];
+        v[0].b = v2[x4.b % 4];
+        v[0].a = v3[x4.a % 4];
+        v[1].r = v4[xx4.r % 4];
+        v[1].g = v5[xx4.g % 4];
+        v[1].b = v6[xx4.b % 4];
+        v[1].a = v7[xx4.a % 4];
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        afpvec4 v0 = image2d_ld4(bottom_blob_2d, ivec2(x4.r, y4.r / 4));
+        afpvec4 v1 = image2d_ld4(bottom_blob_2d, ivec2(x4.g, y4.g / 4));
+        afpvec4 v2 = image2d_ld4(bottom_blob_2d, ivec2(x4.b, y4.b / 4));
+        afpvec4 v3 = image2d_ld4(bottom_blob_2d, ivec2(x4.a, y4.a / 4));
+        afpvec4 v4 = image2d_ld4(bottom_blob_2d, ivec2(xx4.r, yy4.r / 4));
+        afpvec4 v5 = image2d_ld4(bottom_blob_2d, ivec2(xx4.g, yy4.g / 4));
+        afpvec4 v6 = image2d_ld4(bottom_blob_2d, ivec2(xx4.b, yy4.b / 4));
+        afpvec4 v7 = image2d_ld4(bottom_blob_2d, ivec2(xx4.a, yy4.a / 4));
+
+        v[0].r = v0[y4.r % 4];
+        v[0].g = v1[y4.g % 4];
+        v[0].b = v2[y4.b % 4];
+        v[0].a = v3[y4.a % 4];
+        v[1].r = v4[yy4.r % 4];
+        v[1].g = v5[yy4.g % 4];
+        v[1].b = v6[yy4.b % 4];
+        v[1].a = v7[yy4.a % 4];
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        afpvec4 v0 = image3d_ld4(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 4));
+        afpvec4 v1 = image3d_ld4(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 4));
+        afpvec4 v2 = image3d_ld4(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 4));
+        afpvec4 v3 = image3d_ld4(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 4));
+        afpvec4 v4 = image3d_ld4(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 4));
+        afpvec4 v5 = image3d_ld4(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 4));
+        afpvec4 v6 = image3d_ld4(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 4));
+        afpvec4 v7 = image3d_ld4(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 4));
+
+        v[0].r = v0[z4.r % 4];
+        v[0].g = v1[z4.g % 4];
+        v[0].b = v2[z4.b % 4];
+        v[0].a = v3[z4.a % 4];
+        v[1].r = v4[zz4.r % 4];
+        v[1].g = v5[zz4.g % 4];
+        v[1].b = v6[zz4.b % 4];
+        v[1].a = v7[zz4.a % 4];
+    }
+
+    if (ndim == 1)
+    {
+        image1d_st8(top_blob_1d, gx, v);
+    }
+    if (ndim == 2)
+    {
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    if (ndim == 3)
+    {
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 vv_offset;
+    ivec4 lane2;
+    ivec4 lane4;
+
+    if (psc(dims) == 1)
+    {
+        v_offset = i4 / 2;
+        lane2 = i4 % 2;
+        vv_offset = ii4 / 2;
+        lane4 = ii4 % 2;
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 2 + (y4 % 4) / 2;
+        lane2 = y4 % 2;
+        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 2 + (yy4 % 4) / 2;
+        lane4 = yy4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 2 + (z4 % 4) / 2;
+        lane2 = z4 % 2;
+        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 2 + (zz4 % 4) / 2;
+        lane4 = zz4 % 2;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * psc(outw) + gx;
+    if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
+    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
+    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
+    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
+
+    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
+
+    buffer_st8(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (psc(dims) == 1)
+    {
+        v_offset = i4;
+        vv_offset = ii4;
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 4) * psc(w) + x4) * 4 + y4 % 4;
+        vv_offset = ((yy4 / 4) * psc(w) + xx4) * 4 + yy4 % 4;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 4) * psc(cstep) + y4 * psc(w) + x4) * 4 + z4 % 4;
+        vv_offset = ((zz4 / 4) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + zz4 % 4;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * psc(outw) + gx;
+    if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/reshape_pack8.comp b/source/device/vulkan/shaders/reshape_pack8.comp
new file mode 100644
index 000000000..23ee23acb
--- /dev/null
+++ b/source/device/vulkan/shaders/reshape_pack8.comp
@@ -0,0 +1,301 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    ivec4 i4;
+    ivec4 ii4;
+
+    if (ndim == 1)
+    {
+        i4 = gx * 8 + ivec4(0, 1, 2, 3);
+        ii4 = i4 + 4;
+    }
+    if (ndim == 2)
+    {
+        i4 = (gy * 8) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw);
+        ii4 = i4 + 4 * psc(outw);
+    }
+    if (ndim == 3)
+    {
+        i4 = (gz * 8) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw);
+        ii4 = i4 + 4 * psc(outh) * psc(outw);
+    }
+
+#if NCNN_image_shader
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        ivec4 x4 = i4;
+        ivec4 xx4 = ii4;
+
+        afpvec8 v0 = image1d_ld8(bottom_blob_1d, x4.r / 8);
+        afpvec8 v1 = image1d_ld8(bottom_blob_1d, x4.g / 8);
+        afpvec8 v2 = image1d_ld8(bottom_blob_1d, x4.b / 8);
+        afpvec8 v3 = image1d_ld8(bottom_blob_1d, x4.a / 8);
+        afpvec8 v4 = image1d_ld8(bottom_blob_1d, xx4.r / 8);
+        afpvec8 v5 = image1d_ld8(bottom_blob_1d, xx4.g / 8);
+        afpvec8 v6 = image1d_ld8(bottom_blob_1d, xx4.b / 8);
+        afpvec8 v7 = image1d_ld8(bottom_blob_1d, xx4.a / 8);
+
+        v[0].r = v0[(x4.r % 8) / 4][x4.r % 4];
+        v[0].g = v1[(x4.g % 8) / 4][x4.g % 4];
+        v[0].b = v2[(x4.b % 8) / 4][x4.b % 4];
+        v[0].a = v3[(x4.a % 8) / 4][x4.a % 4];
+        v[1].r = v4[(xx4.r % 8) / 4][xx4.r % 4];
+        v[1].g = v5[(xx4.g % 8) / 4][xx4.g % 4];
+        v[1].b = v6[(xx4.b % 8) / 4][xx4.b % 4];
+        v[1].a = v7[(xx4.a % 8) / 4][xx4.a % 4];
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8));
+        afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8));
+        afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8));
+        afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8));
+        afpvec8 v4 = image2d_ld8(bottom_blob_2d, ivec2(xx4.r, yy4.r / 8));
+        afpvec8 v5 = image2d_ld8(bottom_blob_2d, ivec2(xx4.g, yy4.g / 8));
+        afpvec8 v6 = image2d_ld8(bottom_blob_2d, ivec2(xx4.b, yy4.b / 8));
+        afpvec8 v7 = image2d_ld8(bottom_blob_2d, ivec2(xx4.a, yy4.a / 8));
+
+        v[0].r = v0[(y4.r % 8) / 4][y4.r % 4];
+        v[0].g = v1[(y4.g % 8) / 4][y4.g % 4];
+        v[0].b = v2[(y4.b % 8) / 4][y4.b % 4];
+        v[0].a = v3[(y4.a % 8) / 4][y4.a % 4];
+        v[1].r = v4[(yy4.r % 8) / 4][yy4.r % 4];
+        v[1].g = v5[(yy4.g % 8) / 4][yy4.g % 4];
+        v[1].b = v6[(yy4.b % 8) / 4][yy4.b % 4];
+        v[1].a = v7[(yy4.a % 8) / 4][yy4.a % 4];
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8));
+        afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8));
+        afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8));
+        afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8));
+        afpvec8 v4 = image3d_ld8(bottom_blob_3d, ivec3(xx4.r, yy4.r, zz4.r / 8));
+        afpvec8 v5 = image3d_ld8(bottom_blob_3d, ivec3(xx4.g, yy4.g, zz4.g / 8));
+        afpvec8 v6 = image3d_ld8(bottom_blob_3d, ivec3(xx4.b, yy4.b, zz4.b / 8));
+        afpvec8 v7 = image3d_ld8(bottom_blob_3d, ivec3(xx4.a, yy4.a, zz4.a / 8));
+
+        v[0].r = v0[(z4.r % 8) / 4][z4.r % 4];
+        v[0].g = v1[(z4.g % 8) / 4][z4.g % 4];
+        v[0].b = v2[(z4.b % 8) / 4][z4.b % 4];
+        v[0].a = v3[(z4.a % 8) / 4][z4.a % 4];
+        v[1].r = v4[(zz4.r % 8) / 4][zz4.r % 4];
+        v[1].g = v5[(zz4.g % 8) / 4][zz4.g % 4];
+        v[1].b = v6[(zz4.b % 8) / 4][zz4.b % 4];
+        v[1].a = v7[(zz4.a % 8) / 4][zz4.a % 4];
+    }
+
+    if (ndim == 1)
+    {
+        image1d_st8(top_blob_1d, gx, v);
+    }
+    if (ndim == 2)
+    {
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    if (ndim == 3)
+    {
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 vv_offset;
+    ivec4 lane2;
+    ivec4 lane4;
+
+    if (psc(dims) == 1)
+    {
+        v_offset = i4 / 2;
+        lane2 = i4 % 2;
+        vv_offset = ii4 / 2;
+        lane4 = ii4 % 2;
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2;
+        lane2 = y4 % 2;
+        vv_offset = ((yy4 / 8) * psc(w) + xx4) * 4 + (yy4 % 8) / 2;
+        lane4 = yy4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2;
+        lane2 = z4 % 2;
+        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 4 + (zz4 % 8) / 2;
+        lane4 = zz4 % 2;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * psc(outw) + gx;
+    if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
+    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
+    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
+    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
+
+    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
+
+    buffer_st8(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (psc(dims) == 1)
+    {
+        v_offset = i4;
+        vv_offset = ii4;
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+        ivec4 yy4 = ii4 / psc(w);
+        ivec4 xx4 = ii4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8;
+        vv_offset = ((yy4 / 8) * psc(w) + xx4) * 8 + yy4 % 8;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(w);
+        ivec4 xx4 = ii4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8;
+        vv_offset = ((zz4 / 8) * psc(cstep) + yy4 * psc(w) + xx4) * 8 + zz4 % 8;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * psc(outw) + gx;
+    if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/reshape_pack8to1.comp b/source/device/vulkan/shaders/reshape_pack8to1.comp
new file mode 100644
index 000000000..05cd2f869
--- /dev/null
+++ b/source/device/vulkan/shaders/reshape_pack8to1.comp
@@ -0,0 +1,195 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+    ivec3 gxyz = ivec3(gx, gy, gz);
+
+    gxyz[psc(dims) - 1] *= 8;
+
+    int i4_0 = gxyz.z * psc(h) * psc(w) + gxyz.y * psc(w) + gxyz.x;
+
+    ivec3 gxyz4 = ivec3(1, psc(w), psc(h) * psc(w));
+
+    ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[psc(dims) - 1];
+    ivec4 ii4 = i4 + 4 * gxyz4[psc(dims) - 1];
+
+#if NCNN_image_shader
+    afpvec8 v;
+
+    if (psc(dims) == 1)
+    {
+        v = image1d_ld8(bottom_blob_1d, gx);
+    }
+    else if (psc(dims) == 2)
+    {
+        v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+    }
+    else // if (psc(dims) == 3)
+    {
+        v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+    }
+
+    if (ndim == 1)
+    {
+        ivec4 x4 = i4;
+        ivec4 xx4 = ii4;
+
+        image1d_st1(top_blob_1d, x4.r, v[0].r);
+        image1d_st1(top_blob_1d, x4.g, v[0].g);
+        image1d_st1(top_blob_1d, x4.b, v[0].b);
+        image1d_st1(top_blob_1d, x4.a, v[0].a);
+        image1d_st1(top_blob_1d, xx4.r, v[1].r);
+        image1d_st1(top_blob_1d, xx4.g, v[1].g);
+        image1d_st1(top_blob_1d, xx4.b, v[1].b);
+        image1d_st1(top_blob_1d, xx4.a, v[1].a);
+    }
+    if (ndim == 2)
+    {
+        ivec4 y4 = i4 / psc(outw);
+        ivec4 x4 = i4 % psc(outw);
+        ivec4 yy4 = ii4 / psc(outw);
+        ivec4 xx4 = ii4 % psc(outw);
+
+        image2d_st1(top_blob_2d, ivec2(x4.r, y4.r), v[0].r);
+        image2d_st1(top_blob_2d, ivec2(x4.g, y4.g), v[0].g);
+        image2d_st1(top_blob_2d, ivec2(x4.b, y4.b), v[0].b);
+        image2d_st1(top_blob_2d, ivec2(x4.a, y4.a), v[0].a);
+        image2d_st1(top_blob_2d, ivec2(xx4.r, yy4.r), v[1].r);
+        image2d_st1(top_blob_2d, ivec2(xx4.g, yy4.g), v[1].g);
+        image2d_st1(top_blob_2d, ivec2(xx4.b, yy4.b), v[1].b);
+        image2d_st1(top_blob_2d, ivec2(xx4.a, yy4.a), v[1].a);
+    }
+    if (ndim == 3)
+    {
+        int size = psc(outw) * psc(outh);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(outw);
+        ivec4 x4 = i4 % size % psc(outw);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(outw);
+        ivec4 xx4 = ii4 % size % psc(outw);
+
+        image3d_st1(top_blob_3d, ivec3(x4.r, y4.r, z4.r), v[0].r);
+        image3d_st1(top_blob_3d, ivec3(x4.g, y4.g, z4.g), v[0].g);
+        image3d_st1(top_blob_3d, ivec3(x4.b, y4.b, z4.b), v[0].b);
+        image3d_st1(top_blob_3d, ivec3(x4.a, y4.a, z4.a), v[0].a);
+        image3d_st1(top_blob_3d, ivec3(xx4.r, yy4.r, zz4.r), v[1].r);
+        image3d_st1(top_blob_3d, ivec3(xx4.g, yy4.g, zz4.g), v[1].g);
+        image3d_st1(top_blob_3d, ivec3(xx4.b, yy4.b, zz4.b), v[1].b);
+        image3d_st1(top_blob_3d, ivec3(xx4.a, yy4.a, zz4.a), v[1].a);
+    }
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (ndim == 1)
+    {
+        v_offset = i4;
+        vv_offset = ii4;
+    }
+    if (ndim == 2)
+    {
+        ivec4 y4 = i4 / psc(outw);
+        ivec4 x4 = i4 % psc(outw);
+        ivec4 yy4 = ii4 / psc(outw);
+        ivec4 xx4 = ii4 % psc(outw);
+
+        v_offset = y4 * psc(outw) + x4;
+        vv_offset = yy4 * psc(outw) + xx4;
+    }
+    if (ndim == 3)
+    {
+        int size = psc(outw) * psc(outh);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(outw);
+        ivec4 x4 = i4 % size % psc(outw);
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / psc(outw);
+        ivec4 xx4 = ii4 % size % psc(outw);
+
+        v_offset = z4 * psc(outcstep) + y4 * psc(outw) + x4;
+        vv_offset = zz4 * psc(outcstep) + yy4 * psc(outw) + xx4;
+    }
+
+    int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi);
+#endif
+}
diff --git a/source/device/vulkan/shaders/reshape_pack8to4.comp b/source/device/vulkan/shaders/reshape_pack8to4.comp
new file mode 100644
index 000000000..558b07170
--- /dev/null
+++ b/source/device/vulkan/shaders/reshape_pack8to4.comp
@@ -0,0 +1,231 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+#else
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    ivec4 i4;
+
+    if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3);
+    if (ndim == 2) i4 = (gy * 4) * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outw);
+    if (ndim == 3) i4 = (gz * 4) * psc(outh) * psc(outw) + gy * psc(outw) + gx + ivec4(0, 1, 2, 3) * psc(outh) * psc(outw);
+
+#if NCNN_image_shader
+    afpvec4 v;
+
+    if (psc(dims) == 1)
+    {
+        ivec4 x4 = i4;
+
+        afpvec8 v0 = image1d_ld8(bottom_blob_1d, x4.r / 8);
+        afpvec8 v1 = image1d_ld8(bottom_blob_1d, x4.g / 8);
+        afpvec8 v2 = image1d_ld8(bottom_blob_1d, x4.b / 8);
+        afpvec8 v3 = image1d_ld8(bottom_blob_1d, x4.a / 8);
+
+        v.r = v0[(x4.r % 8) / 4][x4.r % 4];
+        v.g = v1[(x4.g % 8) / 4][x4.g % 4];
+        v.b = v2[(x4.b % 8) / 4][x4.b % 4];
+        v.a = v3[(x4.a % 8) / 4][x4.a % 4];
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        afpvec8 v0 = image2d_ld8(bottom_blob_2d, ivec2(x4.r, y4.r / 8));
+        afpvec8 v1 = image2d_ld8(bottom_blob_2d, ivec2(x4.g, y4.g / 8));
+        afpvec8 v2 = image2d_ld8(bottom_blob_2d, ivec2(x4.b, y4.b / 8));
+        afpvec8 v3 = image2d_ld8(bottom_blob_2d, ivec2(x4.a, y4.a / 8));
+
+        v.r = v0[(y4.r % 8) / 4][y4.r % 4];
+        v.g = v1[(y4.g % 8) / 4][y4.g % 4];
+        v.b = v2[(y4.b % 8) / 4][y4.b % 4];
+        v.a = v3[(y4.a % 8) / 4][y4.a % 4];
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        afpvec8 v0 = image3d_ld8(bottom_blob_3d, ivec3(x4.r, y4.r, z4.r / 8));
+        afpvec8 v1 = image3d_ld8(bottom_blob_3d, ivec3(x4.g, y4.g, z4.g / 8));
+        afpvec8 v2 = image3d_ld8(bottom_blob_3d, ivec3(x4.b, y4.b, z4.b / 8));
+        afpvec8 v3 = image3d_ld8(bottom_blob_3d, ivec3(x4.a, y4.a, z4.a / 8));
+
+        v.r = v0[(z4.r % 8) / 4][z4.r % 4];
+        v.g = v1[(z4.g % 8) / 4][z4.g % 4];
+        v.b = v2[(z4.b % 8) / 4][z4.b % 4];
+        v.a = v3[(z4.a % 8) / 4][z4.a % 4];
+    }
+
+    if (ndim == 1)
+    {
+        image1d_st4(top_blob_1d, gx, v);
+    }
+    if (ndim == 2)
+    {
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    if (ndim == 3)
+    {
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 lane2;
+
+    if (psc(dims) == 1)
+    {
+        v_offset = i4 / 2;
+        lane2 = i4 % 2;
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 4 + (y4 % 8) / 2;
+        lane2 = y4 % 2;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 4 + (z4 % 8) / 2;
+        lane2 = z4 % 2;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * psc(outw) + gx;
+    if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
+
+    buffer_st4(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+
+    if (psc(dims) == 1)
+    {
+        v_offset = i4;
+    }
+    else if (psc(dims) == 2)
+    {
+        ivec4 y4 = i4 / psc(w);
+        ivec4 x4 = i4 % psc(w);
+
+        v_offset = ((y4 / 8) * psc(w) + x4) * 8 + y4 % 8;
+    }
+    else // if (psc(dims) == 3)
+    {
+        int size = psc(w) * psc(h);
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / psc(w);
+        ivec4 x4 = i4 % size % psc(w);
+
+        v_offset = ((z4 / 8) * psc(cstep) + y4 * psc(w) + x4) * 8 + z4 % 8;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * psc(outw) + gx;
+    if (ndim == 3) gi = gz * psc(outcstep) + gy * psc(outw) + gx;
+
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+#endif
+}
diff --git a/source/device/vulkan/shaders/softmax_div_sum.comp b/source/device/vulkan/shaders/softmax_div_sum.comp
new file mode 100644
index 000000000..5db4bd4a3
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_div_sum.comp
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+layout (binding = 2) uniform unfp sampler1D sum_workspace_1d;
+layout (binding = 2) uniform unfp sampler2D sum_workspace_2d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+layout (binding = 1) readonly buffer sum_workspace { sfp sum_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afp v;
+    afp sum;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        v = image1d_ld1(bottom_blob_1d, gx);
+        sum = image1d_ld1(sum_workspace_1d, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy));
+        sum = image1d_ld1(sum_workspace_1d, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy));
+        sum = image1d_ld1(sum_workspace_1d, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+        sum = image2d_ld1(sum_workspace_2d, ivec2(gx, gy));
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+        sum = image2d_ld1(sum_workspace_2d, ivec2(gx, gz));
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+        sum = image2d_ld1(sum_workspace_2d, ivec2(gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afp v = buffer_ld1(bottom_top_blob_data, gi);
+
+    afp sum;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        sum = buffer_ld1(sum_workspace_data, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        sum = buffer_ld1(sum_workspace_data, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        sum = buffer_ld1(sum_workspace_data, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        sum = buffer_ld1(sum_workspace_data, gy * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        sum = buffer_ld1(sum_workspace_data, gz * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        sum = buffer_ld1(sum_workspace_data, gz * psc(h) + gy);
+    }
+#endif
+
+    v /= sum;
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st1(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st1(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st1(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/softmax_div_sum_pack4.comp b/source/device/vulkan/shaders/softmax_div_sum_pack4.comp
new file mode 100644
index 000000000..27b28bc9e
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_div_sum_pack4.comp
@@ -0,0 +1,175 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+layout (binding = 2) uniform unfp sampler1D sum_workspace_1d;
+layout (binding = 2) uniform unfp sampler2D sum_workspace_2d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+layout (binding = 1) readonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec4 v;
+    afpvec4 sum;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        v = image1d_ld4(bottom_blob_1d, gx);
+        sum = image1d_ld4(sum_workspace_1d, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+        sum = image1d_ld4(sum_workspace_1d, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+        sum = image1d_ld4(sum_workspace_1d, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        sum = image2d_ld4(sum_workspace_2d, ivec2(gx, gy));
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        sum = image2d_ld4(sum_workspace_2d, ivec2(gx, gz));
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        sum = image2d_ld4(sum_workspace_2d, ivec2(gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec4 v = buffer_ld4(bottom_top_blob_data, gi);
+
+    afpvec4 sum;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        sum = buffer_ld4(sum_workspace_data, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        sum = buffer_ld4(sum_workspace_data, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        sum = buffer_ld4(sum_workspace_data, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        sum = buffer_ld4(sum_workspace_data, gy * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        sum = buffer_ld4(sum_workspace_data, gz * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        sum = buffer_ld4(sum_workspace_data, gz * psc(h) + gy);
+    }
+
+#if NCNN_fp16_packed || NCNN_fp16_storage
+    // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s
+    // TODO only enable this workaround for some nvidia driver
+    if (axis == 0)
+    {
+        sum = afpvec4(sum.r);
+    }
+#endif
+#endif
+
+    v /= sum;
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st4(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st4(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/softmax_div_sum_pack8.comp b/source/device/vulkan/shaders/softmax_div_sum_pack8.comp
new file mode 100644
index 000000000..a329d3f93
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_div_sum_pack8.comp
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+layout (binding = 2) uniform unfp sampler1D sum_workspace_1d;
+layout (binding = 2) uniform unfp sampler2D sum_workspace_2d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
+layout (binding = 1) readonly buffer sum_workspace { sfpvec8 sum_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec8 v;
+    afpvec8 sum;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        v = image1d_ld8(bottom_blob_1d, gx);
+        sum = image1d_ld8(sum_workspace_1d, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        sum = image1d_ld8(sum_workspace_1d, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        sum = image1d_ld8(sum_workspace_1d, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        sum = image2d_ld8(sum_workspace_2d, ivec2(gx, gy));
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        sum = image2d_ld8(sum_workspace_2d, ivec2(gx, gz));
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        sum = image2d_ld8(sum_workspace_2d, ivec2(gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);
+
+    afpvec8 sum;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        sum = buffer_ld8(sum_workspace_data, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        sum = buffer_ld8(sum_workspace_data, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        sum = buffer_ld8(sum_workspace_data, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        sum = buffer_ld8(sum_workspace_data, gy * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        sum = buffer_ld8(sum_workspace_data, gz * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        sum = buffer_ld8(sum_workspace_data, gz * psc(h) + gy);
+    }
+
+#if NCNN_fp16_packed || NCNN_fp16_storage
+    // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s
+    // TODO only enable this workaround for some nvidia driver
+    if (axis == 0)
+    {
+        sum = afpvec8(afpvec4(sum[0].r), afpvec4(sum[0].r));
+    }
+#endif
+#endif
+
+    v[0] /= sum[0];
+    v[1] /= sum[1];
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st8(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st8(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/softmax_exp_sub_max.comp b/source/device/vulkan/shaders/softmax_exp_sub_max.comp
new file mode 100644
index 000000000..210284df4
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_exp_sub_max.comp
@@ -0,0 +1,166 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob_3d;
+layout (binding = 2) uniform unfp sampler1D max_workspace_1d;
+layout (binding = 2) uniform unfp sampler2D max_workspace_2d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+layout (binding = 1) readonly buffer max_workspace { sfp max_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afp v;
+    afp max_value;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        v = image1d_ld1(bottom_blob_1d, gx);
+        max_value = image1d_ld1(max_workspace_1d, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy));
+        max_value = image1d_ld1(max_workspace_1d, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        v = image2d_ld1(bottom_blob_2d, ivec2(gx, gy));
+        max_value = image1d_ld1(max_workspace_1d, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+        max_value = image2d_ld1(max_workspace_2d, ivec2(gx, gy));
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+        max_value = image2d_ld1(max_workspace_2d, ivec2(gx, gz));
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        v = image3d_ld1(bottom_blob_3d, ivec3(gx, gy, gz));
+        max_value = image2d_ld1(max_workspace_2d, ivec2(gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afp v = buffer_ld1(bottom_top_blob_data, gi);
+
+    afp max_value;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        max_value = buffer_ld1(max_workspace_data, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        max_value = buffer_ld1(max_workspace_data, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        max_value = buffer_ld1(max_workspace_data, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        max_value = buffer_ld1(max_workspace_data, gy * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        max_value = buffer_ld1(max_workspace_data, gz * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        max_value = buffer_ld1(max_workspace_data, gz * psc(h) + gy);
+    }
+#endif
+
+    v = exp(v - max_value);
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st1(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st1(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st1(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/softmax_exp_sub_max_pack4.comp b/source/device/vulkan/shaders/softmax_exp_sub_max_pack4.comp
new file mode 100644
index 000000000..2aba5894f
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_exp_sub_max_pack4.comp
@@ -0,0 +1,175 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+layout (binding = 2) uniform unfp sampler1D max_workspace_1d;
+layout (binding = 2) uniform unfp sampler2D max_workspace_2d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+layout (binding = 1) readonly buffer max_workspace { sfpvec4 max_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec4 v;
+    afpvec4 max_value;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        v = image1d_ld4(bottom_blob_1d, gx);
+        max_value = image1d_ld4(max_workspace_1d, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+        max_value = image1d_ld4(max_workspace_1d, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        v = image2d_ld4(bottom_blob_2d, ivec2(gx, gy));
+        max_value = image1d_ld4(max_workspace_1d, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        max_value = image2d_ld4(max_workspace_2d, ivec2(gx, gy));
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        max_value = image2d_ld4(max_workspace_2d, ivec2(gx, gz));
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        v = image3d_ld4(bottom_blob_3d, ivec3(gx, gy, gz));
+        max_value = image2d_ld4(max_workspace_2d, ivec2(gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec4 v = buffer_ld4(bottom_top_blob_data, gi);
+
+    afpvec4 max_value;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        max_value = buffer_ld4(max_workspace_data, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        max_value = buffer_ld4(max_workspace_data, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        max_value = buffer_ld4(max_workspace_data, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        max_value = buffer_ld4(max_workspace_data, gy * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        max_value = buffer_ld4(max_workspace_data, gz * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        max_value = buffer_ld4(max_workspace_data, gz * psc(h) + gy);
+    }
+
+#if NCNN_fp16_packed || NCNN_fp16_storage
+    // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s
+    // TODO only enable this workaround for some nvidia driver
+    if (axis == 0)
+    {
+        max_value = afpvec4(max_value.r);
+    }
+#endif
+#endif
+
+    v = exp(v - max_value);
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st4(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st4(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st4(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/softmax_exp_sub_max_pack8.comp b/source/device/vulkan/shaders/softmax_exp_sub_max_pack8.comp
new file mode 100644
index 000000000..374c5d927
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_exp_sub_max_pack8.comp
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D top_blob_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D top_blob_2d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob_3d;
+layout (binding = 2) uniform unfp sampler1D max_workspace_1d;
+layout (binding = 2) uniform unfp sampler2D max_workspace_2d;
+#else
+layout (binding = 0) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
+layout (binding = 1) readonly buffer max_workspace { sfpvec8 max_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(w) || gy >= psc(h) || gz >= psc(c))
+        return;
+
+#if NCNN_image_shader
+    afpvec8 v;
+    afpvec8 max_value;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        v = image1d_ld8(bottom_blob_1d, gx);
+        max_value = image1d_ld8(max_workspace_1d, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        max_value = image1d_ld8(max_workspace_1d, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        v = image2d_ld8(bottom_blob_2d, ivec2(gx, gy));
+        max_value = image1d_ld8(max_workspace_1d, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        max_value = image2d_ld8(max_workspace_2d, ivec2(gx, gy));
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        max_value = image2d_ld8(max_workspace_2d, ivec2(gx, gz));
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        v = image3d_ld8(bottom_blob_3d, ivec3(gx, gy, gz));
+        max_value = image2d_ld8(max_workspace_2d, ivec2(gy, gz));
+    }
+#else
+    const int gi = gz * psc(cstep) + gy * psc(w) + gx;
+
+    afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);
+
+    afpvec8 max_value;
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        max_value = buffer_ld8(max_workspace_data, 0);
+    }
+    else if (psc(dims) == 2 && axis == 0)
+    {
+        max_value = buffer_ld8(max_workspace_data, gx);
+    }
+    else if (psc(dims) == 2 && axis == 1)
+    {
+        max_value = buffer_ld8(max_workspace_data, gy);
+    }
+    else if (psc(dims) == 3 && axis == 0)
+    {
+        max_value = buffer_ld8(max_workspace_data, gy * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 1)
+    {
+        max_value = buffer_ld8(max_workspace_data, gz * psc(w) + gx);
+    }
+    else if (psc(dims) == 3 && axis == 2)
+    {
+        max_value = buffer_ld8(max_workspace_data, gz * psc(h) + gy);
+    }
+
+#if NCNN_fp16_packed || NCNN_fp16_storage
+    // NOTE reduce max may produce (X, undef, X, undef) on nvidia fp16p/fp16s
+    // TODO only enable this workaround for some nvidia driver
+    if (axis == 0)
+    {
+        max_value = afpvec8(afpvec4(max_value[0].r), afpvec4(max_value[0].r));
+    }
+#endif
+#endif
+
+    v[0] = exp(v[0] - max_value[0]);
+    v[1] = exp(v[1] - max_value[1]);
+
+#if NCNN_image_shader
+    if (psc(dims) == 1)
+    {
+        image1d_st8(top_blob_1d, gx, v);
+    }
+    else if (psc(dims) == 2)
+    {
+        image2d_st8(top_blob_2d, ivec2(gx, gy), v);
+    }
+    else // if (psc(dims) == 3)
+    {
+        image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);
+    }
+#else
+    buffer_st8(bottom_top_blob_data, gi, v);
+#endif
+}
diff --git a/source/device/vulkan/shaders/softmax_reduce_max.comp b/source/device/vulkan/shaders/softmax_reduce_max.comp
new file mode 100644
index 000000000..42271ccb5
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_reduce_max.comp
@@ -0,0 +1,198 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D max_workspace_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D max_workspace_2d;
+#else
+layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer max_workspace { sfp max_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp max_value = afp(-99999999.f);
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afp v = image1d_ld1(bottom_top_blob_1d, i);
+#else
+            afp v = buffer_ld1(bottom_top_blob_data, i);
+#endif
+            max_value = max(max_value, v);
+        }
+#if NCNN_image_shader
+        image1d_st1(max_workspace_1d, 0, max_value);
+#else
+        buffer_st1(max_workspace_data, 0, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 0)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afp v = image2d_ld1(bottom_top_blob_2d, ivec2(gx, i));
+#else
+            int v_offset = i * psc(w) + gx;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+#if NCNN_image_shader
+        image1d_st1(max_workspace_1d, gx, max_value);
+#else
+        buffer_st1(max_workspace_data, gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 1)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afp v = image2d_ld1(bottom_top_blob_2d, ivec2(i, gx));
+#else
+            int v_offset = gx * psc(w) + i;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+#if NCNN_image_shader
+        image1d_st1(max_workspace_1d, gx, max_value);
+#else
+        buffer_st1(max_workspace_data, gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 0)
+    {
+        for (int i = 0; i < psc(c); i++)
+        {
+#if NCNN_image_shader
+            afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, gy, i));
+#else
+            int v_offset = i * psc(cstep) + gy * psc(w) + gx;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+#if NCNN_image_shader
+        image2d_st1(max_workspace_2d, ivec2(gx, gy), max_value);
+#else
+        buffer_st1(max_workspace_data, gy * psc(w) + gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 1)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, i, gy));
+#else
+            int v_offset = gy * psc(cstep) + i * psc(w) + gx;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+#if NCNN_image_shader
+        image2d_st1(max_workspace_2d, ivec2(gx, gy), max_value);
+#else
+        buffer_st1(max_workspace_data, gy * psc(w) + gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 2)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afp v = image3d_ld1(bottom_top_blob_3d, ivec3(i, gx, gy));
+#else
+            int v_offset = gy * psc(cstep) + gx * psc(w) + i;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+#if NCNN_image_shader
+        image2d_st1(max_workspace_2d, ivec2(gx, gy), max_value);
+#else
+        buffer_st1(max_workspace_data, gy * psc(h) + gx, max_value);
+#endif
+        return;
+    }
+}
diff --git a/source/device/vulkan/shaders/softmax_reduce_max_pack4.comp b/source/device/vulkan/shaders/softmax_reduce_max_pack4.comp
new file mode 100644
index 000000000..6de110db9
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_reduce_max_pack4.comp
@@ -0,0 +1,204 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D max_workspace_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D max_workspace_2d;
+#else
+layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer max_workspace { sfpvec4 max_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 max_value = afpvec4(-99999999.f);
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image1d_ld4(bottom_top_blob_1d, i);
+#else
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, i);
+#endif
+            max_value = max(max_value, v);
+        }
+        afpvec2 max2 = max(max_value.rg, max_value.ba);
+        max_value = afpvec4(max(max2.r, max2.g));
+#if NCNN_image_shader
+        image1d_st4(max_workspace_1d, 0, max_value);
+#else
+        buffer_st4(max_workspace_data, 0, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 0)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(gx, i));
+#else
+            int v_offset = i * psc(w) + gx;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+        afpvec2 max2 = max(max_value.rg, max_value.ba);
+        max_value = afpvec4(max(max2.r, max2.g));
+#if NCNN_image_shader
+        image1d_st4(max_workspace_1d, gx, max_value);
+#else
+        buffer_st4(max_workspace_data, gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 1)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(i, gx));
+#else
+            int v_offset = gx * psc(w) + i;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+#if NCNN_image_shader
+        image1d_st4(max_workspace_1d, gx, max_value);
+#else
+        buffer_st4(max_workspace_data, gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 0)
+    {
+        for (int i = 0; i < psc(c); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, gy, i));
+#else
+            int v_offset = i * psc(cstep) + gy * psc(w) + gx;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+        afpvec2 max2 = max(max_value.rg, max_value.ba);
+        max_value = afpvec4(max(max2.r, max2.g));
+#if NCNN_image_shader
+        image2d_st4(max_workspace_2d, ivec2(gx, gy), max_value);
+#else
+        buffer_st4(max_workspace_data, gy * psc(w) + gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 1)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, i, gy));
+#else
+            int v_offset = gy * psc(cstep) + i * psc(w) + gx;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+#if NCNN_image_shader
+        image2d_st4(max_workspace_2d, ivec2(gx, gy), max_value);
+#else
+        buffer_st4(max_workspace_data, gy * psc(w) + gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 2)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(i, gx, gy));
+#else
+            int v_offset = gy * psc(cstep) + gx * psc(w) + i;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            max_value = max(max_value, v);
+        }
+#if NCNN_image_shader
+        image2d_st4(max_workspace_2d, ivec2(gx, gy), max_value);
+#else
+        buffer_st4(max_workspace_data, gy * psc(h) + gx, max_value);
+#endif
+        return;
+    }
+}
diff --git a/source/device/vulkan/shaders/softmax_reduce_max_pack8.comp b/source/device/vulkan/shaders/softmax_reduce_max_pack8.comp
new file mode 100644
index 000000000..66073dad9
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_reduce_max_pack8.comp
@@ -0,0 +1,217 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D max_workspace_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D max_workspace_2d;
+#else
+layout (binding = 0) readonly buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer max_workspace { sfpvec8 max_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 max_value = afpvec8(afpvec4(-99999999.f), afpvec4(-99999999.f));
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image1d_ld8(bottom_top_blob_1d, i);
+#else
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, i);
+#endif
+            max_value[0] = max(max_value[0], v[0]);
+            max_value[1] = max(max_value[1], v[1]);
+        }
+        afpvec4 max4 = max(max_value[0], max_value[1]);
+        afpvec2 max2 = max(max4.rg, max4.ba);
+        afp max1 = max(max2.r, max2.g);
+        max_value = afpvec8(afpvec4(max1), afpvec4(max1));
+#if NCNN_image_shader
+        image1d_st8(max_workspace_1d, 0, max_value);
+#else
+        buffer_st8(max_workspace_data, 0, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 0)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(gx, i));
+#else
+            int v_offset = i * psc(w) + gx;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            max_value[0] = max(max_value[0], v[0]);
+            max_value[1] = max(max_value[1], v[1]);
+        }
+        afpvec4 max4 = max(max_value[0], max_value[1]);
+        afpvec2 max2 = max(max4.rg, max4.ba);
+        afp max1 = max(max2.r, max2.g);
+        max_value = afpvec8(afpvec4(max1), afpvec4(max1));
+#if NCNN_image_shader
+        image1d_st8(max_workspace_1d, gx, max_value);
+#else
+        buffer_st8(max_workspace_data, gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 1)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(i, gx));
+#else
+            int v_offset = gx * psc(w) + i;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            max_value[0] = max(max_value[0], v[0]);
+            max_value[1] = max(max_value[1], v[1]);
+        }
+#if NCNN_image_shader
+        image1d_st8(max_workspace_1d, gx, max_value);
+#else
+        buffer_st8(max_workspace_data, gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 0)
+    {
+        for (int i = 0; i < psc(c); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, gy, i));
+#else
+            int v_offset = i * psc(cstep) + gy * psc(w) + gx;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            max_value[0] = max(max_value[0], v[0]);
+            max_value[1] = max(max_value[1], v[1]);
+        }
+        afpvec4 max4 = max(max_value[0], max_value[1]);
+        afpvec2 max2 = max(max4.rg, max4.ba);
+        afp max1 = max(max2.r, max2.g);
+        max_value = afpvec8(afpvec4(max1), afpvec4(max1));
+#if NCNN_image_shader
+        image2d_st8(max_workspace_2d, ivec2(gx, gy), max_value);
+#else
+        buffer_st8(max_workspace_data, gy * psc(w) + gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 1)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, i, gy));
+#else
+            int v_offset = gy * psc(cstep) + i * psc(w) + gx;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            max_value[0] = max(max_value[0], v[0]);
+            max_value[1] = max(max_value[1], v[1]);
+        }
+#if NCNN_image_shader
+        image2d_st8(max_workspace_2d, ivec2(gx, gy), max_value);
+#else
+        buffer_st8(max_workspace_data, gy * psc(w) + gx, max_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 2)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(i, gx, gy));
+#else
+            int v_offset = gy * psc(cstep) + gx * psc(w) + i;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            max_value[0] = max(max_value[0], v[0]);
+            max_value[1] = max(max_value[1], v[1]);
+        }
+#if NCNN_image_shader
+        image2d_st8(max_workspace_2d, ivec2(gx, gy), max_value);
+#else
+        buffer_st8(max_workspace_data, gy * psc(h) + gx, max_value);
+#endif
+        return;
+    }
+}
diff --git a/source/device/vulkan/shaders/softmax_reduce_sum.comp b/source/device/vulkan/shaders/softmax_reduce_sum.comp
new file mode 100644
index 000000000..b38d16454
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_reduce_sum.comp
@@ -0,0 +1,198 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image1D sum_workspace_1d;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image2D sum_workspace_2d;
+#else
+layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer sum_workspace { sfp sum_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afp sum_value = afp(0.f);
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afp v = image1d_ld1(bottom_top_blob_1d, i);
+#else
+            afp v = buffer_ld1(bottom_top_blob_data, i);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image1d_st1(sum_workspace_1d, 0, sum_value);
+#else
+        buffer_st1(sum_workspace_data, 0, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 0)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afp v = image2d_ld1(bottom_top_blob_2d, ivec2(gx, i));
+#else
+            int v_offset = i * psc(w) + gx;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image1d_st1(sum_workspace_1d, gx, sum_value);
+#else
+        buffer_st1(sum_workspace_data, gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 1)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afp v = image2d_ld1(bottom_top_blob_2d, ivec2(i, gx));
+#else
+            int v_offset = gx * psc(w) + i;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image1d_st1(sum_workspace_1d, gx, sum_value);
+#else
+        buffer_st1(sum_workspace_data, gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 0)
+    {
+        for (int i = 0; i < psc(c); i++)
+        {
+#if NCNN_image_shader
+            afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, gy, i));
+#else
+            int v_offset = i * psc(cstep) + gy * psc(w) + gx;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image2d_st1(sum_workspace_2d, ivec2(gx, gy), sum_value);
+#else
+        buffer_st1(sum_workspace_data, gy * psc(w) + gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 1)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afp v = image3d_ld1(bottom_top_blob_3d, ivec3(gx, i, gy));
+#else
+            int v_offset = gy * psc(cstep) + i * psc(w) + gx;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image2d_st1(sum_workspace_2d, ivec2(gx, gy), sum_value);
+#else
+        buffer_st1(sum_workspace_data, gy * psc(w) + gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 2)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afp v = image3d_ld1(bottom_top_blob_3d, ivec3(i, gx, gy));
+#else
+            int v_offset = gy * psc(cstep) + gx * psc(w) + i;
+            afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image2d_st1(sum_workspace_2d, ivec2(gx, gy), sum_value);
+#else
+        buffer_st1(sum_workspace_data, gy * psc(h) + gx, sum_value);
+#endif
+        return;
+    }
+}
diff --git a/source/device/vulkan/shaders/softmax_reduce_sum_pack4.comp b/source/device/vulkan/shaders/softmax_reduce_sum_pack4.comp
new file mode 100644
index 000000000..40b035ac3
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_reduce_sum_pack4.comp
@@ -0,0 +1,204 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D sum_workspace_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D sum_workspace_2d;
+#else
+layout (binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer sum_workspace { sfpvec4 sum_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec4 sum_value = afpvec4(0.f);
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image1d_ld4(bottom_top_blob_1d, i);
+#else
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, i);
+#endif
+            sum_value += v;
+        }
+        afpvec2 sum2 = sum_value.rg + sum_value.ba;
+        sum_value = afpvec4(sum2.r + sum2.g);
+#if NCNN_image_shader
+        image1d_st4(sum_workspace_1d, 0, sum_value);
+#else
+        buffer_st4(sum_workspace_data, 0, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 0)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(gx, i));
+#else
+            int v_offset = i * psc(w) + gx;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+        afpvec2 sum2 = sum_value.rg + sum_value.ba;
+        sum_value = afpvec4(sum2.r + sum2.g);
+#if NCNN_image_shader
+        image1d_st4(sum_workspace_1d, gx, sum_value);
+#else
+        buffer_st4(sum_workspace_data, gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 1)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image2d_ld4(bottom_top_blob_2d, ivec2(i, gx));
+#else
+            int v_offset = gx * psc(w) + i;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image1d_st4(sum_workspace_1d, gx, sum_value);
+#else
+        buffer_st4(sum_workspace_data, gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 0)
+    {
+        for (int i = 0; i < psc(c); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, gy, i));
+#else
+            int v_offset = i * psc(cstep) + gy * psc(w) + gx;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+        afpvec2 sum2 = sum_value.rg + sum_value.ba;
+        sum_value = afpvec4(sum2.r + sum2.g);
+#if NCNN_image_shader
+        image2d_st4(sum_workspace_2d, ivec2(gx, gy), sum_value);
+#else
+        buffer_st4(sum_workspace_data, gy * psc(w) + gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 1)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(gx, i, gy));
+#else
+            int v_offset = gy * psc(cstep) + i * psc(w) + gx;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image2d_st4(sum_workspace_2d, ivec2(gx, gy), sum_value);
+#else
+        buffer_st4(sum_workspace_data, gy * psc(w) + gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 2)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec4 v = image3d_ld4(bottom_top_blob_3d, ivec3(i, gx, gy));
+#else
+            int v_offset = gy * psc(cstep) + gx * psc(w) + i;
+            afpvec4 v = buffer_ld4(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image2d_st4(sum_workspace_2d, ivec2(gx, gy), sum_value);
+#else
+        buffer_st4(sum_workspace_data, gy * psc(h) + gx, sum_value);
+#endif
+        return;
+    }
+}
diff --git a/source/device/vulkan/shaders/softmax_reduce_sum_pack8.comp b/source/device/vulkan/shaders/softmax_reduce_sum_pack8.comp
new file mode 100644
index 000000000..a4a88024b
--- /dev/null
+++ b/source/device/vulkan/shaders/softmax_reduce_sum_pack8.comp
@@ -0,0 +1,211 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int axis = 0;
+
+#define shape_constant_id_offset 1
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler1D bottom_top_blob_1d;
+layout (binding = 0) uniform unfp sampler2D bottom_top_blob_2d;
+layout (binding = 0) uniform unfp sampler3D bottom_top_blob_3d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image1D sum_workspace_1d;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image2D sum_workspace_2d;
+#else
+layout (binding = 0) readonly buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer sum_workspace { sfpvec8 sum_workspace_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
+        return;
+
+    afpvec8 sum_value = afpvec8(afpvec4(0.f), afpvec4(0.f));
+
+    if (psc(dims) == 1) // axis == 0
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image1d_ld8(bottom_top_blob_1d, i);
+#else
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, i);
+#endif
+            sum_value += v;
+        }
+        afpvec4 sum4 = sum_value[0] + sum_value[1];
+        afpvec2 sum2 = sum4.rg + sum4.ba;
+        afp sum1 = sum2.r + sum2.g;
+        sum_value = afpvec8(afpvec4(sum1), afpvec4(sum1));
+#if NCNN_image_shader
+        image1d_st8(sum_workspace_1d, 0, sum_value);
+#else
+        buffer_st8(sum_workspace_data, 0, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 0)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(gx, i));
+#else
+            int v_offset = i * psc(w) + gx;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+        afpvec4 sum4 = sum_value[0] + sum_value[1];
+        afpvec2 sum2 = sum4.rg + sum4.ba;
+        afp sum1 = sum2.r + sum2.g;
+        sum_value = afpvec8(afpvec4(sum1), afpvec4(sum1));
+#if NCNN_image_shader
+        image1d_st8(sum_workspace_1d, gx, sum_value);
+#else
+        buffer_st8(sum_workspace_data, gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 2 && axis == 1)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image2d_ld8(bottom_top_blob_2d, ivec2(i, gx));
+#else
+            int v_offset = gx * psc(w) + i;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image1d_st8(sum_workspace_1d, gx, sum_value);
+#else
+        buffer_st8(sum_workspace_data, gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 0)
+    {
+        for (int i = 0; i < psc(c); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, gy, i));
+#else
+            int v_offset = i * psc(cstep) + gy * psc(w) + gx;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+        afpvec4 sum4 = sum_value[0] + sum_value[1];
+        afpvec2 sum2 = sum4.rg + sum4.ba;
+        afp sum1 = sum2.r + sum2.g;
+        sum_value = afpvec8(afpvec4(sum1), afpvec4(sum1));
+#if NCNN_image_shader
+        image2d_st8(sum_workspace_2d, ivec2(gx, gy), sum_value);
+#else
+        buffer_st8(sum_workspace_data, gy * psc(w) + gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 1)
+    {
+        for (int i = 0; i < psc(h); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(gx, i, gy));
+#else
+            int v_offset = gy * psc(cstep) + i * psc(w) + gx;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image2d_st8(sum_workspace_2d, ivec2(gx, gy), sum_value);
+#else
+        buffer_st8(sum_workspace_data, gy * psc(w) + gx, sum_value);
+#endif
+        return;
+    }
+
+    if (psc(dims) == 3 && axis == 2)
+    {
+        for (int i = 0; i < psc(w); i++)
+        {
+#if NCNN_image_shader
+            afpvec8 v = image3d_ld8(bottom_top_blob_3d, ivec3(i, gx, gy));
+#else
+            int v_offset = gy * psc(cstep) + gx * psc(w) + i;
+            afpvec8 v = buffer_ld8(bottom_top_blob_data, v_offset);
+#endif
+            sum_value += v;
+        }
+#if NCNN_image_shader
+        image2d_st8(sum_workspace_2d, ivec2(gx, gy), sum_value);
+#else
+        buffer_st8(sum_workspace_data, gy * psc(h) + gx, sum_value);
+#endif
+        return;
+    }
+}
diff --git a/source/device/vulkan/vulkan_allocator.cpp b/source/device/vulkan/vulkan_allocator.cpp
new file mode 100644
index 000000000..c5483ca4f
--- /dev/null
+++ b/source/device/vulkan/vulkan_allocator.cpp
@@ -0,0 +1,1474 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include <stdio.h>
+#include "vulkan_allocator.hpp"
+#include "vulkan_gpu.hpp"
+#include "vulkan_pipeline.hpp"
+
+#include <algorithm>
+
+namespace TEngine {
+
+Allocator::~Allocator()
+{
+
+}
+
+VkAllocator::VkAllocator(const GPUDevice* _vkdev) : vkdev(_vkdev)
+{
+    buffer_memory_type_index = (uint32_t)-1;
+    image_memory_type_index = (uint32_t)-1;
+    mappable = false;
+    coherent = false;
+}
+
+static inline size_t round_up(size_t n, size_t multiple)
+{
+    return (n + multiple - 1) / multiple * multiple;
+}
+
+static inline size_t round_down(size_t n, size_t multiple)
+{
+    return n / multiple * multiple;
+}
+
+static inline size_t least_common_multiple(size_t a, size_t b)
+{
+    if (a == b)
+        return a;
+
+    if (a > b)
+        return least_common_multiple(b, a);
+
+    size_t lcm = b;
+    while (lcm % a != 0)
+    {
+        lcm += b;
+    }
+
+    return lcm;
+}
+
+int VkAllocator::flush(VkBufferMemory* ptr)
+{
+    if (coherent)
+        return 0;
+
+    VkMappedMemoryRange mappedMemoryRange;
+    mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+    mappedMemoryRange.pNext = 0;
+    mappedMemoryRange.memory = ptr->memory;
+    mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size);
+    mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size) - mappedMemoryRange.offset;
+
+    VkResult ret = vkFlushMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkFlushMappedMemoryRanges failed %d", ret);
+        return -1;
+    }
+
+    return 0;
+}
+
+int VkAllocator::invalidate(VkBufferMemory* ptr)
+{
+    if (coherent)
+        return 0;
+
+    VkMappedMemoryRange mappedMemoryRange;
+    mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+    mappedMemoryRange.pNext = 0;
+    mappedMemoryRange.memory = ptr->memory;
+    mappedMemoryRange.offset = round_down(ptr->offset, vkdev->info.non_coherent_atom_size);
+    mappedMemoryRange.size = round_up(ptr->offset + ptr->capacity, vkdev->info.non_coherent_atom_size) - mappedMemoryRange.offset;
+
+    VkResult ret = vkInvalidateMappedMemoryRanges(vkdev->vkdevice(), 1, &mappedMemoryRange);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkInvalidateMappedMemoryRanges failed %d", ret);
+        return -1;
+    }
+    return 0;
+}
+
+VkBuffer VkAllocator::create_buffer(size_t size, VkBufferUsageFlags usage)
+{
+    VkBufferCreateInfo bufferCreateInfo;
+    bufferCreateInfo.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
+    bufferCreateInfo.pNext = 0;
+    bufferCreateInfo.flags = 0;
+    bufferCreateInfo.size = size;
+    bufferCreateInfo.usage = usage;
+    bufferCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    bufferCreateInfo.queueFamilyIndexCount = 0;
+    bufferCreateInfo.pQueueFamilyIndices = 0;
+
+    VkBuffer buffer = 0;
+    VkResult ret = vkCreateBuffer(vkdev->vkdevice(), &bufferCreateInfo, 0, &buffer);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkCreateBuffer failed %d", ret);
+        return 0;
+    }
+
+    return buffer;
+}
+
+VkDeviceMemory VkAllocator::allocate_memory(size_t size, uint32_t memory_type_index)
+{
+    VkMemoryAllocateInfo memoryAllocateInfo;
+    memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    memoryAllocateInfo.pNext = 0;
+    memoryAllocateInfo.allocationSize = size;
+    memoryAllocateInfo.memoryTypeIndex = memory_type_index;
+
+    VkDeviceMemory memory = 0;
+    VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkAllocateMemory failed %d", ret);
+        return 0;
+    }
+    return memory;
+}
+
+VkDeviceMemory VkAllocator::allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer)
+{
+    VkMemoryAllocateInfo memoryAllocateInfo;
+    memoryAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO;
+    memoryAllocateInfo.pNext = 0;
+    memoryAllocateInfo.allocationSize = size;
+    memoryAllocateInfo.memoryTypeIndex = memory_type_index;
+
+    VkMemoryDedicatedAllocateInfoKHR memoryDedicatedAllocateInfo;
+    memoryDedicatedAllocateInfo.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO_KHR;
+    memoryDedicatedAllocateInfo.pNext = 0;
+    memoryDedicatedAllocateInfo.image = image;
+    memoryDedicatedAllocateInfo.buffer = buffer;
+    memoryAllocateInfo.pNext = &memoryDedicatedAllocateInfo;
+
+    VkDeviceMemory memory = 0;
+    VkResult ret = vkAllocateMemory(vkdev->vkdevice(), &memoryAllocateInfo, 0, &memory);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkAllocateMemory failed %d", ret);
+        return 0;
+    }
+
+    return memory;
+}
+
+VkImage VkAllocator::create_image(VkImageType type, int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage)
+{
+    VkImageCreateInfo imageCreateInfo;
+    imageCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
+    imageCreateInfo.pNext = 0;
+    imageCreateInfo.flags = 0;
+    imageCreateInfo.imageType = type;
+    imageCreateInfo.format = format;
+    imageCreateInfo.extent.width = width;
+    imageCreateInfo.extent.height = height;
+    imageCreateInfo.extent.depth = depth;
+    imageCreateInfo.mipLevels = 1;
+    imageCreateInfo.arrayLayers = 1;
+    imageCreateInfo.samples = VK_SAMPLE_COUNT_1_BIT;
+    imageCreateInfo.tiling = tiling;
+    imageCreateInfo.usage = usage;
+    imageCreateInfo.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+    imageCreateInfo.queueFamilyIndexCount = 0;
+    imageCreateInfo.pQueueFamilyIndices = 0;
+    imageCreateInfo.initialLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+
+    VkImage image;
+    VkResult ret = vkCreateImage(vkdev->vkdevice(), &imageCreateInfo, 0, &image);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkCreateImage failed %d %d %d %d %d %d %d %d", ret, type, width, height, depth, format, tiling, usage);
+        return 0;
+    }
+
+    return image;
+}
+
+VkImageView VkAllocator::create_imageview(VkImageViewType type, VkImage image, VkFormat format)
+{
+    VkImageViewCreateInfo imageViewCreateInfo;
+    imageViewCreateInfo.sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO;
+    imageViewCreateInfo.pNext = 0;
+    imageViewCreateInfo.flags = 0;
+    imageViewCreateInfo.image = image;
+    imageViewCreateInfo.viewType = type;
+    imageViewCreateInfo.format = format;
+    imageViewCreateInfo.components.r = VK_COMPONENT_SWIZZLE_IDENTITY;
+    imageViewCreateInfo.components.g = VK_COMPONENT_SWIZZLE_IDENTITY;
+    imageViewCreateInfo.components.b = VK_COMPONENT_SWIZZLE_IDENTITY;
+    imageViewCreateInfo.components.a = VK_COMPONENT_SWIZZLE_IDENTITY;
+    imageViewCreateInfo.subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+    imageViewCreateInfo.subresourceRange.baseMipLevel = 0;
+    imageViewCreateInfo.subresourceRange.levelCount = 1;
+    imageViewCreateInfo.subresourceRange.baseArrayLayer = 0;
+    imageViewCreateInfo.subresourceRange.layerCount = 1;
+
+    VkImageView imageview;
+    VkResult ret = vkCreateImageView(vkdev->vkdevice(), &imageViewCreateInfo, 0, &imageview);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkCreateImageView failed %d", ret);
+        return 0;
+    }
+
+    return imageview;
+}
+
+VkBlobAllocator::VkBlobAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev)
+{
+    buffer_offset_alignment = vkdev->info.buffer_offset_alignment;
+    bind_memory_offset_alignment = vkdev->info.buffer_image_granularity;
+
+    if (vkdev->info.type == 1)
+    {
+        // on integrated gpu, there may be device local only memory too, eg. AMD APU
+        // assuming larger alignment always keeps us safe :)
+
+        // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
+        buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.memory_map_alignment);
+        buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size);
+    }
+
+    block_size = alignSize(16 * 1024 * 1024, buffer_offset_alignment);// 16M
+}
+
+VkBlobAllocator::~VkBlobAllocator()
+{
+    clear();
+}
+
+// TODO
+void VkBlobAllocator::clear()
+{
+//     TLOG_INFO("VkBlobAllocator %lu", buffer_blocks.size());
+
+    for (size_t i=0; i<buffer_blocks.size(); i++)
+    {
+        VkBufferMemory* ptr = buffer_blocks[i];
+
+//         std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
+//         while (it != buffer_budgets[i].end())
+//         {
+//             TLOG_INFO("VkBlobAllocator budget %p %lu %lu", ptr->buffer, it->first, it->second);
+//             it++;
+//         }
+
+        if (mappable)
+            vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
+
+        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
+        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
+
+        delete ptr;
+    }
+    buffer_blocks.clear();
+
+    buffer_budgets.clear();
+
+    for (size_t i=0; i<image_memory_blocks.size(); i++)
+    {
+        VkDeviceMemory memory = image_memory_blocks[i];
+
+//         std::list< std::pair<size_t, size_t> >::iterator it = image_memory_budgets[i].begin();
+//         while (it != image_memory_budgets[i].end())
+//         {
+//             TLOG_INFO("VkBlobAllocator budget %p %lu %lu", memory, it->first, it->second);
+//             it++;
+//         }
+
+        vkFreeMemory(vkdev->vkdevice(), memory, 0);
+    }
+    image_memory_blocks.clear();
+
+    image_memory_budgets.clear();
+}
+
+VkBufferMemory* VkBlobAllocator::fastMalloc(size_t size)
+{
+    size_t aligned_size = alignSize(size, buffer_offset_alignment);
+
+    const int buffer_block_count = buffer_blocks.size();
+
+    // find first spare space in buffer_blocks
+    for (int i=0; i<buffer_block_count; i++)
+    {
+        std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[i].begin();
+        while (it != buffer_budgets[i].end())
+        {
+            size_t budget_size = it->second;
+            if (budget_size < aligned_size)
+            {
+                it++;
+                continue;
+            }
+
+            // return sub buffer
+            VkBufferMemory* ptr = new VkBufferMemory;
+
+            ptr->buffer = buffer_blocks[i]->buffer;
+            ptr->offset = it->first;
+            ptr->memory = buffer_blocks[i]->memory;
+            ptr->capacity = aligned_size;
+            ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr;
+            ptr->access_flags = 0;
+            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+            // adjust buffer_budgets
+            if (budget_size == aligned_size)
+            {
+                buffer_budgets[i].erase(it);
+            }
+            else
+            {
+                it->first += aligned_size;
+                it->second -= aligned_size;
+            }
+
+            // printf("VkBlobAllocator M %p +%lu %lu\n", ptr->buffer, ptr->offset, ptr->capacity);
+
+            return ptr;
+        }
+    }
+
+    size_t new_block_size = std::max(block_size, aligned_size);
+
+    // create new block
+    VkBufferMemory* block = new VkBufferMemory;
+
+    block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
+    block->offset = 0;
+
+    // TODO respect VK_KHR_dedicated_allocation ?
+
+    VkMemoryRequirements memoryRequirements;
+    vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
+
+    // setup memory type and alignment
+    if (buffer_memory_type_index == (uint32_t)-1)
+    {
+        if (vkdev->info.type == 1)
+        {
+            // integrated gpu, prefer unified memory
+            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+        }
+        else
+        {
+            // discrete gpu, device local
+            buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+        }
+
+        mappable = vkdev->is_mappable(buffer_memory_type_index);
+        coherent = vkdev->is_coherent(buffer_memory_type_index);
+    }
+
+    block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
+
+    // ignore memoryRequirements.alignment as we always bind at zero offset
+    vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
+
+    block->mapped_ptr = 0;
+    if (mappable)
+    {
+        vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
+    }
+
+    buffer_blocks.push_back(block);
+
+    // return sub buffer
+    VkBufferMemory* ptr = new VkBufferMemory;
+
+    ptr->buffer = block->buffer;
+    ptr->offset = 0;
+    ptr->memory = block->memory;
+    ptr->capacity = aligned_size;
+    ptr->mapped_ptr = block->mapped_ptr;
+    ptr->access_flags = 0;
+    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+    // adjust buffer_budgets
+    std::list< std::pair<size_t, size_t> > budget;
+    if (new_block_size > aligned_size)
+    {
+        budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
+    }
+    buffer_budgets.push_back(budget);
+
+    //     TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
+
+    return ptr;
+
+}
+
+VkImageMemory* VkBlobAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack)
+{
+    if (elempack != 1 && elempack != 4 && elempack != 8)
+    {
+        printf("elempack must be 1 4 8");
+        return 0;
+    }
+
+    // resolve format
+    VkFormat format = VK_FORMAT_UNDEFINED;
+
+    if (elemsize / elempack == 4)
+    {
+        // fp32
+        if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
+        if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
+        if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
+    }
+    if (elemsize / elempack == 2)
+    {
+        // fp16
+        if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
+        if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
+        if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
+    }
+
+    // resolve image width height depth
+    int width = w;
+    int height = h;
+    int depth = c;
+
+    // large elempack spills on image w
+    if (elempack == 8) width *= 2;
+
+    VkImageType image_type;
+    VkImageViewType imageview_type;
+    if (dims == 1)
+    {
+        image_type = VK_IMAGE_TYPE_1D;
+        imageview_type = VK_IMAGE_VIEW_TYPE_1D;
+
+        if (width > (int)vkdev->info.max_image_dimension_1d)
+        {
+            printf("image dimension too large %d > %d", width, (int)vkdev->info.max_image_dimension_1d);
+            return 0;
+        }
+    }
+    else if (dims == 2)
+    {
+        image_type = VK_IMAGE_TYPE_2D;
+        imageview_type = VK_IMAGE_VIEW_TYPE_2D;
+
+        if (width > (int)vkdev->info.max_image_dimension_2d || height > (int)vkdev->info.max_image_dimension_2d)
+        {
+            printf("image dimension too large %d %d > %d", width, height, (int)vkdev->info.max_image_dimension_2d);
+            return 0;
+        }
+    }
+    else // if (dims == 3)
+    {
+        image_type = VK_IMAGE_TYPE_3D;
+        imageview_type = VK_IMAGE_VIEW_TYPE_3D;
+
+        if (width > (int)vkdev->info.max_image_dimension_3d || height > (int)vkdev->info.max_image_dimension_3d || depth > (int)vkdev->info.max_image_dimension_3d)
+        {
+            printf("image dimension too large %d %d %d > %d", width, height, depth, (int)vkdev->info.max_image_dimension_3d);
+            return 0;
+        }
+    }
+
+    VkImageMemory* ptr = new VkImageMemory;
+
+    ptr->image = create_image(image_type, width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
+
+    ptr->image_type = image_type;
+    ptr->imageview_type = imageview_type;
+    ptr->width = width;
+    ptr->height = height;
+    ptr->depth = depth;
+    ptr->format = format;
+
+    // TODO respect VK_KHR_dedicated_allocation ?
+    VkMemoryRequirements memoryRequirements;
+    vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
+
+    const size_t size = memoryRequirements.size;
+    const size_t alignment = std::max((size_t)memoryRequirements.alignment, bind_memory_offset_alignment);
+
+    size_t aligned_size = alignSize(size, alignment);
+
+    const int image_memory_block_count = image_memory_blocks.size();
+
+    // find first spare space in image_memory_blocks
+    for (int i=0; i<image_memory_block_count; i++)
+    {
+        std::list< std::pair<size_t, size_t> >::iterator it = image_memory_budgets[i].begin();
+        while (it != image_memory_budgets[i].end())
+        {
+            // we cannot use it->first directly for base offset alignment
+            size_t bind_base_offset = it->first;
+            size_t bind_offset = alignSize(bind_base_offset, alignment);
+            size_t budget_size = it->second;
+            if (budget_size < aligned_size + (bind_offset - bind_base_offset))
+            {
+                it++;
+                continue;
+            }
+            // bind at memory offset
+            ptr->memory = image_memory_blocks[i];
+            ptr->bind_offset = bind_offset;
+            ptr->bind_capacity = aligned_size;
+
+            vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
+
+            // do not allow host access to optimal tiling image
+            ptr->mapped_ptr = 0;
+
+            ptr->imageview = create_imageview(imageview_type, ptr->image, format);
+
+            ptr->access_flags = 0;
+            ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+            ptr->command_refcount = 0;
+
+            if (bind_base_offset != bind_offset)
+            {
+                // NOTE there is small offset inside bind_base_offset and bind_offset
+                // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
+                // so that memory management could be easier
+                aligned_size += (bind_offset - bind_base_offset);
+
+                ptr->bind_offset = bind_base_offset;
+                ptr->bind_capacity = aligned_size;
+            }
+
+            // adjust image_memory_budgets
+            if (budget_size == aligned_size)
+            {
+                image_memory_budgets[i].erase(it);
+            }
+            else
+            {
+                it->first += aligned_size;
+                it->second -= aligned_size;
+            }
+
+//             TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
+
+            return ptr;
+        }
+    }
+
+    // setup memory type and alignment
+    if (image_memory_type_index == (uint32_t)-1)
+    {
+        if (vkdev->info.type == 1)
+        {
+            // integrated gpu, prefer unified memory
+            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+        }
+        else
+        {
+            // discrete gpu, device local
+            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+        }
+        mappable = vkdev->is_mappable(image_memory_type_index);
+        coherent = vkdev->is_coherent(image_memory_type_index);
+    }
+
+    // create new block
+    size_t new_block_size = std::max(block_size, aligned_size);
+
+    // bind at memory offset
+    ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
+    ptr->bind_offset = 0;
+    ptr->bind_capacity = aligned_size;
+
+    // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
+    vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
+
+    // do not allow host access to optimal tiling image
+    ptr->mapped_ptr = 0;
+
+    ptr->imageview = create_imageview(imageview_type, ptr->image, format);
+
+    ptr->access_flags = 0;
+    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    ptr->command_refcount = 0;
+
+    // adjust image_memory_budgets
+    image_memory_blocks.push_back(ptr->memory);
+
+    std::list< std::pair<size_t, size_t> > budget;
+    if (new_block_size > aligned_size)
+    {
+        budget.push_back(std::make_pair(aligned_size, new_block_size - aligned_size));
+    }
+    image_memory_budgets.push_back(budget);
+
+//     TLOG_INFO("VkBlobAllocator M %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
+
+    return ptr;
+}
+
+
+void VkBlobAllocator::fastFree(VkBufferMemory* ptr)
+{
+//     TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->buffer, ptr->offset, ptr->capacity);
+
+    const int buffer_block_count = buffer_blocks.size();
+
+    int block_index = -1;
+    for (int i=0; i<buffer_block_count; i++)
+    {
+        if (buffer_blocks[i]->buffer == ptr->buffer && buffer_blocks[i]->memory == ptr->memory)
+        {
+            block_index = i;
+            break;
+        }
+    }
+
+    if (block_index == -1)
+    {
+        printf("FATAL ERROR! unlocked VkBlobAllocator get wild %p", ptr->buffer);
+
+        delete ptr;
+
+        return;
+    }
+
+    // merge
+    std::list< std::pair<size_t, size_t> >::iterator it_merge_left = buffer_budgets[block_index].end();
+    std::list< std::pair<size_t, size_t> >::iterator it_merge_right = buffer_budgets[block_index].end();
+    std::list< std::pair<size_t, size_t> >::iterator it = buffer_budgets[block_index].begin();
+    for ( ; it != buffer_budgets[block_index].end(); it++)
+    {
+        if (it->first + it->second == ptr->offset)
+        {
+            it_merge_left = it;
+        }
+        else if (ptr->offset + ptr->capacity == it->first)
+        {
+            it_merge_right = it;
+        }
+    }
+
+    if (it_merge_left != buffer_budgets[block_index].end() && it_merge_right != buffer_budgets[block_index].end())
+    {
+        it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
+        buffer_budgets[block_index].erase(it_merge_right);
+    }
+    else if (it_merge_left != buffer_budgets[block_index].end())
+    {
+        it_merge_left->second = ptr->offset + ptr->capacity - it_merge_left->first;
+    }
+    else if (it_merge_right != buffer_budgets[block_index].end())
+    {
+        it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->offset;
+        it_merge_right->first = ptr->offset;
+    }
+    else
+    {
+        if (ptr->offset == 0)
+        {
+            // chain leading block
+            buffer_budgets[block_index].push_front(std::make_pair(ptr->offset, ptr->capacity));
+        }
+        else
+        {
+            buffer_budgets[block_index].push_back(std::make_pair(ptr->offset, ptr->capacity));
+        }
+    }
+
+    delete ptr;
+}
+
+void VkBlobAllocator::fastFree(VkImageMemory* ptr)
+{
+//     TLOG_INFO("VkBlobAllocator F %p +%lu %lu", ptr->memory, ptr->bind_offset, ptr->bind_capacity);
+
+    const int image_memory_block_count = image_memory_blocks.size();
+
+    int block_index = -1;
+    for (int i=0; i<image_memory_block_count; i++)
+    {
+        if (image_memory_blocks[i] == ptr->memory)
+        {
+            block_index = i;
+            break;
+        }
+    }
+
+    if (block_index == -1)
+    {
+        printf("FATAL ERROR! unlocked VkBlobAllocator get wild %p\n", ptr->memory);
+
+        if (!ptr->command_refcount)
+        {
+            vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
+            vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
+
+            delete ptr;
+        }
+
+        return;
+    }
+
+    // merge
+    std::list< std::pair<size_t, size_t> >::iterator it_merge_left = image_memory_budgets[block_index].end();
+    std::list< std::pair<size_t, size_t> >::iterator it_merge_right = image_memory_budgets[block_index].end();
+    std::list< std::pair<size_t, size_t> >::iterator it = image_memory_budgets[block_index].begin();
+    for ( ; it != image_memory_budgets[block_index].end(); it++)
+    {
+        if (it->first + it->second == ptr->bind_offset)
+        {
+            it_merge_left = it;
+        }
+        else if (ptr->bind_offset + ptr->bind_capacity == it->first)
+        {
+            it_merge_right = it;
+        }
+    }
+
+    if (it_merge_left != image_memory_budgets[block_index].end() && it_merge_right != image_memory_budgets[block_index].end())
+    {
+        it_merge_left->second = it_merge_right->first + it_merge_right->second - it_merge_left->first;
+        image_memory_budgets[block_index].erase(it_merge_right);
+    }
+    else if (it_merge_left != image_memory_budgets[block_index].end())
+    {
+        it_merge_left->second = ptr->bind_offset + ptr->bind_capacity - it_merge_left->first;
+    }
+    else if (it_merge_right != image_memory_budgets[block_index].end())
+    {
+        it_merge_right->second = it_merge_right->first + it_merge_right->second - ptr->bind_offset;
+        it_merge_right->first = ptr->bind_offset;
+    }
+    else
+    {
+        if (ptr->bind_offset == 0)
+        {
+            // chain leading block
+            image_memory_budgets[block_index].push_front(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
+        }
+        else
+        {
+            image_memory_budgets[block_index].push_back(std::make_pair(ptr->bind_offset, ptr->bind_capacity));
+        }
+    }
+
+    if (!ptr->command_refcount)
+    {
+        vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
+        vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
+
+        delete ptr;
+    }
+}
+
+VkWeightAllocator::VkWeightAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev)
+{
+    buffer_offset_alignment = vkdev->info.buffer_offset_alignment;
+    bind_memory_offset_alignment = vkdev->info.buffer_image_granularity;
+
+    if (vkdev->info.type == 1)
+    {
+        // on integrated gpu, there may be device local only memory too, eg. AMD APU
+        // assuming larger alignment always keeps us safe :)
+
+        // least common multiple for memory_map_alignment and buffer_offset_alignment and non_coherent_atom_size
+        buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.memory_map_alignment);
+        buffer_offset_alignment = least_common_multiple(buffer_offset_alignment, vkdev->info.non_coherent_atom_size);
+    }
+
+    block_size = alignSize(8 * 1024 * 1024, buffer_offset_alignment);// 8M
+}
+
+VkWeightAllocator::~VkWeightAllocator()
+{
+    //clear();
+    printf("run VkWeightAllocator descontruction function\n");
+}
+
+
+void VkWeightAllocator::clear()
+{
+    printf("run VkWeightAllocator clear function\n");
+}
+
+VkBufferMemory* VkWeightAllocator::fastMalloc(size_t size)
+{
+    // printf("VkWeightAllocator fastMalloc %lu\n", size);
+
+    size_t aligned_size = alignSize(size, buffer_offset_alignment);
+
+    const int buffer_block_count = buffer_blocks.size();
+
+    // find first spare space in buffer_blocks
+    for (int i=0; i<buffer_block_count; i++)
+    {
+	size_t free_size = buffer_block_free_spaces[i];
+        if (free_size >= aligned_size)
+        {
+            size_t block_offset = block_size - free_size;
+            // return sub buffer
+            VkBufferMemory* ptr = new VkBufferMemory;
+
+            ptr->buffer = buffer_blocks[i]->buffer;
+            ptr->offset = block_offset;
+            ptr->memory = buffer_blocks[i]->memory;
+            ptr->capacity = aligned_size;
+            ptr->mapped_ptr = buffer_blocks[i]->mapped_ptr;
+            ptr->access_flags = 0;
+            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+            buffer_block_free_spaces[i] -= aligned_size;
+
+	    return ptr;
+	}
+    }
+    size_t new_block_size = std::max(block_size, aligned_size);
+
+    // create new block
+    VkBufferMemory* block = new VkBufferMemory;
+
+    block->buffer = create_buffer(new_block_size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
+    block->offset = 0;
+
+    if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation)
+    {
+	    VkBufferMemoryRequirementsInfo2KHR bufferMemoryRequirementsInfo2;
+        bufferMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2_KHR;
+        bufferMemoryRequirementsInfo2.pNext = 0;
+        bufferMemoryRequirementsInfo2.buffer = block->buffer;
+
+        VkMemoryRequirements2KHR memoryRequirements2;
+        memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
+        memoryRequirements2.pNext = 0;
+
+        VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
+        memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
+        memoryDedicatedRequirements.pNext = 0;
+        memoryRequirements2.pNext = &memoryDedicatedRequirements;
+
+        vkdev->vkGetBufferMemoryRequirements2KHR(vkdev->vkdevice(), &bufferMemoryRequirementsInfo2, &memoryRequirements2);
+
+        bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
+
+	if (dedicatedAllocation)
+        {
+	    // setup memory type and alignment
+	    if (buffer_memory_type_index == (uint32_t)-1)
+            {
+		if (vkdev->info.type == 1)
+		{
+		    // integrated gpu, prefer unified memory
+		    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+		}
+		else
+		{
+		    // discrete gpu, device local
+		    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+		}
+
+		mappable = vkdev->is_mappable(buffer_memory_type_index);
+                coherent = vkdev->is_coherent(buffer_memory_type_index);
+	    }
+
+	    block->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, buffer_memory_type_index, 0, block->buffer);
+	    // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
+	    vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
+
+	    block->mapped_ptr = 0;
+            if (mappable)
+            {
+                vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
+            }
+
+	    dedicated_buffer_blocks.push_back(block);
+
+	    // return sub buffer
+            VkBufferMemory* ptr = new VkBufferMemory;
+
+	    ptr->buffer = block->buffer;
+            ptr->offset = 0;
+            ptr->memory = block->memory;
+            ptr->capacity = new_block_size;
+            ptr->mapped_ptr = block->mapped_ptr;
+            ptr->access_flags = 0;
+            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+            return ptr;
+	}
+    }
+
+    VkMemoryRequirements memoryRequirements;
+    vkGetBufferMemoryRequirements(vkdev->vkdevice(), block->buffer, &memoryRequirements);
+
+    // setup memory type and alignment
+    if (buffer_memory_type_index == (uint32_t)-1)
+    {
+	if (vkdev->info.type == 1)
+        {
+            // integrated gpu, prefer unified memory
+	    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+	}
+	else
+	{
+	    // discrete gpu, device local
+	    buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+	}
+
+	mappable = vkdev->is_mappable(buffer_memory_type_index);
+        coherent = vkdev->is_coherent(buffer_memory_type_index);
+    }
+
+    block->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
+
+    // ignore memoryRequirements.alignment as we always bind at zero offset
+    vkBindBufferMemory(vkdev->vkdevice(), block->buffer, block->memory, 0);
+
+//     printf("VkWeightAllocator M %p", block->buffer);
+    block->mapped_ptr = 0;
+    if (mappable)
+    {
+        vkMapMemory(vkdev->vkdevice(), block->memory, 0, new_block_size, 0, &block->mapped_ptr);
+    }
+
+    buffer_blocks.push_back(block);
+
+    buffer_block_free_spaces.push_back(new_block_size - aligned_size);
+
+    // return sub buffer
+    VkBufferMemory* ptr = new VkBufferMemory;
+
+    ptr->buffer = block->buffer;
+    ptr->offset = 0;
+    ptr->memory = block->memory;
+    ptr->capacity = aligned_size;
+    ptr->mapped_ptr = block->mapped_ptr;
+    ptr->access_flags = 0;
+    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+    return ptr;
+}
+
+VkImageMemory* VkWeightAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack)
+{
+    if (elempack != 1 && elempack != 4 && elempack != 8 && elempack != 16 && elempack != 32 && elempack != 64)
+    {
+        printf("elempack must be 1 4 8 16 32 64\n");
+        return 0;
+    }
+
+    // resolve format
+    VkFormat format = VK_FORMAT_UNDEFINED;
+
+    if (elemsize / elempack == 4)
+    {
+        // fp32
+        if (elempack == 1) format = VK_FORMAT_R32_SFLOAT;
+        if (elempack == 4) format = VK_FORMAT_R32G32B32A32_SFLOAT;
+        if (elempack == 8) format = VK_FORMAT_R32G32B32A32_SFLOAT;
+        if (elempack == 16) format = VK_FORMAT_R32G32B32A32_SFLOAT;
+        if (elempack == 32) format = VK_FORMAT_R32G32B32A32_SFLOAT;
+        if (elempack == 64) format = VK_FORMAT_R32G32B32A32_SFLOAT;
+    }
+    if (elemsize / elempack == 2)
+    {
+        // fp16
+        if (elempack == 1) format = VK_FORMAT_R16_SFLOAT;
+        if (elempack == 4) format = VK_FORMAT_R16G16B16A16_SFLOAT;
+        if (elempack == 8) format = VK_FORMAT_R16G16B16A16_SFLOAT;
+        if (elempack == 16) format = VK_FORMAT_R16G16B16A16_SFLOAT;
+        if (elempack == 32) format = VK_FORMAT_R16G16B16A16_SFLOAT;
+        if (elempack == 64) format = VK_FORMAT_R16G16B16A16_SFLOAT;
+    }
+
+    // resolve image width height depth
+    int width = w;
+    int height = h;
+    int depth = c;
+
+    // large elempack spills on image w
+    if (elempack == 8) width *= 2;
+    if (elempack == 16) width *= 4;
+    if (elempack == 32) width *= 8;
+    if (elempack == 64) width *= 16;
+
+    VkImageType image_type;
+    VkImageViewType imageview_type;
+    if (dims == 1)
+    {
+        image_type = VK_IMAGE_TYPE_1D;
+        imageview_type = VK_IMAGE_VIEW_TYPE_1D;
+
+        if (width > (int)vkdev->info.max_image_dimension_1d)
+        {
+            printf("image dimension too large %d > %d\n", width, (int)vkdev->info.max_image_dimension_1d);
+            return 0;
+        }
+    }
+    else if (dims == 2)
+    {
+        image_type = VK_IMAGE_TYPE_2D;
+        imageview_type = VK_IMAGE_VIEW_TYPE_2D;
+
+        if (width > (int)vkdev->info.max_image_dimension_2d || height > (int)vkdev->info.max_image_dimension_2d)
+        {
+            printf("image dimension too large %d %d > %d \n", width, height, (int)vkdev->info.max_image_dimension_2d);
+            return 0;
+        }
+    }
+    else // if (dims == 3)
+    {
+        image_type = VK_IMAGE_TYPE_3D;
+        imageview_type = VK_IMAGE_VIEW_TYPE_3D;
+
+        if (width > (int)vkdev->info.max_image_dimension_3d || height > (int)vkdev->info.max_image_dimension_3d || depth > (int)vkdev->info.max_image_dimension_3d)
+        {
+            printf("image dimension too large %d %d %d > %d \n", width, height, depth, (int)vkdev->info.max_image_dimension_3d);
+            return 0;
+        }
+    }
+
+    VkImageMemory* ptr = new VkImageMemory;
+
+    ptr->image = create_image(image_type, width, height, depth, format, VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT);
+
+    ptr->image_type = image_type;
+    ptr->imageview_type = imageview_type;
+    ptr->width = width;
+    ptr->height = height;
+    ptr->depth = depth;
+    ptr->format = format;
+
+    if (vkdev->info.support_VK_KHR_get_memory_requirements2 && vkdev->info.support_VK_KHR_dedicated_allocation)
+    {
+        VkImageMemoryRequirementsInfo2KHR imageMemoryRequirementsInfo2;
+        imageMemoryRequirementsInfo2.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2_KHR;
+        imageMemoryRequirementsInfo2.pNext = 0;
+        imageMemoryRequirementsInfo2.image = ptr->image;
+
+        VkMemoryRequirements2KHR memoryRequirements2;
+        memoryRequirements2.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2_KHR;
+        memoryRequirements2.pNext = 0;
+
+        VkMemoryDedicatedRequirementsKHR memoryDedicatedRequirements;
+        memoryDedicatedRequirements.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS_KHR;
+        memoryDedicatedRequirements.pNext = 0;
+        memoryRequirements2.pNext = &memoryDedicatedRequirements;
+
+        vkdev->vkGetImageMemoryRequirements2KHR(vkdev->vkdevice(), &imageMemoryRequirementsInfo2, &memoryRequirements2);
+
+        bool dedicatedAllocation = memoryDedicatedRequirements.requiresDedicatedAllocation || memoryDedicatedRequirements.prefersDedicatedAllocation;
+
+        if (dedicatedAllocation)
+        {
+            // setup memory type and alignment
+            if (image_memory_type_index == (uint32_t)-1)
+            {
+                if (vkdev->info.type == 1)
+                {
+                    // integrated gpu, prefer unified memory
+                    image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+                }
+                else
+                {
+                    // discrete gpu, device local
+                    image_memory_type_index = vkdev->find_memory_index(memoryRequirements2.memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+                }
+
+                mappable = vkdev->is_mappable(image_memory_type_index);
+                coherent = vkdev->is_coherent(image_memory_type_index);
+            }
+
+            // bind memory
+            ptr->memory = allocate_dedicated_memory(memoryRequirements2.memoryRequirements.size, image_memory_type_index, ptr->image, 0);
+            ptr->bind_offset = 0;
+            ptr->bind_capacity = memoryRequirements2.memoryRequirements.size;
+
+            // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
+            vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
+
+            // do not allow host access to optimal tiling image
+            ptr->mapped_ptr = 0;
+
+            ptr->imageview = create_imageview(imageview_type, ptr->image, format);
+
+            ptr->access_flags = 0;
+            ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+            ptr->command_refcount = 0;
+
+            dedicated_image_memory_blocks.push_back(ptr->memory);
+
+            return ptr;
+        }
+    }
+
+    VkMemoryRequirements memoryRequirements;
+    vkGetImageMemoryRequirements(vkdev->vkdevice(), ptr->image, &memoryRequirements);
+
+    const size_t size = memoryRequirements.size;
+    const size_t alignment = std::max((size_t)memoryRequirements.alignment, bind_memory_offset_alignment);
+
+    size_t aligned_size = alignSize(size, alignment);
+
+    const int image_memory_block_count = image_memory_blocks.size();
+
+    // find first spare space in buffer_blocks
+    for (int i=0; i<image_memory_block_count; i++)
+    {
+        // we cannot use image_memory_block_free_spaces[i] directly for base offset alignment
+        size_t bind_base_offset = block_size - image_memory_block_free_spaces[i];
+        size_t bind_offset = alignSize(bind_base_offset, alignment);
+        if (image_memory_block_free_spaces[i] >= aligned_size + (bind_offset - bind_base_offset))
+        {
+            // bind at memory offset
+            ptr->memory = image_memory_blocks[i];
+            ptr->bind_offset = bind_offset;
+            ptr->bind_capacity = aligned_size;
+
+            vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
+
+            // do not allow host access to optimal tiling image
+            ptr->mapped_ptr = 0;
+
+            ptr->imageview = create_imageview(imageview_type, ptr->image, format);
+
+            ptr->access_flags = 0;
+            ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+            ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+            ptr->command_refcount = 0;
+
+            if (bind_base_offset != bind_offset)
+            {
+                // NOTE there is small offset inside bind_base_offset and bind_offset
+                // adjust ptr->bind_offset and ptr->bind_capacity after vkBindImageMemory
+                // so that memory management could be easier
+                aligned_size += (bind_offset - bind_base_offset);
+
+                ptr->bind_offset = bind_base_offset;
+                ptr->bind_capacity = aligned_size;
+            }
+
+            image_memory_block_free_spaces[i] -= aligned_size;
+
+            return ptr;
+        }
+    }
+
+    // setup memory type and alignment
+    if (image_memory_type_index == (uint32_t)-1)
+    {
+        if (vkdev->info.type == 1)
+        {
+            // integrated gpu, prefer unified memory
+            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT, 0);
+        }
+        else
+        {
+            // discrete gpu, device local
+            image_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, 0, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT);
+        }
+
+        mappable = vkdev->is_mappable(image_memory_type_index);
+        coherent = vkdev->is_coherent(image_memory_type_index);
+    }
+
+    // create new block
+    size_t new_block_size = std::max(block_size, aligned_size);
+
+    // bind at memory offset
+    ptr->memory = allocate_memory(new_block_size, image_memory_type_index);
+    ptr->bind_offset = 0;
+    ptr->bind_capacity = aligned_size;
+
+    // ignore memoryRequirements2.memoryRequirements.alignment as we always bind at zero offset
+    vkBindImageMemory(vkdev->vkdevice(), ptr->image, ptr->memory, ptr->bind_offset);
+
+    // do not allow host access to optimal tiling image
+    ptr->mapped_ptr = 0;
+
+    ptr->imageview = create_imageview(imageview_type, ptr->image, format);
+
+    ptr->access_flags = 0;
+    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+    ptr->command_refcount = 0;
+
+    image_memory_blocks.push_back(ptr->memory);
+    image_memory_block_free_spaces.push_back(new_block_size - aligned_size);
+
+    return ptr;
+}
+
+
+void VkWeightAllocator::fastFree(VkBufferMemory* ptr)
+{
+//     TLOG_INFO("VkWeightAllocator F %p", ptr->buffer);
+
+    delete ptr;
+}
+
+void VkWeightAllocator::fastFree(VkImageMemory* ptr)
+{
+//     TLOG_INFO("VkWeightAllocator F %p", ptr->memory);
+
+    if (!ptr->command_refcount)
+    {
+        vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
+        vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
+
+        delete ptr;
+    }
+}
+
+VkStagingAllocator::VkStagingAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev)
+{
+    mappable = true;
+    coherent = true;
+
+    size_compare_ratio = 192;// 0.75f * 256
+}
+
+VkStagingAllocator::~VkStagingAllocator()
+{
+    clear();
+}
+
+void VkStagingAllocator::clear()
+{
+//     TLOG_INFO("VkStagingAllocator %lu", buffer_budgets.size());
+
+    for (std::list<VkBufferMemory*>::iterator it = buffer_budgets.begin(); it != buffer_budgets.end(); it++)
+    {
+        VkBufferMemory* ptr = *it;
+
+//         TLOG_INFO("VkStagingAllocator F %p", ptr->buffer);
+
+        vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
+        vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
+        vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
+
+        delete ptr;
+    }
+    buffer_budgets.clear();
+}
+
+VkBufferMemory* VkStagingAllocator::fastMalloc(size_t size)
+{
+    // printf("VkStagingAllocator fastMalloc %lu\n", size);
+    // find free budget
+    std::list<VkBufferMemory*>::iterator it = buffer_budgets.begin();
+    for (; it != buffer_budgets.end(); it++)
+    {
+        VkBufferMemory* ptr = *it;
+
+        size_t capacity = ptr->capacity;
+
+        // size_compare_ratio ~ 100%
+        if (capacity >= size && ((capacity * size_compare_ratio) >> 8) <= size)
+        {
+            buffer_budgets.erase(it);
+
+//             TLOG_INFO("VkStagingAllocator M %p %lu reused %lu", ptr->buffer, size, capacity);
+
+            return ptr;
+        }
+    }
+
+    VkBufferMemory* ptr = new VkBufferMemory;
+
+    ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
+    ptr->offset = 0;
+
+    VkMemoryRequirements memoryRequirements;
+    vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
+
+    // setup memory type
+    if (buffer_memory_type_index == (uint32_t)-1)
+    {
+        buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+    }
+
+    ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
+
+    // ignore memoryRequirements.alignment as we always bind at zero offset
+    vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
+
+    ptr->capacity = size;
+
+    vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
+
+    ptr->access_flags = 0;
+    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+//     TLOG_INFO("VkStagingAllocator M %p %lu", ptr->buffer, size);
+
+    return ptr;
+}
+
+VkImageMemory* VkStagingAllocator::fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack)
+{
+    // staging image is mainly used for storing small piece of dynamic parameters
+    // we allocate host memory as a fake image, it's simple and good
+
+    const size_t size = w * h * c * elemsize;
+
+    VkImageType image_type;
+    VkImageViewType imageview_type;
+    if (dims == 1)
+    {
+        image_type = VK_IMAGE_TYPE_1D;
+        imageview_type = VK_IMAGE_VIEW_TYPE_1D;
+    }
+    else if (dims == 2)
+    {
+        image_type = VK_IMAGE_TYPE_2D;
+        imageview_type = VK_IMAGE_VIEW_TYPE_2D;
+    }
+    else // if (dims == 3)
+    {
+        image_type = VK_IMAGE_TYPE_3D;
+        imageview_type = VK_IMAGE_VIEW_TYPE_3D;
+    }
+
+    VkImageMemory* ptr = new VkImageMemory;
+
+    ptr->image = 0;
+    ptr->image_type = image_type;
+    ptr->imageview_type = imageview_type;
+    ptr->width = w;
+    ptr->height = h;
+    ptr->depth = c;
+    ptr->format = VK_FORMAT_UNDEFINED;
+    ptr->memory = 0;
+    ptr->bind_offset = 0;
+    ptr->bind_capacity = size;
+
+    ptr->mapped_ptr = malloc(size);
+
+    ptr->imageview = 0;
+
+    ptr->access_flags = 0;
+    ptr->image_layout = VK_IMAGE_LAYOUT_UNDEFINED;
+    ptr->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
+    ptr->command_refcount = 0;
+
+//     TLOG_INFO("VkStagingAllocator M %p %d %d %d %d %d", ptr->image, dims, width, height, depth, format);
+
+    return ptr;
+}
+
+void VkStagingAllocator::fastFree(VkBufferMemory* ptr)
+{
+//     TLOG_INFO("VkStagingAllocator F %p", ptr->buffer);
+
+    // return to buffer_budgets
+    buffer_budgets.push_back(ptr);
+}
+
+void VkStagingAllocator::fastFree(VkImageMemory* ptr)
+{
+//     TLOG_INFO("VkStagingAllocator F %p", ptr->image);
+
+    free(ptr->mapped_ptr);
+
+    delete ptr;
+}
+
+VkWeightStagingAllocator::VkWeightStagingAllocator(const GPUDevice* _vkdev) : VkAllocator(_vkdev)
+{
+    mappable = true;
+    coherent = true;
+}
+
+VkWeightStagingAllocator::~VkWeightStagingAllocator()
+{
+}
+
+VkBufferMemory* VkWeightStagingAllocator::fastMalloc(size_t size)
+{
+    printf("VkWeightStagingAllocator fastMalloc %lu\n", size);
+    VkBufferMemory* ptr = new VkBufferMemory;
+
+    ptr->buffer = create_buffer(size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT);
+    ptr->offset = 0;
+
+    VkMemoryRequirements memoryRequirements;
+    vkGetBufferMemoryRequirements(vkdev->vkdevice(), ptr->buffer, &memoryRequirements);
+
+    // setup memory type
+    if (buffer_memory_type_index == (uint32_t)-1)
+    {
+        buffer_memory_type_index = vkdev->find_memory_index(memoryRequirements.memoryTypeBits, VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT, VK_MEMORY_PROPERTY_HOST_CACHED_BIT, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+    }
+
+    ptr->memory = allocate_memory(memoryRequirements.size, buffer_memory_type_index);
+
+    // ignore memoryRequirements.alignment as we always bind at zero offset
+    vkBindBufferMemory(vkdev->vkdevice(), ptr->buffer, ptr->memory, 0);
+
+    ptr->capacity = size;
+
+    vkMapMemory(vkdev->vkdevice(), ptr->memory, 0, size, 0, &ptr->mapped_ptr);
+
+    ptr->access_flags = 0;
+    ptr->stage_flags = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+
+//     printf("VkWeightStagingAllocator M %p %lu", ptr->buffer, size);
+
+    return ptr;
+}
+
+void VkWeightStagingAllocator::fastFree(VkBufferMemory* ptr)
+{
+//     TLOG_INFO("VkWeightStagingAllocator F %p", ptr->buffer);
+
+    vkUnmapMemory(vkdev->vkdevice(), ptr->memory);
+    vkDestroyBuffer(vkdev->vkdevice(), ptr->buffer, 0);
+    vkFreeMemory(vkdev->vkdevice(), ptr->memory, 0);
+
+    delete ptr;
+}
+
+} // namespace TEngine
diff --git a/source/device/vulkan/vulkan_allocator.hpp b/source/device/vulkan/vulkan_allocator.hpp
new file mode 100644
index 000000000..4a8f7e1c3
--- /dev/null
+++ b/source/device/vulkan/vulkan_allocator.hpp
@@ -0,0 +1,284 @@
+#ifndef VULKAN_ALLOCATOR_HPP
+#define VULKAN_ALLOCATOR_HPP
+
+#include <vulkan/vulkan.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <list>
+#include <vector>
+#include <string>
+#include "vulkan_platform.hpp"
+
+namespace TEngine {
+    
+#define MALLOC_ALIGN    16
+
+template<typename _Tp> static inline _Tp* alignPtr(_Tp* ptr, int n=(int)sizeof(_Tp))
+{
+    return (_Tp*)(((size_t)ptr + n-1) & -n);
+}
+
+static inline size_t alignSize(size_t sz, int n)
+{
+    return (sz + n-1) & -n;
+}
+
+static inline void* fastMalloc(size_t size)
+{
+    unsigned char* udata = (unsigned char*)malloc(size + sizeof(void*) + MALLOC_ALIGN);
+    if (!udata)
+        return 0;
+    unsigned char** adata = alignPtr((unsigned char**)udata + 1, MALLOC_ALIGN);
+    adata[-1] = udata;
+    return adata;
+}
+
+static inline void fastFree(void* ptr)
+{
+    if (ptr)
+    {
+        unsigned char* udata = ((unsigned char**)ptr)[-1];
+        free(udata);
+    }
+}
+
+static inline int TENGINE_XADD(int* addr, int delta) { int tmp = *addr; *addr += delta; return tmp; }
+
+
+class Allocator
+{
+public:
+    virtual ~Allocator();
+    virtual void* fastMalloc(size_t size) = 0;
+    virtual void fastFree(void* ptr) = 0;
+};
+
+// class PoolAllocator : public Allocator
+// {
+// public:
+//     PoolAllocator();
+//     ~PoolAllocator();
+
+//     // ratio range 0 ~ 1
+//     // default cr = 0.75
+//     void set_size_compare_ratio(float scr);
+
+//     // release all budgets immediately
+//     void clear();
+
+//     virtual void* fastMalloc(size_t size);
+//     virtual void fastFree(void* ptr);
+
+// private:
+//     Mutex budgets_lock;
+//     Mutex payouts_lock;
+//     unsigned int size_compare_ratio;// 0~256
+//     std::list< std::pair<size_t, void*> > budgets;
+//     std::list< std::pair<size_t, void*> > payouts;
+// };
+
+// class UnlockedPoolAllocator : public Allocator
+// {
+// public:
+//     UnlockedPoolAllocator();
+//     ~UnlockedPoolAllocator();
+
+//     // ratio range 0 ~ 1
+//     // default cr = 0.75
+//     void set_size_compare_ratio(float scr);
+
+//     // release all budgets immediately
+//     void clear();
+
+//     virtual void* fastMalloc(size_t size);
+//     virtual void fastFree(void* ptr);
+
+// private:
+//     unsigned int size_compare_ratio;// 0~256
+//     std::list< std::pair<size_t, void*> > budgets;
+//     std::list< std::pair<size_t, void*> > payouts;
+// };
+
+class GPUDevice;
+
+class VkBufferMemory
+{
+public:
+    VkBuffer buffer;
+
+    // the base offset assigned by allocator
+    size_t offset;
+    size_t capacity;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // buffer state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class VkImageMemory
+{
+public:
+    VkImage image;
+    VkImageView imageview;
+
+    // underlying info assigned by allocator
+    VkImageType image_type;
+    VkImageViewType imageview_type;
+    int width;
+    int height;
+    int depth;
+    VkFormat format;
+
+    VkDeviceMemory memory;
+    void* mapped_ptr;
+
+    // the base offset assigned by allocator
+    size_t bind_offset;
+    size_t bind_capacity;
+
+    // image state, modified by command functions internally
+    mutable VkAccessFlags access_flags;
+    mutable VkImageLayout image_layout;
+    mutable VkPipelineStageFlags stage_flags;
+
+    // in-execution state, modified by command functions internally
+    mutable int command_refcount;
+
+    // initialize and modified by mat
+    int refcount;
+};
+
+class VkAllocator
+{
+public:
+    VkAllocator(const GPUDevice* _vkdev);
+    virtual ~VkAllocator() { clear(); }
+    virtual void clear() {}
+
+    virtual VkBufferMemory* fastMalloc(size_t size) = 0;
+    virtual void fastFree(VkBufferMemory* ptr) = 0;
+    virtual int flush(VkBufferMemory* ptr);
+    virtual int invalidate(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack) = 0;
+    virtual void fastFree(VkImageMemory* ptr) = 0;
+
+public:
+    const GPUDevice* vkdev;
+    uint32_t buffer_memory_type_index;
+    uint32_t image_memory_type_index;
+    bool mappable;
+    bool coherent;
+
+protected:
+    VkBuffer create_buffer(size_t size, VkBufferUsageFlags usage);
+    VkDeviceMemory allocate_memory(size_t size, uint32_t memory_type_index);
+    VkDeviceMemory allocate_dedicated_memory(size_t size, uint32_t memory_type_index, VkImage image, VkBuffer buffer);
+
+    VkImage create_image(VkImageType type, int width, int height, int depth, VkFormat format, VkImageTiling tiling, VkImageUsageFlags usage);
+    VkImageView create_imageview(VkImageViewType type, VkImage image, VkFormat format);
+};
+
+class VkBlobAllocator : public VkAllocator
+{
+public:
+    VkBlobAllocator(const GPUDevice* vkdev);
+    virtual ~VkBlobAllocator();
+
+public:
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+
+    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; }
+    virtual void fastFree(VkImageMemory* ptr);
+
+protected:
+    size_t block_size;
+    size_t buffer_offset_alignment;
+    size_t bind_memory_offset_alignment;
+    std::vector< std::list< std::pair<size_t, size_t> > > buffer_budgets;
+    std::vector<VkBufferMemory*> buffer_blocks;
+    std::vector< std::list< std::pair<size_t, size_t> > > image_memory_budgets;
+    std::vector<VkDeviceMemory> image_memory_blocks;
+};
+
+class VkWeightAllocator : public VkAllocator
+{
+public:
+    VkWeightAllocator(const GPUDevice* vkdev);
+    virtual ~VkWeightAllocator();
+
+public:
+    // release all blocks immediately
+    virtual void clear();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; }
+    virtual void fastFree(VkImageMemory* ptr);
+
+protected:
+    size_t block_size;
+    size_t buffer_offset_alignment;
+    size_t bind_memory_offset_alignment;
+    std::vector<size_t> buffer_block_free_spaces;
+    std::vector<VkBufferMemory*> buffer_blocks;
+    std::vector<VkBufferMemory*> dedicated_buffer_blocks;
+    std::vector<size_t> image_memory_block_free_spaces;
+    std::vector<VkDeviceMemory> image_memory_blocks;
+    std::vector<VkDeviceMemory> dedicated_image_memory_blocks;
+};
+
+
+class VkStagingAllocator : public VkAllocator
+{
+public:
+    VkStagingAllocator(const GPUDevice* vkdev);
+    virtual ~VkStagingAllocator();
+
+public:
+    // ratio range 0 ~ 1
+    // default cr = 0.75
+    void set_size_compare_ratio(float scr);
+
+    // release all budgets immediately
+    virtual void clear();
+
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int dims, int w, int h, int c, size_t elemsize, int elempack);//{ return 0; }
+    virtual void fastFree(VkImageMemory* ptr);
+
+protected:
+    unsigned int size_compare_ratio;// 0~256
+    std::list<VkBufferMemory*> buffer_budgets;
+};
+
+
+class VkWeightStagingAllocator : public VkAllocator
+{
+public:
+    VkWeightStagingAllocator(const GPUDevice* vkdev);
+    virtual ~VkWeightStagingAllocator();
+
+public:
+    virtual VkBufferMemory* fastMalloc(size_t size);
+    virtual void fastFree(VkBufferMemory* ptr);
+    virtual VkImageMemory* fastMalloc(int /*dims*/, int /*w*/, int /*h*/, int /*c*/, size_t /*elemsize*/, int /*elempack*/) { return 0; }
+    virtual void fastFree(VkImageMemory* /*ptr*/) {}
+
+protected:
+};
+
+}
+#endif
diff --git a/source/device/vulkan/vulkan_command.cpp b/source/device/vulkan/vulkan_command.cpp
new file mode 100644
index 000000000..b5545fe6b
--- /dev/null
+++ b/source/device/vulkan/vulkan_command.cpp
@@ -0,0 +1,1782 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: ddzhao@openailab.com
+ */
+
+#include "vulkan_command.hpp"
+
+#include <algorithm>
+#include "vulkan_option.hpp"
+#include "vulkan_pipeline.hpp"
+#include "vulkan_tensor.hpp"
+
+namespace TEngine {
+
+VkCompute::VkCompute(const GPUDevice* _vkdev) : vkdev(_vkdev)
+{
+    compute_command_pool = 0;
+    compute_command_buffer = 0;
+    compute_command_fence = 0;
+
+    init();
+}
+
+
+VkCompute::~VkCompute()
+{
+    for (size_t i=0; i<image_blocks_to_destroy.size(); i++)
+    {
+        VkImageMemory* ptr = image_blocks_to_destroy[i];
+
+        int old_command_refcount = TENGINE_XADD(&ptr->command_refcount, -1);
+        if (ptr->refcount == 0 && old_command_refcount == 1)
+        {
+            // no userspace reference and we are the last command reference
+            vkDestroyImageView(vkdev->vkdevice(), ptr->imageview, 0);
+            vkDestroyImage(vkdev->vkdevice(), ptr->image, 0);
+
+            delete ptr;
+        }
+        else
+        {
+            // reference exists in user code or other command
+        }
+    }
+    image_blocks_to_destroy.clear();
+
+    if (!vkdev->info.support_VK_KHR_push_descriptor)
+    {
+        for (size_t i=0; i<descriptorsets.size(); i++)
+        {
+            vkFreeDescriptorSets(vkdev->vkdevice(), descriptor_pools[i], 1, &descriptorsets[i]);
+            vkDestroyDescriptorPool(vkdev->vkdevice(), descriptor_pools[i], 0);
+        }
+    }
+
+    vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0);
+
+    vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer);
+    vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0);
+}
+
+void VkCompute::record_upload(tensor* src, VkTensor& dst, const Option& opt)
+{
+    Tensor src_tensor = Tensor(src);
+    record_upload(src_tensor, dst, opt);
+//     // const ir_tensor* src_fp16;
+//     // if (src.elemsize == src.elempack * 4u)
+//     if(src->elem_size == opt.elempack * 4u)
+//     {
+//         // cpu cast to fp16 (discrete gpu)
+//         if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && opt.elempack % 4 == 0)))
+//         {
+//             // ncnn::cast_float32_to_float16(src, src_fp16, opt);
+//             printf("need to add cast_float32_to_float16 here, fix me!\n");
+//         }
+//         else
+//         {
+//             // src_fp16 = src;
+//         }
+//     }
+//     else
+//     {
+//         // src_fp16 = src;
+//     }
+
+//     // upload
+//     VkTensor dst_staging;
+//     if (opt.blob_vkallocator->mappable)
+//     {
+//         // dst_staging.create_like(src_fp16, opt.blob_vkallocator);
+//         dst_staging.create_like(src, opt.blob_vkallocator);
+//     }
+//     else
+//     {
+//         // dst_staging.create_like(src_fp16, opt.staging_vkallocator);
+//         dst_staging.create_like(src, opt.staging_vkallocator);
+//     }
+//     if (dst_staging.empty())
+//         return;
+
+//     // stash staging
+//     upload_staging_buffers.push_back(dst_staging);
+
+// //     TLOG_INFO("upload_staging_buffer %p  ->   %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity());
+
+//     // memcpy src to device
+//     // memcpy(dst_staging.mapped_ptr(), src_fp16->data, src_fp16->elem_size * src_fp16->elem_num);
+//     memcpy(dst_staging.mapped_ptr(), src->data, src->elem_size * src->elem_num);
+//     dst_staging.allocator->flush(dst_staging.data);
+
+//     // mark device host-write @ null
+//     dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT;
+//     dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
+
+//     // TODO
+//     // not use pack for now------------------------
+//     // // resolve dst_elempack
+//     int dims = src->dim_num;
+//     int elemcount = 0;
+//     // src dims[0-3]  n c h w
+//     // if (dims == 1) elemcount = opt.elempack * src_fp16.w;
+//     // if (dims == 2) elemcount = opt.elempack * src_fp16.h;
+//     // if (dims == 3) elemcount = opt.elempack * src_fp16.c;
+//     if(dims == 4) 
+//         elemcount = opt.elempack * src->dims[1];
+//     else 
+//         elemcount = opt.elempack * src->dims[0];
+
+//     int dst_elempack = 1;
+//     if (opt.use_shader_pack8)
+//         dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1;
+//     else
+//         dst_elempack = elemcount % 4 == 0 ? 4 : 1;
+
+//     vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt);
+}
+
+void VkCompute::record_upload(const Tensor& src, VkTensor& dst, const Option& opt)
+{
+    //     TLOG_INFO("record_upload buffer");
+
+    Tensor src_fp16;
+    if (src.elemsize == src.elempack * 4u)
+    {
+        // cpu cast to fp16 (discrete gpu)
+        if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0)))
+        {
+            // printf("do nothing for VkCompute record_upload cast_float32_to_float16, fix me\n");
+            TEngine::cast_float32_to_float16(src, src_fp16, opt);
+        }
+        else
+        {
+            src_fp16 = src;
+        }
+    }
+    else
+    {
+        src_fp16 = src;
+    }
+
+    // upload
+    VkTensor dst_staging;
+    if (opt.blob_vkallocator->mappable)
+    {
+        dst_staging.create_like(src_fp16, opt.blob_vkallocator);
+    }
+    else
+    {
+        dst_staging.create_like(src_fp16, opt.staging_vkallocator);
+    }
+    if (dst_staging.empty())
+        return;
+
+    // stash staging
+    upload_staging_buffers.push_back(dst_staging);
+
+//     TLOG_INFO("upload_staging_buffer %p  ->   %p +%d ~%d", src_fp16.data, dst_staging.buffer(), dst_staging.buffer_offset(), dst_staging.buffer_capacity());
+
+    // memcpy src to device
+    memcpy(dst_staging.mapped_ptr(), src_fp16.data, src_fp16.total() * src_fp16.elemsize);
+    dst_staging.allocator->flush(dst_staging.data);
+
+    // mark device host-write @ null
+    dst_staging.data->access_flags = VK_ACCESS_HOST_WRITE_BIT;
+    dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
+
+    // resolve dst_elempack
+    int dims = src_fp16.dims;
+    int elemcount = 0;
+    if (dims == 1) elemcount = src_fp16.elempack * src_fp16.w;
+    if (dims == 2) elemcount = src_fp16.elempack * src_fp16.h;
+    if (dims == 3) elemcount = src_fp16.elempack * src_fp16.c;
+
+    int dst_elempack = 1;
+    if (opt.use_shader_pack8)
+        dst_elempack = elemcount % 8 == 0 ? 8 : elemcount % 4 == 0 ? 4 : 1;
+    else
+        dst_elempack = elemcount % 4 == 0 ? 4 : 1;
+    
+    // gpu cast to fp16 on the fly (integrated gpu)
+    vkdev->convert_packing(dst_staging, dst, dst_elempack, *this, opt);
+}
+
+void VkCompute::record_download(const VkTensor& src, tensor* dst, const Option& opt)
+{
+    Tensor dst_tensor;
+    record_download(src, dst_tensor, opt);
+    dst->data = dst_tensor.data;
+
+    // Tensor feat;
+    // if (opt.use_packing_layout)
+    // {
+    //     Tensor bottom_blob_unpacked;
+    //     convert_packing(dst_tensor, bottom_blob_unpacked, 1, opt);
+    //     feat = bottom_blob_unpacked;
+    // }
+
+    // if (opt.use_bf16_storage)
+    // {
+    //     if (feat.elemsize / feat.elempack == 2u)
+    //     {
+    //         Tensor feat_fp32;
+    //         cast_bfloat16_to_float32(feat, feat_fp32, opt);
+    //         feat = feat_fp32;
+    //     }
+    // }
+
+    // dst->data = feat.data;
+}
+
+void VkCompute::record_download(const VkTensor& src, Tensor& dst, const Option& opt)
+{
+    int dims = src.dims;
+    int elemcount = 0;
+    if (dims == 1) elemcount = src.elempack * src.w;
+    if (dims == 2) elemcount = src.elempack * src.h;
+    if (dims == 3) elemcount = src.elempack * src.c;
+
+    int dst_elempack = 1;
+    if (opt.use_packing_layout)
+        dst_elempack = elemcount % 4 == 0 ? 4 : 1;
+    else
+        dst_elempack = 1;
+
+    // gpu cast to fp32 on the fly (integrated gpu)
+    Option opt_staging = opt;
+    if (vkdev->info.type != 0)
+    {
+        opt_staging.use_fp16_packed = false;
+        opt_staging.use_fp16_storage = false;
+    }
+
+    VkTensor dst_staging;
+    if (opt_staging.blob_vkallocator->mappable)
+    {
+        vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt);
+    }
+    else
+    {
+        opt_staging.blob_vkallocator = opt.staging_vkallocator;
+        vkdev->convert_packing(src, dst_staging, dst_elempack, *this, opt_staging);
+    }
+
+    // barrier device any @ compute to host-read @ compute
+    if (dst_staging.data->access_flags & VK_ACCESS_HOST_WRITE_BIT || dst_staging.data->stage_flags != VK_PIPELINE_STAGE_HOST_BIT)
+    {
+        VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
+        barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+        barriers[0].pNext = 0;
+        barriers[0].srcAccessMask = dst_staging.data->access_flags;
+        barriers[0].dstAccessMask = VK_ACCESS_HOST_READ_BIT;
+        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barriers[0].buffer = dst_staging.buffer();
+        barriers[0].offset = dst_staging.buffer_offset();
+        barriers[0].size = dst_staging.buffer_capacity();
+
+        VkPipelineStageFlags src_stage = dst_staging.data->stage_flags;
+        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_HOST_BIT;
+
+        if (vkdev->info.support_VK_KHR_push_descriptor)
+        {
+            vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
+            delete[] barriers;
+        }
+        else
+        {
+            record r;
+            r.type = record::TYPE_buffer_barrers;
+            r.command_buffer = compute_command_buffer;
+            r.buffer_barrers.src_stage = src_stage;
+            r.buffer_barrers.dst_stage = dst_stage;
+            r.buffer_barrers.barrier_count = 1;
+            r.buffer_barrers.barriers = barriers;
+            delayed_records.push_back(r);
+        }
+
+        // mark device host-read @ any
+        dst_staging.data->access_flags = VK_ACCESS_HOST_READ_BIT;
+        dst_staging.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT;
+    }
+
+    // create dst
+    Tensor dst_fp16;
+    dst_fp16.create_like(dst_staging, opt.blob_allocator);
+    if (dst_fp16.empty())
+        return;
+
+    // download
+    download_post_buffers.push_back(dst_staging);
+    download_post_tensors_fp16.push_back(dst_fp16);
+
+    // post memcpy device to dst
+    {
+        record r;
+        r.type = record::TYPE_post_download;
+        r.command_buffer = 0;
+        r.post_download.download_post_buffer_mat_offset = download_post_buffers.size() - 1;
+        r.post_download.download_post_mat_fp16_offset = download_post_tensors_fp16.size() - 1;
+        delayed_records.push_back(r);
+    }
+
+    // cast to fp32 (discrete gpu)
+    if (dst_fp16.elemsize == dst_fp16.elempack * 2u)
+    {
+        if (vkdev->info.type == 0 && (opt.use_fp16_storage || (opt.use_fp16_packed && dst_fp16.elempack % 4 == 0)))
+        {
+            int dims = dst_fp16.dims;
+            if (dims == 1)
+                dst.create(dst_fp16.w, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
+            if (dims == 2)
+                dst.create(dst_fp16.w, dst_fp16.h, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
+            if (dims == 3)
+                dst.create(dst_fp16.w, dst_fp16.h, dst_fp16.c, (size_t)(dst_fp16.elempack * 4u), dst_fp16.elempack, opt.blob_allocator);
+
+            download_post_tensors_fp16.push_back(dst_fp16);
+            download_post_tensors.push_back(dst);
+
+            record r;
+            r.type = record::TYPE_post_cast_float16_to_float32;
+            r.command_buffer = 0;
+            r.post_cast_float16_to_float32.download_post_mat_fp16_offset = download_post_tensors_fp16.size() - 1;
+            r.post_cast_float16_to_float32.download_post_mat_offset = download_post_tensors.size() - 1;
+            delayed_records.push_back(r);
+        }
+        else
+        {
+            dst = dst_fp16;
+        }
+    }
+    else
+    {
+        dst = dst_fp16;
+    }
+}
+
+int VkCompute::submit_and_wait()
+{
+    // printf("VkCompute submit_and_wait\n");
+    if (!vkdev->info.support_VK_KHR_push_descriptor)
+    {
+        // printf("start to run begin command buffer\n");
+        begin_command_buffer();
+        const size_t record_count = delayed_records.size();
+        // printf("delayed_records count:%d\n", record_count);
+
+        // handle delayed records
+        for (size_t i=0; i<record_count; i++)
+        {
+            const record& r = delayed_records[i];
+
+            switch (r.type)
+            {
+                case record::TYPE_copy_buffer:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_copy_image:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_copy_buffer_to_image:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_copy_image_to_buffer:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_bind_pipeline:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_bind_descriptorsets:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_push_constants:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_dispatch:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_memory_barrers:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_buffer_barrers:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_image_barrers:
+                {
+                    // TODO
+                    break;
+                }
+                case record::TYPE_post_download:
+                case record::TYPE_post_cast_float16_to_float32:
+                default:
+                    break;	
+            }
+        }
+    }
+
+    // end command buffer
+    {
+        end_command_buffer();
+    }
+
+    VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index);
+    if (compute_queue == 0)
+    {
+        printf("out of compute queue\n");
+        return -1;
+    }
+
+    // submit compute
+    {
+        VkSubmitInfo submitInfo;
+        submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+        submitInfo.pNext = 0;
+        submitInfo.waitSemaphoreCount = 0;
+        submitInfo.pWaitSemaphores = 0;
+        submitInfo.pWaitDstStageMask = 0;
+        submitInfo.commandBufferCount = 1;
+        submitInfo.pCommandBuffers = &compute_command_buffer;
+        submitInfo.signalSemaphoreCount = 0;
+        submitInfo.pSignalSemaphores = 0;
+
+        VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, compute_command_fence);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkQueueSubmit failed %d", ret);
+            vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue);
+            return -1;
+        }
+    }
+
+    vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue);
+
+    // wait
+    {
+        VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &compute_command_fence, VK_TRUE, UINT64_MAX);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkWaitForFences failed %d", ret);
+            return -1;
+        }
+    }
+
+    // handle delayed post records
+    for (size_t i=0; i<delayed_records.size(); i++)
+    {
+        const record& r = delayed_records[i];
+
+        switch (r.type)
+        {
+            case record::TYPE_post_download:
+            {
+                const VkTensor& src = download_post_buffers[r.post_download.download_post_buffer_mat_offset];
+                Tensor dst = download_post_tensors_fp16[r.post_download.download_post_mat_fp16_offset];
+
+    //             TLOG_INFO("post_download  %p +%d ~%d  -> %p", src.buffer(), src.buffer_offset(), src.buffer_capacity(), dst.data);
+
+                src.allocator->invalidate(src.data);
+                // memcpy(dst.data, src.mapped_ptr(), dst.elem_size * dst.elem_num);
+                memcpy(dst.data, src.mapped_ptr(), dst.total() * dst.elemsize);
+                break;
+            }
+            case record::TYPE_post_cast_float16_to_float32:
+            {
+                // TODO
+                printf("submit delayed_records TYPE_post_cast_float16_to_float32, Do nothing, fix me\n");
+                break;
+            }
+            default:
+                break;
+        }
+    }
+
+    delayed_records.clear();
+
+    return 0;
+}
+
+
+int VkCompute::init()
+{
+    // compute_command_pool
+    {
+        VkCommandPoolCreateInfo commandPoolCreateInfo;
+        commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+        commandPoolCreateInfo.pNext = 0;
+        commandPoolCreateInfo.flags = VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;
+        commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index;
+        VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkCreateCommandPool failed %d", ret);
+            return -1;
+        }
+    }
+    // compute_command_buffer
+    {
+        VkCommandBufferAllocateInfo commandBufferAllocateInfo;
+        commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+        commandBufferAllocateInfo.pNext = 0;
+        commandBufferAllocateInfo.commandPool = compute_command_pool;
+        commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+        commandBufferAllocateInfo.commandBufferCount = 1;
+
+        VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkAllocateCommandBuffers failed %d", ret);
+            return -1;
+        }
+    }
+
+    // compute_command_fence
+    {
+        VkFenceCreateInfo fenceCreateInfo;
+        fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+        fenceCreateInfo.pNext = 0;
+        fenceCreateInfo.flags = 0;
+
+        VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkCreateFence failed %d", ret);
+            return -1;
+        }
+    }
+
+    if (vkdev->info.support_VK_KHR_push_descriptor)
+    {
+        begin_command_buffer();
+    }
+
+    return 0;
+}
+
+int VkCompute::begin_command_buffer()
+{
+    VkCommandBufferBeginInfo commandBufferBeginInfo;
+    commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    commandBufferBeginInfo.pNext = 0;
+    commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    commandBufferBeginInfo.pInheritanceInfo = 0;
+
+    VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkBeginCommandBuffer failed %d", ret);
+        return -1;
+    }
+    return 0;
+}
+
+int VkCompute::end_command_buffer()
+{
+    VkResult ret = vkEndCommandBuffer(compute_command_buffer);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkEndCommandBuffer failed %d", ret);
+        return -1;
+    }
+
+    return 0;
+}
+
+void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& bindings, const std::vector<vk_constant_type>& constants, const VkTensor& dispatcher)
+{
+    record_pipeline(pipeline, bindings, std::vector<VkImageTensor>(), constants, dispatcher);
+}
+
+void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkImageTensor>& bindings, const std::vector<vk_constant_type>& constants, const VkImageTensor& dispatcher)
+{
+    record_pipeline(pipeline, std::vector<VkTensor>(), bindings, constants, dispatcher);
+}
+
+void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& buffer_bindings, const std::vector<VkImageTensor>& image_bindings, const std::vector<vk_constant_type>& constants, const VkTensor& dispatcher)
+{
+    // Mat dispatcher_mat(dispatcher.w, dispatcher.h, dispatcher.c, (void*)0);
+
+    record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher.w, dispatcher.h, dispatcher.c);
+}
+
+void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& buffer_bindings, const std::vector<VkImageTensor>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageTensor& dispatcher)
+{
+    // VkTensor dispatcher_VkTensor(dispatcher.w, dispatcher.h, dispatcher.c, (void*)0);
+
+    record_pipeline(pipeline, buffer_bindings, image_bindings, constants, dispatcher.w, dispatcher.h, dispatcher.c);
+}
+
+void VkCompute::record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& buffer_bindings, const std::vector<VkImageTensor>& image_bindings, const std::vector<vk_constant_type>& constants, int dispatcher_w, int dispatcher_h, int dispatcher_c)
+{
+    const int buffer_binding_count = (int)buffer_bindings.size();
+    const int image_binding_count = (int)image_bindings.size();
+    const int constant_count = (int)constants.size();
+
+    const int binding_count = buffer_binding_count + image_binding_count;
+
+    if (binding_count != pipeline->shader_info.binding_count)
+    {
+        printf("binding_count not match, expect %d but got %d + %d", pipeline->shader_info.binding_count, buffer_binding_count, image_binding_count);
+    }
+
+    if (constant_count != pipeline->shader_info.push_constant_count)
+    {
+        printf("push_constant_count not match, expect %d but got %d", pipeline->shader_info.push_constant_count, constant_count);
+    }
+
+    int buffer_index = 0;
+    int image_index = 0;
+    for (int i=0; i<binding_count; i++)
+    {
+        int binding_type = pipeline->shader_info.binding_types[i];
+
+        if (binding_type == 1)
+        {
+            const VkTensor& binding = buffer_bindings[buffer_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[buffer_index];
+            buffer_index++;
+
+//             TLOG_INFO("binding #%d buffer = %d %d %d %d @ %lu %d = %p +%ld ~%ld", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.buffer(), binding.buffer_offset(), binding.buffer_capacity());
+
+            if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
+            {
+                // barrier device any @ compute/null to shader-readwrite @ compute
+                VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
+                barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+                barriers[0].pNext = 0;
+                barriers[0].srcAccessMask = binding.data->access_flags;
+                barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+                barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                barriers[0].buffer = binding.buffer();
+                barriers[0].offset = binding.buffer_offset();
+                barriers[0].size = binding.buffer_capacity();
+
+                VkPipelineStageFlags src_stage = binding.data->stage_flags;
+                VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+                if (vkdev->info.support_VK_KHR_push_descriptor)
+                {
+                    vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
+                    delete[] barriers;
+                }
+                else
+                {
+                    record r;
+                    r.type = record::TYPE_buffer_barrers;
+                    r.command_buffer = compute_command_buffer;
+                    r.buffer_barrers.src_stage = src_stage;
+                    r.buffer_barrers.dst_stage = dst_stage;
+                    r.buffer_barrers.barrier_count = 1;
+                    r.buffer_barrers.barriers = barriers;
+                    delayed_records.push_back(r);
+                }
+
+                // mark device shader-readwrite @ compute
+                binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+                binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+            }
+        }
+        else if (binding_type == 2)
+        {
+            const VkImageTensor& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image() : image_bindings[image_index];
+            image_index++;
+
+//             TLOG_INFO("binding #%d image = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview());
+
+            if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_GENERAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
+            {
+                // image layout transform any @ any to shader-write @ compute
+                VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
+                barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+                barriers[0].pNext = 0;
+                barriers[0].srcAccessMask = binding.data->access_flags;
+                barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+                barriers[0].oldLayout = binding.data->image_layout;
+                barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL;
+                barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                barriers[0].image = binding.image();
+                barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+                barriers[0].subresourceRange.baseMipLevel = 0;
+                barriers[0].subresourceRange.levelCount = 1;
+                barriers[0].subresourceRange.baseArrayLayer = 0;
+                barriers[0].subresourceRange.layerCount = 1;
+
+                VkPipelineStageFlags src_stage = binding.data->stage_flags;
+                VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+                if (vkdev->info.support_VK_KHR_push_descriptor)
+                {
+                    vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
+                    delete[] barriers;
+                }
+                else
+                {
+                    record r;
+                    r.type = record::TYPE_image_barrers;
+                    r.command_buffer = compute_command_buffer;
+                    r.image_barrers.src_stage = src_stage;
+                    r.image_barrers.dst_stage = dst_stage;
+                    r.image_barrers.barrier_count = 1;
+                    r.image_barrers.barriers = barriers;
+                    delayed_records.push_back(r);
+                }
+
+                // mark image shader-write @ compute
+                binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+                binding.data->image_layout = VK_IMAGE_LAYOUT_GENERAL;
+                binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+            }
+
+            // image and imageview can not be destroyed until command execution ends
+            TENGINE_XADD(&binding.data->command_refcount, 1);
+            image_blocks_to_destroy.push_back(binding.data);
+        }
+        else // if (binding_type == 3)
+        {
+            const VkImageTensor& binding = image_bindings[image_index].empty() ? vkdev->get_dummy_image() : image_bindings[image_index];
+            image_index++;
+
+//             TLOG_INFO("binding #%d sampler = %d %d %d %d @ %lu %d = %p +%ld ~%ld %p", i, binding.dims, binding.w, binding.h, binding.c, binding.elemsize, binding.elempack, binding.image(), binding.data->bind_offset, binding.data->bind_capacity, binding.imageview());
+
+            // if the same image used for both storage image and combined image sampler
+            // only apply image layout transition to general
+            for (int j=0; j<image_binding_count; j++)
+            {
+                if (pipeline->shader_info.binding_types[j] == 2 && binding.data == image_bindings[j].data)
+                {
+                    // the same image is used as storage image, skip it
+                    continue;
+                }
+            }
+
+            if (binding.data->access_flags & VK_ACCESS_SHADER_WRITE_BIT || binding.data->image_layout != VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL || binding.data->stage_flags != VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT)
+            {
+                // image layout transform any @ any to shader-readonly-optimal @ compute
+                VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
+                barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+                barriers[0].pNext = 0;
+                barriers[0].srcAccessMask = binding.data->access_flags;
+                barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+                barriers[0].oldLayout = binding.data->image_layout;
+                barriers[0].newLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+                barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+                barriers[0].image = binding.image();
+                barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+                barriers[0].subresourceRange.baseMipLevel = 0;
+                barriers[0].subresourceRange.levelCount = 1;
+                barriers[0].subresourceRange.baseArrayLayer = 0;
+                barriers[0].subresourceRange.layerCount = 1;
+
+                VkPipelineStageFlags src_stage = binding.data->stage_flags;
+                VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+                if (vkdev->info.support_VK_KHR_push_descriptor)
+                {
+                    vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
+                    delete[] barriers;
+                }
+                else
+                {
+                    record r;
+                    r.type = record::TYPE_image_barrers;
+                    r.command_buffer = compute_command_buffer;
+                    r.image_barrers.src_stage = src_stage;
+                    r.image_barrers.dst_stage = dst_stage;
+                    r.image_barrers.barrier_count = 1;
+                    r.image_barrers.barriers = barriers;
+                    delayed_records.push_back(r);
+                }
+
+                // mark image shader-readonly-optimal @ compute
+                binding.data->access_flags = VK_ACCESS_SHADER_READ_BIT;
+                binding.data->image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
+                binding.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+            }
+
+            // image and imageview can not be destroyed until command execution ends
+            TENGINE_XADD(&binding.data->command_refcount, 1);
+            image_blocks_to_destroy.push_back(binding.data);
+        }
+    }
+    // record bind pipeline
+    {
+        if (vkdev->info.support_VK_KHR_push_descriptor)
+        {
+            vkCmdBindPipeline(compute_command_buffer, VK_PIPELINE_BIND_POINT_COMPUTE, pipeline->pipeline);
+        }
+        else
+        {
+            record r;
+            r.type = record::TYPE_bind_pipeline;
+            r.command_buffer = compute_command_buffer;
+            r.bind_pipeline.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE;
+            r.bind_pipeline.pipeline = pipeline->pipeline;
+            delayed_records.push_back(r);
+        }
+    }
+
+    // record update bindings
+    if (binding_count > 0)
+    {
+        std::vector<unsigned char> descriptorInfos;
+        {
+            descriptorInfos.resize(sizeof(VkDescriptorBufferInfo) * buffer_binding_count + sizeof(VkDescriptorImageInfo) * image_binding_count);
+
+            unsigned char* p_descriptorInfos = descriptorInfos.data();
+            int descriptorBufferInfo_index = 0;
+            int descriptorImageInfo_index = 0;
+            for (int i=0; i<binding_count; i++)
+            {
+                int binding_type = pipeline->shader_info.binding_types[i];
+
+                if (binding_type == 1)
+                {
+                    const VkTensor& binding = buffer_bindings[descriptorBufferInfo_index].empty() ? vkdev->get_dummy_buffer() : buffer_bindings[descriptorBufferInfo_index];
+                    descriptorBufferInfo_index++;
+
+                    VkDescriptorBufferInfo descriptorBufferInfo;
+                    descriptorBufferInfo.buffer = binding.buffer();
+                    descriptorBufferInfo.offset = binding.buffer_offset();
+                    descriptorBufferInfo.range = binding.total() * binding.elemsize;
+
+                    memcpy(p_descriptorInfos, &descriptorBufferInfo, sizeof(VkDescriptorBufferInfo));
+                    p_descriptorInfos += sizeof(VkDescriptorBufferInfo);
+                }
+                else //if (binding_type == 2 || binding_type == 3)
+                {
+                    const VkImageTensor& binding = image_bindings[descriptorImageInfo_index].empty() ? vkdev->get_dummy_image() : image_bindings[descriptorImageInfo_index];
+                    descriptorImageInfo_index++;
+
+                    // we always use immutable nearest sampler set in descroptor layout during pipeline creation
+                    VkDescriptorImageInfo descriptorImageInfo;
+                    descriptorImageInfo.sampler = 0;
+                    descriptorImageInfo.imageView = binding.imageview();
+                    descriptorImageInfo.imageLayout = binding.data->image_layout;
+
+                    memcpy(p_descriptorInfos, &descriptorImageInfo, sizeof(VkDescriptorImageInfo));
+                    p_descriptorInfos += sizeof(VkDescriptorImageInfo);
+                }
+            }
+        }
+
+        if (vkdev->info.support_VK_KHR_push_descriptor)
+        {
+            vkdev->vkCmdPushDescriptorSetWithTemplateKHR(compute_command_buffer, pipeline->descriptor_update_template, pipeline->pipeline_layout, 0, descriptorInfos.data());
+        }
+        else
+        {
+            // create new descriptor_pool and descriptorset
+            VkDescriptorPool descriptor_pool;
+            {
+                int image_binding_count = 0;
+                int sampler_binding_count = 0;
+                for (int i=0; i<binding_count; i++)
+                {
+                    int binding_type = pipeline->shader_info.binding_types[i];
+
+                    if (binding_type == 2)
+                        image_binding_count++;
+                    else // if (binding_type == 3)
+                        sampler_binding_count++;
+                }
+
+                VkDescriptorPoolSize poolSizes[3];
+                poolSizes[0].type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+                poolSizes[0].descriptorCount = buffer_binding_count;
+                poolSizes[1].type = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
+                poolSizes[1].descriptorCount = image_binding_count;
+                poolSizes[2].type = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+                poolSizes[2].descriptorCount = sampler_binding_count;
+
+                VkDescriptorPoolCreateInfo descriptorPoolCreateInfo;
+                descriptorPoolCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO;
+                descriptorPoolCreateInfo.pNext = 0;
+                descriptorPoolCreateInfo.flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT;
+                descriptorPoolCreateInfo.maxSets = 1;
+                descriptorPoolCreateInfo.poolSizeCount = 3;
+                descriptorPoolCreateInfo.pPoolSizes = poolSizes;
+
+                VkResult ret = vkCreateDescriptorPool(vkdev->vkdevice(), &descriptorPoolCreateInfo, 0, &descriptor_pool);
+                if (ret != VK_SUCCESS)
+                {
+                    printf("vkCreateDescriptorPool failed %d", ret);
+                    return;
+                }
+            }
+            descriptor_pools.push_back(descriptor_pool);
+
+            VkDescriptorSet descriptorset;
+            {
+                VkDescriptorSetAllocateInfo descriptorSetAllocateInfo;
+                descriptorSetAllocateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO;
+                descriptorSetAllocateInfo.pNext = 0;
+                descriptorSetAllocateInfo.descriptorPool = descriptor_pool;
+                descriptorSetAllocateInfo.descriptorSetCount = 1;
+                descriptorSetAllocateInfo.pSetLayouts = &pipeline->descriptorset_layout;
+
+                VkResult ret = vkAllocateDescriptorSets(vkdev->vkdevice(), &descriptorSetAllocateInfo, &descriptorset);
+                if (ret != VK_SUCCESS)
+                {
+                    printf("vkAllocateDescriptorSets failed %d", ret);
+                    return;
+                }
+            }
+            descriptorsets.push_back(descriptorset);
+
+            if (vkdev->info.support_VK_KHR_descriptor_update_template)
+            {
+                vkdev->vkUpdateDescriptorSetWithTemplateKHR(vkdev->vkdevice(), descriptorset, pipeline->descriptor_update_template, descriptorInfos.data());
+            }
+            else
+            {
+                std::vector<VkWriteDescriptorSet> writeDescriptorSets(binding_count);
+                {
+                    const unsigned char* p_descriptorInfos = descriptorInfos.data();
+                    for (int i=0; i<binding_count; i++)
+                    {
+                        int binding_type = pipeline->shader_info.binding_types[i];
+
+                        writeDescriptorSets[i].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+                        writeDescriptorSets[i].pNext = 0;
+                        writeDescriptorSets[i].dstSet = descriptorset;
+                        writeDescriptorSets[i].dstBinding = i;
+                        writeDescriptorSets[i].dstArrayElement = 0;
+                        writeDescriptorSets[i].descriptorCount = 1;
+                        writeDescriptorSets[i].pTexelBufferView = 0;
+
+                        if (binding_type == 1)
+                        {
+                            writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+                            writeDescriptorSets[i].pImageInfo = 0;
+                            writeDescriptorSets[i].pBufferInfo = (const VkDescriptorBufferInfo*)p_descriptorInfos;
+
+                            p_descriptorInfos += sizeof(VkDescriptorBufferInfo);
+                        }
+                        else if (binding_type == 2)
+                        {
+                            writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
+                            writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos;
+                            writeDescriptorSets[i].pBufferInfo = 0;
+
+                            p_descriptorInfos += sizeof(VkDescriptorImageInfo);
+                        }
+                        else // if (binding_type == 3)
+                        {
+                            writeDescriptorSets[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+                            writeDescriptorSets[i].pImageInfo = (const VkDescriptorImageInfo*)p_descriptorInfos;
+                            writeDescriptorSets[i].pBufferInfo = 0;
+
+                            p_descriptorInfos += sizeof(VkDescriptorImageInfo);
+                        }
+                    }
+                }
+
+                vkUpdateDescriptorSets(vkdev->vkdevice(), binding_count, writeDescriptorSets.data(), 0, 0);
+            }
+
+            record r;
+            r.type = record::TYPE_bind_descriptorsets;
+            r.command_buffer = compute_command_buffer;
+            r.bind_descriptorsets.bind_point = VK_PIPELINE_BIND_POINT_COMPUTE;
+            r.bind_descriptorsets.pipeline_layout = pipeline->pipeline_layout;
+            r.bind_descriptorsets.descriptorset_count = 1;
+            r.bind_descriptorsets.descriptorset_offset = descriptorsets.size() - 1;
+            delayed_records.push_back(r);
+        }
+    }
+
+    // record push constants
+    if (constant_count > 0)
+    {
+        if (vkdev->info.support_VK_KHR_push_descriptor)
+        {
+            vkCmdPushConstants(compute_command_buffer, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, constant_count * sizeof(vk_constant_type), constants.data());
+        }
+        else
+        {
+            uint32_t size = constant_count * sizeof(vk_constant_type);
+            unsigned char* constant_values = new unsigned char[size];
+            memcpy(constant_values, constants.data(), size);
+
+            record r;
+            r.type = record::TYPE_push_constants;
+            r.command_buffer = compute_command_buffer;
+            r.push_constants.pipeline_layout = pipeline->pipeline_layout;
+            r.push_constants.stage_flags = VK_SHADER_STAGE_COMPUTE_BIT;
+            r.push_constants.size = size;
+            r.push_constants.values = constant_values;
+            delayed_records.push_back(r);
+        }
+    }
+
+    // record dispatch
+    {
+        uint32_t group_count_x = (dispatcher_w + pipeline->local_size_x - 1) / pipeline->local_size_x;
+        uint32_t group_count_y = (dispatcher_h + pipeline->local_size_y - 1) / pipeline->local_size_y;
+        uint32_t group_count_z = (dispatcher_c + pipeline->local_size_z - 1) / pipeline->local_size_z;
+
+        if (vkdev->info.support_VK_KHR_push_descriptor)
+        {
+            vkCmdDispatch(compute_command_buffer, group_count_x, group_count_y, group_count_z);
+        }
+        else
+        {
+            record r;
+            r.type = record::TYPE_dispatch;
+            r.command_buffer = compute_command_buffer;
+            r.dispatch.group_count_x = group_count_x;
+            r.dispatch.group_count_y = group_count_y;
+            r.dispatch.group_count_z = group_count_z;
+            delayed_records.push_back(r);
+        }
+    }
+}
+
+VkTransfer::VkTransfer(const GPUDevice* _vkdev) : vkdev(_vkdev)
+{
+    compute_command_pool = 0;
+    transfer_command_pool = 0;
+
+    upload_command_buffer = 0;
+    compute_command_buffer = 0;
+
+    upload_compute_semaphore = 0;
+
+    upload_command_fence = 0;
+    compute_command_fence = 0;
+
+    init();
+}
+
+VkTransfer::~VkTransfer()
+{
+    vkDestroyFence(vkdev->vkdevice(), compute_command_fence, 0);
+
+    vkFreeCommandBuffers(vkdev->vkdevice(), compute_command_pool, 1, &compute_command_buffer);
+    vkDestroyCommandPool(vkdev->vkdevice(), compute_command_pool, 0);
+
+    if (!vkdev->info.unified_compute_transfer_queue)
+    {
+        vkDestroyFence(vkdev->vkdevice(), upload_command_fence, 0);
+
+        vkDestroySemaphore(vkdev->vkdevice(), upload_compute_semaphore, 0);
+
+        vkFreeCommandBuffers(vkdev->vkdevice(), transfer_command_pool, 1, &upload_command_buffer);
+        vkDestroyCommandPool(vkdev->vkdevice(), transfer_command_pool, 0);
+    }
+}
+
+int VkTransfer::init()
+{
+    // compute_command_pool
+    {
+        VkCommandPoolCreateInfo commandPoolCreateInfo;
+        commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+        commandPoolCreateInfo.pNext = 0;
+        commandPoolCreateInfo.flags = 0;
+        commandPoolCreateInfo.queueFamilyIndex = vkdev->info.compute_queue_family_index;
+
+        VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &compute_command_pool);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkCreateCommandPool failed %d", ret);
+            return -1;
+        }
+    }
+
+    // compute_command_buffer
+    {
+        VkCommandBufferAllocateInfo commandBufferAllocateInfo;
+        commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+        commandBufferAllocateInfo.pNext = 0;
+        commandBufferAllocateInfo.commandPool = compute_command_pool;
+        commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+        commandBufferAllocateInfo.commandBufferCount = 1;
+
+        VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &compute_command_buffer);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkAllocateCommandBuffers failed %d", ret);
+            return -1;
+        }
+    }
+
+    // compute_command_fence
+    {
+        VkFenceCreateInfo fenceCreateInfo;
+        fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+        fenceCreateInfo.pNext = 0;
+        fenceCreateInfo.flags = 0;
+
+        VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &compute_command_fence);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkCreateFence failed %d", ret);
+            return -1;
+        } 
+    }
+
+    if (!vkdev->info.unified_compute_transfer_queue)
+    {
+        // transfer_command_pool
+        {
+            VkCommandPoolCreateInfo commandPoolCreateInfo;
+            commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+            commandPoolCreateInfo.pNext = 0;
+            commandPoolCreateInfo.flags = 0;
+            commandPoolCreateInfo.queueFamilyIndex = vkdev->info.transfer_queue_family_index;
+
+            VkResult ret = vkCreateCommandPool(vkdev->vkdevice(), &commandPoolCreateInfo, 0, &transfer_command_pool);
+            if (ret != VK_SUCCESS)
+            {
+                printf("vkCreateCommandPool failed %d", ret);
+                return -1;
+            }
+        }
+
+    // upload_command_buffer
+    {
+            VkCommandBufferAllocateInfo commandBufferAllocateInfo;
+            commandBufferAllocateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
+            commandBufferAllocateInfo.pNext = 0;
+            commandBufferAllocateInfo.commandPool = transfer_command_pool;
+            commandBufferAllocateInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
+            commandBufferAllocateInfo.commandBufferCount = 1;
+
+            VkResult ret = vkAllocateCommandBuffers(vkdev->vkdevice(), &commandBufferAllocateInfo, &upload_command_buffer);
+            if (ret != VK_SUCCESS)
+            {
+                printf("vkAllocateCommandBuffers failed %d", ret);
+                return -1;
+            }
+    }
+
+    // upload_compute_semaphore
+    {
+            VkSemaphoreCreateInfo semaphoreCreateInfo;
+            semaphoreCreateInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+            semaphoreCreateInfo.pNext = 0;
+            semaphoreCreateInfo.flags = 0;
+
+            VkResult ret = vkCreateSemaphore(vkdev->vkdevice(), &semaphoreCreateInfo, 0, &upload_compute_semaphore);
+
+        if (ret != VK_SUCCESS)
+        {
+                printf("vkCreateSemaphore failed %d", ret);
+        return -1;
+        }
+    }
+
+    // upload_command_fence
+    {
+            VkFenceCreateInfo fenceCreateInfo;
+            fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
+            fenceCreateInfo.pNext = 0;
+            fenceCreateInfo.flags = 0;
+
+            VkResult ret = vkCreateFence(vkdev->vkdevice(), &fenceCreateInfo, 0, &upload_command_fence);
+
+        if (ret != VK_SUCCESS)
+            {
+                printf("vkCreateFence failed %d", ret);
+                return -1;
+        }
+    }
+    }
+
+    begin_command_buffer();
+
+    return 0;
+}
+
+int VkTransfer::begin_command_buffer()
+{
+    {
+        VkCommandBufferBeginInfo commandBufferBeginInfo;
+        commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+        commandBufferBeginInfo.pNext = 0;
+        commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+        commandBufferBeginInfo.pInheritanceInfo = 0;
+
+        VkResult ret = vkBeginCommandBuffer(compute_command_buffer, &commandBufferBeginInfo);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkBeginCommandBuffer failed %d", ret);
+            return -1;
+        }
+    }
+
+    if (!vkdev->info.unified_compute_transfer_queue)
+    {
+        {
+            VkCommandBufferBeginInfo commandBufferBeginInfo;
+            commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+            commandBufferBeginInfo.pNext = 0;
+            commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+            commandBufferBeginInfo.pInheritanceInfo = 0;
+
+            VkResult ret = vkBeginCommandBuffer(upload_command_buffer, &commandBufferBeginInfo);
+            if (ret != VK_SUCCESS)
+            {
+                printf("vkBeginCommandBuffer failed %d", ret);
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+
+int VkTransfer::end_command_buffer()
+{
+    {
+        VkResult ret = vkEndCommandBuffer(compute_command_buffer);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkEndCommandBuffer failed %d", ret);
+            return -1;
+        }
+    }
+
+    if (!vkdev->info.unified_compute_transfer_queue)
+    {
+        {
+            VkResult ret = vkEndCommandBuffer(upload_command_buffer);
+            if (ret != VK_SUCCESS)
+            {
+                printf("vkEndCommandBuffer failed %d", ret);
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+int VkTransfer::submit_and_wait()
+{
+    // end command buffer
+    {
+        end_command_buffer();
+    }
+
+    VkQueue compute_queue = vkdev->acquire_queue(vkdev->info.compute_queue_family_index);
+    if (compute_queue == 0)
+    {
+        printf("out of compute queue");
+        return -1;
+    }
+
+    if (vkdev->info.unified_compute_transfer_queue)
+    {
+        // submit compute
+        {
+            VkSubmitInfo submitInfo;
+            submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+            submitInfo.pNext = 0;
+            submitInfo.waitSemaphoreCount = 0;
+            submitInfo.pWaitSemaphores = 0;
+            submitInfo.pWaitDstStageMask = 0;
+            submitInfo.commandBufferCount = 1;
+            submitInfo.pCommandBuffers = &compute_command_buffer;
+            submitInfo.signalSemaphoreCount = 0;
+            submitInfo.pSignalSemaphores = 0;
+
+            VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, compute_command_fence);
+            if (ret != VK_SUCCESS)
+            {
+                printf("vkQueueSubmit failed %d", ret);
+                vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue);
+                return -1;
+            }
+        }
+    }
+    else
+    {
+        VkQueue transfer_queue = vkdev->acquire_queue(vkdev->info.transfer_queue_family_index);
+        if (transfer_queue == 0)
+        {
+            printf("out of transfer queue");
+            vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue);
+            return -1;
+        }
+
+        // submit upload compute
+        {
+            VkSubmitInfo submitInfo;
+            submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+            submitInfo.pNext = 0;
+            submitInfo.waitSemaphoreCount = 0;
+            submitInfo.pWaitSemaphores = 0;
+            submitInfo.pWaitDstStageMask = 0;
+            submitInfo.commandBufferCount = 1;
+            submitInfo.pCommandBuffers = &upload_command_buffer;
+            submitInfo.signalSemaphoreCount = 1;
+            submitInfo.pSignalSemaphores = &upload_compute_semaphore;
+
+            VkResult ret = vkQueueSubmit(transfer_queue, 1, &submitInfo, upload_command_fence);
+            if (ret != VK_SUCCESS)
+            {
+                printf("vkQueueSubmit failed %d", ret);
+                vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue);
+                vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue);
+                return -1;
+            }
+        }
+        
+        {
+            VkPipelineStageFlags wait_dst_stage = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT;// FIXME
+            VkSubmitInfo submitInfo;
+            submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+            submitInfo.pNext = 0;
+            submitInfo.waitSemaphoreCount = 1;
+            submitInfo.pWaitSemaphores = &upload_compute_semaphore;
+            submitInfo.pWaitDstStageMask = &wait_dst_stage;
+            submitInfo.commandBufferCount = 1;
+            submitInfo.pCommandBuffers = &compute_command_buffer;
+            submitInfo.signalSemaphoreCount = 0;
+            submitInfo.pSignalSemaphores = 0;
+
+            VkResult ret = vkQueueSubmit(compute_queue, 1, &submitInfo, compute_command_fence);
+
+            if (ret != VK_SUCCESS)
+            {
+                printf("vkQueueSubmit failed %d", ret);
+                vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue);
+                vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue);
+                return -1;
+            }
+        }
+        
+        vkdev->reclaim_queue(vkdev->info.transfer_queue_family_index, transfer_queue);
+    }
+    vkdev->reclaim_queue(vkdev->info.compute_queue_family_index, compute_queue);
+    
+    // wait
+    if (vkdev->info.unified_compute_transfer_queue)
+    {
+        VkResult ret = vkWaitForFences(vkdev->vkdevice(), 1, &compute_command_fence, VK_TRUE, UINT64_MAX);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkWaitForFences failed %d", ret);
+            return -1;
+        }
+    }
+    else
+    {
+        VkFence fences[2] = { upload_command_fence, compute_command_fence };
+
+        VkResult ret = vkWaitForFences(vkdev->vkdevice(), 2, fences, VK_TRUE, UINT64_MAX);
+        if (ret != VK_SUCCESS)
+        {
+            printf("vkWaitForFences failed %d", ret);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+void VkTransfer::record_upload(const Tensor& src, VkTensor& dst, const Option& opt)
+{
+//     TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack);
+
+    // NOTE keep the hack here ?
+    if (src.elemsize == src.elempack * 4u)
+    {
+        if (opt.use_fp16_storage || (opt.use_fp16_packed && src.elempack % 4 == 0))
+        {
+            // printf("VkTransfer record_upload, cast fp32 to fp16, need to be done, fix me\n");
+            Tensor src_fp16;
+            TEngine::cast_float32_to_float16(src, src_fp16);
+            record_upload(src_fp16, dst, opt);
+
+            return;
+        }
+    }
+
+    Tensor src_flattened = src.reshape(src.w * src.h * src.c);
+
+    // create dst
+    dst.create_like(src_flattened, opt.blob_vkallocator);
+
+    if (dst.empty())
+    {
+        return;
+    }
+
+    if (dst.allocator->mappable)
+    {
+        // memcpy src_flattened to device
+        memcpy(dst.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize);
+        dst.allocator->flush(dst.data);
+
+        // barrier device host-write @ null to shader-read @ compute
+        {
+            VkBufferMemoryBarrier barrier;
+            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+            barrier.pNext = 0;
+            barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
+            barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+            barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.buffer = dst.buffer();
+            barrier.offset = dst.buffer_offset();
+            barrier.size = dst.buffer_capacity();
+
+            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT;
+            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+            vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+        }
+
+        // mark device shader-readwrite @ compute
+        dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT;
+        dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+        return;
+    }
+
+    // create staging
+    VkTensor dst_staging;
+    dst_staging.create_like(src_flattened, opt.staging_vkallocator);
+
+    // memcpy src_flattened to staging
+    memcpy(dst_staging.mapped_ptr(), src_flattened.data, src_flattened.total() * src_flattened.elemsize);
+    dst_staging.allocator->flush(dst_staging.data);
+
+    VkCommandBuffer command_buffer;
+    if (vkdev->info.unified_compute_transfer_queue)
+    {
+        command_buffer = compute_command_buffer;
+    }
+    else
+    {
+        command_buffer = upload_command_buffer;
+    }
+
+    // barrier staging host-write @ null to transfer-read @ queue
+    {
+        VkBufferMemoryBarrier barrier;
+        barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+        barrier.pNext = 0;
+        barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+        barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barrier.buffer = dst_staging.buffer();
+        barrier.offset = dst_staging.buffer_offset();
+        barrier.size = dst_staging.buffer_capacity();
+
+        VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT;
+        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+        vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+    }
+
+    // record staging to device
+    {
+        VkBufferCopy region;
+        region.srcOffset = dst_staging.buffer_offset();
+        region.dstOffset = dst.buffer_offset();
+        region.size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity());
+
+        vkCmdCopyBuffer(command_buffer, dst_staging.buffer(), dst.buffer(), 1, &region);
+    }
+
+    if (vkdev->info.unified_compute_transfer_queue)
+    {
+        // barrier device transfer-write @ compute to shader-read @ compute
+        {
+            VkBufferMemoryBarrier barrier;
+            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+            barrier.pNext = 0;
+            barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+            barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+            barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.buffer = dst.buffer();
+            barrier.offset = dst.buffer_offset();
+            barrier.size = dst.buffer_capacity();
+
+            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+            vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+        }
+    }
+    else
+    {
+        // queue ownership transfer transfer-write @ transfer to shader-read @ compute
+
+        // release
+        {
+            VkBufferMemoryBarrier barrier;
+            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+            barrier.pNext = 0;
+            barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+            barrier.dstAccessMask = 0;
+            barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index;
+            barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index;
+            barrier.buffer = dst.buffer();
+            barrier.offset = dst.buffer_offset();
+            barrier.size = dst.buffer_capacity();
+
+            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+
+            vkCmdPipelineBarrier(upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+        }
+
+        // acquire
+        {
+            VkBufferMemoryBarrier barrier;
+            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+            barrier.pNext = 0;
+            barrier.srcAccessMask = 0;
+            barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+            barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index;
+            barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index;
+            barrier.buffer = dst.buffer();
+            barrier.offset = dst.buffer_offset();
+            barrier.size = dst.buffer_capacity();
+
+            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+            vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+        }
+    }
+
+    // mark device shader-readwrite @ compute
+    dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT;
+    dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+    // stash staging
+    upload_staging_buffers.push_back(dst_staging);
+}
+
+void VkTransfer::record_upload(const tensor* src, VkTensor& dst, const Option& opt)
+{
+//     TLOG_INFO("record_upload src = %d | %d %d %d @ %d", src.dims, src.w, src.h, src.c, src.elempack);
+
+    // NOTE keep the hack here ?
+    // printf("elem size: %d, elempack:%d\n", src.elemsize, src.elempack);
+    if (src->elem_size == opt.elempack * 4u)
+    {
+        if (opt.use_fp16_storage || (opt.use_fp16_packed && opt.elempack % 4 == 0))
+        {
+            printf("VkTransfer record_upload, cast fp32 to fp16, need to be done, fix me\n");
+            // Mat src_fp16;
+            // cast_float32_to_float16(src, src_fp16);
+
+            // record_upload(src_fp16, dst, opt);
+
+            return;
+        }
+    }
+
+    // Mat src_flattened = src.reshape(src.w * src.h * src.c);
+
+    // create dst
+    // dst.create_like(src_flattened, opt.blob_vkallocator);
+    // int elemnum = src->elem_num;    //  src->GetTotalSize()/sizeof(float);
+    dst.create(src->elem_num, src->elem_size, opt.blob_vkallocator);
+
+    if (dst.empty())
+    {
+        return;
+    }
+
+    if (dst.allocator->mappable)
+    {
+        // memcpy src_flattened to device
+        memcpy(dst.mapped_ptr(), src->data, src->elem_num * src->elem_size);
+        dst.allocator->flush(dst.data);
+
+        // barrier device host-write @ null to shader-read @ compute
+        {
+            VkBufferMemoryBarrier barrier;
+            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+            barrier.pNext = 0;
+            barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
+            barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+            barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.buffer = dst.buffer();
+            barrier.offset = dst.buffer_offset();
+            barrier.size = dst.buffer_capacity();
+
+            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT;
+            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+            vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+        }
+
+        // mark device shader-readwrite @ compute
+        dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT;
+        dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+        return;
+    }
+
+    printf("run create staging\n");
+    // create staging
+    VkTensor dst_staging;
+    dst_staging.create(src->elem_num, src->elem_size, opt.staging_vkallocator);
+    // dst_staging.create_like(src_flattened, opt.staging_vkallocator);
+
+    // memcpy src_flattened to staging
+    memcpy(dst_staging.mapped_ptr(), src->data, src->elem_num * src->elem_size);
+    dst_staging.allocator->flush(dst_staging.data);
+
+    VkCommandBuffer command_buffer;
+    if (vkdev->info.unified_compute_transfer_queue)
+    {
+        command_buffer = compute_command_buffer;
+    }
+    else
+    {
+        command_buffer = upload_command_buffer;
+    }
+
+    // barrier staging host-write @ null to transfer-read @ queue
+    {
+        VkBufferMemoryBarrier barrier;
+        barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+        barrier.pNext = 0;
+        barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT;
+        barrier.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
+        barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barrier.buffer = dst_staging.buffer();
+        barrier.offset = dst_staging.buffer_offset();
+        barrier.size = dst_staging.buffer_capacity();
+
+        VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_HOST_BIT;
+        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+
+        vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+    }
+
+    // record staging to device
+    {
+        VkBufferCopy region;
+        region.srcOffset = dst_staging.buffer_offset();
+        region.dstOffset = dst.buffer_offset();
+        region.size = std::min(dst_staging.buffer_capacity(), dst.buffer_capacity());
+
+        vkCmdCopyBuffer(command_buffer, dst_staging.buffer(), dst.buffer(), 1, &region);
+    }
+
+    if (vkdev->info.unified_compute_transfer_queue)
+    {
+        // barrier device transfer-write @ compute to shader-read @ compute
+        {
+            VkBufferMemoryBarrier barrier;
+            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+            barrier.pNext = 0;
+            barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+            barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+            barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+            barrier.buffer = dst.buffer();
+            barrier.offset = dst.buffer_offset();
+            barrier.size = dst.buffer_capacity();
+
+            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+            vkCmdPipelineBarrier(command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+        }
+    }
+    else
+    {
+        // queue ownership transfer transfer-write @ transfer to shader-read @ compute
+
+        // release
+        {
+            VkBufferMemoryBarrier barrier;
+            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+            barrier.pNext = 0;
+            barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
+            barrier.dstAccessMask = 0;
+            barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index;
+            barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index;
+            barrier.buffer = dst.buffer();
+            barrier.offset = dst.buffer_offset();
+            barrier.size = dst.buffer_capacity();
+
+            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TRANSFER_BIT;
+            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT;
+
+            vkCmdPipelineBarrier(upload_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+        }
+
+        // acquire
+        {
+            VkBufferMemoryBarrier barrier;
+            barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+            barrier.pNext = 0;
+            barrier.srcAccessMask = 0;
+            barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT;
+            barrier.srcQueueFamilyIndex = vkdev->info.transfer_queue_family_index;
+            barrier.dstQueueFamilyIndex = vkdev->info.compute_queue_family_index;
+            barrier.buffer = dst.buffer();
+            barrier.offset = dst.buffer_offset();
+            barrier.size = dst.buffer_capacity();
+
+            VkPipelineStageFlags src_stage = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT;
+            VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+            vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, &barrier, 0, 0);
+        }
+    }
+
+    // mark device shader-readwrite @ compute
+    dst.data->access_flags = VK_ACCESS_SHADER_READ_BIT;
+    dst.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+    // stash staging
+    upload_staging_buffers.push_back(dst_staging);
+}
+
+} // namespace TEngine
diff --git a/source/device/vulkan/vulkan_command.hpp b/source/device/vulkan/vulkan_command.hpp
new file mode 100644
index 000000000..1f5e82e06
--- /dev/null
+++ b/source/device/vulkan/vulkan_command.hpp
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2020, OPEN AI LAB
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef VULKAN_COMMAND_HPP
+#define VULKAN_COMMAND_HPP
+
+#include <vulkan/vulkan.h>
+#include <vector>
+#include "vulkan_allocator.hpp"
+#include "vulkan_tensor.hpp"
+#include "vulkan_pipeline.hpp"
+#include "vulkan_option.hpp"
+#include "vulkan_platform.hpp"
+// #include "tengine_log.h"
+
+namespace TEngine {
+
+class Pipeline;
+class VkCompute
+{
+public:
+    VkCompute(const GPUDevice* vkdev);
+    virtual ~VkCompute();
+
+public:
+    void record_upload(tensor* src, VkTensor& dst, const Option& opt);
+    void record_upload(const Tensor& src, VkTensor& dst, const Option& opt);
+
+    void record_download(const VkTensor& src, tensor* dst, const Option& opt);
+    void record_download(const VkTensor& src, Tensor& dst, const Option& opt);
+
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& bindings, const std::vector<vk_constant_type>& constants, const VkTensor& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkImageTensor>& bindings, const std::vector<vk_constant_type>& constants, const VkImageTensor& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& buffer_bindings, const std::vector<VkImageTensor>& image_bindings, const std::vector<vk_constant_type>& constants, const VkTensor& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& buffer_bindings, const std::vector<VkImageTensor>& image_bindings, const std::vector<vk_constant_type>& constants, const VkImageTensor& dispatcher);
+    void record_pipeline(const Pipeline* pipeline, const std::vector<VkTensor>& buffer_bindings, const std::vector<VkImageTensor>& image_bindings, const std::vector<vk_constant_type>& constants, int dispatcher_w, int dispatcher_h, int dispatcher_c);
+    
+    int submit_and_wait();
+
+    int reset();
+
+protected:
+    int init();
+    int begin_command_buffer();
+    int end_command_buffer();
+
+protected:
+    const GPUDevice* vkdev;
+
+    VkCommandPool compute_command_pool;
+    VkCommandBuffer compute_command_buffer;
+    VkFence compute_command_fence;
+
+    std::vector<VkTensor> upload_staging_buffers;
+    std::vector<VkTensor> download_post_buffers;
+    std::vector<Tensor> download_post_tensors_fp16;
+    std::vector<Tensor> download_post_tensors;
+    std::vector<VkImageMemory*> image_blocks_to_destroy;
+
+    // the good-old path for device without VK_KHR_push_descriptor
+    std::vector<VkDescriptorPool> descriptor_pools;
+    std::vector<VkDescriptorSet> descriptorsets;
+
+    struct record
+    {
+        enum
+        {
+            TYPE_copy_buffer,
+            TYPE_copy_image,
+            TYPE_copy_buffer_to_image,
+            TYPE_copy_image_to_buffer,
+            TYPE_bind_pipeline,
+            TYPE_bind_descriptorsets,
+            TYPE_push_constants,
+            TYPE_dispatch,
+            TYPE_memory_barrers,
+            TYPE_buffer_barrers,
+            TYPE_image_barrers,
+            TYPE_post_download,
+            TYPE_post_cast_float16_to_float32,
+        };
+
+        int type;
+        VkCommandBuffer command_buffer;
+
+        union
+        {
+            struct { VkBuffer src; VkBuffer dst; uint32_t region_count; const VkBufferCopy* regions; } copy_buffer;
+            struct { VkImage src; VkImageLayout src_layout; VkImage dst; VkImageLayout dst_layout; uint32_t region_count; const VkImageCopy* regions; } copy_image;
+            struct { VkBuffer src; VkImage dst; VkImageLayout layout; uint32_t region_count; const VkBufferImageCopy* regions; } copy_buffer_to_image;
+            struct { VkImage src; VkImageLayout layout; VkBuffer dst; uint32_t region_count; const VkBufferImageCopy* regions; } copy_image_to_buffer;
+
+            struct { VkPipelineBindPoint bind_point; VkPipeline pipeline; } bind_pipeline;
+            struct { VkPipelineBindPoint bind_point; VkPipelineLayout pipeline_layout; uint32_t descriptorset_count; uint32_t descriptorset_offset; } bind_descriptorsets;
+            struct { VkPipelineLayout pipeline_layout; VkShaderStageFlags stage_flags; uint32_t size; const void* values; } push_constants;
+
+            struct { uint32_t group_count_x; uint32_t group_count_y; uint32_t group_count_z; } dispatch;
+
+            struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkMemoryBarrier* barriers; } memory_barrers;
+            struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkBufferMemoryBarrier* barriers; } buffer_barrers;
+            struct { VkPipelineStageFlags src_stage; VkPipelineStageFlags dst_stage; uint32_t barrier_count; const VkImageMemoryBarrier* barriers; } image_barrers;
+
+            struct { uint32_t download_post_buffer_mat_offset; uint32_t download_post_mat_fp16_offset; } post_download;
+            struct { uint32_t download_post_mat_fp16_offset; uint32_t download_post_mat_offset; } post_cast_float16_to_float32;
+        };
+    };
+
+    std::vector<record> delayed_records;
+};
+
+
+class VkTransfer
+{
+public:
+    VkTransfer(const GPUDevice* vkdev);
+    ~VkTransfer();
+public:
+    void record_upload(const tensor* src, VkTensor& dst, const Option& opt);
+    void record_upload(const Tensor& src, VkTensor& dst, const Option& opt);
+
+    int submit_and_wait();
+
+protected:
+    int init();
+    int begin_command_buffer();
+    int end_command_buffer();
+
+protected:
+    const GPUDevice* vkdev;
+
+    VkCommandPool compute_command_pool;
+    VkCommandPool transfer_command_pool;
+
+    VkCommandBuffer upload_command_buffer;
+    VkCommandBuffer compute_command_buffer;
+
+    VkSemaphore upload_compute_semaphore;
+
+    VkFence upload_command_fence;
+    VkFence compute_command_fence;
+
+    std::vector<VkTensor> upload_staging_buffers;
+};
+
+} // namespace TEngine
+
+#endif
diff --git a/source/device/vulkan/vulkan_define.h b/source/device/vulkan/vulkan_define.h
new file mode 100644
index 000000000..e0c68277a
--- /dev/null
+++ b/source/device/vulkan/vulkan_define.h
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#pragma once
+
+#define VULKAN_DEV_NAME "VK"
+
+
+typedef struct vulkan_option
+{
+    char* dev_name;
+    int precision;      //!< precision of calculation
+} vulkan_opt_t;
diff --git a/source/device/vulkan/vulkan_device.cc b/source/device/vulkan/vulkan_device.cc
new file mode 100644
index 000000000..57067405b
--- /dev/null
+++ b/source/device/vulkan/vulkan_device.cc
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include "vulkan_device.hpp"
+
+#include "vulkan_limit.hpp"
+#include "vulkan_graph.hpp"
+
+extern "C"
+{
+#include "api/c_api.h"
+#include "device/device.h"
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "executer/executer.h"
+#include "optimizer/split.h"
+#include "module/module.h"
+#include "utility/vector.h"
+#include "utility/log.h"
+}
+
+#include <cstring>
+
+
+int vulkan_describe(struct device* device, struct vector* allowed_ops, struct vector* blocked_ops, struct vector* precision)
+{
+    (void)device;
+
+    for (int op_type : vulkan_supported_ops)
+    {
+        push_vector_data(allowed_ops, &op_type);
+    }
+
+    for (int i = 0, j = 0; i < OP_BUILTIN_LAST; i++)
+    {
+        int op_type = vulkan_supported_ops[j];
+        if (op_type != i)
+        {
+            push_vector_data(blocked_ops, &i);
+        }
+        else
+        {
+            if (j < sizeof(vulkan_supported_ops) / sizeof(vulkan_supported_ops[0]))
+                j++;
+        }
+    }
+
+    int precision_var = TENGINE_DT_UINT8;
+    push_vector_data(precision, &precision_var);
+    precision_var = TENGINE_DT_FP16;
+    push_vector_data(precision, &precision_var);
+    precision_var = TENGINE_DT_FP32;
+    push_vector_data(precision, &precision_var);
+
+    return 0;
+}
+
+
+int vulkan_evaluation(struct device* device, struct subgraph* sub_graph, struct vector* evolution_tensors, struct vector* evolution_nodes)
+{
+    // nothing to do with vulkan
+    (void)device;
+    (void)sub_graph;
+    (void)evolution_tensors;
+    (void)evolution_nodes;
+
+    return 0;
+}
+
+
+int vulkan_allocate(struct device* device, struct subgraph* sub_graph)
+{
+    if (nullptr == device)
+    {
+        return -1;
+    }
+
+    /* set the correct input wait count: INPUT tensor is always ready */
+    sub_graph->input_wait_count = 0;
+
+    for (int i = 0; i < sub_graph->input_num; i++)
+    {
+        struct tensor* tensor = get_ir_graph_tensor(sub_graph->graph, sub_graph->input_tensor_list[i]);
+
+        if (tensor->tensor_type == TENSOR_TYPE_VAR)
+            sub_graph->input_wait_count++;
+    }
+
+    return 0;
+}
+
+
+int vulkan_release(struct device* device, struct subgraph* sub_graph)
+{
+    (void)sub_graph;
+
+    return 0;
+}
+
+int vulkan_split_graph(struct graph* ir_graph)
+{
+    struct device* cur_dev = ir_graph->attribute->context->device;
+
+    if (0 != strcmp(VULKAN_DEV_NAME, cur_dev->name))
+    {
+        return -1;
+    }
+
+    struct vector* allowed_ops = create_vector(sizeof(int), nullptr);
+    struct vector* blocked_ops = create_vector(sizeof(int), nullptr);
+    struct vector* precision = create_vector(sizeof(int), nullptr);
+
+    cur_dev->allocator->describe(cur_dev, allowed_ops, blocked_ops, precision);
+
+    split_graph_node_to_sub_graph(ir_graph, allowed_ops, blocked_ops, precision);
+
+    release_vector(allowed_ops);
+    release_vector(blocked_ops);
+    release_vector(precision);
+
+    //
+    generate_sub_graph_io(ir_graph);
+    add_sub_graph_to_ir_graph(ir_graph);
+
+    // add node sub graph id
+    for (int i = 0; i < (uint16_t)get_vector_num(ir_graph->subgraph_list); i++)
+    {
+        struct subgraph* sub_graph = *(struct subgraph**)get_vector_data(ir_graph->subgraph_list, i);
+        sub_graph->index = i;
+
+        for (uint16_t j = 0; j < sub_graph->node_num; j++)
+        {
+            uint16_t node_id = sub_graph->node_list[j];
+            struct node* ir_node = get_ir_graph_node(ir_graph, node_id);
+            ir_node->subgraph_idx = sub_graph->index;
+        }
+    }
+
+    return 0;
+}
+
+
+extern "C"
+{
+static struct interface vulkan_interface = {
+        .init           = vulkan_dev_init,
+        .pre_run        = vulkan_dev_prerun,
+        .run            = vulkan_dev_run,
+        .post_run       = vulkan_dev_postrun,
+        .async_run      = nullptr,
+        .async_wait     = nullptr,
+        .release_graph  = nullptr,
+        .release_device = vulkan_dev_release,
+};
+
+
+static struct allocator vulkan_allocator = {
+        .describe       = vulkan_describe,
+        .evaluation     = vulkan_evaluation,
+        .allocate       = vulkan_allocate,
+        .release        = vulkan_release,
+};
+
+
+static struct optimizer vulkan_optimizer = {
+        .split_graph    = vulkan_split_graph,
+        .optimize_graph = nullptr,
+};
+
+
+
+static struct vulkan_device vulkan_dev = {
+        .base = {
+                .name       = VULKAN_DEV_NAME,
+                .interface  = &vulkan_interface,
+                .allocator  = &vulkan_allocator,
+                .optimizer  = &vulkan_optimizer,
+                .scheduler  = nullptr,
+                .privacy    = nullptr,
+        },
+};
+
+
+int register_vulkan_device(void)
+{
+    int ret = register_device(&vulkan_dev.base);
+    if (0 != ret)
+    {
+        TLOG_INFO("Tengine plugin %s register failed.\n", vulkan_dev.base.name);
+        return -1;
+    }
+
+    TLOG_INFO("Tengine plugin device %s is registered.\n", vulkan_dev.base.name);
+    return 0;
+}
+
+
+int unregister_vulkan_device(void)
+{
+    int ret = unregister_device(&vulkan_dev.base);
+    if (0 != ret)
+    {
+        TLOG_INFO("Tengine plugin %s unregister failed.\n", vulkan_dev.base.name);
+        return ret;
+    }
+
+    TLOG_INFO("Tengine plugin device %s is unregistered.\n", vulkan_dev.base.name);
+
+    return 0;
+}
+}
diff --git a/source/device/vulkan/vulkan_device.hpp b/source/device/vulkan/vulkan_device.hpp
new file mode 100644
index 000000000..9560261fe
--- /dev/null
+++ b/source/device/vulkan/vulkan_device.hpp
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#pragma once
+
+#include "vulkan_define.h"
+
+extern "C"
+{
+#include "api/c_api.h"
+#include "device/device.h"
+
+struct vulkan_device
+{
+    struct device base;
+};
+
+DLLEXPORT int register_vulkan_device(void);
+}
\ No newline at end of file
diff --git a/source/device/vulkan/vulkan_executor.cc b/source/device/vulkan/vulkan_executor.cc
new file mode 100644
index 000000000..ca030e894
--- /dev/null
+++ b/source/device/vulkan/vulkan_executor.cc
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include "vulkan_executor.hpp"
+#include "vulkan_helper.hpp"
+#include "vulkan_gpu.hpp"
+#include "vulkan_graph.hpp"
+
+extern "C"
+{
+#include "operator/op.h"
+#include "convolution_param.h"
+}
+
+using namespace TEngine;
+
+bool VULKANEngine::init()
+{
+    return true;
+}
+
+
+
+int VULKANEngine::VULKANEnginePreRun(struct subgraph* subgraph)
+{
+    // TLOG_INFO("==== vulkan prerun start ====\n");
+    create_gpu_instance();
+    // struct device *vk_dev = (struct device *)dev;
+    struct graph *orig_graph = subgraph->graph;
+    // struct vk_dev_priv *priv = (struct vk_dev_priv *)orig_graph->dev_priv;
+
+    // /* todo: set the tensor shape ? */
+
+    // /* create exec_graph */
+    VulkanGraph* vk_exec_graph = new VulkanGraph(subgraph);
+
+    if (vk_exec_graph == nullptr)
+    {
+        // set_tengine_errno(EIO);
+        TLOG_ERR("vulkan exec graph is NULL\n");
+        return -1;        
+    }
+
+    vk_exec_graph->upload_model();
+    vk_exec_graph->create_pipeline();
+
+    subgraph->device_graph = vk_exec_graph;
+
+    int node_num = subgraph->node_num;
+    TLOG_INFO("==== vulkan prerun done ====\n");
+    return 0;
+
+};
+
+int VULKANEngine::VULKANEngineRun(struct subgraph* subgraph)
+{
+    // struct vk_device *vk_dev = (struct vk_device *)dev; 
+    struct graph *orig_graph = subgraph->graph;
+    // struct vk_dev_priv *priv = (struct vk_dev_priv *)orig_graph->dev_priv;
+
+    VulkanGraph *vk_exec_graph = (VulkanGraph *)subgraph->device_graph;
+    if (vk_exec_graph == nullptr)
+    {
+        // set_tengine_errno(EIO);
+        TLOG_ERR("vulkan exec graph is NULL\n");
+        return -1;
+    }
+
+    vk_exec_graph->record_graph_pipeline();
+    return 0;
+}
+
+void VULKANEngine::VULKANEnginePostRun()
+{
+    destroy_gpu_instance();
+    return;
+};
\ No newline at end of file
diff --git a/source/device/vulkan/vulkan_executor.hpp b/source/device/vulkan/vulkan_executor.hpp
new file mode 100644
index 000000000..28ae46efb
--- /dev/null
+++ b/source/device/vulkan/vulkan_executor.hpp
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, Open AI Lab
+ * Author: lswang@openailab.com
+ */
+
+
+extern "C"
+{
+#include "api/c_api.h"
+#include "device/device.h"
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "executer/executer.h"
+#include "optimizer/split.h"
+#include "module/module.h"
+#include "utility/vector.h"
+#include "utility/log.h"
+}
+
+#include <map>
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <tuple>
+#include <vector>
+
+// #include <CL/cl.h>
+
+#include <cmath>
+
+// typedef std::map<uint32_t, cl_mem> dict_uint2clmem;
+
+struct VULKANqueue
+{
+    std::string name;
+    int dims;
+    // cl_kernel queue_kernel;
+    // cl_event enentPoint;
+    size_t *queue_global_work_size;
+    size_t *queue_local_work_size;
+};
+
+class VULKANEngine
+{
+public:
+//    VULKANEngine();
+//    ~VULKANEngine() = default;
+
+    int VULKANEnginePreRun(struct subgraph* subgraph);
+    int VULKANEngineRun(struct subgraph* subgraph);
+    void VULKANEnginePostRun();
+
+private:
+    bool init();
+
+private:
+
+public:
+    // dict_uint2clmem             vulkan_tensor_map;
+    std::vector<struct VULKANqueue>    queue_list;
+
+public:
+    int bin_num;
+
+};
+
+
+
diff --git a/source/device/vulkan/vulkan_gpu.cpp b/source/device/vulkan/vulkan_gpu.cpp
new file mode 100644
index 000000000..dac4e9486
--- /dev/null
+++ b/source/device/vulkan/vulkan_gpu.cpp
@@ -0,0 +1,2036 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "vulkan_gpu.hpp"
+#include "vulkan_option.hpp"
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "layer/packing_vulkan.hpp"
+
+#if __ANDROID__
+#define ENABLE_VALIDATION_LAYER 0
+#else
+#define ENABLE_VALIDATION_LAYER 0
+#endif
+
+namespace TEngine {
+
+// global
+static VkInstance g_instance = 0;
+static int g_gpu_count = 0;
+static int g_default_gpu_index = -1;
+
+// experience value
+#define MAX_GPU_COUNT 8
+static GpuInfo g_gpu_infos[MAX_GPU_COUNT];
+
+// TODO
+// default
+// static Mutex g_default_vkdev_lock;
+static GPUDevice* g_default_vkdev[MAX_GPU_COUNT] = {0};
+
+// precompiled spirv
+struct layer_shader_registry_entry
+{
+    const uint32_t* spv_data;
+    size_t spv_data_size;
+};
+
+#include "layer_shader_spv_data.h"
+
+static const layer_shader_registry_entry layer_shader_registry[] =
+{
+#include "layer_shader_registry.h"
+};
+
+static ShaderInfo layer_shader_infos[sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry)];
+
+static const int layer_shader_registry_entry_count = sizeof(layer_shader_registry) / sizeof(layer_shader_registry_entry);
+
+int support_VK_KHR_external_memory_capabilities = 0;
+int support_VK_KHR_get_physical_device_properties2 = 0;
+int support_VK_KHR_get_surface_capabilities2 = 0;
+int support_VK_KHR_surface = 0;
+int support_VK_EXT_debug_utils = 0;
+
+#if __ANDROID_API__ >= 26
+int support_VK_KHR_android_surface = 0;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_external_memory_capabilities
+PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR = 0;
+
+// VK_KHR_get_physical_device_properties2
+PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR = 0;
+PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR = 0;
+PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR = 0;
+PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR = 0;
+PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR = 0;
+PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR = 0;
+PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR = 0;
+
+// VK_KHR_get_surface_capabilities2
+PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR = 0;
+PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR = 0;
+
+// VK_KHR_surface
+PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR = 0;
+PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR = 0;
+PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR = 0;
+PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR = 0;
+PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR = 0;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR = 0;
+#endif // __ANDROID_API__ >= 26
+
+// compile with old vulkan sdk
+#if VK_HEADER_VERSION < 80
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR (VkStructureType)1000177000
+typedef struct VkPhysicalDevice8BitStorageFeaturesKHR {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           storageBuffer8BitAccess;
+    VkBool32           uniformAndStorageBuffer8BitAccess;
+    VkBool32           storagePushConstant8;
+} VkPhysicalDevice8BitStorageFeaturesKHR;
+#endif // VK_HEADER_VERSION < 80
+#if VK_HEADER_VERSION < 95
+#define VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR (VkStructureType)1000082000
+typedef struct VkPhysicalDeviceFloat16Int8FeaturesKHR {
+    VkStructureType    sType;
+    void*              pNext;
+    VkBool32           shaderFloat16;
+    VkBool32           shaderInt8;
+} VkPhysicalDeviceFloat16Int8FeaturesKHR;
+#endif // VK_HEADER_VERSION < 95
+
+static int init_instance_extension()
+{
+    if (support_VK_KHR_external_memory_capabilities)
+    {
+        vkGetPhysicalDeviceExternalBufferPropertiesKHR = (PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceExternalBufferPropertiesKHR");
+    }
+
+    if (support_VK_KHR_get_physical_device_properties2)
+    {
+	vkGetPhysicalDeviceFeatures2KHR = (PFN_vkGetPhysicalDeviceFeatures2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFeatures2KHR");
+        vkGetPhysicalDeviceProperties2KHR = (PFN_vkGetPhysicalDeviceProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceProperties2KHR");
+        vkGetPhysicalDeviceFormatProperties2KHR = (PFN_vkGetPhysicalDeviceFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceFormatProperties2KHR");
+        vkGetPhysicalDeviceImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceImageFormatProperties2KHR");
+        vkGetPhysicalDeviceQueueFamilyProperties2KHR = (PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceQueueFamilyProperties2KHR");
+        vkGetPhysicalDeviceMemoryProperties2KHR = (PFN_vkGetPhysicalDeviceMemoryProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceMemoryProperties2KHR");
+        vkGetPhysicalDeviceSparseImageFormatProperties2KHR = (PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSparseImageFormatProperties2KHR");
+    }
+
+    if (support_VK_KHR_get_surface_capabilities2)
+    {
+        vkGetPhysicalDeviceSurfaceCapabilities2KHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilities2KHR");
+        vkGetPhysicalDeviceSurfaceFormats2KHR = (PFN_vkGetPhysicalDeviceSurfaceFormats2KHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormats2KHR");
+    }
+
+    if (support_VK_KHR_surface)
+    {
+        vkDestroySurfaceKHR = (PFN_vkDestroySurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkDestroySurfaceKHR");
+        vkGetPhysicalDeviceSurfaceSupportKHR = (PFN_vkGetPhysicalDeviceSurfaceSupportKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceSupportKHR");
+        vkGetPhysicalDeviceSurfaceCapabilitiesKHR = (PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceCapabilitiesKHR");
+        vkGetPhysicalDeviceSurfaceFormatsKHR = (PFN_vkGetPhysicalDeviceSurfaceFormatsKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfaceFormatsKHR");
+        vkGetPhysicalDeviceSurfacePresentModesKHR = (PFN_vkGetPhysicalDeviceSurfacePresentModesKHR)vkGetInstanceProcAddr(g_instance, "vkGetPhysicalDeviceSurfacePresentModesKHR");
+    }
+
+#if __ANDROID_API__ >= 26
+    if (support_VK_KHR_android_surface)
+    {
+        vkCreateAndroidSurfaceKHR = (PFN_vkCreateAndroidSurfaceKHR)vkGetInstanceProcAddr(g_instance, "vkCreateAndroidSurfaceKHR");
+    }
+#endif // __ANDROID_API__ >= 26
+
+    return 0;
+}
+
+#if ENABLE_VALIDATION_LAYER
+static VkDebugUtilsMessengerEXT callback;
+
+static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(
+    VkDebugUtilsMessageSeverityFlagBitsEXT /*messageSeverity*/,
+    VkDebugUtilsMessageTypeFlagsEXT /*messageType*/,
+    const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData,
+    void* /*pUserData*/)
+{
+    TLOG_INFO("validation layer: %s\n", pCallbackData->pMessage);
+
+    return VK_FALSE;
+}
+
+VkResult CreateDebugUtilsMessengerEXT(VkInstance instance, const VkDebugUtilsMessengerCreateInfoEXT* pCreateInfo, const VkAllocationCallbacks* pAllocator, VkDebugUtilsMessengerEXT* pCallback)
+{
+    PFN_vkCreateDebugUtilsMessengerEXT func = (PFN_vkCreateDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkCreateDebugUtilsMessengerEXT");
+    if (func)
+        return func(instance, pCreateInfo, pAllocator, pCallback);
+
+    return VK_ERROR_EXTENSION_NOT_PRESENT;
+}
+
+void DestroyDebugUtilsMessengerEXT(VkInstance instance, VkDebugUtilsMessengerEXT callback, const VkAllocationCallbacks* pAllocator)
+{
+    PFN_vkDestroyDebugUtilsMessengerEXT func = (PFN_vkDestroyDebugUtilsMessengerEXT)vkGetInstanceProcAddr(instance, "vkDestroyDebugUtilsMessengerEXT");
+    if (func)
+        func(instance, callback, pAllocator);
+}
+#endif // ENABLE_VALIDATION_LAYER
+
+static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
+{
+    // first try, compute only queue
+    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    {
+        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
+
+        if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
+        && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
+        {
+            return i;
+        }
+    }
+
+    // second try, any queue with compute and graphics
+    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    {
+        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
+
+        if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
+            && (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
+        {
+            return i;
+        }
+    }
+
+    // third try, any queue with compute
+    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    {
+        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
+
+        if (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
+        {
+            return i;
+        }
+    }
+
+    return -1;
+}
+
+static uint32_t find_device_graphics_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
+{
+    // first try, graphics only queue
+    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    {
+        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
+
+        if ((queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
+            && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT))
+        {
+            return i;
+        }
+    }
+
+    // second try, any queue with graphics and compute
+    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    {
+        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
+
+        if ((queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
+            && (queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT))
+        {
+            return i;
+        }
+    }
+
+    // third try, any queue with graphics
+    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    {
+        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
+
+        if (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT)
+        {
+            return i;
+        }
+    }
+
+//     TLOG_INFO("no graphics queue\n");
+    return -1;
+}
+
+static uint32_t find_device_transfer_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties)
+{
+    // first try, transfer only queue
+    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    {
+        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
+
+        if ((queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
+            && !(queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
+            && !(queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
+        {
+            return i;
+        }
+    }
+
+    // second try, any queue with transfer
+    for (uint32_t i=0; i<queueFamilyProperties.size(); i++)
+    {
+        const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
+
+        if (queueFamilyProperty.queueFlags & VK_QUEUE_TRANSFER_BIT)
+        {
+            return i;
+        }
+    }
+
+    // third try, use compute queue
+    uint32_t compute_queue_index = find_device_compute_queue(queueFamilyProperties);
+    if (compute_queue_index != (uint32_t)-1)
+    {
+        return compute_queue_index;
+    }
+
+    // fourth try, use graphics queue
+    uint32_t graphics_queue_index = find_device_graphics_queue(queueFamilyProperties);
+    if (graphics_queue_index != (uint32_t)-1)
+    {
+        return graphics_queue_index;
+    }
+
+//     TLOG_INFO("no transfer queue\n");
+    return -1;
+}
+
+static int find_default_vulkan_device_index()
+{
+    // first try, discrete gpu
+    for (int i=0; i<g_gpu_count; i++)
+    {
+        if (g_gpu_infos[i].type == 0)
+            return i;
+    }
+
+    // second try, integrated gpu
+    for (int i=0; i<g_gpu_count; i++)
+    {
+        if (g_gpu_infos[i].type == 1)
+            return i;
+    }
+
+    // third try, any probed device
+    if (g_gpu_count > 0)
+        return 0;
+
+    TLOG_INFO("no vulkan device\n");
+    return -1;
+}
+
+int create_gpu_instance()
+{
+    VkResult ret;
+
+    std::vector<const char*> enabledLayers;
+
+#if ENABLE_VALIDATION_LAYER
+    uint32_t instanceLayerPropertyCount;
+    ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, NULL);
+    if (ret != VK_SUCCESS)
+    {
+        TLOG_INFO("vkEnumerateInstanceLayerProperties failed %d\n", ret);
+        return -1;
+    }
+
+    std::vector<VkLayerProperties> instanceLayerProperties(instanceLayerPropertyCount);
+    ret = vkEnumerateInstanceLayerProperties(&instanceLayerPropertyCount, instanceLayerProperties.data());
+    if (ret != VK_SUCCESS)
+    {
+        TLOG_INFO("vkEnumerateInstanceLayerProperties failed %d\n", ret);
+        return -1;
+    }
+
+    for (uint32_t i=0; i<instanceLayerPropertyCount; i++)
+    {
+        const VkLayerProperties& lp = instanceLayerProperties[i];
+//         TLOG_INFO("instance layer %s = %u\n", lp.layerName, lp.implementationVersion);
+
+        if (strcmp(lp.layerName, "VK_LAYER_LUNARG_standard_validation") == 0)
+        {
+            enabledLayers.push_back("VK_LAYER_LUNARG_standard_validation");
+        }
+        if (strcmp(lp.layerName, "VK_LAYER_LUNARG_parameter_validation") == 0)
+        {
+            enabledLayers.push_back("VK_LAYER_LUNARG_parameter_validation");
+        }
+    }
+#endif // ENABLE_VALIDATION_LAYER
+
+    std::vector<const char*> enabledExtensions;
+
+    uint32_t instanceExtensionPropertyCount;
+    ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, NULL);
+    if (ret != VK_SUCCESS)
+    {
+        TLOG_INFO("vkEnumerateInstanceExtensionProperties failed %d\n", ret);
+        return -1;
+    }
+
+    std::vector<VkExtensionProperties> instanceExtensionProperties(instanceExtensionPropertyCount);
+    ret = vkEnumerateInstanceExtensionProperties(NULL, &instanceExtensionPropertyCount, instanceExtensionProperties.data());
+    if (ret != VK_SUCCESS)
+    {
+        TLOG_INFO("vkEnumerateInstanceExtensionProperties failed %d\n", ret);
+        return -1;
+    }
+
+    support_VK_KHR_get_physical_device_properties2 = 0;
+    support_VK_KHR_get_surface_capabilities2 = 0;
+    support_VK_KHR_surface = 0;
+    support_VK_EXT_debug_utils = 0;
+#if __ANDROID_API__ >= 26
+    support_VK_KHR_android_surface = 0;
+#endif // __ANDROID_API__ >= 26
+    for (uint32_t j=0; j<instanceExtensionPropertyCount; j++)
+    {
+        const VkExtensionProperties& exp = instanceExtensionProperties[j];
+//         TLOG_INFO("instance extension %s = %u\n", exp.extensionName, exp.specVersion);
+
+        if (strcmp(exp.extensionName, "VK_KHR_external_memory_capabilities") == 0)
+            support_VK_KHR_external_memory_capabilities = exp.specVersion;
+        else if (strcmp(exp.extensionName, "VK_KHR_get_physical_device_properties2") == 0)
+            support_VK_KHR_get_physical_device_properties2 = exp.specVersion;
+        else if (strcmp(exp.extensionName, "VK_KHR_get_surface_capabilities2") == 0)
+            support_VK_KHR_get_surface_capabilities2 = exp.specVersion;
+        else if (strcmp(exp.extensionName, "VK_KHR_surface") == 0)
+            support_VK_KHR_surface = exp.specVersion;
+        else if (strcmp(exp.extensionName, "VK_EXT_debug_utils") == 0)
+            support_VK_EXT_debug_utils = exp.specVersion;
+#if __ANDROID_API__ >= 26
+        else if (strcmp(exp.extensionName, "VK_KHR_android_surface") == 0)
+            support_VK_KHR_android_surface = exp.specVersion;
+#endif // __ANDROID_API__ >= 26
+    }
+
+    if (support_VK_KHR_external_memory_capabilities)
+        enabledExtensions.push_back("VK_KHR_external_memory_capabilities");
+    if (support_VK_KHR_get_physical_device_properties2)
+        enabledExtensions.push_back("VK_KHR_get_physical_device_properties2");
+    if (support_VK_KHR_get_surface_capabilities2)
+        enabledExtensions.push_back("VK_KHR_get_surface_capabilities2");
+    if (support_VK_KHR_surface)
+        enabledExtensions.push_back("VK_KHR_surface");
+#if ENABLE_VALIDATION_LAYER
+    if (support_VK_EXT_debug_utils)
+        enabledExtensions.push_back("VK_EXT_debug_utils");
+#endif // ENABLE_VALIDATION_LAYER
+#if __ANDROID_API__ >= 26
+    if (support_VK_KHR_android_surface)
+        enabledExtensions.push_back("VK_KHR_android_surface");
+#endif // __ANDROID_API__ >= 26
+
+    VkApplicationInfo applicationInfo;
+    applicationInfo.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
+    applicationInfo.pNext = 0;
+    applicationInfo.pApplicationName = "tengine";
+    applicationInfo.applicationVersion = 0;
+    applicationInfo.pEngineName = "tengine";
+    applicationInfo.engineVersion = 20200530;
+    applicationInfo.apiVersion = VK_MAKE_VERSION(1, 0, 0);
+
+    VkInstanceCreateInfo instanceCreateInfo;
+    instanceCreateInfo.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
+    instanceCreateInfo.pNext = 0;
+    instanceCreateInfo.flags = 0;
+    instanceCreateInfo.pApplicationInfo = &applicationInfo;
+    instanceCreateInfo.enabledLayerCount = enabledLayers.size();
+    instanceCreateInfo.ppEnabledLayerNames = enabledLayers.data();
+    instanceCreateInfo.enabledExtensionCount = enabledExtensions.size();
+    instanceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
+
+    ret = vkCreateInstance(&instanceCreateInfo, 0, &g_instance);
+    if (ret != VK_SUCCESS)
+    {
+        TLOG_INFO("vkCreateInstance failed %d\n", ret);
+        return -1;
+    }
+
+#if ENABLE_VALIDATION_LAYER
+    if (support_VK_EXT_debug_utils)
+    {
+        VkDebugUtilsMessengerCreateInfoEXT createInfo = {};
+        createInfo.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
+        createInfo.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT;
+        createInfo.messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT;
+        createInfo.pfnUserCallback = debugCallback;
+        createInfo.pUserData = 0;
+        ret = CreateDebugUtilsMessengerEXT(g_instance, &createInfo, NULL, &callback);
+        if (ret != VK_SUCCESS)
+        {
+            TLOG_INFO("CreateDebugUtilsMessengerEXT failed %d\n", ret);
+            return -1;
+        }
+    }
+#endif // ENABLE_VALIDATION_LAYER
+
+    init_instance_extension();
+
+    uint32_t physicalDeviceCount = 0;
+    ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, 0);
+    if (ret != VK_SUCCESS)
+    {
+        TLOG_INFO("01vkEnumeratePhysicalDevices failed %d\n", ret);
+        return -1;
+    }
+
+    if (physicalDeviceCount > MAX_GPU_COUNT)
+        physicalDeviceCount = MAX_GPU_COUNT;
+
+    std::vector<VkPhysicalDevice> physicalDevices(physicalDeviceCount);
+
+    ret = vkEnumeratePhysicalDevices(g_instance, &physicalDeviceCount, physicalDevices.data());
+    if (ret != VK_SUCCESS)
+    {
+        TLOG_INFO("vkEnumeratePhysicalDevices failed %d\n", ret);
+        return -1;
+    }
+
+    // find proper device and queue
+    int gpu_info_index = 0;
+    for (uint32_t i=0; i<physicalDeviceCount; i++)
+    {
+        const VkPhysicalDevice& physicalDevice = physicalDevices[i];
+        GpuInfo& gpu_info = g_gpu_infos[gpu_info_index];
+
+        // device type
+        VkPhysicalDeviceProperties physicalDeviceProperties;
+        vkGetPhysicalDeviceProperties(physicalDevice, &physicalDeviceProperties);
+
+//         TLOG_INFO("[%u] apiVersion = %u.%u.%u\n", i, VK_VERSION_MAJOR(physicalDeviceProperties.apiVersion),
+//             VK_VERSION_MINOR(physicalDeviceProperties.apiVersion), VK_VERSION_PATCH(physicalDeviceProperties.apiVersion));
+//         TLOG_INFO("[%u] driverVersion = %u.%u.%u\n", i, VK_VERSION_MAJOR(physicalDeviceProperties.driverVersion),
+//             VK_VERSION_MINOR(physicalDeviceProperties.driverVersion), VK_VERSION_PATCH(physicalDeviceProperties.driverVersion));
+//         TLOG_INFO("[%u] vendorID = %x\n", i, physicalDeviceProperties.vendorID);
+//         TLOG_INFO("[%u] deviceID = %x\n", i, physicalDeviceProperties.deviceID);
+//         TLOG_INFO("[%u] deviceType = %x\n", i, physicalDeviceProperties.deviceType);
+//         TLOG_INFO("[%u] deviceName = %s\n", i, physicalDeviceProperties.deviceName);
+//         TLOG_INFO("[%u] pipelineCacheUUID = %u\n", i, physicalDeviceProperties.pipelineCacheUUID);
+
+        gpu_info.bug_local_size_spec_const = false;
+        gpu_info.bug_implicit_fp16_arithmetic = false;
+
+        if (physicalDeviceProperties.vendorID == 0x13b5 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 66))
+        {
+            // arm mali with old buggy driver
+            gpu_info.bug_local_size_spec_const = true;
+        }
+
+        if (physicalDeviceProperties.vendorID == 0x5143 && physicalDeviceProperties.apiVersion < VK_MAKE_VERSION(1, 0, 49))
+        {
+            // qcom adreno with old buggy driver
+            gpu_info.bug_local_size_spec_const = true;
+        }
+
+        if (physicalDeviceProperties.vendorID == 0x13b5 && (physicalDeviceProperties.deviceID == 0x7500001 || physicalDeviceProperties.deviceID == 0x8602000))
+        {
+            // TODO enable devices other than rk3288/rk3399
+            // arm mali driver accept spirv with fp16 arithmetic
+            gpu_info.bug_implicit_fp16_arithmetic = true;
+        }
+
+        if (physicalDeviceProperties.vendorID == 0x5143 && (physicalDeviceProperties.deviceID == 0x6030001 || physicalDeviceProperties.deviceID == 0x6040001))
+        {
+            // TODO enable devices other than qcom855/qcom855plus
+            // qcom adreno driver accept spirv with fp16 arithmetic
+            gpu_info.bug_implicit_fp16_arithmetic = true;
+        }
+
+        gpu_info.physical_device = physicalDevice;
+
+        // info
+        gpu_info.api_version = physicalDeviceProperties.apiVersion;
+        gpu_info.driver_version = physicalDeviceProperties.driverVersion;
+        gpu_info.vendor_id = physicalDeviceProperties.vendorID;
+        gpu_info.device_id = physicalDeviceProperties.deviceID;
+        memcpy(gpu_info.pipeline_cache_uuid, physicalDeviceProperties.pipelineCacheUUID, VK_UUID_SIZE);
+
+        if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU)
+            gpu_info.type = 0;
+        else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU)
+            gpu_info.type = 1;
+        else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_VIRTUAL_GPU)
+            gpu_info.type = 2;
+        else if (physicalDeviceProperties.deviceType == VK_PHYSICAL_DEVICE_TYPE_CPU)
+            gpu_info.type = 3;
+        else
+            gpu_info.type = -1;
+
+        // device capability
+        gpu_info.max_shared_memory_size = physicalDeviceProperties.limits.maxComputeSharedMemorySize;
+
+        gpu_info.max_workgroup_count[0] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[0];
+        gpu_info.max_workgroup_count[1] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[1];
+        gpu_info.max_workgroup_count[2] = physicalDeviceProperties.limits.maxComputeWorkGroupCount[2];
+
+        gpu_info.max_workgroup_invocations = physicalDeviceProperties.limits.maxComputeWorkGroupInvocations;
+
+        gpu_info.max_workgroup_size[0] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[0];
+        gpu_info.max_workgroup_size[1] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[1];
+        gpu_info.max_workgroup_size[2] = physicalDeviceProperties.limits.maxComputeWorkGroupSize[2];
+
+        gpu_info.memory_map_alignment = physicalDeviceProperties.limits.minMemoryMapAlignment;
+        gpu_info.buffer_offset_alignment = physicalDeviceProperties.limits.minStorageBufferOffsetAlignment;
+        gpu_info.non_coherent_atom_size = physicalDeviceProperties.limits.nonCoherentAtomSize;
+        gpu_info.buffer_image_granularity = physicalDeviceProperties.limits.bufferImageGranularity;
+        gpu_info.max_image_dimension_1d = physicalDeviceProperties.limits.maxImageDimension1D;
+        gpu_info.max_image_dimension_2d = physicalDeviceProperties.limits.maxImageDimension2D;
+        gpu_info.max_image_dimension_3d = physicalDeviceProperties.limits.maxImageDimension3D;
+
+        gpu_info.timestamp_period = physicalDeviceProperties.limits.timestampPeriod;
+
+        TLOG_INFO("[%u] max_shared_memory_size = %u\n", i, gpu_info.max_shared_memory_size);
+        TLOG_INFO("[%u] max_workgroup_count = %u %u %u\n", i, gpu_info.max_workgroup_count[0], gpu_info.max_workgroup_count[1], gpu_info.max_workgroup_count[2]);
+        TLOG_INFO("[%u] max_workgroup_invocations = %u\n", i, gpu_info.max_workgroup_invocations);
+        TLOG_INFO("[%u] max_workgroup_size = %u %u %u\n", i, gpu_info.max_workgroup_size[0], gpu_info.max_workgroup_size[1], gpu_info.max_workgroup_size[2]);
+        TLOG_INFO("[%u] memory_map_alignment = %lu\n", i, gpu_info.memory_map_alignment);
+        TLOG_INFO("[%u] buffer_offset_alignment = %lu\n", i, gpu_info.buffer_offset_alignment);
+
+        // find compute queue
+        uint32_t queueFamilyPropertiesCount;
+        vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, 0);
+
+        std::vector<VkQueueFamilyProperties> queueFamilyProperties(queueFamilyPropertiesCount);
+        vkGetPhysicalDeviceQueueFamilyProperties(physicalDevice, &queueFamilyPropertiesCount, queueFamilyProperties.data());
+
+        gpu_info.compute_queue_family_index = find_device_compute_queue(queueFamilyProperties);
+        gpu_info.graphics_queue_family_index = find_device_graphics_queue(queueFamilyProperties);
+        gpu_info.transfer_queue_family_index = find_device_transfer_queue(queueFamilyProperties);
+
+        gpu_info.compute_queue_count = queueFamilyProperties[gpu_info.compute_queue_family_index].queueCount;
+        gpu_info.graphics_queue_count = queueFamilyProperties[gpu_info.graphics_queue_family_index].queueCount;
+        gpu_info.transfer_queue_count = queueFamilyProperties[gpu_info.transfer_queue_family_index].queueCount;
+
+        // cache memory properties
+        vkGetPhysicalDeviceMemoryProperties(physicalDevice, &gpu_info.physicalDeviceMemoryProperties);
+
+        // get device extension
+        uint32_t deviceExtensionPropertyCount = 0;
+        ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, NULL);
+        if (ret != VK_SUCCESS)
+        {
+            TLOG_INFO("vkEnumerateDeviceExtensionProperties failed %d\n", ret);
+            return -1;
+        }
+
+        std::vector<VkExtensionProperties> deviceExtensionProperties(deviceExtensionPropertyCount);
+        ret = vkEnumerateDeviceExtensionProperties(physicalDevice, NULL, &deviceExtensionPropertyCount, deviceExtensionProperties.data());
+        if (ret != VK_SUCCESS)
+        {
+            TLOG_INFO("vkEnumerateDeviceExtensionProperties failed %d\n", ret);
+            return -1;
+        }
+
+        // extension capability
+        gpu_info.support_VK_KHR_8bit_storage = 0;
+        gpu_info.support_VK_KHR_16bit_storage = 0;
+        gpu_info.support_VK_KHR_bind_memory2 = 0;
+        gpu_info.support_VK_KHR_dedicated_allocation = 0;
+        gpu_info.support_VK_KHR_descriptor_update_template = 0;
+        gpu_info.support_VK_KHR_external_memory = 0;
+        gpu_info.support_VK_KHR_get_memory_requirements2 = 0;
+        gpu_info.support_VK_KHR_maintenance1 = 0;
+        gpu_info.support_VK_KHR_push_descriptor = 0;
+        gpu_info.support_VK_KHR_sampler_ycbcr_conversion = 0;
+        gpu_info.support_VK_KHR_shader_float16_int8 = 0;
+        gpu_info.support_VK_KHR_shader_float_controls = 0;
+        gpu_info.support_VK_KHR_storage_buffer_storage_class = 0;
+        gpu_info.support_VK_KHR_swapchain = 0;
+        gpu_info.support_VK_EXT_queue_family_foreign = 0;
+#if __ANDROID_API__ >= 26
+        gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = 0;
+#endif // __ANDROID_API__ >= 26
+        for (uint32_t j=0; j<deviceExtensionPropertyCount; j++)
+        {
+            const VkExtensionProperties& exp = deviceExtensionProperties[j];
+//             TLOG_INFO("device extension %s = %u\n", exp.extensionName, exp.specVersion);
+
+            if (strcmp(exp.extensionName, "VK_KHR_8bit_storage") == 0)
+                gpu_info.support_VK_KHR_8bit_storage = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_16bit_storage") == 0)
+                gpu_info.support_VK_KHR_16bit_storage = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_bind_memory2") == 0)
+                gpu_info.support_VK_KHR_bind_memory2 = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_dedicated_allocation") == 0)
+                gpu_info.support_VK_KHR_dedicated_allocation = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_descriptor_update_template") == 0)
+                gpu_info.support_VK_KHR_descriptor_update_template = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_external_memory") == 0)
+                gpu_info.support_VK_KHR_external_memory = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_get_memory_requirements2") == 0)
+                gpu_info.support_VK_KHR_get_memory_requirements2 = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_maintenance1") == 0)
+                gpu_info.support_VK_KHR_maintenance1 = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_push_descriptor") == 0)
+                gpu_info.support_VK_KHR_push_descriptor = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_sampler_ycbcr_conversion") == 0)
+                gpu_info.support_VK_KHR_sampler_ycbcr_conversion = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_shader_float16_int8") == 0)
+                gpu_info.support_VK_KHR_shader_float16_int8 = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_shader_float_controls") == 0)
+                gpu_info.support_VK_KHR_shader_float_controls = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_storage_buffer_storage_class") == 0)
+                gpu_info.support_VK_KHR_storage_buffer_storage_class = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_KHR_swapchain") == 0)
+                gpu_info.support_VK_KHR_swapchain = exp.specVersion;
+            else if (strcmp(exp.extensionName, "VK_EXT_queue_family_foreign") == 0)
+                gpu_info.support_VK_EXT_queue_family_foreign = exp.specVersion;
+#if __ANDROID_API__ >= 26
+            else if (strcmp(exp.extensionName, "VK_ANDROID_external_memory_android_hardware_buffer") == 0)
+                gpu_info.support_VK_ANDROID_external_memory_android_hardware_buffer = exp.specVersion;
+#endif // __ANDROID_API__ >= 26
+        }
+
+        // check features
+        gpu_info.support_fp16_packed = true;
+        gpu_info.support_fp16_storage = false;
+        gpu_info.support_fp16_arithmetic = false;
+        gpu_info.support_int8_storage = false;
+        gpu_info.support_int8_arithmetic = false;
+        gpu_info.support_ycbcr_conversion = false;
+        if (support_VK_KHR_get_physical_device_properties2)
+        {
+            void* queryExtensionFeatures = 0;
+
+            // query int8 storage
+            VkPhysicalDevice8BitStorageFeaturesKHR query8BitStorageFeatures;
+            query8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
+            query8BitStorageFeatures.pNext = 0;
+            if (gpu_info.support_VK_KHR_8bit_storage)
+            {
+                query8BitStorageFeatures.pNext = queryExtensionFeatures;
+                queryExtensionFeatures = &query8BitStorageFeatures;
+            }
+
+            // query fp16/int16 storage
+            VkPhysicalDevice16BitStorageFeaturesKHR query16BitStorageFeatures;
+            query16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
+            query16BitStorageFeatures.pNext = 0;
+            if (gpu_info.support_VK_KHR_16bit_storage)
+            {
+                query16BitStorageFeatures.pNext = queryExtensionFeatures;
+                queryExtensionFeatures = &query16BitStorageFeatures;
+            }
+
+            // query fp16/int8 arithmetic
+            VkPhysicalDeviceFloat16Int8FeaturesKHR queryFloat16Int8Features;
+            queryFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
+            queryFloat16Int8Features.pNext = 0;
+            if (gpu_info.support_VK_KHR_shader_float16_int8)
+            {
+                queryFloat16Int8Features.pNext = queryExtensionFeatures;
+                queryExtensionFeatures = &queryFloat16Int8Features;
+            }
+
+            // query ycbcr_conversion
+            VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
+            querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR;
+            querySamplerYcbcrConversionFeatures.pNext = 0;
+            if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
+            {
+                querySamplerYcbcrConversionFeatures.pNext = queryExtensionFeatures;
+                queryExtensionFeatures = &querySamplerYcbcrConversionFeatures;
+            }
+
+            VkPhysicalDeviceFeatures2KHR queryFeatures;
+            queryFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR,
+            queryFeatures.pNext = queryExtensionFeatures;
+
+            vkGetPhysicalDeviceFeatures2KHR(physicalDevice, &queryFeatures);
+
+            if (gpu_info.support_VK_KHR_8bit_storage)
+            {
+                gpu_info.support_int8_storage = query8BitStorageFeatures.storageBuffer8BitAccess && query8BitStorageFeatures.uniformAndStorageBuffer8BitAccess;
+            }
+            if (gpu_info.support_VK_KHR_16bit_storage)
+            {
+                gpu_info.support_fp16_storage = query16BitStorageFeatures.storageBuffer16BitAccess && query16BitStorageFeatures.uniformAndStorageBuffer16BitAccess;
+            }
+            if (gpu_info.support_VK_KHR_shader_float16_int8)
+            {
+                gpu_info.support_fp16_arithmetic = queryFloat16Int8Features.shaderFloat16;
+                gpu_info.support_int8_arithmetic = queryFloat16Int8Features.shaderInt8;
+            }
+            if (gpu_info.support_VK_KHR_sampler_ycbcr_conversion)
+            {
+                gpu_info.support_ycbcr_conversion = querySamplerYcbcrConversionFeatures.samplerYcbcrConversion;
+            }
+        }
+        else
+        {
+//             // TODO
+//             VkPhysicalDeviceFeatures features;
+//             vkGetPhysicalDeviceFeatures(physicalDevice, &features);
+        }
+
+        if (physicalDeviceProperties.vendorID == 0x13b5)
+        {
+            // the 16bit_storage implementation of arm mali driver is buggy :[
+            gpu_info.support_fp16_storage = false;
+        }
+
+        if (physicalDeviceProperties.vendorID == 0x10002 && physicalDeviceProperties.deviceID == 0x70006214 && physicalDeviceProperties.apiVersion == VK_MAKE_VERSION(1, 1, 82))
+        {
+            // the 16bit_storage implementation of vivante gc1700 driver is buggy :[
+            gpu_info.support_fp16_storage = false;
+        }
+
+        if (gpu_info.bug_implicit_fp16_arithmetic)
+        {
+            // force capability on as long as the driver accept spirv with fp16 arithmetic :D
+            gpu_info.support_fp16_arithmetic = true;
+        }
+
+        TLOG_INFO("[%u %s]  queueC=%u[%u]  queueG=%u[%u]  queueT=%u[%u]\n", i, physicalDeviceProperties.deviceName,
+                gpu_info.compute_queue_family_index, gpu_info.compute_queue_count,
+                gpu_info.graphics_queue_family_index, gpu_info.graphics_queue_count,
+                gpu_info.transfer_queue_family_index, gpu_info.transfer_queue_count);
+
+        TLOG_INFO("[%u %s]  buglssc=%d  bugihfa=%d\n", i, physicalDeviceProperties.deviceName,
+                gpu_info.bug_local_size_spec_const, gpu_info.bug_implicit_fp16_arithmetic);
+
+        TLOG_INFO("[%u %s]  fp16p=%d  fp16s=%d  fp16a=%d  int8s=%d  int8a=%d\n", i, physicalDeviceProperties.deviceName,
+                gpu_info.support_fp16_packed, gpu_info.support_fp16_storage, gpu_info.support_fp16_arithmetic,
+                gpu_info.support_int8_storage, gpu_info.support_int8_arithmetic);
+
+        gpu_info_index++;
+    }
+
+    g_gpu_count = gpu_info_index;
+
+    // the default gpu device
+    g_default_gpu_index = find_default_vulkan_device_index();
+
+    // resolve shader info
+    // TLOG_INFO("run create gpu instance resolve shader info, layer_shader_registry_entry_count:%d\n", layer_shader_registry_entry_count);
+    for (int i=0; i<layer_shader_registry_entry_count; i++)
+    {
+        resolve_shader_info(layer_shader_registry[i].spv_data, layer_shader_registry[i].spv_data_size, layer_shader_infos[i]);
+    }
+
+    return 0;
+}
+
+void destroy_gpu_instance()
+{
+    for (int i=0; i<MAX_GPU_COUNT; i++)
+    {
+        delete g_default_vkdev[i];
+        g_default_vkdev[i] = 0;
+    }
+
+#if ENABLE_VALIDATION_LAYER
+    if (support_VK_EXT_debug_utils)
+    {
+        DestroyDebugUtilsMessengerEXT(g_instance, callback, NULL);
+    }
+#endif // ENABLE_VALIDATION_LAYER
+
+    vkDestroyInstance(g_instance, 0);
+}
+
+int get_gpu_count()
+{
+    return g_gpu_count;
+}
+
+int get_default_gpu_index()
+{
+    return g_default_gpu_index;
+}
+
+const GpuInfo& get_gpu_info(int device_index)
+{
+    return g_gpu_infos[device_index];
+}
+
+GPUDevice::GPUDevice(int device_index) : info(g_gpu_infos[device_index])
+{
+    std::vector<const char*> enabledExtensions;
+    if (info.support_VK_KHR_8bit_storage)
+        enabledExtensions.push_back("VK_KHR_8bit_storage");
+    if (info.support_VK_KHR_16bit_storage)
+        enabledExtensions.push_back("VK_KHR_16bit_storage");
+    if (info.support_VK_KHR_bind_memory2)
+        enabledExtensions.push_back("VK_KHR_bind_memory2");
+    if (info.support_VK_KHR_dedicated_allocation)
+        enabledExtensions.push_back("VK_KHR_dedicated_allocation");
+    if (info.support_VK_KHR_descriptor_update_template)
+        enabledExtensions.push_back("VK_KHR_descriptor_update_template");
+    if (info.support_VK_KHR_external_memory)
+        enabledExtensions.push_back("VK_KHR_external_memory");
+    if (info.support_VK_KHR_get_memory_requirements2)
+        enabledExtensions.push_back("VK_KHR_get_memory_requirements2");
+    if (info.support_VK_KHR_maintenance1)
+        enabledExtensions.push_back("VK_KHR_maintenance1");
+    if (info.support_VK_KHR_push_descriptor)
+        enabledExtensions.push_back("VK_KHR_push_descriptor");
+    if (info.support_VK_KHR_sampler_ycbcr_conversion)
+        enabledExtensions.push_back("VK_KHR_sampler_ycbcr_conversion");
+    if (info.support_VK_KHR_shader_float16_int8)
+        enabledExtensions.push_back("VK_KHR_shader_float16_int8");
+    if (info.support_VK_KHR_shader_float_controls)
+        enabledExtensions.push_back("VK_KHR_shader_float_controls");
+    if (info.support_VK_KHR_storage_buffer_storage_class)
+        enabledExtensions.push_back("VK_KHR_storage_buffer_storage_class");
+    if (info.support_VK_KHR_swapchain)
+        enabledExtensions.push_back("VK_KHR_swapchain");
+    if (info.support_VK_EXT_queue_family_foreign)
+        enabledExtensions.push_back("VK_EXT_queue_family_foreign");
+#if __ANDROID_API__ >= 26
+    if (info.support_VK_ANDROID_external_memory_android_hardware_buffer)
+        enabledExtensions.push_back("VK_ANDROID_external_memory_android_hardware_buffer");
+#endif // __ANDROID_API__ >= 26
+
+    void* enabledExtensionFeatures = 0;
+
+    // enable int8 storage
+    VkPhysicalDevice8BitStorageFeaturesKHR enabled8BitStorageFeatures;
+    enabled8BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_8BIT_STORAGE_FEATURES_KHR;
+    enabled8BitStorageFeatures.pNext = 0;
+    enabled8BitStorageFeatures.storageBuffer8BitAccess = info.support_int8_storage;
+    enabled8BitStorageFeatures.uniformAndStorageBuffer8BitAccess = info.support_int8_storage;
+    enabled8BitStorageFeatures.storagePushConstant8 = VK_FALSE;
+    if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_8bit_storage)
+    {
+        enabled8BitStorageFeatures.pNext = enabledExtensionFeatures;
+        enabledExtensionFeatures = &enabled8BitStorageFeatures;
+    }
+
+    // enable fp16/int16 storage
+    VkPhysicalDevice16BitStorageFeaturesKHR enabled16BitStorageFeatures;
+    enabled16BitStorageFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_16BIT_STORAGE_FEATURES_KHR;
+    enabled16BitStorageFeatures.pNext = 0;
+    enabled16BitStorageFeatures.storageBuffer16BitAccess = info.support_fp16_storage;
+    enabled16BitStorageFeatures.uniformAndStorageBuffer16BitAccess = info.support_fp16_storage;
+    enabled16BitStorageFeatures.storagePushConstant16 = VK_FALSE;
+    enabled16BitStorageFeatures.storageInputOutput16 = VK_FALSE;
+    if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_16bit_storage)
+    {
+        enabled16BitStorageFeatures.pNext = enabledExtensionFeatures;
+        enabledExtensionFeatures = &enabled16BitStorageFeatures;
+    }
+
+    // enable fp16/int8 arithmetic
+    VkPhysicalDeviceFloat16Int8FeaturesKHR enabledFloat16Int8Features;
+    enabledFloat16Int8Features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR;
+    enabledFloat16Int8Features.pNext = 0;
+    enabledFloat16Int8Features.shaderFloat16 = info.support_fp16_arithmetic;
+    enabledFloat16Int8Features.shaderInt8 = info.support_int8_arithmetic;
+    if (support_VK_KHR_get_physical_device_properties2 && info.support_VK_KHR_shader_float16_int8)
+    {
+        enabledFloat16Int8Features.pNext = enabledExtensionFeatures;
+        enabledExtensionFeatures = &enabledFloat16Int8Features;
+    }
+
+    // enable ycbcr conversion
+    VkPhysicalDeviceSamplerYcbcrConversionFeaturesKHR querySamplerYcbcrConversionFeatures;
+    querySamplerYcbcrConversionFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES_KHR;
+    querySamplerYcbcrConversionFeatures.pNext = 0;
+    querySamplerYcbcrConversionFeatures.samplerYcbcrConversion = info.support_ycbcr_conversion;
+    if (support_VK_KHR_get_physical_device_properties2 && info.support_ycbcr_conversion)
+    {
+        querySamplerYcbcrConversionFeatures.pNext = enabledExtensionFeatures;
+        enabledExtensionFeatures = &querySamplerYcbcrConversionFeatures;
+    }
+    std::vector<float> compute_queue_priorities(info.compute_queue_count, 1.f);// 0.f ~ 1.f
+    std::vector<float> graphics_queue_priorities(info.graphics_queue_count, 1.f);// 0.f ~ 1.f
+    std::vector<float> transfer_queue_priorities(info.transfer_queue_count, 1.f);// 0.f ~ 1.f
+
+    VkDeviceQueueCreateInfo deviceQueueCreateInfos[3];
+    VkDeviceQueueCreateInfo deviceComputeQueueCreateInfo;
+    deviceComputeQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+    deviceComputeQueueCreateInfo.pNext = 0;
+    deviceComputeQueueCreateInfo.flags = 0;
+    deviceComputeQueueCreateInfo.queueFamilyIndex = info.compute_queue_family_index;
+    deviceComputeQueueCreateInfo.queueCount = info.compute_queue_count;
+    deviceComputeQueueCreateInfo.pQueuePriorities = compute_queue_priorities.data();
+
+    VkDeviceQueueCreateInfo deviceGraphicsQueueCreateInfo;
+    deviceGraphicsQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+    deviceGraphicsQueueCreateInfo.pNext = 0;
+    deviceGraphicsQueueCreateInfo.flags = 0;
+    deviceGraphicsQueueCreateInfo.queueFamilyIndex = info.graphics_queue_family_index;
+    deviceGraphicsQueueCreateInfo.queueCount = info.graphics_queue_count;
+    deviceGraphicsQueueCreateInfo.pQueuePriorities = graphics_queue_priorities.data();
+
+    VkDeviceQueueCreateInfo deviceTransferQueueCreateInfo;
+    deviceTransferQueueCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
+    deviceTransferQueueCreateInfo.pNext = 0;
+    deviceTransferQueueCreateInfo.flags = 0;
+    deviceTransferQueueCreateInfo.queueFamilyIndex = info.transfer_queue_family_index;
+    deviceTransferQueueCreateInfo.queueCount = info.transfer_queue_count;
+    deviceTransferQueueCreateInfo.pQueuePriorities = transfer_queue_priorities.data();
+
+    VkDeviceCreateInfo deviceCreateInfo;
+    deviceCreateInfo.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
+    deviceCreateInfo.pNext = enabledExtensionFeatures;
+    deviceCreateInfo.flags = 0;
+    if (info.compute_queue_family_index == info.graphics_queue_family_index && info.compute_queue_family_index == info.transfer_queue_family_index)
+    {
+        deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
+        deviceCreateInfo.queueCreateInfoCount = 1;
+    }
+    else if (info.compute_queue_family_index == info.graphics_queue_family_index && info.compute_queue_family_index != info.transfer_queue_family_index)
+    {
+        deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
+        deviceQueueCreateInfos[1] = deviceTransferQueueCreateInfo;
+        deviceCreateInfo.queueCreateInfoCount = 2;
+    }
+    else if (info.compute_queue_family_index != info.graphics_queue_family_index && info.graphics_queue_family_index == info.transfer_queue_family_index)
+    {
+        deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
+        deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo;
+        deviceCreateInfo.queueCreateInfoCount = 2;
+    }
+    else // if (info.compute_queue_family_index != info.graphics_queue_family_index && info.graphics_queue_family_index != info.transfer_queue_family_index)
+    {
+        deviceQueueCreateInfos[0] = deviceComputeQueueCreateInfo;
+        deviceQueueCreateInfos[1] = deviceGraphicsQueueCreateInfo;
+        deviceQueueCreateInfos[2] = deviceTransferQueueCreateInfo;
+        deviceCreateInfo.queueCreateInfoCount = 3;
+    }
+    deviceCreateInfo.pQueueCreateInfos = deviceQueueCreateInfos;
+    deviceCreateInfo.enabledLayerCount = 0;
+    deviceCreateInfo.ppEnabledLayerNames = 0;
+    deviceCreateInfo.enabledExtensionCount = enabledExtensions.size();
+    deviceCreateInfo.ppEnabledExtensionNames = enabledExtensions.data();
+    deviceCreateInfo.pEnabledFeatures = 0;// VkPhysicalDeviceFeatures pointer
+
+    VkResult ret = vkCreateDevice(info.physical_device, &deviceCreateInfo, 0, &device);
+    if (ret != VK_SUCCESS)
+    {
+        TLOG_INFO("vkCreateDevice failed %d\n", ret);
+    }
+
+    init_device_extension();
+
+    create_shader_module();
+
+    compute_queues.resize(info.compute_queue_count);
+    blob_allocators.resize(info.compute_queue_count);
+    staging_allocators.resize(info.compute_queue_count);
+    for (uint32_t i = 0; i < info.compute_queue_count; i++)
+    {
+        vkGetDeviceQueue(device, info.compute_queue_family_index, i, &compute_queues[i]);
+	
+        blob_allocators[i] = new VkBlobAllocator(this);
+        staging_allocators[i] = new VkStagingAllocator(this);
+    }
+    if (info.compute_queue_family_index != info.graphics_queue_family_index)
+    {
+        graphics_queues.resize(info.graphics_queue_count);
+        for (uint32_t i = 0; i < info.graphics_queue_count; i++)
+        {
+            vkGetDeviceQueue(device, info.graphics_queue_family_index, i, &graphics_queues[i]);
+        }
+    }
+    if (info.compute_queue_family_index != info.transfer_queue_family_index && info.graphics_queue_family_index != info.transfer_queue_family_index)
+    {
+        transfer_queues.resize(info.transfer_queue_count);
+        for (uint32_t i = 0; i < info.transfer_queue_count; i++)
+        {
+            vkGetDeviceQueue(device, info.transfer_queue_family_index, i, &transfer_queues[i]);
+        }
+    }
+
+    create_dummy_buffer_image();
+
+    create_utility_operator();
+}
+
+GPUDevice::~GPUDevice()
+{
+    destroy_utility_operator();
+
+    destroy_dummy_buffer_image();
+
+    for (uint32_t i = 0; i < info.compute_queue_count; i++)
+    {
+        delete blob_allocators[i];
+        delete staging_allocators[i];
+    }
+    blob_allocators.clear();
+    staging_allocators.clear();
+
+    destroy_shader_module();
+
+    vkDestroyDevice(device, 0);
+}
+
+VkShaderModule GPUDevice::get_shader_module(int shader_type_index) const
+{
+    if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
+    {
+        TLOG_INFO("no such shader module %d\n", shader_type_index);
+        return 0;
+    }
+
+    return shader_modules[shader_type_index];
+}
+
+VkShaderModule GPUDevice::create_shader_module(int shader_type_index, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
+{
+    if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
+    {
+        TLOG_INFO("no such shader module %d\n", shader_type_index);
+        return 0;
+    }
+
+    const uint32_t* spv_data = layer_shader_registry[shader_type_index].spv_data;
+    size_t spv_data_size = layer_shader_registry[shader_type_index].spv_data_size;
+
+    return compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
+}
+
+VkShaderModule GPUDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const
+{
+    VkShaderModuleCreateInfo shaderModuleCreateInfo;
+    shaderModuleCreateInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+    shaderModuleCreateInfo.pNext = 0;
+    shaderModuleCreateInfo.flags = 0;
+    shaderModuleCreateInfo.codeSize = spv_data_size;
+    shaderModuleCreateInfo.pCode = spv_data;
+
+    VkShaderModule shader_module;
+    VkResult ret = vkCreateShaderModule(device, &shaderModuleCreateInfo, 0, &shader_module);
+    if (ret != VK_SUCCESS)
+    {
+        TLOG_INFO("vkCreateShaderModule failed %d\n", ret);
+        return 0;
+    }
+
+    return shader_module;
+}
+
+// TODO
+static void inject_local_size_xyz(const uint32_t* code, size_t size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z, uint32_t* dstcode, size_t* dstsize)
+{
+    uint32_t local_size_x_id = -1;
+    uint32_t local_size_y_id = -1;
+    uint32_t local_size_z_id = -1;
+    uint32_t gl_WorkGroupSize_id = -1;
+
+    const uint32_t* p = code;
+    uint32_t* dp = dstcode;
+
+    // skip magic version generator bound schema
+    memcpy(dp, p, 5 * sizeof(uint32_t));
+    p += 5;
+    dp += 5;
+
+    // foreach op
+    while ((const unsigned char*)p < (const unsigned char*)code + size)
+    {
+        uint32_t opcode = p[0];
+
+        uint16_t wordcount = opcode >> 16;
+        uint16_t op = opcode & 0xffff;
+        if (op == 16) // OpExecutionMode
+        {
+            uint32_t mode = p[2];
+            if (mode == 17) // LocalSize
+            {
+                memcpy(dp, p, wordcount * sizeof(uint32_t));
+
+                // set local_size_xyz
+                dp[3] = local_size_x;
+                dp[4] = local_size_y;
+                dp[5] = local_size_z;
+
+                p += wordcount;
+                dp += wordcount;
+                continue;
+            }
+        }
+        else if (op == 50) // OpSpecConstant
+        {
+            uint32_t id = p[2];
+            if (id == local_size_x_id || id == local_size_y_id || id == local_size_z_id)
+            {
+                p += wordcount;
+                continue;
+            }
+        }
+        else if (op == 51) // OpSpecConstantComposite
+        {
+            uint32_t id = p[2];
+            if (id == gl_WorkGroupSize_id)
+            {
+                if (wordcount == 6 && (p[3] == local_size_x_id || p[4] == local_size_y_id || p[5] == local_size_z_id))
+                {
+                    p += wordcount;
+                    continue;
+                }
+            }
+        }
+        else if (op == 71) // OpDecorate
+        {
+            uint32_t id = p[1];
+            uint32_t decoration = p[2];
+            if (decoration == 1) // SpecId
+            {
+                uint32_t specid = p[3];
+                if (specid == 233) local_size_x_id = id;
+                if (specid == 234) local_size_y_id = id;
+                if (specid == 235) local_size_z_id = id;
+                if (specid == 233 || specid == 234 || specid == 235)
+                {
+                    p += wordcount;
+                    continue;
+                }
+            }
+            else if (decoration == 11) // BuiltIn
+            {
+                uint32_t builtin = p[3];
+                if (builtin == 25) // WorkgroupSize
+                {
+                    gl_WorkGroupSize_id = id;
+                    p += wordcount;
+                    continue;
+                }
+            }
+        }
+
+        memcpy(dp, p, wordcount * sizeof(uint32_t));
+        p += wordcount;
+        dp += wordcount;
+    }
+    *dstsize = (unsigned char*)dp - (unsigned char*)dstcode;
+}
+
+VkShaderModule GPUDevice::compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const
+{
+    uint32_t* spv_data_modified = (uint32_t*)malloc(spv_data_size);
+    size_t spv_data_size_modified = spv_data_size;
+    inject_local_size_xyz(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z, spv_data_modified, &spv_data_size_modified);
+
+    VkShaderModule shader_module = compile_shader_module(spv_data_modified, spv_data_size_modified);
+
+    free(spv_data_modified);
+
+    return shader_module;
+}
+
+
+
+
+uint32_t GPUDevice::find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const
+{
+    // first try, find required and with preferred and without preferred_not
+    for (uint32_t i=0; i<info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
+    {
+        bool is_required = (1 << i) & memory_type_bits;
+        if (is_required)
+        {
+            const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties.memoryTypes[i];
+            if ((memoryType.propertyFlags & required) == required
+                && (preferred && (memoryType.propertyFlags & preferred))
+                && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
+            {
+										                    return i;
+										                }
+        }
+    }
+
+    // second try, find required and with preferred
+    for (uint32_t i=0; i<info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
+    {
+        bool is_required = (1 << i) & memory_type_bits;
+        if (is_required)
+        {
+            const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties.memoryTypes[i];
+            if ((memoryType.propertyFlags & required) == required
+                && (preferred && (memoryType.propertyFlags & preferred)))
+            {
+                return i;
+            }
+        }
+    }
+
+    // third try, find required and without preferred_not
+    for (uint32_t i=0; i<info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
+    {
+        bool is_required = (1 << i) & memory_type_bits;
+        if (is_required)
+        {
+            const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties.memoryTypes[i];
+            if ((memoryType.propertyFlags & required) == required
+                && (preferred_not && !(memoryType.propertyFlags & preferred_not)))
+            {
+                return i;
+            }
+        }
+    }
+
+    // fourth try, find any required
+    for (uint32_t i=0; i<info.physicalDeviceMemoryProperties.memoryTypeCount; i++)
+    {
+        bool is_required = (1 << i) & memory_type_bits;
+        if (is_required)
+        {
+            const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties.memoryTypes[i];
+            if ((memoryType.propertyFlags & required) == required)
+            {
+                return i;
+            }
+        }
+    }
+
+    TLOG_INFO("no such memory type %u %u %u %u\n", memory_type_bits, required, preferred, preferred_not);
+    return -1;
+}
+
+bool GPUDevice::is_mappable(uint32_t memory_type_index) const
+{
+    const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties.memoryTypes[memory_type_index];
+
+    return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+}
+
+bool GPUDevice::is_coherent(uint32_t memory_type_index) const
+{
+    const VkMemoryType& memoryType = info.physicalDeviceMemoryProperties.memoryTypes[memory_type_index];
+
+    return memoryType.propertyFlags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
+}
+
+VkQueue GPUDevice::acquire_queue(uint32_t queue_family_index) const
+{
+    if (queue_family_index != info.compute_queue_family_index
+        && queue_family_index != info.graphics_queue_family_index
+        && queue_family_index != info.transfer_queue_family_index)
+    {
+        TLOG_INFO("invalid queue_family_index %u", queue_family_index);
+        return 0;
+    }
+
+    MutexLockGuard lock(queue_lock);
+
+    std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index ? compute_queues
+	        : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues;
+    for (int i=0; i<(int)queues.size(); i++)
+    {
+        VkQueue queue = queues[i];
+        if (queue)
+        {
+            queues[i] = 0;
+            return queue;
+        }
+    }
+
+    // out of hardware queue
+    return 0;
+}
+
+// TODO
+void GPUDevice::reclaim_queue(uint32_t queue_family_index, VkQueue queue) const
+{
+    if (queue_family_index != info.compute_queue_family_index
+        && queue_family_index != info.graphics_queue_family_index
+        && queue_family_index != info.transfer_queue_family_index)
+    {
+        TLOG_INFO("invalid queue_family_index %u", queue_family_index);
+        return;
+    }
+
+    // TODO
+    MutexLockGuard lock(queue_lock);
+
+    std::vector<VkQueue>& queues = queue_family_index == info.compute_queue_family_index ? compute_queues 
+		    : queue_family_index == info.graphics_queue_family_index ? graphics_queues : transfer_queues;
+    for (int i=0; i<(int)queues.size(); i++)
+    {
+        if (!queues[i])
+        {
+            queues[i] = queue;
+            return;
+        }
+    }
+
+    TLOG_INFO("FATAL ERROR! reclaim_queue get wild queue %u %p", queue_family_index, queue);
+}
+
+VkAllocator* GPUDevice::acquire_blob_allocator() const
+{
+    MutexLockGuard lock(blob_allocator_lock);
+
+    for (int i=0; i<(int)blob_allocators.size(); i++)
+    {
+        VkAllocator* allocator = blob_allocators[i];
+        if (allocator)
+        {
+            blob_allocators[i] = 0;
+            return allocator;
+        }
+    }
+
+    // out of blob allocator
+    return 0;
+}
+
+void GPUDevice::reclaim_blob_allocator(VkAllocator* allocator) const
+{
+    MutexLockGuard lock(blob_allocator_lock);
+
+    for (int i=0; i<(int)blob_allocators.size(); i++)
+    {
+        if (!blob_allocators[i])
+        {
+            blob_allocators[i] = allocator;
+            return;
+        }
+    }
+
+    TLOG_INFO("FATAL ERROR! reclaim_blob_allocator get wild allocator %p", allocator);
+}
+
+
+VkAllocator* GPUDevice::acquire_staging_allocator() const
+{
+    MutexLockGuard lock(staging_allocator_lock);
+
+    for (int i=0; i<(int)staging_allocators.size(); i++)
+    {
+        VkAllocator* allocator = staging_allocators[i];
+        if (allocator)
+        {
+            staging_allocators[i] = 0;
+            return allocator;
+        }
+    }
+
+    // out of staging allocator
+    return 0;
+}
+
+
+void GPUDevice::reclaim_staging_allocator(VkAllocator* allocator) const
+{
+    MutexLockGuard lock(staging_allocator_lock);
+
+    for (int i=0; i<(int)staging_allocators.size(); i++)
+    {
+        if (!staging_allocators[i])
+        {
+            staging_allocators[i] = allocator;
+            return;
+        }
+    }
+
+    TLOG_INFO("FATAL ERROR! reclaim_staging_allocator get wild allocator %p", allocator);
+}
+
+int GPUDevice::create_shader_module()
+{
+    if (info.bug_local_size_spec_const)
+    {
+        // do not cache shader module
+        return 0;
+    }
+
+    shader_modules.resize(layer_shader_registry_entry_count, VK_NULL_HANDLE);
+    for (int i=0; i<layer_shader_registry_entry_count; i++)
+    {
+        // add_shader cmake macro
+        // 0 = fp32
+        // 1 = fp16p
+        // 2 = fp16pa
+        // 3 = fp16s
+        // 4 = fp16sa
+        
+        if (!info.support_fp16_packed)
+        {
+            if (i % 5 == 1)
+                continue;
+        }
+
+	    if (!info.support_fp16_packed || !info.support_fp16_arithmetic)
+        {
+            if (i % 5 == 2)
+                continue;
+        }
+        if (!info.support_fp16_storage)
+        {
+            if (i % 5 == 3)
+                continue;
+        }
+
+        if (!info.support_fp16_storage || !info.support_fp16_arithmetic)
+        {
+            if (i % 5 == 4)
+                continue;
+        }
+        const uint32_t* spv_data = layer_shader_registry[i].spv_data;
+        size_t spv_data_size = layer_shader_registry[i].spv_data_size;
+
+        VkShaderModule shader_module = compile_shader_module(spv_data, spv_data_size);
+        if (shader_module == 0)
+        {
+            TLOG_INFO("compile_shader_module %d failed\n", i);
+            return -1;
+        }
+
+        shader_modules[i] = shader_module;
+    }
+
+    return 0;
+}
+
+void GPUDevice::destroy_shader_module()
+{
+    for (int i=0; i<(int)shader_modules.size(); i++)
+    {
+        vkDestroyShaderModule(device, shader_modules[i], 0);
+    }
+
+    shader_modules.clear();
+}
+
+int GPUDevice::init_device_extension()
+{
+    if (info.support_VK_KHR_bind_memory2)
+    {
+        vkBindBufferMemory2KHR = (PFN_vkBindBufferMemory2KHR)vkGetDeviceProcAddr(device, "vkBindBufferMemory2KHR");
+        vkBindImageMemory2KHR = (PFN_vkBindImageMemory2KHR)vkGetDeviceProcAddr(device, "vkBindImageMemory2KHR");
+    }
+
+    if (info.support_VK_KHR_descriptor_update_template)
+    {
+        vkCreateDescriptorUpdateTemplateKHR = (PFN_vkCreateDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(device, "vkCreateDescriptorUpdateTemplateKHR");
+        vkDestroyDescriptorUpdateTemplateKHR = (PFN_vkDestroyDescriptorUpdateTemplateKHR)vkGetDeviceProcAddr(device, "vkDestroyDescriptorUpdateTemplateKHR");
+        vkUpdateDescriptorSetWithTemplateKHR = (PFN_vkUpdateDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(device, "vkUpdateDescriptorSetWithTemplateKHR");
+    }
+
+    if (info.support_VK_KHR_get_memory_requirements2)
+    {
+        vkGetImageMemoryRequirements2KHR = (PFN_vkGetImageMemoryRequirements2KHR)vkGetDeviceProcAddr(device, "vkGetImageMemoryRequirements2KHR");
+        vkGetBufferMemoryRequirements2KHR = (PFN_vkGetBufferMemoryRequirements2KHR)vkGetDeviceProcAddr(device, "vkGetBufferMemoryRequirements2KHR");
+        vkGetImageSparseMemoryRequirements2KHR = (PFN_vkGetImageSparseMemoryRequirements2KHR)vkGetDeviceProcAddr(device, "vkGetImageSparseMemoryRequirements2KHR");
+    }
+
+    if (info.support_VK_KHR_maintenance1)
+    {
+        vkTrimCommandPoolKHR = (PFN_vkTrimCommandPoolKHR)vkGetDeviceProcAddr(device, "vkTrimCommandPoolKHR");
+    }
+
+    if (info.support_VK_KHR_push_descriptor)
+    {
+        if (info.support_VK_KHR_descriptor_update_template)
+        {
+            vkCmdPushDescriptorSetWithTemplateKHR = (PFN_vkCmdPushDescriptorSetWithTemplateKHR)vkGetDeviceProcAddr(device, "vkCmdPushDescriptorSetWithTemplateKHR");
+        }
+
+        vkCmdPushDescriptorSetKHR = (PFN_vkCmdPushDescriptorSetKHR)vkGetDeviceProcAddr(device, "vkCmdPushDescriptorSetKHR");
+    }
+
+    if (info.support_VK_KHR_sampler_ycbcr_conversion)
+    {
+        vkCreateSamplerYcbcrConversionKHR = (PFN_vkCreateSamplerYcbcrConversionKHR)vkGetDeviceProcAddr(device, "vkCreateSamplerYcbcrConversionKHR");
+        vkDestroySamplerYcbcrConversionKHR = (PFN_vkDestroySamplerYcbcrConversionKHR)vkGetDeviceProcAddr(device, "vkDestroySamplerYcbcrConversionKHR");
+    }
+
+    if (info.support_VK_KHR_swapchain)
+    {
+        vkCreateSwapchainKHR = (PFN_vkCreateSwapchainKHR)vkGetDeviceProcAddr(device, "vkCreateSwapchainKHR");
+        vkDestroySwapchainKHR = (PFN_vkDestroySwapchainKHR)vkGetDeviceProcAddr(device, "vkDestroySwapchainKHR");
+        vkGetSwapchainImagesKHR = (PFN_vkGetSwapchainImagesKHR)vkGetDeviceProcAddr(device, "vkGetSwapchainImagesKHR");
+        vkAcquireNextImageKHR = (PFN_vkAcquireNextImageKHR)vkGetDeviceProcAddr(device, "vkAcquireNextImageKHR");
+        vkQueuePresentKHR = (PFN_vkQueuePresentKHR)vkGetDeviceProcAddr(device, "vkQueuePresentKHR");
+    }
+
+#if __ANDROID_API__ >= 26
+    if (info.support_VK_ANDROID_external_memory_android_hardware_buffer)
+    {
+        vkGetAndroidHardwareBufferPropertiesANDROID = (PFN_vkGetAndroidHardwareBufferPropertiesANDROID)vkGetDeviceProcAddr(device, "vkGetAndroidHardwareBufferPropertiesANDROID");
+        vkGetMemoryAndroidHardwareBufferANDROID = (PFN_vkGetMemoryAndroidHardwareBufferANDROID)vkGetDeviceProcAddr(device, "vkGetMemoryAndroidHardwareBufferANDROID");
+    }
+#endif // __ANDROID_API__ >= 26
+
+    return 0;
+}
+
+GPUDevice* get_gpu_device(int device_index)
+{
+    if (device_index < 0 || device_index >= g_gpu_count)
+        return 0;
+
+    // MutexLockGuard lock(g_default_vkdev_lock);
+
+    if (!g_default_vkdev[device_index])
+        g_default_vkdev[device_index] = new GPUDevice(device_index);
+
+    return g_default_vkdev[device_index];
+}
+
+const ShaderInfo& get_shader_info(int shader_type_index)
+{
+    if (shader_type_index < 0 || shader_type_index >= layer_shader_registry_entry_count)
+    {
+        TLOG_INFO("no such shader module %d\n", shader_type_index);
+        return layer_shader_infos[0];
+    }
+
+    return layer_shader_infos[shader_type_index];
+}
+
+// TODO
+int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info)
+{
+    shader_info.specialization_count = 0;
+    shader_info.binding_count = 0;
+    shader_info.push_constant_count = 0;
+
+    uint32_t parameter_id = -233;
+
+    int specialization_count = 0;
+    int binding_count = 0;
+    int push_constant_count = 0;
+
+    // id -> binding_type
+    std::vector<int> id_types;
+
+    // binding_id -> binding_type
+    std::vector<int> binding_types;
+
+    const uint32_t* p = spv_data;
+
+    int bound = p[3];
+
+    id_types.resize(bound);
+
+    // skip magic version generator bound schema
+    p += 5;
+
+    // foreach op
+    while ((const unsigned char*)p < (const unsigned char*)spv_data + spv_data_size)
+    {
+        uint32_t opcode = p[0];
+
+        uint16_t wordcount = opcode >> 16;
+        uint16_t op = opcode & 0xffff;
+
+        if (op == 5) // OpName
+        {
+            uint32_t id = p[1];
+            const char* name = (const char*)&p[2];
+            if (strcmp(name, "parameter") == 0)
+            {
+                parameter_id = id;
+            }
+        }
+        else if (op == 6) // OpMemberName
+        {
+            uint32_t id = p[1];
+            if (id == parameter_id)
+            {
+                push_constant_count++;
+            }
+        }
+        else if (op == 25) // OpTypeImage
+        {
+            uint32_t id = p[1];
+            id_types[id] = 2;
+        }
+        else if (op == 27) // OpTypeSampledImage
+        {
+            uint32_t id = p[1];
+            id_types[id] = 3;
+        }
+        else if (op == 32) // OpTypePointer
+        {
+            uint32_t id = p[1];
+            uint32_t storage_class = p[2];
+            uint32_t type = p[3];
+            if (storage_class == 0) // UniformConstant
+            {
+                id_types[id] = id_types[type];
+            }
+            if (storage_class == 2) // Uniform
+            {
+                id_types[id] = id_types[type];
+            }
+        }
+        else if (op == 59) // OpVariable
+        {
+            uint32_t id = p[1];
+            uint32_t var_id = p[2];
+            uint32_t storage_class = p[3];
+            if (storage_class == 0) // UniformConstant
+            {
+                id_types[var_id] = id_types[id];
+            }
+            if (storage_class == 2) // Uniform
+            {
+                id_types[var_id] = id_types[id];
+            }
+        }
+        else if (op == 71) // OpDecorate
+        {
+            uint32_t id = p[1];
+            uint32_t decoration = p[2];
+            uint32_t binding_id = p[3];
+            if (decoration == 1) // SpecId
+            {
+                specialization_count++;
+            }
+            if (decoration == 3) // BufferBlock
+            {
+                id_types[id] = 1;
+            }
+            else if (decoration == 33) // Binding
+            {
+                binding_count = std::max(binding_count, (int)binding_id + 1);
+
+                binding_types.resize(binding_count);
+                binding_types[binding_id] = id;
+            }
+        }
+
+        p += wordcount;
+    }
+
+    if (binding_count > 16)
+    {
+        TLOG_INFO("too many binding %d", binding_count);
+        return -1;
+    }
+
+    shader_info.specialization_count = specialization_count;
+    shader_info.binding_count = binding_count;
+    shader_info.push_constant_count = push_constant_count;
+
+    // resolve binding_types
+    for (int i=0; i<binding_count; i++)
+    {
+        shader_info.binding_types[i] = id_types[ binding_types[i] ];
+    }
+
+    return 0;
+}
+
+VkTensor GPUDevice::get_dummy_buffer() const
+{
+    return dummy_buffer;
+}
+
+VkImageTensor GPUDevice::get_dummy_image() const
+{
+    return dummy_image;
+}
+
+class VkDummyAllocator : public VkBlobAllocator
+{
+public:
+    VkDummyAllocator(const GPUDevice* _vkdev) : VkBlobAllocator(_vkdev)
+    {
+        // NOTE 16k is large enough I think ...
+        block_size = alignSize(16 * 1024, buffer_offset_alignment);
+    }
+};
+
+class VkDummyCompute : public VkCompute
+{
+public:
+    VkDummyCompute(const GPUDevice* _vkdev) : VkCompute(_vkdev) {}
+
+    void record_dummy(const VkTensor& buffer)
+    {
+//         TLOG_INFO("xxx barrier buffer %p +%d ~%d", buffer.buffer(), buffer.buffer_offset(), buffer.buffer_capacity());
+
+        // barrier device any @ compute/null to shader-readwrite @ compute
+        VkBufferMemoryBarrier* barriers = new VkBufferMemoryBarrier[1];
+        barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
+        barriers[0].pNext = 0;
+        barriers[0].srcAccessMask = buffer.data->access_flags;
+        barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barriers[0].buffer = buffer.buffer();
+        barriers[0].offset = buffer.buffer_offset();
+        barriers[0].size = buffer.buffer_capacity();
+
+        VkPipelineStageFlags src_stage = buffer.data->stage_flags;
+        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+        if (vkdev->info.support_VK_KHR_push_descriptor)
+        {
+            vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 1, barriers, 0, 0);
+            delete[] barriers;
+        }
+        else
+        {
+            record r;
+            r.type = record::TYPE_buffer_barrers;
+            r.command_buffer = compute_command_buffer;
+            r.buffer_barrers.src_stage = src_stage;
+            r.buffer_barrers.dst_stage = dst_stage;
+            r.buffer_barrers.barrier_count = 1;
+            r.buffer_barrers.barriers = barriers;
+            delayed_records.push_back(r);
+        }
+
+        // mark device shader-readwrite @ compute
+        buffer.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+        buffer.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+    }
+
+    void record_dummy(const VkImageTensor& image)
+    {
+//         TLOG_INFO("xxx barrier image %p +%d ~%d %p", image.image(), image.data->bind_offset, image.data->bind_capacity, image.imageview());
+
+        // image layout transform any @ any to shader-write @ compute
+        VkImageMemoryBarrier* barriers = new VkImageMemoryBarrier[1];
+        barriers[0].sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
+        barriers[0].pNext = 0;
+        barriers[0].srcAccessMask = image.data->access_flags;
+        barriers[0].dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+        barriers[0].oldLayout = image.data->image_layout;
+        barriers[0].newLayout = VK_IMAGE_LAYOUT_GENERAL;
+        barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
+        barriers[0].image = image.image();
+        barriers[0].subresourceRange.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
+        barriers[0].subresourceRange.baseMipLevel = 0;
+        barriers[0].subresourceRange.levelCount = 1;
+        barriers[0].subresourceRange.baseArrayLayer = 0;
+        barriers[0].subresourceRange.layerCount = 1;
+
+        VkPipelineStageFlags src_stage = image.data->stage_flags;
+        VkPipelineStageFlags dst_stage = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+
+        if (vkdev->info.support_VK_KHR_push_descriptor)
+        {
+            vkCmdPipelineBarrier(compute_command_buffer, src_stage, dst_stage, 0, 0, 0, 0, 0, 1, barriers);
+            delete[] barriers;
+        }
+        else
+        {
+            record r;
+            r.type = record::TYPE_image_barrers;
+            r.command_buffer = compute_command_buffer;
+            r.image_barrers.src_stage = src_stage;
+            r.image_barrers.dst_stage = dst_stage;
+            r.image_barrers.barrier_count = 1;
+            r.image_barrers.barriers = barriers;
+            delayed_records.push_back(r);
+        }
+
+        // mark image shader-write @ compute
+        image.data->access_flags = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT;
+        image.data->image_layout = VK_IMAGE_LAYOUT_GENERAL;
+        image.data->stage_flags = VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT;
+    }
+
+};
+
+int GPUDevice::create_dummy_buffer_image()
+{
+    dummy_allocator = new VkDummyAllocator(this);
+
+    dummy_buffer.create(1, 4u, dummy_allocator);
+    dummy_image.create(1, 4u, dummy_allocator);
+
+    VkDummyCompute cmd(this);
+
+    cmd.record_dummy(dummy_buffer);
+    cmd.record_dummy(dummy_image);
+
+    cmd.submit_and_wait();
+
+    return 0;
+}
+
+void GPUDevice::destroy_dummy_buffer_image()
+{
+    dummy_buffer.release();
+    dummy_image.release();
+
+    delete dummy_allocator;
+}
+
+int GPUDevice::create_utility_operator()
+{
+    TLOG_INFO("run create utility operator\n");
+    memset(uop_packing, 0, sizeof(uop_packing));
+
+    Option opt;
+
+    // from buffer | image
+    // to buffer | image
+    for (int i0=0; i0<2; i0++)
+    {
+    for (int i1=0; i1<2; i1++)
+    {
+        opt.use_image_storage = (i0 == 1 || i1 == 1);
+// #if __APPLE__
+//         if (opt.use_image_storage)
+//             continue;
+// #endif
+
+        // from fp32-b/i | fp16p-b/i | fp16s-b/i
+        // to fp32-b/i | fp16p-b/i | fp16s-b/i
+        for (int j0=0; j0<3; j0++)
+        {
+        for (int j1=0; j1<3; j1++)
+        {
+            opt.use_fp16_packed = (j0 == 1 || j1 == 1);
+            opt.use_fp16_storage = (j0 == 2 || j1 == 2);
+
+            if (!info.support_fp16_packed && opt.use_fp16_packed)
+                continue;
+
+            if (!info.support_fp16_storage && opt.use_fp16_storage)
+                continue;
+
+            // from pack1 | pack4 | pack8
+            for (int k=0; k<3; k++)
+            {
+                // enable pack8 for pack8to1/pack8to4
+                opt.use_shader_pack8 = true;
+
+                {   // create packing layer
+                    TEngine::Packing_vulkan* uop = new Packing_vulkan();
+                    uop->vkdev = this;
+
+                    uop->out_elempack = k == 0 ? 1 : k == 1 ? 4 : 8;
+                    uop->cast_type_from = j0 + 1;
+                    uop->cast_type_to = j1 + 1;
+                    uop->storage_type_from = i0;
+                    uop->storage_type_to = i1;
+                    // TLOG_INFO("out_elempack:%d %d %d %d %d\n", uop->out_elempack, uop->cast_type_from, uop->cast_type_to, uop->storage_type_from, uop->storage_type_to);
+
+                    uop->create_pipeline(opt);
+
+                    uop_packing[i0][i1][j0][j1][k] = uop;
+                }
+            }
+        }
+        }
+    }
+    }
+
+    return 0;
+}
+
+void GPUDevice::destroy_utility_operator()
+{
+    Option opt;
+
+    // from buffer | image
+    // to buffer | image
+    for (int i0=0; i0<2; i0++)
+    {
+    for (int i1=0; i1<2; i1++)
+    {
+        opt.use_image_storage = (i0 == 1 || i1 == 1);
+#if __APPLE__
+        if (opt.use_image_storage)
+            continue;
+#endif
+
+        // from fp32-b/i | fp16p-b/i | fp16s-b/i
+        // to fp32-b/i | fp16p-b/i | fp16s-b/i
+        for (int j0=0; j0<3; j0++)
+        {
+        for (int j1=0; j1<3; j1++)
+        {
+            opt.use_fp16_packed = (j0 == 1 || j1 == 1);
+            opt.use_fp16_storage = (j0 == 2 || j1 == 2);
+
+            if (!info.support_fp16_packed && opt.use_fp16_packed)
+                continue;
+
+            if (!info.support_fp16_storage && opt.use_fp16_storage)
+                continue;
+
+            // from pack1 | pack4 | pack8
+            for (int k=0; k<3; k++)
+            {
+                opt.use_shader_pack8 = (k == 2 || k == 2);
+
+                TEngine::Layer* uop = uop_packing[i0][i1][j0][j1][k];
+
+                uop->destroy_pipeline(opt);
+
+                delete uop;
+
+                uop_packing[i0][i1][j0][j1][k] = 0;
+            }
+        }
+        }
+    }
+    }
+}
+
+void GPUDevice::convert_packing(const VkTensor& src, VkTensor& dst, int dst_elempack, VkCompute& cmd, const Option& _opt) const
+{
+    // buffer2buffer uop is created with use_image_storage disabled
+    Option opt = _opt;
+    opt.use_image_storage = false;
+
+    int cast_type_from_index = src.elemsize == src.elempack * 4u ? 0 : opt.use_fp16_storage ? 2 : 1;
+    int cast_type_to_index = opt.use_fp16_storage ? 2 : opt.use_fp16_packed && dst_elempack % 4 == 0 ? 1 : 0;
+    int packing_type_to_index = dst_elempack == 1 ? 0 : dst_elempack == 4 ? 1 : 2;
+
+    // TLOG_INFO("convert_packing b2b %d %d %d\n", cast_type_from_index, cast_type_to_index, packing_type_to_index);
+
+    const TEngine::Packing_vulkan* uop = uop_packing[0][0][cast_type_from_index][cast_type_to_index][packing_type_to_index];
+
+    uop->record_pipeline(src, dst, cmd, opt);
+}
+
+} // namespace TEngine
diff --git a/source/device/vulkan/vulkan_gpu.hpp b/source/device/vulkan/vulkan_gpu.hpp
new file mode 100644
index 000000000..b0a6466a1
--- /dev/null
+++ b/source/device/vulkan/vulkan_gpu.hpp
@@ -0,0 +1,349 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef VULKAN_GPU_HPP
+#define VULKAN_GPU_HPP
+
+#include <vector>
+
+#include "vulkan_platform.hpp"
+#include <vulkan/vulkan.h>
+#include "vulkan_tensor.hpp"
+
+// #include "tengine_log.h"
+
+namespace TEngine {
+
+// instance
+int create_gpu_instance();
+void destroy_gpu_instance();
+
+// instance extension capability
+extern int support_VK_KHR_external_memory_capabilities;
+extern int support_VK_KHR_get_physical_device_properties2;
+extern int support_VK_KHR_get_surface_capabilities2;
+extern int support_VK_KHR_surface;
+extern int support_VK_EXT_debug_utils;
+#if __ANDROID_API__ >= 26
+extern int support_VK_KHR_android_surface;
+#endif // __ANDROID_API__ >= 26
+
+// VK_KHR_external_memory_capabilities
+extern PFN_vkGetPhysicalDeviceExternalBufferPropertiesKHR vkGetPhysicalDeviceExternalBufferPropertiesKHR;
+
+// VK_KHR_get_physical_device_properties2
+extern PFN_vkGetPhysicalDeviceFeatures2KHR vkGetPhysicalDeviceFeatures2KHR;
+extern PFN_vkGetPhysicalDeviceProperties2KHR vkGetPhysicalDeviceProperties2KHR;
+extern PFN_vkGetPhysicalDeviceFormatProperties2KHR vkGetPhysicalDeviceFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceImageFormatProperties2KHR vkGetPhysicalDeviceImageFormatProperties2KHR;
+extern PFN_vkGetPhysicalDeviceQueueFamilyProperties2KHR vkGetPhysicalDeviceQueueFamilyProperties2KHR;
+extern PFN_vkGetPhysicalDeviceMemoryProperties2KHR vkGetPhysicalDeviceMemoryProperties2KHR;
+extern PFN_vkGetPhysicalDeviceSparseImageFormatProperties2KHR vkGetPhysicalDeviceSparseImageFormatProperties2KHR;
+
+// VK_KHR_get_surface_capabilities2
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilities2KHR vkGetPhysicalDeviceSurfaceCapabilities2KHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormats2KHR vkGetPhysicalDeviceSurfaceFormats2KHR;
+
+// VK_KHR_surface
+extern PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceSupportKHR vkGetPhysicalDeviceSurfaceSupportKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceCapabilitiesKHR vkGetPhysicalDeviceSurfaceCapabilitiesKHR;
+extern PFN_vkGetPhysicalDeviceSurfaceFormatsKHR vkGetPhysicalDeviceSurfaceFormatsKHR;
+extern PFN_vkGetPhysicalDeviceSurfacePresentModesKHR vkGetPhysicalDeviceSurfacePresentModesKHR;
+
+#if __ANDROID_API__ >= 26
+// VK_KHR_android_surface
+extern PFN_vkCreateAndroidSurfaceKHR vkCreateAndroidSurfaceKHR;
+#endif // __ANDROID_API__ >= 26
+
+// get info
+int get_gpu_count();
+int get_default_gpu_index();
+
+class GpuInfo
+{
+public:
+    // vulkan physical device
+    VkPhysicalDevice physical_device;
+
+    // memory properties
+    VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties;
+
+    // info
+    uint32_t api_version;
+    uint32_t driver_version;
+    uint32_t vendor_id;
+    uint32_t device_id;
+    uint8_t pipeline_cache_uuid[VK_UUID_SIZE];
+
+    // 0 = discrete gpu
+    // 1 = integrated gpu
+    // 2 = virtual gpu
+    // 3 = cpu
+    int type;
+
+    // hardware limit
+    uint32_t max_shared_memory_size;
+    uint32_t max_workgroup_count[3];
+    uint32_t max_workgroup_invocations;
+    uint32_t max_workgroup_size[3];
+    size_t memory_map_alignment;
+    size_t buffer_offset_alignment;
+    size_t non_coherent_atom_size;
+    size_t buffer_image_granularity;
+    uint32_t max_image_dimension_1d;
+    uint32_t max_image_dimension_2d;
+    uint32_t max_image_dimension_3d;
+    float timestamp_period;
+
+    // runtime
+    uint32_t compute_queue_family_index;
+    uint32_t graphics_queue_family_index;
+    uint32_t transfer_queue_family_index;
+
+    uint32_t compute_queue_count;
+    uint32_t graphics_queue_count;
+    uint32_t transfer_queue_count;
+
+    // property
+    bool unified_compute_transfer_queue;
+
+    // bug is not feature
+    bool bug_local_size_spec_const;
+
+    // but sometimes bug is a feature
+    bool bug_implicit_fp16_arithmetic;
+
+    // fp16 and int8 feature
+    bool support_fp16_packed;
+    bool support_fp16_storage;
+    bool support_fp16_arithmetic;
+    bool support_int8_storage;
+    bool support_int8_arithmetic;
+
+    // ycbcr conversion feature
+    bool support_ycbcr_conversion;
+
+    // extension capability
+    int support_VK_KHR_8bit_storage;
+    int support_VK_KHR_16bit_storage;
+    int support_VK_KHR_bind_memory2;
+    int support_VK_KHR_dedicated_allocation;
+    int support_VK_KHR_descriptor_update_template;
+    int support_VK_KHR_external_memory;
+    int support_VK_KHR_get_memory_requirements2;
+    int support_VK_KHR_maintenance1;
+    int support_VK_KHR_push_descriptor;
+    int support_VK_KHR_sampler_ycbcr_conversion;
+    int support_VK_KHR_shader_float16_int8;
+    int support_VK_KHR_shader_float_controls;
+    int support_VK_KHR_storage_buffer_storage_class;
+    int support_VK_KHR_swapchain;
+    int support_VK_EXT_queue_family_foreign;
+#if __ANDROID_API__ >= 26
+    int support_VK_ANDROID_external_memory_android_hardware_buffer;
+#endif // __ANDROID_API__ >= 26
+};
+
+const GpuInfo& get_gpu_info(int device_index = get_default_gpu_index());
+
+class VkAllocator;
+class VkCompute;
+class Layer;
+class Packing_vulkan;
+class Option;
+class GPUDevice
+{
+public:
+    GPUDevice(int device_index = get_default_gpu_index());
+    ~GPUDevice();
+
+    const GpuInfo& info;
+
+    VkDevice vkdevice() const { return device; }
+
+    VkShaderModule get_shader_module(int shader_type_index) const;
+
+    // with fixed workgroup size
+    VkShaderModule create_shader_module(int shader_type_index, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size) const;
+
+    // with fixed workgroup size
+    VkShaderModule compile_shader_module(const uint32_t* spv_data, size_t spv_data_size, uint32_t local_size_x, uint32_t local_size_y, uint32_t local_size_z) const;
+
+    uint32_t find_memory_index(uint32_t memory_type_bits, VkFlags required, VkFlags preferred, VkFlags preferred_not) const;
+    bool is_mappable(uint32_t memory_type_index) const;
+    bool is_coherent(uint32_t memory_type_index) const;
+
+    VkQueue acquire_queue(uint32_t queue_family_index) const;
+    void reclaim_queue(uint32_t queue_family_index, VkQueue queue) const;
+
+    // allocator on this device
+    VkAllocator* acquire_blob_allocator() const;
+    void reclaim_blob_allocator(VkAllocator* allocator) const;
+
+    VkAllocator* acquire_staging_allocator() const;
+    void reclaim_staging_allocator(VkAllocator* allocator) const;
+
+    // dummy buffer image
+    VkTensor get_dummy_buffer() const;
+    VkImageTensor get_dummy_image() const;
+
+    // utility operator
+    void convert_packing(const VkTensor& src, VkTensor& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    // void convert_packing(const VkImageMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    // void convert_packing(const VkMat& src, VkImageMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+    // void convert_packing(const VkImageMat& src, VkMat& dst, int dst_elempack, VkCompute& cmd, const Option& opt) const;
+
+    // VK_KHR_bind_memory2
+    PFN_vkBindBufferMemory2KHR vkBindBufferMemory2KHR;
+    PFN_vkBindImageMemory2KHR vkBindImageMemory2KHR;
+
+    // VK_KHR_descriptor_update_template
+    PFN_vkCreateDescriptorUpdateTemplateKHR vkCreateDescriptorUpdateTemplateKHR;
+    PFN_vkDestroyDescriptorUpdateTemplateKHR vkDestroyDescriptorUpdateTemplateKHR;
+    PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR;
+
+    // VK_KHR_get_memory_requirements2
+    PFN_vkGetImageMemoryRequirements2KHR vkGetImageMemoryRequirements2KHR;
+    PFN_vkGetBufferMemoryRequirements2KHR vkGetBufferMemoryRequirements2KHR;
+    PFN_vkGetImageSparseMemoryRequirements2KHR vkGetImageSparseMemoryRequirements2KHR;
+
+    // VK_KHR_maintenance1
+    PFN_vkTrimCommandPoolKHR vkTrimCommandPoolKHR;
+
+    // VK_KHR_push_descriptor
+    PFN_vkCmdPushDescriptorSetWithTemplateKHR vkCmdPushDescriptorSetWithTemplateKHR;
+    PFN_vkCmdPushDescriptorSetKHR vkCmdPushDescriptorSetKHR;
+
+    // VK_KHR_sampler_ycbcr_conversion
+    PFN_vkCreateSamplerYcbcrConversionKHR vkCreateSamplerYcbcrConversionKHR;
+    PFN_vkDestroySamplerYcbcrConversionKHR vkDestroySamplerYcbcrConversionKHR;
+
+    // VK_KHR_swapchain
+    PFN_vkCreateSwapchainKHR vkCreateSwapchainKHR;
+    PFN_vkDestroySwapchainKHR vkDestroySwapchainKHR;
+    PFN_vkGetSwapchainImagesKHR vkGetSwapchainImagesKHR;
+    PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR;
+    PFN_vkQueuePresentKHR vkQueuePresentKHR;
+
+#if __ANDROID_API__ >= 26
+    // VK_ANDROID_external_memory_android_hardware_buffer
+    PFN_vkGetAndroidHardwareBufferPropertiesANDROID vkGetAndroidHardwareBufferPropertiesANDROID;
+    PFN_vkGetMemoryAndroidHardwareBufferANDROID vkGetMemoryAndroidHardwareBufferANDROID;
+#endif // __ANDROID_API__ >= 26
+
+protected:
+    // shader management
+    int create_shader_module();
+    void destroy_shader_module();
+
+    // device extension
+    int init_device_extension();
+
+    // dummy buffer and image
+    int create_dummy_buffer_image();
+    void destroy_dummy_buffer_image();
+
+    // utility operator
+    int create_utility_operator();
+    void destroy_utility_operator();
+
+private:
+    VkDevice device;
+    std::vector<VkShaderModule> shader_modules;
+
+    // hardware queue
+    mutable std::vector<VkQueue> compute_queues;
+    mutable std::vector<VkQueue> graphics_queues;
+    mutable std::vector<VkQueue> transfer_queues;
+    
+    mutable Mutex queue_lock;
+
+    // default blob allocator for each queue
+    mutable std::vector<VkAllocator*> blob_allocators;
+    
+    mutable Mutex blob_allocator_lock;
+
+    // default staging allocator for each queue
+    mutable std::vector<VkAllocator*> staging_allocators;
+    
+    mutable Mutex staging_allocator_lock;
+
+    // dummy buffer and image
+    VkAllocator* dummy_allocator;
+    VkTensor dummy_buffer;
+    VkImageTensor dummy_image;
+
+    // utility operator
+    // from buffer | image
+    // to buffer | image
+    // from fp32-b/i | fp16p-b/i | fp16s-b/i
+    // to fp32-b/i | fp16p-b/i | fp16s-b/i
+    // to pack1 | pack4 | pack8
+    TEngine::Packing_vulkan* uop_packing[2][2][3][3][3];
+};
+
+GPUDevice* get_gpu_device(int device_index = get_default_gpu_index());
+
+// info from spirv
+class ShaderInfo
+{
+public:
+    int specialization_count;
+    int binding_count;
+    int push_constant_count;
+
+    // 0 = null
+    // 1 = storage buffer
+    // 2 = storage image
+    // 3 = combined image sampler
+    int binding_types[16];// 16 is large enough(maybe)
+};
+
+const ShaderInfo& get_shader_info(int shader_type_index);
+int resolve_shader_info(const uint32_t* spv_data, size_t spv_data_size, ShaderInfo& shader_info);
+
+union vk_specialization_type { int i; float f; uint32_t u32; };
+union vk_constant_type { int i; float f; };
+
+}
+
+#endif // VULKAN_GPU_HPP
diff --git a/source/device/vulkan/vulkan_graph.cc b/source/device/vulkan/vulkan_graph.cc
new file mode 100644
index 000000000..222477f80
--- /dev/null
+++ b/source/device/vulkan/vulkan_graph.cc
@@ -0,0 +1,545 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include "vulkan_graph.hpp"
+#include "vulkan_executor.hpp"
+
+#include <iostream>
+#include "vulkan_graph.hpp"
+#include "vulkan_pipeline.hpp"
+#include "vulkan_gpu.hpp"
+#include "vulkan_command.hpp"
+#include "vulkan_allocator.hpp"
+#include "vulkan_tensor.hpp"
+#include "vulkan_layer.hpp"
+
+#include "layer/convolution_vulkan.hpp"
+#include "layer/pooling_vulkan.hpp"
+#include "layer/convolutiondepthwise_vulkan.hpp"
+#include "layer/innerproduct_vulkan.hpp"
+#include "layer/flatten_vulkan.hpp"
+#include "layer/softmax_vulkan.hpp"
+#include "layer/relu_vulkan.hpp"
+#include "layer/dropout_vulkan.hpp"
+#include "layer/eltwise_vulkan.hpp"
+#include "layer/priorbox_vulkan.hpp"
+#include "layer/permute_vulkan.hpp"
+#include "layer/concat_vulkan.hpp"
+#include "layer/reshape_vulkan.hpp"
+#include "layer/interp_vulkan.hpp"
+#include "layer/crop_vulkan.hpp"
+
+#include <sys/time.h>
+
+extern "C"
+{
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+}
+
+
+int vulkan_dev_init(struct device* dev)
+{
+    (void)dev;
+    return 0;
+}
+
+
+int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options)
+{
+    subgraph->device_graph = new VULKANEngine;
+    auto engine = (VULKANEngine*)subgraph->device_graph;
+
+    return engine->VULKANEnginePreRun(subgraph);
+}
+
+
+int vulkan_dev_run(struct device* dev, struct subgraph* subgraph)
+{
+    auto engine = (VULKANEngine*)subgraph->device_graph;
+    return engine->VULKANEngineRun(subgraph);
+}
+
+
+int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph)
+{
+    auto engine = (VULKANEngine*)subgraph->device_graph;
+    engine->VULKANEnginePostRun();
+    // delete engine;
+
+    return 0;
+}
+
+
+int vulkan_dev_release(struct device* dev)
+{
+    (void)dev;
+    return 0;
+}
+
+
+
+namespace TEngine {
+
+static double get_cur_time(void)
+{
+    struct timeval tv;
+
+    gettimeofday(&tv, NULL);
+
+    return tv.tv_sec * 1000.0 + (tv.tv_usec / 1000.0);
+}
+
+
+VulkanGraph::VulkanGraph(struct subgraph* graph)
+{
+    vkdev = get_gpu_device();
+    weight_vkallocator = 0;
+    weight_staging_vkallocator = 0;
+
+    // set graph options
+    if (!vkdev->info.support_fp16_packed || !vkdev->info.support_fp16_storage)
+        opt.use_fp16_packed = false;
+    if (!vkdev->info.support_fp16_storage) 
+    {
+        opt.use_fp16_storage = false;
+        opt.use_shader_pack8 = false;
+    }    
+
+    if (!vkdev->info.support_fp16_arithmetic) 
+        opt.use_fp16_arithmetic = false;
+
+    TLOG_INFO("use_fp16_packed %d\n", opt.use_fp16_packed);
+    TLOG_INFO("use_fp16_storage %d\n", opt.use_fp16_storage);
+    TLOG_INFO("use_shader_pack8 %d\n", opt.use_shader_pack8);
+    TLOG_INFO("use_fp16_arithmetic %d\n", opt.use_fp16_arithmetic);
+
+    struct subgraph *subgraph = (struct subgraph *)graph;
+    struct graph *ir_graph = subgraph->graph;
+    int node_num = subgraph->node_num;
+
+    sgraph = graph;
+    for(int i = 0; i < node_num; i++)
+    {
+        struct node *ir_node = get_ir_graph_node(ir_graph, subgraph->node_list[i]);
+
+        if (ir_node->op.type == OP_CONST || ir_node->op.type == OP_INPUT)
+            continue;
+        else if (ir_node->op.type == OP_CLIP)
+            ir_node->op.type = OP_RELU6;
+
+        if(ir_node->op.type == OP_CONV)
+        {
+            struct conv_param *conv_param = (struct conv_param *)ir_node->op.param_mem;
+
+            if (conv_param->group == conv_param->output_channel && conv_param->group != 1 && ir_graph->graph_layout == TENGINE_LAYOUT_NCHW) // DW
+            {
+                Layer* layer = new ConvolutionDepthWise_vulkan(ir_graph, ir_node);
+                layer->vkdev = vkdev;
+                layer->name = "ConvolutionDepthWise";
+                layers.push_back(layer);
+            }
+            else
+            {
+                Layer* layer = new Convolution_vulkan(ir_graph, ir_node);
+                layer->vkdev = vkdev;
+                layer->name = "Convolution";
+                layers.push_back(layer);
+            }
+        }
+
+        if(ir_node->op.type == OP_POOL)
+        {
+            Layer* layer = new Pooling_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Pooling";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_FC)
+        {
+            Layer* layer = new InnerProduct_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "InnerProduct";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_FLATTEN)
+        {
+            Layer* layer = new Flatten_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Flatten";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_SOFTMAX)
+        {
+            Layer* layer = new Softmax_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Softmax";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_RELU)
+        {
+            Layer* layer = new ReLU_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "ReLU";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_DROPOUT)
+        {
+            Layer* layer = new Dropout_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Dropout";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_ELTWISE)
+        {
+            Layer* layer = new Eltwise_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Eltwise";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_PRIORBOX)
+        {
+            Layer* layer = new PriorBox_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "PriorBox";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_PERMUTE)
+        {
+            Layer* layer = new Permute_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Permute";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_CONCAT)
+        {
+            Layer* layer = new Concat_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Concat";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_RESHAPE)
+        {
+            Layer* layer = new Reshape_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Reshape";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_INTERP || ir_node->op.type == OP_UPSAMPLE)
+        {
+            Layer* layer = new Interp_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Interp";
+            layers.push_back(layer);
+        }
+
+        if(ir_node->op.type == OP_CROP)
+        {
+            Layer* layer = new Crop_vulkan(ir_graph, ir_node);
+            layer->vkdev = vkdev;
+            layer->name = "Crop";
+            layers.push_back(layer);
+        }
+        
+        struct tensor *input = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
+        std::string name = input->name;
+        tensor_map_[name] = input;
+        tensor_map[name] = Tensor(input);
+
+        VkTensor vktensor;
+        vktensor_map_[name] = vktensor;
+
+        struct tensor *output = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
+        name = output->name;
+        tensor_map_[name] = output;
+        tensor_map[name] = Tensor(output);      
+    }
+}
+
+VulkanGraph::~VulkanGraph()
+{
+   for(auto& ptr: mem_buf_vector_)
+	   std::free(ptr);
+}
+
+int VulkanGraph::upload_model()
+{
+    
+// printf("run upload_model\n");
+    TEngine::VkTransfer cmd(vkdev);
+    if (!weight_vkallocator)
+    {
+        weight_vkallocator = new VkWeightAllocator(vkdev);
+    }
+    if (!weight_staging_vkallocator)
+    {
+        weight_staging_vkallocator = new VkWeightStagingAllocator(vkdev);
+    }
+    
+    Option opt_upload = opt;
+    opt_upload.blob_vkallocator = weight_vkallocator;
+    opt_upload.workspace_vkallocator = weight_vkallocator;
+    opt_upload.staging_vkallocator = weight_staging_vkallocator;
+
+    int layer_size = layers.size();
+    for(int i = 0; i < layer_size; i++)
+    {
+        layers[i]->upload_model(cmd, opt_upload);
+    }    
+    
+    cmd.submit_and_wait();
+// printf("run upload_model done\n");
+    return 0;
+}
+
+int VulkanGraph::create_pipeline()
+{
+    // printf("start to run create pipeline\n");
+    for (size_t i=0; i<layers.size(); i++)
+    {
+        Layer* layer = layers[i];
+        Option opt1 = opt;
+        // printf("create pipeline layer name: %s \n", layers[i]->name.c_str());
+        int cret = layer->create_pipeline(opt1);
+        if (cret != 0)
+        {
+            printf("layer create_pipeline %d failed", (int)i);
+            return -1;
+        }
+    }
+// printf("run create_pipeline done\n");
+    return 0;
+}
+
+int VulkanGraph::record_graph_pipeline()
+{
+    // printf("start to run record pipeline, layer size:%d\n", layers.size());
+
+    TEngine::VkCompute cmd(vkdev);
+
+    if (!opt.blob_vkallocator)
+    {
+        local_blob_vkallocator = vkdev->acquire_blob_allocator();
+        opt.blob_vkallocator = local_blob_vkallocator;
+    }
+    if (!opt.workspace_vkallocator)
+    {
+        opt.workspace_vkallocator = opt.blob_vkallocator;
+    }
+    if (!opt.staging_vkallocator)
+    {
+        local_staging_vkallocator = vkdev->acquire_staging_allocator();
+        opt.staging_vkallocator = local_staging_vkallocator;
+    }
+    std::string name;
+
+    Tensor input;
+    Tensor output;
+
+    // printf("tensor_map size:%d ---------------------\n", tensor_map.size());
+
+    for (size_t i=0; i<layers.size(); i++)
+    {
+        Layer* layer = layers[i];
+        // printf("layer type: %s\n", layer->name.c_str());
+
+        std::string in_name = layer->bottoms[0];
+        std::string out_name = layer->tops[0];
+        name = out_name;
+
+        // upload Tensor data to VkTensor 
+        if((i==0) && vktensor_map_[in_name].dims == 0)
+        {
+            cmd.record_upload(tensor_map_[in_name], vktensor_map_[in_name], opt);
+            // cmd.record_download(vktensor_map_[in_name], tensor_map[in_name], opt);
+        }
+        
+        int cret;
+        if(layer->name == "ReLU" || layer->name == "Dropout" || layer->name == "Softmax")   // inplace
+        {
+            VkTensor bottom_tensor = vktensor_map_[in_name];
+            cret = layer->record_pipeline(bottom_tensor, cmd, opt);
+            vktensor_map_[out_name] = bottom_tensor;
+        }
+        else if(layer->name == "Eltwise" || layer->name == "Concat" || layer->name == "PriorBox" || layer->name == "Crop") // multi-in, one-out
+        {
+            std::vector<VkTensor> bottom_blobs;
+            for(int i = 0; i < layer->bottoms.size(); i++)
+            {
+                bottom_blobs.push_back(vktensor_map_[layer->bottoms[i]]);
+            }
+
+            VkTensor top_tensor;
+            std::vector<VkTensor> top_blobs;
+            top_blobs.push_back(top_tensor);
+            cret = layer->record_pipeline(bottom_blobs, top_blobs, cmd, opt);
+            vktensor_map_[out_name] = top_blobs[0];
+        }
+        else    // original one-in one-out
+        {
+            VkTensor bottom_tensor = vktensor_map_[in_name];
+            VkTensor top_tensor;
+            cret = layer->record_pipeline(bottom_tensor, top_tensor, cmd, opt);
+            vktensor_map_[out_name] = top_tensor;
+        }
+
+        // download all nodes data
+        {
+            // Tensor tmp_tensor;
+            // cmd.record_download(vktensor_map_[out_name], tmp_tensor, opt);
+            // tensor_map[out_name] = tmp_tensor;
+        }
+
+        if (cret != 0)
+        {
+            printf("layer record_pipeline %d failed", (int)i);
+            return -1;
+        }
+    }
+
+    cmd.record_download(vktensor_map_[name], output, opt);
+
+    // // download output
+    // int byte_size=tensor_map_[name]->elem_size * tensor_map_[name]->elem_num;
+    // void* mem=std::malloc(byte_size);
+    // tensor_map_[name]->data = mem;
+    // cmd.record_download(vktensor_map_[name], tensor_map_[name], opt);
+
+// double total_time, min_time, max_time;
+//     min_time = 999999999;
+//     max_time = 0;
+//     total_time = 0;
+// double start_time = get_cur_time();
+
+    cmd.submit_and_wait();
+
+// double end_time = get_cur_time();
+// double cur_time = end_time - start_time;
+// total_time += cur_time;
+// if (cur_time > max_time)
+//     max_time = cur_time;
+// if (cur_time < min_time)
+//     min_time = cur_time;
+// printf("vulkan Repeat [1] min %.3f ms, max %.3f ms, avg %.3f ms\n", min_time, max_time, total_time / 1);
+
+    Tensor tmp_fp32;
+    if(output.elemsize == output.elempack * 2)
+    {
+        TEngine::cast_float16_to_float32(output, tmp_fp32, opt);
+    }
+    else
+    {
+        tmp_fp32 = output;
+    }
+
+    Tensor blob_unpacked;
+    if (opt.use_packing_layout)
+    {
+        convert_packing(tmp_fp32, blob_unpacked, 1, opt);
+    }
+    else
+    {
+        blob_unpacked = tmp_fp32;
+    }
+
+    tensor_map_[name]->data = blob_unpacked.data;
+
+
+// #define DEBUG_OUTPUT
+#ifdef DEBUG_OUTPUT
+    printf("run save tensor data\n");
+    for (size_t j=0; j<layers.size(); j++)
+    {
+        Layer* layer = layers[j];
+
+        std::string in_name = layer->tops[0];
+        // std::string in_name = layer->bottoms[0];
+        printf("%s\n", in_name.c_str());
+
+        std::string fname = std::to_string(j)+".data";
+        FILE* fp = fopen(fname.c_str(), "w");
+
+        // float * data = (float*)get_tensor_buffer(tensor_map_[name]);
+        // float* data = (float*)vktensor_map_[in_name].mapped_ptr();
+        // float* data = (float*)tensor_map_[in_name]->data;
+        // float* data = (float*)tensor_map[in_name].data;
+        Tensor tmp_fp16 = tensor_map[in_name];
+        Tensor tmp_fp32;
+        if(tmp_fp16.elemsize == tmp_fp16.elempack * 2)
+            TEngine::cast_float16_to_float32(tmp_fp16, tmp_fp32, opt);
+        else
+            tmp_fp32 = tmp_fp16;
+    
+        Tensor blob_unpacked;
+        if (opt.use_packing_layout)
+            convert_packing(tmp_fp32, blob_unpacked, 1, opt);
+        else
+            blob_unpacked = tmp_fp32;
+
+        int byte_size=tensor_map_[in_name]->elem_size * tensor_map_[name]->elem_num;
+        void* mem=std::malloc(byte_size);
+        memcpy(mem, blob_unpacked.data, byte_size);
+        tensor_map_[in_name]->data = mem;
+        // tensor_map_[in_name]->data = blob_unpacked.data;
+
+        // float* data = (float*)tmp_fp32.data;
+        float* data = (float*)blob_unpacked.data;
+        printf("tensor shape:%d %d %d %d\n", tensor_map_[in_name]->dims[0], tensor_map_[in_name]->dims[1], tensor_map_[in_name]->dims[2], tensor_map_[in_name]->dims[3]);
+        byte_size=tensor_map_[in_name]->elem_size * tensor_map_[in_name]->elem_num;
+        for(int i = 0; i < byte_size/sizeof(float); i++)
+        {
+            if(i % 16 == 0)
+            {
+                fprintf(fp, "\n%d:", i);
+            }
+            fprintf(fp, " %.6f", data[i]);
+        }
+        fprintf(fp, "\n");
+
+        fclose(fp);
+    }
+#endif
+
+    return 0;
+}
+
+int VulkanGraph::destory_pipeline()
+{
+    return 0;
+}
+
+}
diff --git a/source/device/vulkan/vulkan_graph.hpp b/source/device/vulkan/vulkan_graph.hpp
new file mode 100644
index 000000000..8218f271c
--- /dev/null
+++ b/source/device/vulkan/vulkan_graph.hpp
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#pragma once
+
+#include <array>
+#include <random>
+#include <string>
+#include <vector>
+#include <map>
+#include <unordered_map>
+#include <vulkan/vulkan.h>
+
+#include "vulkan_gpu.hpp"
+#include "vulkan_pipeline.hpp"
+#include "vulkan_command.hpp"
+#include "vulkan_option.hpp"
+#include "vulkan_layer.hpp"
+
+extern "C"
+{
+// #include "device/device.h"
+// #include "graph/subgraph.h"
+
+#include "api/c_api.h"
+#include "device/device.h"
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "executer/executer.h"
+#include "optimizer/split.h"
+#include "module/module.h"
+#include "utility/vector.h"
+#include "utility/log.h"
+
+
+#include "convolution_param.h"
+
+namespace TEngine {
+
+class VulkanDevice;
+
+class VulkanGraph {
+
+friend VulkanDevice;
+
+public:
+    const std::string& GetName(void) const {return name_;}
+
+    VulkanGraph(const std::string& name);
+    VulkanGraph(struct subgraph* graph);
+    ~VulkanGraph();
+
+    int record_convolution(VkCompute& cmd, ir_node_t* node);
+
+    int UploadConvolutionWeight(VkTransfer& cmd, const Option& opt, ir_node_t* node);
+
+    bool CreateConvolutionPipeline(ir_node_t* node);
+
+    bool CreatePoolingPipeline(ir_node_t* node);
+
+    std::unordered_map<std::string, tensor*> tensor_map_;    // tengine lite cpu tensor list
+    std::unordered_map<std::string, Tensor> tensor_map;         // vulkan cpu tensor list
+    std::unordered_map<std::string, VkTensor> vktensor_map_;    // vulkan gpu tensor list
+
+    bool OpSupported(const std::string& name);
+
+    Option opt;
+    Pipeline* pipeline_convolution;
+    
+    int record_graph_pipeline();
+
+    int upload_model();
+
+    int create_pipeline();
+
+    int destory_pipeline();
+
+protected:
+    subgraph* sgraph;
+    std::vector<Layer*> layers;
+
+    const GPUDevice* vkdev;
+
+    VkAllocator* weight_vkallocator;
+    VkAllocator* weight_staging_vkallocator;
+    
+private:
+
+    VkAllocator* local_blob_vkallocator;
+    VkAllocator* local_staging_vkallocator;
+    
+    std::string name_;
+
+    std::vector<void *> gpu_mem_vector_;
+    std::vector<void *> mem_buf_vector_;
+
+    std::map<std::string, tensor*> iotensor_map_;
+};
+
+} //namespace TEngine
+
+
+int vulkan_dev_init(struct device* dev);
+int vulkan_dev_prerun(struct device* dev, struct subgraph* subgraph, void* options);
+int vulkan_dev_run(struct device* dev, struct subgraph* subgraph);
+int vulkan_dev_postrun(struct device* dev, struct subgraph* subgraph);
+int vulkan_dev_release(struct device* dev);
+}
+
+
+/*
+
+
+
+
+*/
\ No newline at end of file
diff --git a/source/device/vulkan/vulkan_helper.cc b/source/device/vulkan/vulkan_helper.cc
new file mode 100644
index 000000000..4668f8bfe
--- /dev/null
+++ b/source/device/vulkan/vulkan_helper.cc
@@ -0,0 +1,311 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#include "vulkan_helper.hpp"
+
+// bool CHECK_ENQUEUE_BUFFER_STATUS(cl_int status)
+// {
+//     if (status != CL_SUCCESS)
+//     {
+//         TLOG_INFO("Log: clEnqueue****Buffer status %d\n",status);
+//         if (status == CL_INVALID_COMMAND_QUEUE  )
+//             TLOG_INFO("Log: CL_INVALID_COMMAND_QUEUE   \n");
+//         else if (status == CL_INVALID_CONTEXT   )
+//             TLOG_INFO("Log: CL_INVALID_CONTEXT    \n");
+//         else if (status == CL_INVALID_MEM_OBJECT  )
+//             TLOG_INFO("Log: CL_INVALID_MEM_OBJECT   \n");
+//         else if (status == CL_INVALID_VALUE  )
+//             TLOG_INFO("Log: CL_INVALID_VALUE   \n");
+//         else if (status == CL_INVALID_EVENT_WAIT_LIST  )
+//             TLOG_INFO("Log: CL_INVALID_EVENT_WAIT_LIST   \n");
+//         else if (status == CL_MISALIGNED_SUB_BUFFER_OFFSET   )
+//             TLOG_INFO("Log: CL_MISALIGNED_SUB_BUFFER_OFFSET    \n");
+//         else if (status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST  )
+//             TLOG_INFO("Log: CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST   \n");
+//         else if (status == CL_MEM_OBJECT_ALLOCATION_FAILURE   )
+//             TLOG_INFO("Log: CL_MEM_OBJECT_ALLOCATION_FAILURE     \n");
+//         else if (status == CL_INVALID_OPERATION    )
+//             TLOG_INFO("Log: CL_INVALID_OPERATION    \n");
+//         else if (status == CL_OUT_OF_RESOURCES    )
+//             TLOG_INFO("Log: CL_OUT_OF_RESOURCES      \n");
+//         else if (status == CL_OUT_OF_HOST_MEMORY     )
+//             TLOG_INFO("Log: CL_OUT_OF_HOST_MEMORY     \n");
+//         return false;
+//     }
+// //    else
+// //        TLOG_INFO("Log: clEnqueue****Buffer SUCCESS\n");
+//     return true;
+// }
+
+// bool CHECK_ENQUEUE_KERNEL_STATUS(cl_int status)
+// {
+//     if (status != CL_SUCCESS)
+//     {
+//         TLOG_INFO("Log: clEnqueueNDRangeKernel status %d\n",status);
+//         if (status == CL_INVALID_PROGRAM_EXECUTABLE   )
+//             TLOG_INFO("Log: CL_INVALID_PROGRAM_EXECUTABLE    \n");
+//         else if (status == CL_INVALID_COMMAND_QUEUE    )
+//             TLOG_INFO("Log: CL_INVALID_COMMAND_QUEUE     \n");
+//         else if (status == CL_INVALID_KERNEL   )
+//             TLOG_INFO("Log: CL_INVALID_KERNEL    \n");
+//         else if (status == CL_INVALID_CONTEXT   )
+//             TLOG_INFO("Log: CL_INVALID_CONTEXT    \n");
+//         else if (status == CL_INVALID_KERNEL_ARGS   )
+//             TLOG_INFO("Log: CL_INVALID_KERNEL_ARGS    \n");
+//         else if (status == CL_INVALID_WORK_DIMENSION    )
+//             TLOG_INFO("Log: CL_INVALID_WORK_DIMENSION     \n");
+//         else if (status == CL_INVALID_GLOBAL_WORK_SIZE   )
+//             TLOG_INFO("Log: CL_INVALID_GLOBAL_WORK_SIZE    \n");
+//         else if (status == CL_INVALID_GLOBAL_OFFSET    )
+//             TLOG_INFO("Log: CL_INVALID_GLOBAL_OFFSET      \n");
+//         else if (status == CL_INVALID_WORK_GROUP_SIZE     )
+//             TLOG_INFO("Log: CL_INVALID_WORK_GROUP_SIZE     \n");
+//         else if (status == CL_INVALID_WORK_ITEM_SIZE     )
+//             TLOG_INFO("Log: CL_INVALID_WORK_ITEM_SIZE       \n");
+//         else if (status == CL_MISALIGNED_SUB_BUFFER_OFFSET      )
+//             TLOG_INFO("Log: CL_MISALIGNED_SUB_BUFFER_OFFSET      \n");
+//         else if (status == CL_INVALID_IMAGE_SIZE    )
+//             TLOG_INFO("Log: CL_INVALID_IMAGE_SIZE     \n");
+//         else if (status == CL_OUT_OF_RESOURCES     )
+//             TLOG_INFO("Log: CL_OUT_OF_RESOURCES       \n");
+//         else if (status == CL_MEM_OBJECT_ALLOCATION_FAILURE     )
+//             TLOG_INFO("Log: CL_MEM_OBJECT_ALLOCATION_FAILURE     \n");
+//         else if (status == CL_INVALID_EVENT_WAIT_LIST      )
+//             TLOG_INFO("Log: CL_INVALID_EVENT_WAIT_LIST        \n");
+//         else if (status == CL_OUT_OF_RESOURCES       )
+//             TLOG_INFO("Log: CL_OUT_OF_RESOURCES       \n");
+//         else if (status == CL_OUT_OF_HOST_MEMORY        )
+//             TLOG_INFO("Log: CL_OUT_OF_HOST_MEMORY        \n");
+//         return false;
+//     }
+// //    else
+// //        TLOG_INFO("Log: clEnqueueNDRangeKernel SUCCESS\n");
+//     return true;
+// }
+
+// bool CHECK_SET_KERNEL_STATUS(cl_int status)
+// {
+//     if (status != CL_SUCCESS)
+//     {
+//         TLOG_INFO("Log: clSetKernelArg status %d\n",status);
+//         if (status == CL_INVALID_KERNEL )
+//             TLOG_INFO("Log: CL_INVALID_KERNEL  \n");
+//         else if (status == CL_INVALID_ARG_INDEX  )
+//             TLOG_INFO("Log: CL_INVALID_ARG_INDEX   \n");
+//         else if (status == CL_INVALID_ARG_VALUE )
+//             TLOG_INFO("Log: CL_INVALID_ARG_VALUE  \n");
+//         else if (status == CL_INVALID_MEM_OBJECT )
+//             TLOG_INFO("Log: CL_INVALID_MEM_OBJECT  \n");
+//         else if (status == CL_INVALID_SAMPLER )
+//             TLOG_INFO("Log: CL_INVALID_SAMPLER  \n");
+//         else if (status == CL_INVALID_ARG_SIZE  )
+//             TLOG_INFO("Log: CL_INVALID_ARG_SIZE   \n");
+//         else if (status == CL_INVALID_ARG_VALUE )
+//             TLOG_INFO("Log: CL_INVALID_ARG_VALUE  \n");
+//         else if (status == CL_OUT_OF_RESOURCES   )
+//             TLOG_INFO("Log: CL_OUT_OF_RESOURCES    \n");
+//         else if (status == CL_OUT_OF_HOST_MEMORY  )
+//             TLOG_INFO("Log: CL_OUT_OF_HOST_MEMORY   \n");
+//         return false;
+//     }
+// //    else
+// //    {
+// //        TLOG_INFO("Log: clSetKernelArg SUCCESS   \n");
+// //    }
+//     return true;
+// }
+
+/** convert the kernel file into a string */
+int convertToString(const char *filename, std::string& s)
+{
+    size_t size;
+    char*  str;
+    std::fstream f(filename, (std::fstream::in | std::fstream::binary));
+
+    if(f.is_open())
+    {
+        size_t fileSize;
+        f.seekg(0, std::fstream::end);
+        size = fileSize = (size_t)f.tellg();
+        f.seekg(0, std::fstream::beg);
+        str = new char[size+1];
+        if(!str)
+        {
+            f.close();
+            return 0;
+        }
+
+        f.read(str, fileSize);
+        f.close();
+        str[size] = '\0';
+        s = str;
+        delete[] str;
+        return 0;
+    }
+    std::cout<<"Error: failed to open file\n"<<filename<<std::endl;
+    return -1;
+}
+
+/**Getting platforms and choose an available one.*/
+// int getPlatform(cl_platform_id &platform)
+// {
+    // platform = NULL;//the chosen platform
+
+    // cl_uint numPlatforms;//the NO. of platforms
+    // cl_int    status = clGetPlatformIDs(0, NULL, &numPlatforms);
+    // if (status != CL_SUCCESS)
+    // {
+    //     std::cout<<"Error: Getting platforms!"<<std::endl;
+    //     return -1;
+    // }
+
+    // /**For clarity, choose the first available platform. */
+    // if(numPlatforms > 0)
+    // {
+    //     cl_platform_id* platforms =
+    //         (cl_platform_id* )malloc(numPlatforms* sizeof(cl_platform_id));
+    //     status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+    //     platform = platforms[0];
+    //     free(platforms);
+    // }
+    // else
+    //     return -1;
+
+    // return 0;
+// }
+
+/**Step 2:Query the platform and choose the first GPU device if has one.*/
+// cl_device_id *getCl_device_id(cl_platform_id &platform)
+// {
+//     cl_uint numDevices = 0;
+//     cl_device_id *devices=NULL;
+//     cl_int    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+//     if (numDevices > 0) //GPU available.
+//     {
+//         devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
+//         status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
+//     }
+//     return devices;
+// }
+
+void get_device_message()
+{
+    // /* Host/device data structures */
+    // cl_platform_id *platforms;
+    // cl_device_id *devices;
+    // cl_uint num_platforms;
+    // cl_uint num_devices, addr_data;
+    // cl_int i, err;
+
+    // /* Extension data */
+    // char name_data[48000], ext_data[409600];
+
+    // err = clGetPlatformIDs(5, NULL, &num_platforms);
+    // if(err < 0) {
+    //     perror("Couldn't find any platforms.");
+    //     exit(1);
+    // }
+
+    // /* 选取所有的platforms*/
+    // platforms = (cl_platform_id*)
+    //     malloc(sizeof(cl_platform_id) * num_platforms);
+    // err = clGetPlatformIDs(num_platforms, platforms, NULL);
+    // if(err < 0) {
+    //     perror("Couldn't find any platforms");
+    //     exit(1);
+    // }
+
+    // //循环查看所有platforms的devices信息，一般intel和AMD的都可以有两个devices：CPU和显卡
+    // //如果是nvidia的就一般只有一个显卡device了。
+    // printf("\nnum_platforms %d\n", num_platforms);
+    // for (int j = 0; j < (int)num_platforms; j++)
+    // {
+    //     printf("\nplatform %d\n", j+1);
+    //     /* 步骤和platforms的一样 */
+    //     err = clGetDeviceIDs(platforms[j], CL_DEVICE_TYPE_ALL, 1, NULL, &num_devices);
+    //     if(err < 0) {
+    //         perror("Couldn't find any devices!!!");
+    //         exit(1);
+    //     }
+
+    //     /* Access connected devices */
+    //     devices = (cl_device_id*)
+    //         malloc(sizeof(cl_device_id) * num_devices);
+    //     clGetDeviceIDs(platforms[j], CL_DEVICE_TYPE_ALL,
+    //                    num_devices, devices, NULL);
+
+    //     /*循环显示platform的所有device（CPU和显卡）信息。*/
+    //     for(i=0; i<(int)num_devices; i++) {
+
+    //         err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME,
+    //                               sizeof(name_data), name_data, NULL);
+    //         if(err < 0) {
+    //             perror("Couldn't read extension data");
+    //             exit(1);
+    //         }
+    //         clGetDeviceInfo(devices[i], CL_DEVICE_ADDRESS_BITS,
+    //                         sizeof(ext_data), &addr_data, NULL);
+
+    //         clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS,
+    //                         sizeof(ext_data), ext_data, NULL);
+
+    //         printf("NAME: %s\nADDRESS_WIDTH: %u\nEXTENSIONS: %s\n\n",
+    //                name_data, addr_data, ext_data);
+    //     }
+    // }
+
+    // free(platforms);
+    // free(devices);
+    // printf("\n");
+}
+
+void dump_sub_graph(struct subgraph* sub_graph)
+{
+    // TLOG_INFO("Sub graph[%d]: {%8s } has %d nodes, %d input tensors, %d output tensors.\n", sub_graph->index, sub_graph->device->name, sub_graph->node_num, sub_graph->input_num, sub_graph->output_num);
+    // TLOG_INFO("\tSub nodes: [ ");
+
+    // for (int j = 0; j < sub_graph->node_num - 1; j++)
+    // {
+    //     int node_id = sub_graph->node_list[j];
+    //     TLOG_INFO("%d, ", node_id);
+    // }
+    // TLOG_INFO("%d ].\n", sub_graph->node_list[sub_graph->node_num - 1]);
+
+    // TLOG_INFO("\tSub input tensors: [ ");
+    // for (int j = 0; j < sub_graph->input_num - 1; j++)
+    // {
+    //     int tensor_id = sub_graph->input_tensor_list[j];
+    //     TLOG_INFO("%d, ", tensor_id);
+    // }
+    // TLOG_INFO("%d ].\n", sub_graph->input_tensor_list[sub_graph->input_num - 1]);
+
+    // TLOG_INFO("\tSub output tensors: [ ");
+    // for (int j = 0; j < sub_graph->output_num - 1; j++)
+    // {
+    //     int tensor_id = sub_graph->output_tensor_list[j];
+    //     TLOG_INFO("%d, ", tensor_id);
+    // }
+    // TLOG_INFO("%d ].\n", sub_graph->output_tensor_list[sub_graph->output_num - 1]);
+}
+
diff --git a/source/device/vulkan/vulkan_helper.hpp b/source/device/vulkan/vulkan_helper.hpp
new file mode 100644
index 000000000..3955be7bb
--- /dev/null
+++ b/source/device/vulkan/vulkan_helper.hpp
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+#pragma once
+
+// #include <CL/cl.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <string>
+#include <fstream>
+
+extern "C"
+{
+#include "api/c_api.h"
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "device/device.h"
+#include "utility/sys_port.h"
+#include "utility/log.h"
+}
+
+// bool CHECK_SET_KERNEL_STATUS(cl_int status);
+// bool CHECK_ENQUEUE_KERNEL_STATUS(cl_int status);
+// bool CHECK_ENQUEUE_BUFFER_STATUS(cl_int status);
+
+/** convert the kernel file into a string */
+int convertToString(const char *filename, std::string& s);
+
+/**Getting platforms and choose an available one.*/
+// int getPlatform(cl_platform_id &platform);
+
+/**Step 2:Query the platform and choose the first GPU device if has one.*/
+// cl_device_id *getCl_device_id(cl_platform_id &platform);
+
+void get_device_message();
+
+void dump_sub_graph(struct subgraph* sub_graph);
+
diff --git a/source/device/vulkan/vulkan_layer.cpp b/source/device/vulkan/vulkan_layer.cpp
new file mode 100644
index 000000000..a4c7e4dab
--- /dev/null
+++ b/source/device/vulkan/vulkan_layer.cpp
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "vulkan_layer.hpp"
+
+namespace TEngine {
+
+Layer::Layer()
+{
+    support_vulkan = false;
+}
+
+Layer::~Layer()
+{
+}
+
+int Layer::create_pipeline(const Option& /*opt*/)
+{
+    return 0;
+}
+
+int Layer::destroy_pipeline(const Option& /*opt*/)
+{
+    return 0;
+}
+
+int Layer::upload_model(VkTransfer& cmd, const Option& opt)
+{
+    return 0;
+}
+
+int Layer::record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    return 0;
+}
+
+int Layer::record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+{
+    return 0;
+}
+
+int Layer::record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const
+{
+    printf("run layer record_pipeline VkTensors\n");
+    return 0;
+}
+
+} // TEngine
\ No newline at end of file
diff --git a/source/device/vulkan/vulkan_layer.hpp b/source/device/vulkan/vulkan_layer.hpp
new file mode 100644
index 000000000..526ca148b
--- /dev/null
+++ b/source/device/vulkan/vulkan_layer.hpp
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef VULKAN_LAYER_HPP
+#define VULKAN_LAYER_HPP
+
+#include <vulkan/vulkan.h>
+#include "vulkan_command.hpp"
+#include "vulkan_pipeline.hpp"
+
+extern "C"
+{
+#include "api/c_api.h"
+#include "device/device.h"
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+#include "executer/executer.h"
+#include "optimizer/split.h"
+#include "module/module.h"
+#include "utility/vector.h"
+#include "utility/log.h"
+}
+
+namespace TEngine {
+
+class Layer
+{
+public:
+    // empty
+    Layer();
+    // virtual destructor
+    virtual ~Layer();
+
+    // layer implementation specific setup
+    // return 0 if success
+    virtual int create_pipeline(const Option& opt);
+
+    // layer implementation specific clean
+    // return 0 if success
+    virtual int destroy_pipeline(const Option& opt);
+
+    // upload weight blob from host to device
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    // virtual int record_pipeline(VkCompute& cmd, const Option& opt) const;
+    virtual int record_pipeline(VkTensor& bottom_top_blob, VkCompute& cmd, const Option& opt) const;
+    virtual int record_pipeline(const VkTensor& bottom_blob, VkTensor& top_blob, VkCompute& cmd, const Option& opt) const;
+
+    virtual int record_pipeline(const std::vector<VkTensor>& bottom_blobs, std::vector<VkTensor>& top_blobs, VkCompute& cmd, const Option& opt) const;
+
+public:
+    // support vulkan compute
+    bool support_vulkan;
+
+    // accept input blob with packed storage
+    bool support_packing;
+
+    // accept bf16
+    bool support_bf16_storage;
+
+    // shader image storage
+    bool support_image_storage;
+
+public:
+    const GPUDevice* vkdev;
+    std::vector<std::string> bottoms;
+    std::vector<std::string> tops;
+
+public:
+    // layer name
+    std::string name;
+    // Node* node;
+    ir_graph_t* graph;
+    ir_node_t* node;
+};
+
+Layer* create_layer(std::string type);
+
+} // TEngine
+
+#endif // VULKAN_LAYER_HPP
diff --git a/source/device/vulkan/vulkan_limit.hpp b/source/device/vulkan/vulkan_limit.hpp
new file mode 100644
index 000000000..741786fae
--- /dev/null
+++ b/source/device/vulkan/vulkan_limit.hpp
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Copyright (c) 2021, OPEN AI LAB
+ * Author: hhchen@openailab.com
+ */
+
+
+#pragma once
+
+extern "C"
+{
+#include "operator/op.h"
+}
+
+
+const int vulkan_supported_ops[] = {
+
+        OP_CLIP,
+        OP_CONCAT,
+        OP_CONST,
+        OP_CONV,
+        OP_DROPOUT,
+        OP_ELTWISE,
+        OP_FC,
+        OP_FLATTEN,
+        OP_INPUT,
+////        OP_PERMUTE,
+        OP_POOL,
+        OP_RELU,
+        OP_RESHAPE,
+        OP_SLICE,
+////        OP_SOFTMAX
+
+
+//        OP_BIAS,
+
+////        OP_ABSVAL,
+////        OP_ADD_N,
+////        OP_ARGMAX,
+////        OP_ARGMIN,
+////        OP_BATCHNORM,
+////        OP_BATCHTOSPACEND,
+////        OP_BIAS,
+////        OP_BROADMUL,
+//
+////        OP_CAST,
+////        OP_CEIL,
+////        OP_CLIP,
+////        OP_COMPARISON,
+////        OP_CONCAT,
+//        OP_CONST,
+//        OP_CONV,
+////        OP_CROP,
+////        OP_DECONV,
+////        OP_DEPTHTOSPACE,
+////        OP_DETECTION_OUTPUT,
+////        OP_DETECTION_POSTPROCESS,
+//
+////        OP_DROPOUT,
+////        OP_ELTWISE,
+////        OP_ELU,
+////        OP_EMBEDDING,
+////        OP_EXPANDDIMS,
+////        OP_FC,
+////        OP_FLATTEN,
+////        OP_GATHER,
+////        OP_GEMM,
+////        OP_GRU,
+////        OP_HARDSIGMOID,
+////        OP_HARDSWISH,
+//        OP_INPUT,
+////        OP_INSTANCENORM,
+////        OP_INTERP,
+////        OP_LOGICAL,
+////        OP_LOGISTIC,
+////        OP_LRN,
+////        OP_LSTM,
+////        OP_MATMUL,
+////        OP_MAXIMUM,
+////        OP_MEAN,
+////        OP_MINIMUM,
+////        OP_MVN,
+////        OP_NOOP,
+////        OP_NORMALIZE,
+//
+////        OP_PAD,
+////        OP_PERMUTE,
+//        OP_POOL,
+////        OP_PRELU,
+////        OP_PRIORBOX,
+////        OP_PSROIPOOLING,
+////        OP_REDUCEL2,
+////        OP_REDUCTION,
+////        OP_REGION,
+//        OP_RELU,
+//
+////        OP_RELU6,
+////        OP_REORG,
+////        OP_RESHAPE,
+////        OP_RESIZE,
+////        OP_REVERSE,
+////        OP_RNN,
+////        OP_ROIALIGN,
+////        OP_ROIPOOLING,
+////        OP_ROUND,
+////        OP_RPN,
+////        OP_SCALE,
+////        OP_SELU,
+////        OP_SHUFFLECHANNEL,
+////        OP_SIGMOID,
+//
+////        OP_SLICE,
+////        OP_SOFTMAX,
+////        OP_SPACETOBATCHND,
+////        OP_SPACETODEPTH,
+////        OP_SPARSETODENSE,
+////        OP_SPLIT,
+////        OP_SQUAREDDIFFERENCE,
+////        OP_SQUEEZE,
+////        OP_STRIDED_SLICE,
+////        OP_SWAP_AXIS,
+////        OP_TANH,
+////        OP_THRESHOLD,
+////        OP_TOPKV2,
+////        OP_TRANSPOSE,
+////        OP_UNARY,
+////        OP_UNSQUEEZE,
+////        OP_UPSAMPLE,
+////        OP_ZEROSLIKE,
+////        OP_MISH,
+////        OP_LOGSOFTMAX,
+////        OP_RELU1,
+////        OP_L2NORMALIZATION,
+////        OP_L2POOL,
+////        OP_TILE,
+////        OP_SHAPE,
+////        OP_SCATTER,
+////        OP_WHERE,
+////        OP_BUILTIN_LAST
+
+
+};
diff --git a/source/device/vulkan/vulkan_option.cpp b/source/device/vulkan/vulkan_option.cpp
new file mode 100644
index 000000000..d57440411
--- /dev/null
+++ b/source/device/vulkan/vulkan_option.cpp
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "vulkan_option.hpp"
+
+namespace TEngine {
+
+Option::Option()
+{
+    lightmode = true;
+    num_threads = 1;
+    elempack = 1;
+    blob_allocator = 0;
+    workspace_allocator = 0;
+
+    blob_vkallocator = 0;
+    workspace_vkallocator = 0;
+    staging_vkallocator = 0;
+
+    use_winograd_convolution = true;
+    use_sgemm_convolution = true;
+    use_int8_inference = true;
+    use_vulkan_compute = true;
+
+    use_fp16_packed = true; 
+    use_fp16_storage = true;
+    use_fp16_arithmetic = false;
+    use_int8_storage = false;
+    use_int8_arithmetic = false;
+
+    use_packing_layout = true;
+    use_shader_pack8 = false;
+    use_image_storage = false;
+    use_bf16_storage = false;
+}
+
+} // namespace TEngine
diff --git a/source/device/vulkan/vulkan_option.hpp b/source/device/vulkan/vulkan_option.hpp
new file mode 100644
index 000000000..ee026e1a2
--- /dev/null
+++ b/source/device/vulkan/vulkan_option.hpp
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef VULKAN_OPTION_HPP
+#define VULKAN_OPTION_HPP
+
+namespace TEngine {
+
+class VkAllocator;
+
+class Allocator;
+class Option
+{
+public:
+    // default option
+    Option();
+
+public:
+    // light mode
+    // intermediate blob will be recycled when enabled
+    // enabled by default
+    bool lightmode;
+
+    // thread count
+    // default value is the one returned by get_cpu_count()
+    int num_threads;
+
+    // Pack Layout 1/4/8
+    int elempack;
+
+    // blob memory allocator
+    Allocator* blob_allocator;
+
+    // workspace memory allocator
+    Allocator* workspace_allocator;
+
+    // blob memory allocator
+    VkAllocator* blob_vkallocator;
+
+    // workspace memory allocator
+    VkAllocator* workspace_vkallocator;
+
+    // staging memory allocator
+    VkAllocator* staging_vkallocator;
+
+    // enable winograd convolution optimization
+    // improve convolution 3x3 stride1 performace, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_winograd_convolution;
+
+    // enable sgemm convolution optimization
+    // improve convolution 1x1 stride1 performace, may consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_sgemm_convolution;
+
+    // enable quantized int8 inference
+    // use low-precision int8 path for quantized model
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_int8_inference;
+
+    // enable vulkan compute
+    bool use_vulkan_compute;
+
+    // enable options for gpu inference
+    bool use_fp16_packed;
+    bool use_fp16_storage;
+    bool use_fp16_arithmetic;
+    bool use_int8_storage;
+    bool use_int8_arithmetic;
+
+    // enable simd-friendly packed memory layout
+    // improve all operator performace on all arm devices, will consume more memory
+    // changes should be applied before loading network structure and weight
+    // enabled by default
+    bool use_packing_layout;
+
+    bool use_shader_pack8;
+
+    // turn on for adreno
+    bool use_image_storage;
+
+    // enable bf16 data type for storage
+    // improve most operator performace on all arm devices, may consume more memory
+    bool use_bf16_storage;
+};
+
+} // namespace TEngine
+
+#endif // VULKAN_OPTION_HPP
diff --git a/source/device/vulkan/vulkan_pipeline.cpp b/source/device/vulkan/vulkan_pipeline.cpp
new file mode 100644
index 000000000..6935c76b5
--- /dev/null
+++ b/source/device/vulkan/vulkan_pipeline.cpp
@@ -0,0 +1,568 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "vulkan_pipeline.hpp"
+#include "vulkan_gpu.hpp"
+
+#include "stdio.h"
+#include <math.h>
+#include <algorithm>
+
+namespace TEngine {
+
+Pipeline::Pipeline(const GPUDevice* _vkdev) : vkdev(_vkdev)
+{
+    local_shader_module = 0;
+
+    descriptorset_layout = 0;
+    pipeline_layout = 0;
+    pipeline = 0;
+    descriptor_update_template = 0;
+
+    local_size_x = 1;
+    local_size_y = 1;
+    local_size_z = 1;
+}
+
+Pipeline::~Pipeline()
+{
+    destroy();
+}
+
+int Pipeline::create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations)
+{
+    ShaderInfo si;
+    int ret = resolve_shader_info(spv_data, spv_data_size, si);
+    if (ret != 0)
+    {
+        printf("resolve_shader_info failed %d", ret);
+        return -1;
+    }
+
+    // -3 for local_size_xyz
+    int specialization_count_expected = si.specialization_count - 3;
+    if ((int)specializations.size() != specialization_count_expected)
+    {
+        printf("pipeline specialization count mismatch, expect %d but got %d", specialization_count_expected, (int)specializations.size());
+        return -1;
+    }
+
+    if (vkdev->info.bug_local_size_spec_const)
+    {
+        local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size, local_size_x, local_size_y, local_size_z);
+    }
+    else
+    {
+        local_shader_module = vkdev->compile_shader_module(spv_data, spv_data_size);
+    }
+
+//     TLOG_INFO("local_shader_module %p created", local_shader_module);
+
+    return create(local_shader_module, si, specializations);
+}
+
+int Pipeline::create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations)
+{
+    // printf("run pipeline create, shader_type_index:%d, specialization size:%d\n", shader_type_index, specializations.size());
+    // ncnn_add_shader cmake macro
+    // 0 = fp32
+    // 1 = fp16p
+    // 2 = fp16pa
+    // 3 = fp16s
+    // 4 = fp16sa
+    // 5 = image
+    // 6 = image_fp16p
+    // 7 = image_fp16pa
+    // 8 = image_fp16s
+    // 9 = image_fp16sa
+
+    if (opt.use_image_storage && vkdev->info.support_fp16_storage && opt.use_fp16_storage && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic)
+    {
+        shader_type_index += 9;
+    }
+    else if (opt.use_image_storage && vkdev->info.support_fp16_packed && opt.use_fp16_packed && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic)
+    {
+        shader_type_index += 7;
+    }
+    else if (opt.use_image_storage && vkdev->info.support_fp16_storage && opt.use_fp16_storage)
+    {
+        shader_type_index += 8;
+    }
+    else if (opt.use_image_storage && vkdev->info.support_fp16_packed && opt.use_fp16_packed)
+    {
+        shader_type_index += 6;
+    }
+    else if (opt.use_image_storage)
+    {
+        shader_type_index += 5;
+    }
+    else if (vkdev->info.support_fp16_storage && opt.use_fp16_storage && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic)
+    {
+        shader_type_index += 4;
+    }
+    else if (vkdev->info.support_fp16_packed && opt.use_fp16_packed && vkdev->info.support_fp16_arithmetic && opt.use_fp16_arithmetic)
+    {
+        shader_type_index += 2;
+    }
+    else if (vkdev->info.support_fp16_storage && opt.use_fp16_storage)
+    {
+        shader_type_index += 3;
+    }
+    else if (vkdev->info.support_fp16_packed && opt.use_fp16_packed)
+    {
+        shader_type_index += 1;
+    }
+
+    const ShaderInfo& si = get_shader_info(shader_type_index);
+
+    // -3 for local_size_xyz
+    int specialization_count_expected = si.specialization_count - 3;
+    // int specialization_count_expected = si.specialization_count;
+    if ((int)specializations.size() != specialization_count_expected)
+    {
+        printf("pipeline %d specialization count mismatch, expect %d but got %d\n", shader_type_index, specialization_count_expected, (int)specializations.size());
+        return -1;
+    }
+
+    if (vkdev->info.bug_local_size_spec_const)
+    {
+        local_shader_module = vkdev->create_shader_module(shader_type_index, local_size_x, local_size_y, local_size_z);
+
+        return create(local_shader_module, si, specializations);
+    }
+
+    VkShaderModule shader_module = vkdev->get_shader_module(shader_type_index);
+
+    return create(shader_module, si, specializations);
+}
+
+int Pipeline::create(VkShaderModule shader_module, const ShaderInfo& _shader_info, const std::vector<vk_specialization_type>& specializations)
+{
+    shader_info = _shader_info;
+
+    create_descriptorset_layout();
+
+    create_pipeline_layout();
+
+    create_pipeline(shader_module, specializations);
+
+    if (vkdev->info.support_VK_KHR_descriptor_update_template)
+    {
+        create_descriptor_update_template();
+    }
+
+    return 0;
+}
+
+void Pipeline::destroy()
+{
+    if (vkdev->info.support_VK_KHR_descriptor_update_template)
+    {
+        if (descriptor_update_template)
+        {
+            vkdev->vkDestroyDescriptorUpdateTemplateKHR(vkdev->vkdevice(), descriptor_update_template, 0);
+            descriptor_update_template = 0;
+        }	
+    }
+
+    if (pipeline)
+    {
+	vkDestroyPipeline(vkdev->vkdevice(), pipeline, 0);
+        pipeline = 0;
+    }
+
+    if (pipeline_layout)
+    {
+        vkDestroyPipelineLayout(vkdev->vkdevice(), pipeline_layout, 0);
+        pipeline_layout = 0;
+    }
+
+    if (descriptorset_layout)
+    {
+        vkDestroyDescriptorSetLayout(vkdev->vkdevice(), descriptorset_layout, 0);
+        descriptorset_layout = 0;
+    }
+
+    if (local_shader_module)
+    {
+        vkDestroyShaderModule(vkdev->vkdevice(), local_shader_module, 0);
+        local_shader_module = 0;
+    }
+}
+
+void Pipeline::set_optimal_local_size_xyz(int w, int h, int c)
+{
+    set_optimal_local_size_xyz(Tensor(w, h, c, (void*)0));
+}
+
+void Pipeline::set_optimal_local_size_xyz(const VkTensor& local_size_xyz)
+{
+    int w = local_size_xyz.w;
+    int h = local_size_xyz.h;
+    int c = local_size_xyz.c;
+
+    if (w == 0 && h == 0 && c == 0)
+    {
+        // fallback to the common and safe 4x4x4
+        w = 4;
+        h = 4;
+        c = 4;
+    }
+
+    w = std::min(w, (int)vkdev->info.max_workgroup_size[0]);
+    h = std::min(h, (int)vkdev->info.max_workgroup_size[1]);
+    c = std::min(c, (int)vkdev->info.max_workgroup_size[2]);
+
+    if (w * h * c <= (int)vkdev->info.max_workgroup_invocations)
+    {
+        return set_local_size_xyz(w, h, c);
+    }
+
+    int max_local_size_xy = (int)vkdev->info.max_workgroup_invocations / c;
+
+    int wh_max = std::max(1, (int)sqrt(max_local_size_xy));
+    while (w * h >= wh_max)
+    {
+        w = std::max(1, w / 2);
+        h = std::max(1, h / 2);
+    }
+
+    set_local_size_xyz(w, h, c);
+}
+
+void Pipeline::set_optimal_local_size_xyz(const Tensor& local_size_xyz)
+{
+    int w = local_size_xyz.w;
+    int h = local_size_xyz.h;
+    int c = local_size_xyz.c;
+
+    if (w == 0 && h == 0 && c == 0)
+    {
+        // fallback to the common and safe 4x4x4
+        w = 4;
+        h = 4;
+        c = 4;
+    }
+
+    w = std::min(w, (int)vkdev->info.max_workgroup_size[0]);
+    h = std::min(h, (int)vkdev->info.max_workgroup_size[1]);
+    c = std::min(c, (int)vkdev->info.max_workgroup_size[2]);
+
+    if (w * h * c <= (int)vkdev->info.max_workgroup_invocations)
+    {
+        return set_local_size_xyz(w, h, c);
+    }
+
+    int max_local_size_xy = (int)vkdev->info.max_workgroup_invocations / c;
+
+    int wh_max = std::max(1, (int)sqrt(max_local_size_xy));
+    while (w * h >= wh_max)
+    {
+        w = std::max(1, w / 2);
+        h = std::max(1, h / 2);
+    }
+
+    set_local_size_xyz(w, h, c);
+}
+
+void Pipeline::set_local_size_xyz(int w, int h, int c)
+{
+    local_size_x = w;
+    local_size_y = h;
+    local_size_z = c;
+
+//     TLOG_INFO("local size = %d %d %d", local_size_x, local_size_y, local_size_z);
+}
+
+int Pipeline::create_descriptorset_layout()
+{
+    const int binding_count = shader_info.binding_count;
+
+    if (binding_count == 0)
+    {
+        descriptorset_layout = 0;
+        return 0;
+    }
+
+    std::vector<VkDescriptorSetLayoutBinding> descriptorSetLayoutBindings(binding_count);
+    for (int i=0; i<binding_count; i++)
+    {
+        int binding_type = shader_info.binding_types[i];
+
+        descriptorSetLayoutBindings[i].binding = i;
+        descriptorSetLayoutBindings[i].descriptorCount = 1;
+        descriptorSetLayoutBindings[i].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+        if (binding_type == 1)
+        {
+            descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+            descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
+        }
+        else
+        {
+            printf("binding type not support for now, fix me\n");
+        }
+        // else if (binding_type == 2)
+        // {
+        //     descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
+        //     descriptorSetLayoutBindings[i].pImmutableSamplers = 0;
+        // }
+        // else // if (binding_type == 3)
+        // {
+        //     descriptorSetLayoutBindings[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+        //     descriptorSetLayoutBindings[i].pImmutableSamplers = vkdev->immutable_texelfetch_sampler();// we always use texelfetch
+        // }
+    }
+
+    VkDescriptorSetLayoutCreateInfo descriptorSetLayoutCreateInfo;
+    descriptorSetLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+    descriptorSetLayoutCreateInfo.pNext = 0;
+    descriptorSetLayoutCreateInfo.flags = 0;
+    descriptorSetLayoutCreateInfo.bindingCount = binding_count;
+    descriptorSetLayoutCreateInfo.pBindings = descriptorSetLayoutBindings.data();
+
+    if (vkdev->info.support_VK_KHR_push_descriptor)
+    {
+        descriptorSetLayoutCreateInfo.flags |= VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR;
+    }
+
+    VkResult ret = vkCreateDescriptorSetLayout(vkdev->vkdevice(), &descriptorSetLayoutCreateInfo, 0, &descriptorset_layout);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkCreateDescriptorSetLayout failed %d", ret);
+        return -1;
+    }
+
+    return 0;
+}
+
+int Pipeline::create_pipeline_layout()
+{
+    const int push_constant_count = shader_info.push_constant_count;
+
+    VkPushConstantRange pushConstantRange;
+    pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    pushConstantRange.offset = 0;
+    pushConstantRange.size = sizeof(vk_constant_type) * push_constant_count;
+
+    VkPipelineLayoutCreateInfo pipelineLayoutCreateInfo;
+    pipelineLayoutCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+    pipelineLayoutCreateInfo.pNext = 0;
+    pipelineLayoutCreateInfo.flags = 0;
+
+    if (descriptorset_layout)
+    {
+    pipelineLayoutCreateInfo.setLayoutCount = 1;
+    pipelineLayoutCreateInfo.pSetLayouts = &descriptorset_layout;
+    }
+    else
+    {
+    pipelineLayoutCreateInfo.setLayoutCount = 0;
+    pipelineLayoutCreateInfo.pSetLayouts = 0;
+    }
+
+    if (push_constant_count > 0)
+    {
+    pipelineLayoutCreateInfo.pushConstantRangeCount = 1;
+    pipelineLayoutCreateInfo.pPushConstantRanges = &pushConstantRange;
+    }
+    else
+    {
+    pipelineLayoutCreateInfo.pushConstantRangeCount = 0;
+    pipelineLayoutCreateInfo.pPushConstantRanges = 0;
+    }
+
+    VkResult ret = vkCreatePipelineLayout(vkdev->vkdevice(), &pipelineLayoutCreateInfo, 0, &pipeline_layout);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkCreatePipelineLayout failed %d", ret);
+        return -1;
+    }
+
+    return 0;
+}
+
+
+int Pipeline::create_pipeline(VkShaderModule shader_module, const std::vector<vk_specialization_type>& specializations)
+{
+    const int specialization_count = specializations.size();
+
+    // +3 for local_size_xyz
+    std::vector<VkSpecializationMapEntry> specializationMapEntries;
+    specializationMapEntries.resize(specialization_count + 3);
+
+    for (int i=0; i<specialization_count; i++)
+    {
+        specializationMapEntries[i].constantID = i;
+        specializationMapEntries[i].offset = i * sizeof(vk_specialization_type);
+        specializationMapEntries[i].size = sizeof(vk_specialization_type);
+    }
+
+    std::vector<vk_specialization_type> specialization_data = specializations;
+
+    // append local_size_xyz specialization
+    if (!vkdev->info.bug_local_size_spec_const)
+    {
+        VkSpecializationMapEntry* local_size_xyz_entries = specializationMapEntries.data() + specialization_count;
+
+        local_size_xyz_entries[0].constantID = 233;
+        local_size_xyz_entries[0].offset = (specialization_count+0) * sizeof(vk_specialization_type);
+        local_size_xyz_entries[0].size = sizeof(vk_specialization_type);
+
+        local_size_xyz_entries[1].constantID = 234;
+        local_size_xyz_entries[1].offset = (specialization_count+1) * sizeof(vk_specialization_type);
+        local_size_xyz_entries[1].size = sizeof(vk_specialization_type);
+
+        local_size_xyz_entries[2].constantID = 235;
+        local_size_xyz_entries[2].offset = (specialization_count+2) * sizeof(vk_specialization_type);
+        local_size_xyz_entries[2].size = sizeof(vk_specialization_type);
+
+        specialization_data.resize(specialization_count + 3);
+        specialization_data[ specialization_count+0 ].u32 = local_size_x;
+        specialization_data[ specialization_count+1 ].u32 = local_size_y;
+        specialization_data[ specialization_count+2 ].u32 = local_size_z;
+    }
+
+    VkSpecializationInfo specializationInfo;
+    specializationInfo.mapEntryCount = specializationMapEntries.size();
+    specializationInfo.pMapEntries = specializationMapEntries.data();
+    specializationInfo.dataSize = specialization_data.size() * sizeof(vk_specialization_type);
+    specializationInfo.pData = specialization_data.data();
+
+    VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo;
+    pipelineShaderStageCreateInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    pipelineShaderStageCreateInfo.pNext = 0;
+    pipelineShaderStageCreateInfo.flags = 0;
+    pipelineShaderStageCreateInfo.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+    pipelineShaderStageCreateInfo.module = shader_module;
+    pipelineShaderStageCreateInfo.pName = "main";
+    pipelineShaderStageCreateInfo.pSpecializationInfo = &specializationInfo;
+
+    VkComputePipelineCreateInfo computePipelineCreateInfo;
+    computePipelineCreateInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    computePipelineCreateInfo.pNext = 0;
+    computePipelineCreateInfo.flags = 0;
+    computePipelineCreateInfo.stage = pipelineShaderStageCreateInfo;
+    computePipelineCreateInfo.layout = pipeline_layout;
+    computePipelineCreateInfo.basePipelineHandle = 0;
+    computePipelineCreateInfo.basePipelineIndex = 0;
+
+    VkResult ret = vkCreateComputePipelines(vkdev->vkdevice(), 0, 1, &computePipelineCreateInfo, 0, &pipeline);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkCreateComputePipelines failed %d", ret);
+        return -1;
+    }
+
+    return 0;
+}
+
+int Pipeline::create_descriptor_update_template()
+{
+    const int binding_count = shader_info.binding_count;
+
+    if (binding_count == 0)
+    {
+        descriptor_update_template = 0;
+        return 0;
+    }
+
+    std::vector<VkDescriptorUpdateTemplateEntryKHR> descriptorUpdateTemplateEntries(binding_count);
+    size_t offset = 0;
+    for (int i=0; i<binding_count; i++)// TODO do not update weights
+    {
+        int binding_type = shader_info.binding_types[i];
+
+        descriptorUpdateTemplateEntries[i].dstBinding = i;
+        descriptorUpdateTemplateEntries[i].dstArrayElement = 0;
+        descriptorUpdateTemplateEntries[i].descriptorCount = 1;
+        descriptorUpdateTemplateEntries[i].offset = offset;
+
+        if (binding_type == 1)
+        {
+            descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+            descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorBufferInfo);
+        }
+        else if (binding_type == 2)
+        {
+            descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE;
+            descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
+        }
+        else // if (binding_type == 3)
+        {
+            descriptorUpdateTemplateEntries[i].descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER;
+            descriptorUpdateTemplateEntries[i].stride = sizeof(VkDescriptorImageInfo);
+        }
+
+        offset += descriptorUpdateTemplateEntries[i].stride;
+    }
+
+    VkDescriptorUpdateTemplateCreateInfoKHR descriptorUpdateTemplateCreateInfo;
+    descriptorUpdateTemplateCreateInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR;
+    descriptorUpdateTemplateCreateInfo.pNext = 0;
+    descriptorUpdateTemplateCreateInfo.flags = 0;
+    descriptorUpdateTemplateCreateInfo.descriptorUpdateEntryCount = binding_count;// TODO do not update weights
+    descriptorUpdateTemplateCreateInfo.pDescriptorUpdateEntries = descriptorUpdateTemplateEntries.data();
+    if (vkdev->info.support_VK_KHR_push_descriptor)
+    {
+    descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR;
+    }
+    else
+    {
+    descriptorUpdateTemplateCreateInfo.templateType = VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_DESCRIPTOR_SET_KHR;
+    }
+    // descriptorSetLayout should be ignored if VK_DESCRIPTOR_UPDATE_TEMPLATE_TYPE_PUSH_DESCRIPTORS_KHR
+    // FIXME HACK WARNING TODO NOTE but crash on radv if set NULL  :(
+    descriptorUpdateTemplateCreateInfo.descriptorSetLayout = descriptorset_layout;
+    descriptorUpdateTemplateCreateInfo.pipelineBindPoint = VK_PIPELINE_BIND_POINT_COMPUTE;
+    descriptorUpdateTemplateCreateInfo.pipelineLayout = pipeline_layout;
+    descriptorUpdateTemplateCreateInfo.set = 0;
+
+    VkResult ret = vkdev->vkCreateDescriptorUpdateTemplateKHR(vkdev->vkdevice(), &descriptorUpdateTemplateCreateInfo, 0, &descriptor_update_template);
+    if (ret != VK_SUCCESS)
+    {
+        printf("vkCreateDescriptorUpdateTemplateKHR failed %d", ret);
+        return -1;
+    }
+
+    return 0;
+}
+
+
+} // namespace TEngine
diff --git a/source/device/vulkan/vulkan_pipeline.hpp b/source/device/vulkan/vulkan_pipeline.hpp
new file mode 100644
index 000000000..9980d2e43
--- /dev/null
+++ b/source/device/vulkan/vulkan_pipeline.hpp
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef VULKAN_PIPELINE_HPP
+#define VULKAN_PIPELINE_HPP
+
+#include <vulkan/vulkan.h>
+#include "vulkan_gpu.hpp"
+#include "vulkan_tensor.hpp"
+#include "vulkan_platform.hpp"
+#include "vulkan_option.hpp"
+
+namespace TEngine {
+
+class Option;
+class Pipeline
+{
+public:
+    Pipeline(const GPUDevice* vkdev);
+    virtual ~Pipeline();
+
+public:
+    void set_optimal_local_size_xyz(int w = 4, int h = 4, int c = 4);
+    
+    void set_optimal_local_size_xyz(const VkTensor& local_size_xyz);
+    void set_optimal_local_size_xyz(const Tensor& local_size_xyz);
+    void set_local_size_xyz(int w, int h, int c);
+
+    int create(const uint32_t* spv_data, size_t spv_data_size, const std::vector<vk_specialization_type>& specializations);
+
+    int create(int shader_type_index, const Option& opt, const std::vector<vk_specialization_type>& specializations);
+
+    int create(VkShaderModule shader_module, const ShaderInfo& si, const std::vector<vk_specialization_type>& specializations);
+
+    void destroy();
+
+protected:
+    int create_descriptorset_layout();
+    int create_pipeline_layout();
+    int create_pipeline(VkShaderModule shader_module, const std::vector<vk_specialization_type>& specializations);
+    int create_descriptor_update_template();
+
+public:
+    const GPUDevice* vkdev;
+
+    // local shader module
+    VkShaderModule local_shader_module;
+
+    VkDescriptorSetLayout descriptorset_layout;
+    VkPipelineLayout pipeline_layout;
+
+    // op forward TODO use pipeline cache ?
+    VkPipeline pipeline;
+
+    VkDescriptorUpdateTemplateKHR descriptor_update_template;
+
+    ShaderInfo shader_info;
+
+    uint32_t local_size_x;
+    uint32_t local_size_y;
+    uint32_t local_size_z;
+};
+
+#if __ANDROID_API__ >= 26
+class VkCompute;
+class ImportAndroidHardwareBufferPipeline : private Pipeline
+{
+public:
+    ImportAndroidHardwareBufferPipeline(const GPUDevice* vkdev);
+    ~ImportAndroidHardwareBufferPipeline();
+
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, const Option& opt);
+    int create(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator, int type_to, int rotate_from, int target_width, int target_height, const Option& opt);
+    void destroy();
+
+    friend class VkCompute;
+
+protected:
+    int create_sampler(VkAndroidHardwareBufferImageAllocator* ahb_im_allocator);
+    int create_descriptorset_layout();
+    int create_descriptor_update_template();
+
+public:
+    int type_to;
+    int rotate_from;
+    bool need_resize;
+
+    VkSampler sampler;
+};
+#endif // __ANDROID_API__ >= 26
+
+} // namespace TEngine
+
+#endif // VULKAN_PIPELINE_HPP
diff --git a/source/device/vulkan/vulkan_platform.hpp b/source/device/vulkan/vulkan_platform.hpp
new file mode 100644
index 000000000..cc03681a7
--- /dev/null
+++ b/source/device/vulkan/vulkan_platform.hpp
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef VULKAN_PLATFORM_HPP
+#define VULKAN_PLATFORM_HPP
+
+#include <pthread.h>
+
+namespace TEngine {
+
+class Mutex
+{
+public:
+    Mutex() { pthread_mutex_init(&mutex, 0); }
+    ~Mutex() { pthread_mutex_destroy(&mutex); }
+    void lock() { pthread_mutex_lock(&mutex); }
+    void unlock() { pthread_mutex_unlock(&mutex); }
+private:
+    friend class ConditionVariable;
+    pthread_mutex_t mutex;
+};
+
+class MutexLockGuard
+{
+public:
+    MutexLockGuard(Mutex& _mutex) : mutex(_mutex) { mutex.lock(); }
+    ~MutexLockGuard() { mutex.unlock(); }
+private:
+    Mutex& mutex;
+};
+
+class ConditionVariable
+{
+public:
+    ConditionVariable() { pthread_cond_init(&cond, 0); }
+    ~ConditionVariable() { pthread_cond_destroy(&cond); }
+    void wait(Mutex& mutex) { pthread_cond_wait(&cond, &mutex.mutex); }
+    void broadcast() { pthread_cond_broadcast(&cond); }
+    void signal() { pthread_cond_signal(&cond); }
+private:
+    pthread_cond_t cond;
+};
+
+class Thread
+{
+public:
+    Thread(void* (*start)(void*), void* args = 0) { pthread_create(&t, 0, start, args); }
+    ~Thread() {}
+    void join() { pthread_join(t, 0); }
+private:
+    pthread_t t;
+};
+
+} // namespace TEngine
+
+#endif
\ No newline at end of file
diff --git a/source/device/vulkan/vulkan_tensor.cpp b/source/device/vulkan/vulkan_tensor.cpp
new file mode 100644
index 000000000..38f588502
--- /dev/null
+++ b/source/device/vulkan/vulkan_tensor.cpp
@@ -0,0 +1,374 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#include "vulkan_tensor.hpp"
+
+namespace TEngine {
+
+void convert_packing(tensor* src, Tensor& dst, int elempack, const Option& opt)
+{
+    const Tensor _src = Tensor(src);
+    // printf("convert packing ir_tensor to Tensor : %d %d %d %d %d\n", _src.c, _src.h, _src.w, _src.elempack, _src.elemsize);
+}
+
+void convert_packing(const Tensor& src, Tensor& dst, int _elempack, const Option& opt)
+{
+    int elempack = src.elempack;
+    int out_elempack = _elempack;
+
+    if (elempack == out_elempack)
+    {
+        dst = src;
+        return;
+    }
+
+    int w = src.w;
+    int h = src.h;
+    int channels = src.c;
+    int dims = src.dims;
+    size_t elemsize = src.elemsize;
+
+    if (dims == 1)
+    {
+        if (out_elempack == 1)
+        {
+            dst = src;
+            dst.w = w * elempack;
+            dst.cstep = w * elempack;
+            dst.elemsize = elemsize / elempack;
+            dst.elempack = out_elempack;
+            return;
+        }
+
+        int outw = (w * elempack + out_elempack - 1) / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+
+        dst.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
+        if (dst.empty())
+            return;
+
+        memcpy(dst.data, src.data, w * elemsize);
+
+        return;
+    }
+
+    if (dims == 2)
+    {
+        int outh = (h * elempack + out_elempack - 1) / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+        size_t lane_size = out_elemsize / out_elempack;
+
+        dst.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
+        if (dst.empty())
+            return;
+
+        #pragma omp parallel for
+        for (int i = 0; i < outh; i++)
+        {
+            unsigned char* outptr = (unsigned char*)dst + i * w * out_elemsize;
+
+            for (int j = 0; j < w; j++)
+            {
+                unsigned char* out_elem_ptr = outptr + j * out_elemsize;
+
+                for (int k = 0; k < out_elempack; k++)
+                {
+                    int srcy = (i * out_elempack + k) / elempack;
+                    if (srcy >= h)
+                        break;
+
+                    int srck = (i * out_elempack + k) % elempack;
+
+                    const unsigned char* ptr = (const unsigned char*)src + srcy * w * elemsize;
+                    const unsigned char* elem_ptr = ptr + j * elemsize;
+                    memcpy(out_elem_ptr + k * lane_size, elem_ptr + srck * lane_size, lane_size);
+                }
+            }
+        }
+
+        return;
+    }
+
+    if (dims == 3)
+    {
+        int outc = (channels * elempack + out_elempack - 1) / out_elempack;
+        size_t out_elemsize = elemsize / elempack * out_elempack;
+        size_t lane_size = out_elemsize / out_elempack;
+
+        dst.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
+        if (dst.empty())
+            return;
+
+        #pragma omp parallel for
+        for (int q = 0; q < outc; q++)
+        {
+            Tensor out = dst.channel(q);
+
+            for (int i = 0; i < h; i++)
+            {
+                unsigned char* outptr = (unsigned char*)out + i * w * out_elemsize;
+
+                for (int j = 0; j < w; j++)
+                {
+                    unsigned char* out_elem_ptr = outptr + j * out_elemsize;
+
+                    for (int k = 0; k < out_elempack; k++)
+                    {
+                        int srcq = (q * out_elempack + k) / elempack;
+                        if (srcq >= channels)
+                            break;
+
+                        int srck = (q * out_elempack + k) % elempack;
+
+                        const Tensor m = src.channel(srcq);
+                        const unsigned char* ptr = (const unsigned char*)m + i * w * elemsize;
+                        const unsigned char* elem_ptr = ptr + j * elemsize;
+                        memcpy(out_elem_ptr + k * lane_size, elem_ptr + srck * lane_size, lane_size);
+                    }
+                }
+            }
+        }
+
+        return;
+    }
+}
+
+unsigned short float32_to_float16(float value)
+{
+    // 1 : 8 : 23
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+
+    tmp.f = value;
+
+    // 1 : 8 : 23
+    unsigned short sign = (tmp.u & 0x80000000) >> 31;
+    unsigned short exponent = (tmp.u & 0x7F800000) >> 23;
+    unsigned int significand = tmp.u & 0x7FFFFF;
+
+    //     TLOG_INFO("%d %d %d", sign, exponent, significand);
+
+    // 1 : 5 : 10
+    unsigned short fp16;
+    if (exponent == 0)
+    {
+        // zero or denormal, always underflow
+        fp16 = (sign << 15) | (0x00 << 10) | 0x00;
+    }
+    else if (exponent == 0xFF)
+    {
+        // infinity or NaN
+        fp16 = (sign << 15) | (0x1F << 10) | (significand ? 0x200 : 0x00);
+    }
+    else
+    {
+        // normalized
+        short newexp = exponent + (-127 + 15);
+        if (newexp >= 31)
+        {
+            // overflow, return infinity
+            fp16 = (sign << 15) | (0x1F << 10) | 0x00;
+        }
+        else if (newexp <= 0)
+        {
+            // underflow
+            if (newexp >= -10)
+            {
+                // denormal half-precision
+                unsigned short sig = (significand | 0x800000) >> (14 - newexp);
+                fp16 = (sign << 15) | (0x00 << 10) | sig;
+            }
+            else
+            {
+                // underflow
+                fp16 = (sign << 15) | (0x00 << 10) | 0x00;
+            }
+        }
+        else
+        {
+            fp16 = (sign << 15) | (newexp << 10) | (significand >> 13);
+        }
+    }
+
+    return fp16;
+}
+
+float float16_to_float32(unsigned short value)
+{
+    // 1 : 5 : 10
+    unsigned short sign = (value & 0x8000) >> 15;
+    unsigned short exponent = (value & 0x7c00) >> 10;
+    unsigned short significand = value & 0x03FF;
+
+    //     TLOG_INFO("%d %d %d", sign, exponent, significand);
+
+    // 1 : 8 : 23
+    union
+    {
+        unsigned int u;
+        float f;
+    } tmp;
+    if (exponent == 0)
+    {
+        if (significand == 0)
+        {
+            // zero
+            tmp.u = (sign << 31);
+        }
+        else
+        {
+            // denormal
+            exponent = 0;
+            // find non-zero bit
+            while ((significand & 0x200) == 0)
+            {
+                significand <<= 1;
+                exponent++;
+            }
+            significand <<= 1;
+            significand &= 0x3FF;
+            tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13);
+        }
+    }
+    else if (exponent == 0x1F)
+    {
+        // infinity or NaN
+        tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13);
+    }
+    else
+    {
+        // normalized
+        tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13);
+    }
+
+    return tmp.f;
+}
+
+void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt)
+{
+    // printf("function cast_float32_to_float16 not done, fix me\n!!!!!");
+
+    int w = src.w;
+    int h = src.h;
+    int channels = src.c;
+    int dims = src.dims;
+    size_t elemsize = src.elemsize;
+    int elempack = src.elempack;
+
+    size_t out_elemsize = 2 * elempack;
+
+    if (dims == 1)
+    {
+        dst.create(w, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 2)
+    {
+        dst.create(w, h, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 3)
+    {
+        dst.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
+    }
+    if (dst.empty())
+        return ;
+
+    int size = w * h * elempack;
+
+    #pragma omp parallel for 
+    for (int q = 0; q < channels; q++)
+    {
+        const float* ptr = src.channel(q);
+        unsigned short* outptr = dst.channel(q);
+
+        for (int i = 0; i < size; i++)
+        {
+            outptr[i] = float32_to_float16(ptr[i]);
+        }
+    }
+
+}
+
+void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt)
+{
+    // printf("function cast_float16_to_float32 not done, fix me\n!!!!!");
+
+    int w = src.w;
+    int h = src.h;
+    int channels = src.c;
+    int dims = src.dims;
+    size_t elemsize = src.elemsize;
+    int elempack = src.elempack;
+
+    size_t out_elemsize = 4 * elempack;
+
+    if (dims == 1)
+    {
+        dst.create(w, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 2)
+    {
+        dst.create(w, h, out_elemsize, elempack, opt.blob_allocator);
+    }
+    else if (dims == 3)
+    {
+        dst.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
+    }
+    if (dst.empty())
+        return ;
+
+    int size = w * h * elempack;
+
+    #pragma omp parallel for
+    for (int q = 0; q < channels; q++)
+    {
+        const unsigned short* ptr = src.channel(q);
+        float* outptr = dst.channel(q);
+
+        for (int i = 0; i < size; i++)
+        {
+            outptr[i] = float16_to_float32(ptr[i]);
+        }
+    }
+
+}
+
+}   // namespace TEngine
diff --git a/source/device/vulkan/vulkan_tensor.hpp b/source/device/vulkan/vulkan_tensor.hpp
new file mode 100644
index 000000000..a0ef5a9bd
--- /dev/null
+++ b/source/device/vulkan/vulkan_tensor.hpp
@@ -0,0 +1,1817 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ * Parts of the following code in this file refs to
+ * https://github.com/Tencent/ncnn/tree/master/src/layer/vulkan/
+ * Tencent is pleased to support the open source community by making ncnn
+ * available.
+ *
+ * Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+ *
+ * Licensed under the BSD 3-Clause License (the "License"); you may not use this
+ * file except in compliance with the License. You may obtain a copy of the
+ * License at
+ *
+ * https://opensource.org/licenses/BSD-3-Clause
+ */
+
+/*
+ * Copyright (c) 2020, Open AI Lab
+ * Author: ddzhao@openailab.com
+ */
+
+#ifndef VULKAN_TENSOR_HPP
+#define VULKAN_TENSOR_HPP
+
+#include <iostream>
+#include <cstring>
+// #include "tengine_ir.h"
+
+extern "C"
+{
+#include "graph/tensor.h"
+#include "graph/node.h"
+#include "graph/graph.h"
+#include "graph/subgraph.h"
+}
+
+#include <vulkan/vulkan.h>
+#include "vulkan_allocator.hpp"
+#include "vulkan_option.hpp"
+
+namespace TEngine {
+
+class VkTensor;
+class VkImageTensor;
+
+class Tshape
+{
+public:
+    Tshape()
+    {
+        w = 0;
+        h = 0;
+        c = 0;
+        dims = 0;
+    }
+    Tshape(int _w, int _h, int _c)
+    {
+        w = _w;
+        h = _h;
+        c = _c;
+        dims = 3;
+    }
+
+    int dims;
+    int w;
+    int h;
+    int c;
+
+    size_t cstep;
+};
+
+class Tensor
+{
+public:
+    // empty
+    Tensor();
+    // vec
+    Tensor(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // image
+    Tensor(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // dim
+    Tensor(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // packed vec
+    Tensor(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed image
+    Tensor(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // packed dim
+    Tensor(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // copy
+    Tensor(const Tensor& m);
+    // copy from ir_tensor
+    Tensor(struct tensor* m);
+    // external vec
+    Tensor(int w, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external image
+    Tensor(int w, int h, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external dim
+    Tensor(int w, int h, int c, void* data, size_t elemsize = 4u, Allocator* allocator = 0);
+    // external packed vec
+    Tensor(int w, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed image
+    Tensor(int w, int h, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // external packed dim
+    Tensor(int w, int h, int c, void* data, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // release
+    ~Tensor();
+    // assign
+    Tensor& operator=(const Tensor& m);
+
+    // reshape vec
+    Tensor reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    Tensor reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    Tensor reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate image
+    void create(int w, int h, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize = 4u, Allocator* allocator = 0);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const tensor* m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const Tensor& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkTensor& m, Allocator* allocator = 0);
+    // allocate like
+    void create_like(const VkImageTensor& im, Allocator* allocator = 0);
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // shape only
+    Tensor shape() const;
+
+    // data reference
+    Tensor channel(int c);
+    const Tensor channel(int c) const;
+    float* row(int y);
+    const float* row(int y) const;
+
+    // access raw data
+    template<typename T> operator T*();
+    template<typename T> operator const T*() const;
+
+    // pointer to the data
+    void* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    Allocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int c;
+
+    size_t cstep;
+};
+
+
+
+class VkTensor
+{
+public:
+    // empty
+    VkTensor();
+    // vec
+    VkTensor(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkTensor(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkTensor(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkTensor(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkTensor(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkTensor(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkTensor(const VkTensor& m);
+    // external vec
+    VkTensor(int w, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkTensor(int w, int h, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkTensor(int w, int h, int c, VkBufferMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkTensor(int w, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkTensor(int w, int h, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkTensor(int w, int h, int c, VkBufferMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkTensor();
+    // assign
+    VkTensor& operator=(const VkTensor& m);
+        // reshape vec
+    VkTensor reshape(int w, Allocator* allocator = 0) const;
+    // reshape image
+    VkTensor reshape(int w, int h, Allocator* allocator = 0) const;
+    // reshape dim
+    VkTensor reshape(int w, int h, int c, Allocator* allocator = 0) const;
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const Tensor& m, VkAllocator* allocator);
+    void create_like(const tensor* m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkTensor& m, VkAllocator* allocator);
+
+    // allocate vec
+    void create(struct tensor* tensor, VkAllocator* allocator);
+
+    // staging buffer
+    void prepare_staging_buffer();
+    void discard_staging_buffer();
+
+    // copy
+    // void upload(const Tensor& m);
+    // void download(Tensor& m) const;
+
+    // mapped
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // shape only
+    // Mat shape() const;
+    
+    // low-level reference
+    VkBuffer buffer() const;
+    size_t buffer_offset() const;
+    size_t buffer_capacity() const;
+
+    // device buffer
+    VkBufferMemory* data;
+
+    // staging buffer
+    VkBufferMemory* staging_data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    int* refcount;
+    int* staging_refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+    VkAllocator* staging_allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int c;
+
+    size_t cstep;
+};
+
+class VkImageTensor
+{
+public:
+    // empty
+    VkImageTensor();
+    // vec
+    VkImageTensor(int w, size_t elemsize, VkAllocator* allocator);
+    // image
+    VkImageTensor(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // dim
+    VkImageTensor(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // packed vec
+    VkImageTensor(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed image
+    VkImageTensor(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // packed dim
+    VkImageTensor(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // copy
+    VkImageTensor(const VkImageTensor& m);
+    // external vec
+    VkImageTensor(int w, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external image
+    VkImageTensor(int w, int h, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external dim
+    VkImageTensor(int w, int h, int c, VkImageMemory* data, size_t elemsize, VkAllocator* allocator);
+    // external packed vec
+    VkImageTensor(int w, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed image
+    VkImageTensor(int w, int h, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // external packed dim
+    VkImageTensor(int w, int h, int c, VkImageMemory* data, size_t elemsize, int elempack, VkAllocator* allocator);
+    // release
+    ~VkImageTensor();
+    // assign
+    VkImageTensor& operator=(const VkImageTensor& m);
+    // allocate vec
+    void create(int w, size_t elemsize, VkAllocator* allocator);
+    // allocate image
+    void create(int w, int h, size_t elemsize, VkAllocator* allocator);
+    // allocate dim
+    void create(int w, int h, int c, size_t elemsize, VkAllocator* allocator);
+    // allocate packed vec
+    void create(int w, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed image
+    void create(int w, int h, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate packed dim
+    void create(int w, int h, int c, size_t elemsize, int elempack, VkAllocator* allocator);
+    // allocate like
+    void create_like(const tensor* m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkTensor& m, VkAllocator* allocator);
+    // allocate like
+    void create_like(const VkImageTensor& im, VkAllocator* allocator);
+
+
+    // mapped
+    ///Mat mapped() const;
+    void* mapped_ptr() const;
+
+    // refcount++
+    void addref();
+    // refcount--
+    void release();
+
+    bool empty() const;
+    size_t total() const;
+
+    // shape only
+    ///Mat shape() const;
+
+    // low-level reference
+    VkImage image() const;
+    VkImageView imageview() const;
+
+#if __ANDROID_API__ >= 26
+    // convenient construct from android hardware buffer
+    static VkImageMat from_android_hardware_buffer(VkAndroidHardwareBufferImageAllocator* allocator);
+#endif // __ANDROID_API__ >= 26
+
+    // device image
+    VkImageMemory* data;
+
+    // pointer to the reference counter
+    // when points to user-allocated data, the pointer is NULL
+    
+    int* refcount;
+
+    // element size in bytes
+    // 4 = float32/int32
+    // 2 = float16
+    // 1 = int8/uint8
+    // 0 = empty
+    size_t elemsize;
+
+    // packed count inside element
+    // c/1-h-w-1  h/1-w-1  w/1-1  scalar
+    // c/4-h-w-4  h/4-w-4  w/4-4  sse/neon
+    // c/8-h-w-8  h/8-w-8  w/8-8  avx/fp16
+    int elempack;
+
+    // the allocator
+    VkAllocator* allocator;
+
+    // the dimension rank
+    int dims;
+
+    int w;
+    int h;
+    int c;
+};
+
+inline VkTensor::VkTensor()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+}
+
+inline VkTensor::VkTensor(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+inline VkTensor::VkTensor(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+inline VkTensor::VkTensor(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+inline VkTensor::VkTensor(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+inline VkTensor::VkTensor(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+inline VkTensor::VkTensor(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+inline VkTensor::VkTensor(const VkTensor& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c)
+{
+    if (refcount)
+        TENGINE_XADD(refcount, 1);
+
+    cstep = m.cstep;
+}
+
+inline VkTensor::VkTensor(int _w, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1)
+{
+    cstep = w;
+}
+
+inline VkTensor::VkTensor(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
+{
+    cstep = w * h;
+}
+
+inline VkTensor::VkTensor(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+inline VkTensor::VkTensor(int _w, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1)
+{
+    cstep = w;
+}
+
+inline VkTensor::VkTensor(int _w, int _h, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
+{
+    cstep = w * h;
+}
+
+inline VkTensor::VkTensor(int _w, int _h, int _c, VkBufferMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+inline VkTensor::~VkTensor()
+{
+    release();
+}
+
+inline VkTensor& VkTensor::operator=(const VkTensor& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        TENGINE_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+inline void VkTensor::create(int _w, size_t _elemsize, VkAllocator* _allocator)
+{
+    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = 1;
+    allocator = _allocator;
+
+    dims = 1;
+    w = _w;
+    h = 1;
+    c = 1;
+
+    cstep = w;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+
+        data = allocator->fastMalloc(totalsize);
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkTensor::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+{
+    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = 1;
+    allocator = _allocator;
+
+    dims = 2;
+    w = _w;
+    h = _h;
+    c = 1;
+
+    cstep = w * h;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+
+        data = allocator->fastMalloc(totalsize);
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkTensor::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+{
+    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = 1;
+    allocator = _allocator;
+
+    dims = 3;
+    w = _w;
+    h = _h;
+    c = _c;
+
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+
+        data = allocator->fastMalloc(totalsize);
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkTensor::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 1;
+    w = _w;
+    h = 1;
+    c = 1;
+
+    cstep = w;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+
+        data = allocator->fastMalloc(totalsize);
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkTensor::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 2;
+    w = _w;
+    h = _h;
+    c = 1;
+
+    cstep = w * h;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+
+        data = allocator->fastMalloc(totalsize);
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkTensor::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 3;
+    w = _w;
+    h = _h;
+    c = _c;
+
+    // cstep = alignSize(w * h * elemsize, 16) / elemsize;
+    cstep = w * h;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+
+        data = allocator->fastMalloc(totalsize);
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkBufferMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkTensor::create_like(const tensor* m, VkAllocator* _allocator)
+{
+    int _c = m->dims[1];
+    int _h = m->dims[2];
+    int _w = m->dims[3];
+    size_t _elemsize = m->data_type == 0 ? 4 : 1;
+    int _elempack = 1;
+
+    if (_c == 0 && _h == 0 && _w != 0)
+        create(_w, _elemsize, _elempack, _allocator);
+    if (_c == 0 && _h != 0 && _w != 0)
+        create(_w, _h, _elemsize, _elempack, _allocator);
+    if (_c != 0 && _h != 0 && _w != 0)
+        create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+inline void VkTensor::create_like(const Tensor& m, VkAllocator* _allocator)
+{
+    int _dims = m.dims;
+    if (_dims == 1)
+        create(m.w, m.elemsize, m.elempack, _allocator);
+    if (_dims == 2)
+        create(m.w, m.h, m.elemsize, m.elempack, _allocator);
+    if (_dims == 3)
+        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
+}
+
+inline void VkTensor::create_like(const VkTensor& m, VkAllocator* _allocator)
+{
+    int _dims = m.dims;
+    if (_dims == 1)
+        create(m.w, m.elemsize, m.elempack, _allocator);
+    if (_dims == 2)
+        create(m.w, m.h, m.elemsize, m.elempack, _allocator);
+    if (_dims == 3)
+        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
+}
+
+inline void* VkTensor::mapped_ptr() const
+{
+    if (!allocator->mappable)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->offset;
+}
+
+inline void VkTensor::addref()
+{
+    if (refcount)
+        TENGINE_XADD(refcount, 1);
+}
+
+inline void VkTensor::release()
+{
+    if (refcount && TENGINE_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+inline bool VkTensor::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+inline size_t VkTensor::total() const
+{
+    return cstep * c;
+}
+
+// TODO
+// inline Mat VkTensor::shape() const
+// {
+//     if (dims == 1)
+//         return Mat(w * elempack, (void*)0);
+//     if (dims == 2)
+//         return Mat(w, h * elempack, (void*)0);
+//     if (dims == 3)
+//         return Mat(w, h, c * elempack, (void*)0);
+
+//     return Mat();
+// }
+
+inline VkBuffer VkTensor::buffer() const
+{
+    return data->buffer;
+}
+
+inline size_t VkTensor::buffer_offset() const
+{
+    return data->offset;
+}
+
+inline size_t VkTensor::buffer_capacity() const
+{
+    return data->capacity;
+}
+
+// VkImageTensor
+inline VkImageTensor::VkImageTensor()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0)
+{
+}
+
+inline VkImageTensor::VkImageTensor(int _w, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0)
+{
+    create(_w, _elemsize, _allocator);
+}
+
+inline VkImageTensor::VkImageTensor(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0)
+{
+    create(_w, _h, _elemsize, _allocator);
+}
+
+inline VkImageTensor::VkImageTensor(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+inline VkImageTensor::VkImageTensor(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+inline VkImageTensor::VkImageTensor(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+inline VkImageTensor::VkImageTensor(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+inline VkImageTensor::VkImageTensor(const VkImageTensor& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c)
+{
+    if (refcount)
+        TENGINE_XADD(refcount, 1);
+}
+
+inline VkImageTensor::VkImageTensor(int _w, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1)
+{
+}
+
+inline VkImageTensor::VkImageTensor(int _w, int _h, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
+{
+}
+
+inline VkImageTensor::VkImageTensor(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
+{
+}
+
+inline VkImageTensor::VkImageTensor(int _w, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1)
+{
+}
+
+inline VkImageTensor::VkImageTensor(int _w, int _h, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
+{
+}
+
+inline VkImageTensor::VkImageTensor(int _w, int _h, int _c, VkImageMemory* _data, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
+{
+}
+
+inline VkImageTensor::~VkImageTensor()
+{
+    release();
+}
+
+inline VkImageTensor& VkImageTensor::operator=(const VkImageTensor& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        TENGINE_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    c = m.c;
+
+    return *this;
+}
+
+inline void VkImageTensor::create(int _w, size_t _elemsize, VkAllocator* _allocator)
+{
+    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = 1;
+    allocator = _allocator;
+
+    dims = 1;
+    w = _w;
+    h = 1;
+    c = 1;
+
+    if (total() > 0)
+    {
+        data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack);
+        if (!data)
+            return;
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkImageTensor::create(int _w, int _h, size_t _elemsize, VkAllocator* _allocator)
+{
+    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = 1;
+    allocator = _allocator;
+
+    dims = 2;
+    w = _w;
+    h = _h;
+    c = 1;
+
+    if (total() > 0)
+    {
+        data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack);
+        if (!data)
+            return;
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkImageTensor::create(int _w, int _h, int _c, size_t _elemsize, VkAllocator* _allocator)
+{
+    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = 1;
+    allocator = _allocator;
+
+    dims = 3;
+    w = _w;
+    h = _h;
+    c = _c;
+
+    if (total() > 0)
+    {
+        data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack);
+        if (!data)
+            return;
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkImageTensor::create(int _w, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 1;
+    w = _w;
+    h = 1;
+    c = 1;
+
+    if (total() > 0)
+    {
+        data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack);
+        if (!data)
+            return;
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkImageTensor::create(int _w, int _h, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 2;
+    w = _w;
+    h = _h;
+    c = 1;
+
+    if (total() > 0)
+    {
+        data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack);
+        if (!data)
+            return;
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkImageTensor::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, VkAllocator* _allocator)
+{
+    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 3;
+    w = _w;
+    h = _h;
+    c = _c;
+
+    if (total() > 0)
+    {
+        data = allocator->fastMalloc(dims, w, h, c, elemsize, elempack);
+        if (!data)
+            return;
+
+        refcount = (int*)((unsigned char*)data + offsetof(VkImageMemory, refcount));
+        *refcount = 1;
+    }
+}
+
+inline void VkImageTensor::create_like(const tensor* m, VkAllocator* _allocator)
+{
+    int _c = m->dims[1];
+    int _h = m->dims[2];
+    int _w = m->dims[3];
+    size_t _elemsize = m->data_type == 0 ? 4 : 1;
+    int _elempack = 1;
+    int _dims = m->dim_num;
+
+    if (_dims == 1)
+        create(_w, _elemsize, _elempack, _allocator);
+    if (_dims == 2)
+        create(_w, _h, _elemsize, _elempack, _allocator);
+    if (_dims == 3)
+        create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+
+inline void VkImageTensor::create_like(const VkTensor& m, VkAllocator* _allocator)
+{
+    int _dims = m.dims;
+    if (_dims == 1)
+        create(m.w, m.elemsize, m.elempack, _allocator);
+    if (_dims == 2)
+        create(m.w, m.h, m.elemsize, m.elempack, _allocator);
+    if (_dims == 3)
+        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
+}
+
+inline void VkImageTensor::create_like(const VkImageTensor& im, VkAllocator* _allocator)
+{
+    int _dims = im.dims;
+    if (_dims == 1)
+        create(im.w, im.elemsize, im.elempack, _allocator);
+    if (_dims == 2)
+        create(im.w, im.h, im.elemsize, im.elempack, _allocator);
+    if (_dims == 3)
+        create(im.w, im.h, im.c, im.elemsize, im.elempack, _allocator);
+}
+
+// inline Mat VkImageMat::mapped() const
+// {
+//     if (!allocator->mappable || !data->mapped_ptr)
+//         return Mat();
+
+//     if (dims == 1)
+//         return Mat(w, mapped_ptr(), elemsize, elempack, 0);
+
+//     if (dims == 2)
+//         return Mat(w, h, mapped_ptr(), elemsize, elempack, 0);
+
+//     if (dims == 3)
+//         return Mat(w, h, c, mapped_ptr(), elemsize, elempack, 0);
+
+//     return Mat();
+// }
+
+inline void* VkImageTensor::mapped_ptr() const
+{
+    if (!allocator->mappable || !data->mapped_ptr)
+        return 0;
+
+    return (unsigned char*)data->mapped_ptr + data->bind_offset;
+}
+
+inline void VkImageTensor::addref()
+{
+    if (refcount)
+        TENGINE_XADD(refcount, 1);
+}
+
+inline void VkImageTensor::release()
+{
+    if (refcount && TENGINE_XADD(refcount, -1) == 1)
+    {
+        if (allocator && data)
+        {
+            allocator->fastFree(data);
+        }
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    c = 0;
+
+    refcount = 0;
+}
+
+inline bool VkImageTensor::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+inline size_t VkImageTensor::total() const
+{
+    return w * h * c;
+}
+
+// inline Mat VkImageTensor::shape() const
+// {
+//     if (dims == 1)
+//         return Mat(w * elempack, (void*)0);
+//     if (dims == 2)
+//         return Mat(w, h * elempack, (void*)0);
+//     if (dims == 3)
+//         return Mat(w, h, c * elempack, (void*)0);
+
+//     return Mat();
+// }
+
+inline VkImage VkImageTensor::image() const
+{
+    return data->image;
+}
+
+inline VkImageView VkImageTensor::imageview() const
+{
+    return data->imageview;
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//Tensor defination
+
+inline Tensor::Tensor()
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+}   
+
+inline Tensor::Tensor(int _w, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _allocator);
+}   
+
+inline Tensor::Tensor(int _w, int _h, size_t _elemsize, Allocator* _allocator)     : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0){
+    create(_w, _h, _elemsize, _allocator);}
+inline Tensor::Tensor(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _allocator);
+}
+
+inline Tensor::Tensor(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _elemsize, _elempack, _allocator);
+}
+
+inline Tensor::Tensor(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _h, _elemsize, _elempack, _allocator);
+}
+
+inline Tensor::Tensor(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(0), refcount(0), elemsize(0), elempack(0), allocator(0), dims(0), w(0), h(0), c(0), cstep(0)
+{
+    create(_w, _h, _c, _elemsize, _elempack, _allocator);
+}
+
+inline Tensor::Tensor(const Tensor& m)
+    : data(m.data), refcount(m.refcount), elemsize(m.elemsize), elempack(m.elempack), allocator(m.allocator), dims(m.dims), w(m.w), h(m.h), c(m.c), cstep(m.cstep)
+{
+    if (refcount)
+        TENGINE_XADD(refcount, 1);
+}
+
+inline Tensor::Tensor(struct tensor* m)
+    : data(m->data), refcount(0), elemsize(0), elempack(1), allocator(0), dims(0), w(0), h(0), c(0)
+{
+    if(m->layout == 0)
+    {
+        c = m->dims[1];
+        h = m->dims[2];
+        w = m->dims[3];
+        elemsize = m->elem_size;
+        elempack = 1;
+        dims = 3;
+        cstep = w * h;
+    }
+    else
+    {
+        c = m->dims[3];
+        h = m->dims[2];
+        w = m->dims[1];
+        elemsize = m->elem_size;
+        elempack = 1;
+        dims = 3;
+        cstep = w * h;
+    }
+}
+inline Tensor::Tensor(int _w, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(1), w(_w), h(1), c(1)
+{
+    cstep = w;
+}
+
+inline Tensor::Tensor(int _w, int _h, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
+{
+    cstep = w * h;
+}
+
+inline Tensor::Tensor(int _w, int _h, int _c, void* _data, size_t _elemsize, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(1), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+inline Tensor::Tensor(int _w, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(1), w(_w), h(1), c(1)
+{
+    cstep = w;
+}
+
+inline Tensor::Tensor(int _w, int _h, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(2), w(_w), h(_h), c(1)
+{
+    cstep = w * h;
+}
+
+inline Tensor::Tensor(int _w, int _h, int _c, void* _data, size_t _elemsize, int _elempack, Allocator* _allocator)
+    : data(_data), refcount(0), elemsize(_elemsize), elempack(_elempack), allocator(_allocator), dims(3), w(_w), h(_h), c(_c)
+{
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+}
+
+inline Tensor::~Tensor()
+{
+    release();
+}
+
+inline Tensor& Tensor::operator=(const Tensor& m)
+{
+    if (this == &m)
+        return *this;
+
+    if (m.refcount)
+        TENGINE_XADD(m.refcount, 1);
+
+    release();
+
+    data = m.data;
+    refcount = m.refcount;
+    elemsize = m.elemsize;
+    elempack = m.elempack;
+    allocator = m.allocator;
+
+    dims = m.dims;
+    w = m.w;
+    h = m.h;
+    c = m.c;
+
+    cstep = m.cstep;
+
+    return *this;
+}
+
+inline Tensor Tensor::reshape(int _w, Allocator* _allocator) const
+{
+    if (w * h * c != _w)
+        return Tensor();
+
+    if (dims == 3 && cstep != (size_t)w * h)
+    {
+        Tensor m;
+        m.create(_w, elemsize, elempack, _allocator);
+
+        // flatten
+        for (int i=0; i<c; i++)
+        {
+            const void* ptr = (unsigned char*)data + i * cstep * elemsize;
+            void* mptr = (unsigned char*)m.data + i * w * h * elemsize;
+            memcpy(mptr, ptr, w * h * elemsize);
+        }
+
+        return m;
+    }
+
+    Tensor m = *this;
+
+    m.dims = 1;
+    m.w = _w;
+    m.h = 1;
+    m.c = 1;
+
+    m.cstep = _w;
+
+    return m;
+}
+
+inline Tensor Tensor::reshape(int _w, int _h, Allocator* _allocator) const
+{
+    if (w * h * c != _w * _h)
+        return Tensor();
+
+    if (dims == 3 && cstep != (size_t)w * h)
+    {
+        Tensor m;
+        m.create(_w, _h, elemsize, elempack, _allocator);
+
+        // flatten
+        for (int i=0; i<c; i++)
+        {
+            const void* ptr = (unsigned char*)data + i * cstep * elemsize;
+            void* mptr = (unsigned char*)m.data + i * w * h * elemsize;
+            memcpy(mptr, ptr, w * h * elemsize);
+        }
+
+        return m;
+    }
+
+    Tensor m = *this;
+
+    m.dims = 2;
+    m.w = _w;
+    m.h = _h;
+    m.c = 1;
+
+    m.cstep = _w * _h;
+
+    return m;
+}
+
+inline Tensor Tensor::reshape(int _w, int _h, int _c, Allocator* _allocator) const
+{
+    if (w * h * c != _w * _h * _c)
+        return Tensor();
+
+    if (dims < 3)
+    {
+        if ((size_t)_w * _h != alignSize(_w * _h * elemsize, 16) / elemsize)
+        {
+            Tensor m;
+            m.create(_w, _h, _c, elemsize, elempack, _allocator);
+
+            // align channel
+            for (int i=0; i<_c; i++)
+            {
+                const void* ptr = (unsigned char*)data + i * _w * _h * elemsize;
+                void* mptr = (unsigned char*)m.data + i * m.cstep * m.elemsize;
+                memcpy(mptr, ptr, _w * _h * elemsize);
+            }
+
+            return m;
+        }
+    }
+    else if (c != _c)
+    {
+        // flatten and then align
+        Tensor tmp = reshape(_w * _h * _c, _allocator);
+        return tmp.reshape(_w, _h, _c, _allocator);
+    }
+
+    Tensor m = *this;
+
+    m.dims = 3;
+    m.w = _w;
+    m.h = _h;
+    m.c = _c;
+
+    m.cstep = alignSize(_w * _h * elemsize, 16) / elemsize;
+
+    return m;
+}
+
+inline void Tensor::create(int _w, size_t _elemsize, Allocator* _allocator)
+{
+    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = 1;
+    allocator = _allocator;
+
+    dims = 1;
+    w = _w;
+    h = 1;
+    c = 1;
+
+    cstep = w;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+inline void Tensor::create(int _w, int _h, size_t _elemsize, Allocator* _allocator)
+{
+    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = 1;
+    allocator = _allocator;
+
+    dims = 2;
+    w = _w;
+    h = _h;
+    c = 1;
+
+    cstep = w * h;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+inline void Tensor::create(int _w, int _h, int _c, size_t _elemsize, Allocator* _allocator)
+{
+    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == 1 && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = 1;
+    allocator = _allocator;
+
+    dims = 3;
+    w = _w;
+    h = _h;
+    c = _c;
+
+    cstep = alignSize(w * h * elemsize, 16) / elemsize;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+inline void Tensor::create(int _w, size_t _elemsize, int _elempack, Allocator* _allocator)
+{
+    if (dims == 1 && w == _w && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 1;
+    w = _w;
+    h = 1;
+    c = 1;
+
+    cstep = w;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+inline void Tensor::create(int _w, int _h, size_t _elemsize, int _elempack, Allocator* _allocator)
+{
+    if (dims == 2 && w == _w && h == _h && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 2;
+    w = _w;
+    h = _h;
+    c = 1;
+
+    cstep = w * h;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+inline void Tensor::create(int _w, int _h, int _c, size_t _elemsize, int _elempack, Allocator* _allocator)
+{
+    if (dims == 3 && w == _w && h == _h && c == _c && elemsize == _elemsize && elempack == _elempack && allocator == _allocator)
+        return;
+
+    release();
+
+    elemsize = _elemsize;
+    elempack = _elempack;
+    allocator = _allocator;
+
+    dims = 3;
+    w = _w;
+    h = _h;
+    c = _c;
+
+    cstep = w * h;    //alignSize(w * h * elemsize, 16) / elemsize;
+
+    if (total() > 0)
+    {
+        size_t totalsize = alignSize(total() * elemsize, 4);
+        if (allocator)
+            data = allocator->fastMalloc(totalsize + (int)sizeof(*refcount));
+        else
+            data = fastMalloc(totalsize + (int)sizeof(*refcount));
+        refcount = (int*)(((unsigned char*)data) + totalsize);
+        *refcount = 1;
+    }
+}
+
+// inline void Tensor::create_like(const tensor* m, Allocator* _allocator)
+// {
+//     int _dims = m.dims;
+//     if (_dims == 1)
+//         create(m.w, m.elemsize, m.elempack, _allocator);
+//     if (_dims == 2)
+//         create(m.w, m.h, m.elemsize, m.elempack, _allocator);
+//     if (_dims == 3)
+//         create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
+// }
+
+inline void Tensor::create_like(const Tensor& m, Allocator* _allocator)
+{
+    int _dims = m.dims;
+    if (_dims == 1)
+        create(m.w, m.elemsize, m.elempack, _allocator);
+    if (_dims == 2)
+        create(m.w, m.h, m.elemsize, m.elempack, _allocator);
+    if (_dims == 3)
+        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
+}
+
+inline void Tensor::create_like(const VkTensor& m, Allocator* _allocator)
+{
+    int _dims = m.dims;
+    if (_dims == 1)
+        create(m.w, m.elemsize, m.elempack, _allocator);
+    if (_dims == 2)
+        create(m.w, m.h, m.elemsize, m.elempack, _allocator);
+    if (_dims == 3)
+        create(m.w, m.h, m.c, m.elemsize, m.elempack, _allocator);
+}
+
+inline void Tensor::create_like(const VkImageTensor& im, Allocator* _allocator)
+{
+    int _dims = im.dims;
+    if (_dims == 1)
+        create(im.w, im.elemsize, im.elempack, _allocator);
+    if (_dims == 2)
+        create(im.w, im.h, im.elemsize, im.elempack, _allocator);
+    if (_dims == 3)
+        create(im.w, im.h, im.c, im.elemsize, im.elempack, _allocator);
+}
+
+inline void Tensor::addref()
+{
+    if (refcount)
+        TENGINE_XADD(refcount, 1);
+}
+
+inline void Tensor::release()
+{
+    if (refcount && TENGINE_XADD(refcount, -1) == 1)
+    {
+        if (allocator)
+            allocator->fastFree(data);
+        else
+            fastFree(data);
+    }
+
+    data = 0;
+
+    elemsize = 0;
+    elempack = 0;
+
+    dims = 0;
+    w = 0;
+    h = 0;
+    c = 0;
+
+    cstep = 0;
+
+    refcount = 0;
+}
+
+inline bool Tensor::empty() const
+{
+    return data == 0 || total() == 0;
+}
+
+inline size_t Tensor::total() const
+{
+    return cstep * c;
+}
+
+inline Tensor Tensor::shape() const
+{
+    if (dims == 1)
+        return Tensor(w * elempack, (void*)0);
+    if (dims == 2)
+        return Tensor(w, h * elempack, (void*)0);
+    if (dims == 3)
+        return Tensor(w, h, c * elempack, (void*)0);
+
+    return Tensor();
+}
+
+inline Tensor Tensor::channel(int _c)
+{
+    return Tensor(w, h, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+}
+
+inline const Tensor Tensor::channel(int _c) const
+{
+    return Tensor(w, h, (unsigned char*)data + cstep * _c * elemsize, elemsize, elempack, allocator);
+}
+
+inline float* Tensor::row(int y)
+{
+    return (float*)((unsigned char*)data + w * y * elemsize);
+}
+
+inline const float* Tensor::row(int y) const
+{
+    return (const float*)((unsigned char*)data + w * y * elemsize);
+}
+
+template <typename T>
+inline Tensor::operator T*()
+{
+    return (T*)data;
+}
+
+template <typename T>
+inline Tensor::operator const T*() const
+{
+    return (const T*)data;
+}
+
+void convert_packing(const Tensor& src, Tensor& dst, int elempack, const Option& opt = Option());
+void convert_packing(tensor* src, Tensor&dst, int elempack, const Option& opt = Option());
+void cast_float32_to_float16(const Tensor& src, Tensor& dst, const Option& opt = Option());
+void cast_float16_to_float32(const Tensor& src, Tensor& dst, const Option& opt = Option());
+
+
+} // namespace TEngine
+
+
+#endif // VULKAN_TENSOR_HPP
+