From 7086c360d9603a5fb9643ced11bc620e8cd17aa9 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 21 Jan 2025 22:18:38 +1100 Subject: [PATCH 01/20] Generic NLT code is added. Testing is needed. --- src/core/codestream/ojph_tile.cpp | 96 ++++++++++------ src/core/common/ojph_arch.h | 12 ++ src/core/common/ojph_mem.h | 4 +- src/core/transform/ojph_colour.cpp | 145 ++++++++++++++++++++++++- src/core/transform/ojph_colour.h | 10 ++ src/core/transform/ojph_colour_local.h | 10 ++ 6 files changed, 239 insertions(+), 38 deletions(-) diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 9cec729c..67fed0bd 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -273,13 +273,18 @@ namespace ojph { } else { - float mul = 1.0f / (float)(1<i32 + line_offsets[comp_num]; - float *dp = tc->f32; - if (is_signed[comp_num]) - cnvrt_si32_to_float(sp, dp, mul, comp_width); - else - cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width); + if (nlt_type3[comp_num] == type3) + irv_convert_to_float_nlt_type3(line, line_offsets[comp_num], + tc, num_bits[comp_num], is_signed[comp_num], comp_width); + else { + float mul = 1.0f / (float)(1<i32 + line_offsets[comp_num]; + float *dp = tc->f32; + if (is_signed[comp_num]) + cnvrt_si32_to_float(sp, dp, mul, comp_width); + else + cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width); + } } comps[comp_num].push_line(); } @@ -311,13 +316,19 @@ namespace ojph { } else { - float mul = 1.0f / (float)(1<i32 + line_offsets[comp_num]; - float *dp = lines[comp_num].f32; - if (is_signed[comp_num]) - cnvrt_si32_to_float(sp, dp, mul, comp_width); - else - cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width); + if (nlt_type3[comp_num] == type3) + irv_convert_to_float_nlt_type3(line, line_offsets[comp_num], + lines + comp_num, num_bits[comp_num], is_signed[comp_num], + comp_width); + else { + float mul = 1.0f / (float)(1<i32 + line_offsets[comp_num]; + float *dp = lines[comp_num].f32; + if (is_signed[comp_num]) + cnvrt_si32_to_float(sp, dp, mul, comp_width); + else + cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width); + } if (comp_num == 2) { // irreversible color transform ict_forward(lines[0].f32, lines[1].f32, lines[2].f32, @@ -364,13 +375,21 @@ namespace ojph { } else { - float mul = (float)(1 << num_bits[comp_num]); - const float *sp = src_line->f32; - si32 *dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) - cnvrt_float_to_si32(sp, dp, mul, comp_width); - else - cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width); + if (nlt_type3[comp_num] == type3) + { + irv_convert_to_integer_nlt_type3(src_line, tgt_line, + line_offsets[comp_num], num_bits[comp_num], + is_signed[comp_num], comp_width); + } + else { + float mul = (float)(1 << num_bits[comp_num]); + const float *sp = src_line->f32; + si32 *dp = tgt_line->i32 + line_offsets[comp_num]; + if (is_signed[comp_num]) + cnvrt_float_to_si32(sp, dp, mul, comp_width); + else + cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width); + } } } else @@ -407,17 +426,30 @@ namespace ojph { } else { - float mul = (float)(1 << num_bits[comp_num]); - const float *sp; - if (comp_num < 3) - sp = lines[comp_num].f32; - else - sp = comps[comp_num].pull_line()->f32; - si32 *dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) - cnvrt_float_to_si32(sp, dp, mul, comp_width); - else - cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width); + if (nlt_type3[comp_num] == type3) + { + line_buf* lbp; + if (comp_num < 3) + lbp = lines + comp_num; + else + lbp = comps[comp_num].pull_line(); + irv_convert_to_integer_nlt_type3(lbp, tgt_line, + line_offsets[comp_num], num_bits[comp_num], + is_signed[comp_num], comp_width); + } + else { + float mul = (float)(1 << num_bits[comp_num]); + const float *sp; + if (comp_num < 3) + sp = lines[comp_num].f32; + else + sp = comps[comp_num].pull_line()->f32; + si32 *dp = tgt_line->i32 + line_offsets[comp_num]; + if (is_signed[comp_num]) + cnvrt_float_to_si32(sp, dp, mul, comp_width); + else + cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width); + } } } diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h index 29ab7a57..33e434a0 100644 --- a/src/core/common/ojph_arch.h +++ b/src/core/common/ojph_arch.h @@ -271,6 +271,18 @@ namespace ojph { #endif } + //////////////////////////////////////////////////////////////////////////// + static inline si64 ojph_round64(float val) + { + #ifdef OJPH_COMPILER_MSVC + return (si64)(val + (val >= 0.0f ? 0.5f : -0.5f)); + #elif (defined OJPH_COMPILER_GNUC) + return (si64)(val + (val >= 0.0f ? 0.5f : -0.5f)); + #else + return (si64)round(val); + #endif + } + //////////////////////////////////////////////////////////////////////////// static inline si32 ojph_trunc(float val) { diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h index 99897f32..b910e120 100644 --- a/src/core/common/ojph_mem.h +++ b/src/core/common/ojph_mem.h @@ -138,8 +138,8 @@ namespace ojph { enum : ui32 { LFT_UNDEFINED = 0x00, // Type is undefined/uninitialized // These flags reflects data size in bytes - LFT_BYTE = 0x01, // Set when data is 1 byte - LFT_16BIT = 0x02, // Set when data is 2 bytes + LFT_BYTE = 0x01, // Set when data is 1 byte (not used) + LFT_16BIT = 0x02, // Set when data is 2 bytes (not used) LFT_32BIT = 0x04, // Set when data is 4 bytes LFT_64BIT = 0x08, // Set when data is 8 bytes LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index a98b477b..3c6ab026 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -78,6 +78,16 @@ namespace ojph { void (*cnvrt_float_to_si32) (const float *sp, si32 *dp, float mul, ui32 width) = NULL; + ////////////////////////////////////////////////////////////////////////// + void (*irv_convert_to_float_nlt_type3) ( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL; + + ////////////////////////////////////////////////////////////////////////// + void (*irv_convert_to_integer_nlt_type3) ( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) = NULL; + ////////////////////////////////////////////////////////////////////////// void (*rct_forward) (const line_buf* r, const line_buf* g, const line_buf* b, @@ -115,6 +125,8 @@ namespace ojph { cnvrt_si32_to_float = gen_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd; cnvrt_float_to_si32 = gen_cnvrt_float_to_si32; + irv_convert_to_float_nlt_type3 = gen_irv_convert_to_float_nlt_type3; + irv_convert_to_integer_nlt_type3 = gen_irv_convert_to_integer_nlt_type3; rct_forward = gen_rct_forward; rct_backward = gen_rct_backward; ict_forward = gen_ict_forward; @@ -237,8 +249,8 @@ namespace ojph { } else { - assert(src_line->flags | line_buf::LFT_64BIT); - assert(dst_line->flags | line_buf::LFT_32BIT); + assert(src_line->flags & line_buf::LFT_64BIT); + assert(dst_line->flags & line_buf::LFT_32BIT); const si64 *sp = src_line->i64 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; for (ui32 i = width; i > 0; --i) @@ -276,8 +288,8 @@ namespace ojph { } else { - assert(src_line->flags | line_buf::LFT_64BIT); - assert(dst_line->flags | line_buf::LFT_32BIT); + assert(src_line->flags & line_buf::LFT_64BIT); + assert(dst_line->flags & line_buf::LFT_32BIT); const si64 *sp = src_line->i64 + src_line_offset; si32 *dp = dst_line->i32 + dst_line_offset; for (ui32 i = width; i > 0; --i) { @@ -319,6 +331,131 @@ namespace ojph { *dp++ = ojph_round(*sp++ * mul); } + ////////////////////////////////////////////////////////////////////////// + void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_REVERSIBLE) == 0 && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_REVERSIBLE) == 0); + + float mul; + if (bit_depth < 32) + mul = 1.0f / (float)(1 << bit_depth); + else + mul = (float)(1.0 / 65536.0 / 65536.0); + + const si32* sp = src_line->i32 + src_line_offset; + float* dp = dst_line->f32; + if (is_signed) + { + si32 shift = (1 << (bit_depth - 1)) + 1; + for (ui32 i = width; i > 0; --i) { + si32 v = *sp++; + v = (v >= 0) ? v : (- v - shift); + *dp++ = (float)v * mul; + } + } + else + { + for (ui32 i = width; i > 0; --i) + *dp++ = (float)*sp++ * mul - 0.5f; + } + } + + ////////////////////////////////////////////////////////////////////////// + void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_REVERSIBLE) == 0 && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_REVERSIBLE) == 0); + + float mul; + if (bit_depth < 32) + mul = 1.0f / (float)(1 << bit_depth); + else + mul = (float)(1.0 / 65536.0 / 65536.0); + + const float* sp = src_line->f32; + si32* dp = dst_line->i32 + dst_line_offset; + if (bit_depth <= 30) + { + // We are leaving two bit overhead -- here, we are assuming that after + // multiplications, the resulting number can still be represented + // using 32 bit integer + const si32 half = (1 << (bit_depth - 1)); + const si32 shift = half + 1; + const si32 upper_limit = 0x7FFFFFFF >> (32 - bit_depth); + const si32 lower_limit = 0x80000000 >> (32 - bit_depth); + + if (is_signed) + { + for (ui32 i = width; i > 0; --i) { + si32 v = ojph_round(*sp++ * mul); + v = ojph_max(v, lower_limit); + v = ojph_min(v, upper_limit); + v = (v >= 0) ? v : (- v - shift); + *dp++ = v; + } + } + else + { + for (ui32 i = width; i > 0; --i) { + si32 v = ojph_round(*sp++ * mul); + v = ojph_max(v, lower_limit); + v = ojph_min(v, upper_limit); + v = (v >= 0) ? v : (- v - shift); + *dp++ = v + half; + } + } + } + else + { + // There is the possibility that converting to integer will + // exceed the dynamic range of 32bit integer; therefore, we need + // to use 64 bit. One may think, why not limit the floats to the + // range of [-0.5f, 0.5f)? + // Notice the half closed range -- we need a value just below 0.5f. + // While getting this number is possible, after multiplication, the + // resulting number will not be exactly the maximum that the integer + // can achieve. All this is academic, because here are talking + // about a number which has all the exponent bits set, meaning + // it is either infinity, -infinity, qNan or sNan. + const si32 half = (1 << (bit_depth - 1)); + const si32 shift = half + 1; + const si64 upper_limit = 0x7FFFFFFFFFFFFFFFLL >> (64 - bit_depth); + const si64 lower_limit = 0x8000000000000000LL >> (64 - bit_depth); + + if (is_signed) + { + for (ui32 i = width; i > 0; --i) { + si64 t = ojph_round64(*sp++ * mul); + t = ojph_max(t, lower_limit); + t = ojph_min(t, upper_limit); + si32 v = (si32)t; + v = (v >= 0) ? v : (- v - shift); + *dp++ = v; + } + } + else + { + for (ui32 i = width; i > 0; --i) { + si64 t = ojph_round64(*sp++ * mul); + t = ojph_max(t, lower_limit); + t = ojph_min(t, upper_limit); + si32 v = (si32)t; + v = (v >= 0) ? v : (- v - shift); + *dp++ = v + half; + } + } + } + } + ////////////////////////////////////////////////////////////////////////// void gen_rct_forward( const line_buf *r, const line_buf *g, const line_buf *b, diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h index cc42aaa5..d5375a97 100644 --- a/src/core/transform/ojph_colour.h +++ b/src/core/transform/ojph_colour.h @@ -77,6 +77,16 @@ namespace ojph { extern void (*cnvrt_float_to_si32) (const float *sp, si32 *dp, float mul, ui32 width); + //////////////////////////////////////////////////////////////////////////// + extern void (*irv_convert_to_float_nlt_type3) ( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + + //////////////////////////////////////////////////////////////////////////// + extern void (*irv_convert_to_integer_nlt_type3) ( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); + //////////////////////////////////////////////////////////////////////////// extern void (*rct_forward) (const line_buf *r, const line_buf *g, const line_buf *b, diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index 5eb8b746..acd0f944 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -92,6 +92,16 @@ namespace ojph { void gen_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void gen_irv_convert_to_float_nlt_type3( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void gen_irv_convert_to_integer_nlt_type3( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); + ////////////////////////////////////////////////////////////////////////// void gen_rct_forward( const line_buf *r, const line_buf *g, const line_buf *b, From d065c767e227b7f539b1fcdfddd9c4029380671c Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 21 Jan 2025 22:30:33 +1100 Subject: [PATCH 02/20] Fixes warning on Mac. --- src/core/transform/ojph_colour.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 3c6ab026..63438948 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -390,8 +390,8 @@ namespace ojph { // using 32 bit integer const si32 half = (1 << (bit_depth - 1)); const si32 shift = half + 1; - const si32 upper_limit = 0x7FFFFFFF >> (32 - bit_depth); - const si32 lower_limit = 0x80000000 >> (32 - bit_depth); + const si32 upper_limit = INT_MAX >> (32 - bit_depth); + const si32 lower_limit = INT_MIN >> (32 - bit_depth); if (is_signed) { @@ -428,8 +428,8 @@ namespace ojph { // it is either infinity, -infinity, qNan or sNan. const si32 half = (1 << (bit_depth - 1)); const si32 shift = half + 1; - const si64 upper_limit = 0x7FFFFFFFFFFFFFFFLL >> (64 - bit_depth); - const si64 lower_limit = 0x8000000000000000LL >> (64 - bit_depth); + const si64 upper_limit = LLONG_MAX >> (64 - bit_depth); + const si64 lower_limit = LLONG_MIN >> (64 - bit_depth); if (is_signed) { From bf48100b12c8c6dd43d29ff772ddd9d39650673b Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 21 Jan 2025 22:31:59 +1100 Subject: [PATCH 03/20] Fixes compilation. --- src/core/transform/ojph_colour.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 63438948..c29ff207 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -36,6 +36,7 @@ //***************************************************************************/ #include +#include #include "ojph_defs.h" #include "ojph_arch.h" From e60473c28f1ad9862493c30a4f1a714f6965cdf4 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Wed, 22 Jan 2025 12:40:22 +1100 Subject: [PATCH 04/20] Bug fixes. Improvements. --- src/core/codestream/ojph_resolution.cpp | 68 +++++++++++++++------- src/core/codestream/ojph_subband.cpp | 24 ++++++-- src/core/codestream/ojph_tile.cpp | 21 +++++-- src/core/common/ojph_mem.h | 16 +---- src/core/others/ojph_mem.cpp | 28 +-------- src/core/transform/ojph_colour.cpp | 77 +++++++++++-------------- src/core/transform/ojph_colour_avx2.cpp | 24 ++++---- src/core/transform/ojph_colour_sse2.cpp | 24 ++++---- 8 files changed, 145 insertions(+), 137 deletions(-) diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp index 8ee5b79d..343615f8 100644 --- a/src/core/codestream/ojph_resolution.cpp +++ b/src/core/codestream/ojph_resolution.cpp @@ -207,20 +207,31 @@ namespace ojph { const param_qcd* qp = codestream->access_qcd()->get_qcc(comp_num); ui32 precision = qp->propose_precision(cdp); + const param_atk* atk = cdp->access_atk(); + bool reversible = atk->is_reversible(); ui32 width = res_rect.siz.w + 1; - if (precision <= 32) { - for (ui32 i = 0; i < num_steps; ++i) + if (reversible) + { + if (precision <= 32) { + for (ui32 i = 0; i < num_steps; ++i) + allocator->pre_alloc_data(width, 1); allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + } + else + { + for (ui32 i = 0; i < num_steps; ++i) + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + } } - else - { + else { for (ui32 i = 0; i < num_steps; ++i) - allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); - allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); } } } @@ -474,21 +485,38 @@ namespace ojph { // initiate storage of line_buf ui32 width = res_rect.siz.w + 1; - if (precision <= 32) + if (this->reversible) { - for (ui32 i = 0; i < num_steps; ++i) - ssp[i].line->wrap( + if (precision <= 32) + { + for (ui32 i = 0; i < num_steps; ++i) + ssp[i].line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + sig->line->wrap( allocator->post_alloc_data(width, 1), width, 1); - sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); - aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + aug->line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + } + else + { + for (ui32 i = 0; i < num_steps; ++i) + ssp[i].line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + sig->line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + aug->line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + } } - else + else { - for (ui32 i = 0; i < num_steps; ++i) - ssp[i].line->wrap( - allocator->post_alloc_data(width, 1), width, 1); - sig->line->wrap(allocator->post_alloc_data(width, 1), width, 1); - aug->line->wrap(allocator->post_alloc_data(width, 1), width, 1); + for (ui32 i = 0; i < num_steps; ++i) + ssp[i].line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + sig->line->wrap( + allocator->post_alloc_data(width, 1), width, 1); + aug->line->wrap( + allocator->post_alloc_data(width, 1), width, 1); } cur_line = 0; diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp index 4830895f..655a2b8b 100644 --- a/src/core/codestream/ojph_subband.cpp +++ b/src/core/codestream/ojph_subband.cpp @@ -92,6 +92,8 @@ namespace ojph { const param_qcd* qp = codestream->access_qcd()->get_qcc(comp_num); ui32 precision = qp->propose_precision(cdp); + const param_atk* atk = cdp->access_atk(); + bool reversible = atk->is_reversible(); for (ui32 i = 0; i < num_blocks.w; ++i) codeblock::pre_alloc(codestream, nominal, precision); @@ -100,10 +102,15 @@ namespace ojph { allocator->pre_alloc_obj(1); //allocate line_buf ui32 width = band_rect.siz.w + 1; - if (precision <= 32) - allocator->pre_alloc_data(width, 1); + if (reversible) + { + if (precision <= 32) + allocator->pre_alloc_data(width, 1); + else + allocator->pre_alloc_data(width, 1); + } else - allocator->pre_alloc_data(width, 1); + allocator->pre_alloc_data(width, 1); } ////////////////////////////////////////////////////////////////////////// @@ -201,10 +208,15 @@ namespace ojph { lines = allocator->post_alloc_obj(1); //allocate line_buf ui32 width = band_rect.siz.w + 1; - if (precision <= 32) - lines->wrap(allocator->post_alloc_data(width, 1), width, 1); + if (reversible) + { + if (precision <= 32) + lines->wrap(allocator->post_alloc_data(width, 1), width, 1); + else + lines->wrap(allocator->post_alloc_data(width, 1), width, 1); + } else - lines->wrap(allocator->post_alloc_data(width, 1), width, 1); + lines->wrap(allocator->post_alloc_data(width, 1), width, 1); } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index 67fed0bd..ae78b06c 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -122,11 +122,16 @@ namespace ojph { } //allocate lines - if (codestream->get_cod()->is_employing_color_transform()) + const param_cod* cdp = codestream->get_cod(); + if (cdp->is_employing_color_transform()) { allocator->pre_alloc_obj(3); - for (int i = 0; i < 3; ++i) - allocator->pre_alloc_data(width, 0); + if (cdp->access_atk()->is_reversible()) + for (int i = 0; i < 3; ++i) + allocator->pre_alloc_data(width, 0); + else + for (int i = 0; i < 3; ++i) + allocator->pre_alloc_data(width, 0); } } @@ -230,8 +235,14 @@ namespace ojph { { num_lines = 3; lines = allocator->post_alloc_obj(num_lines); - for (int i = 0; i < 3; ++i) - lines[i].wrap(allocator->post_alloc_data(width, 0), width, 0); + if (reversible) + for (int i = 0; i < 3; ++i) + lines[i].wrap( + allocator->post_alloc_data(width, 0), width, 0); + else + for (int i = 0; i < 3; ++i) + lines[i].wrap( + allocator->post_alloc_data(width, 0), width, 0); } else { diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h index b910e120..b5a91759 100644 --- a/src/core/common/ojph_mem.h +++ b/src/core/common/ojph_mem.h @@ -142,26 +142,14 @@ namespace ojph { LFT_16BIT = 0x02, // Set when data is 2 bytes (not used) LFT_32BIT = 0x04, // Set when data is 4 bytes LFT_64BIT = 0x08, // Set when data is 8 bytes - LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding - // Not all combinations are useful + LFT_INTEGER = 0x10, // Set when data is an integer, in other words + // 32bit integer, not 32bit float LFT_SIZE_MASK = 0x0F, // To extract data size }; public: line_buf() : size(0), pre_size(0), flags(LFT_UNDEFINED), i32(0) {} - template - void pre_alloc(mem_fixed_allocator *p, size_t num_ele, ui32 pre_size) - { - memset(this, 0, sizeof(line_buf)); - p->pre_alloc_data(num_ele, pre_size); - size = num_ele; - this->pre_size = pre_size; - } - - template - void finalize_alloc(mem_fixed_allocator *p); - template void wrap(T *buffer, size_t num_ele, ui32 pre_size); diff --git a/src/core/others/ojph_mem.cpp b/src/core/others/ojph_mem.cpp index 0bb0b5f7..8b1af56f 100644 --- a/src/core/others/ojph_mem.cpp +++ b/src/core/others/ojph_mem.cpp @@ -49,30 +49,6 @@ namespace ojph { // //////////////////////////////////////////////////////////////////////////// - //////////////////////////////////////////////////////////////////////////// - template<> - void line_buf::finalize_alloc(mem_fixed_allocator *p) - { - assert(p != 0 && size != 0); - i32 = p->post_alloc_data(size, pre_size); - } - - //////////////////////////////////////////////////////////////////////////// - template<> - void line_buf::finalize_alloc(mem_fixed_allocator *p) - { - assert(p != 0 && size != 0); - f32 = p->post_alloc_data(size, pre_size); - } - - //////////////////////////////////////////////////////////////////////////// - template<> - void line_buf::finalize_alloc(mem_fixed_allocator *p) - { - assert(p != 0 && size != 0); - i64 = p->post_alloc_data(size, pre_size); - } - //////////////////////////////////////////////////////////////////////////// template<> void line_buf::wrap(si32 *buffer, size_t num_ele, ui32 pre_size) @@ -80,7 +56,7 @@ namespace ojph { this->i32 = buffer; this->size = num_ele; this->pre_size = pre_size; - this->flags = LFT_32BIT | LFT_REVERSIBLE; + this->flags = LFT_32BIT | LFT_INTEGER; } //////////////////////////////////////////////////////////////////////////// @@ -100,7 +76,7 @@ namespace ojph { this->i64 = buffer; this->size = num_ele; this->pre_size = pre_size; - this->flags = LFT_64BIT | LFT_REVERSIBLE; + this->flags = LFT_64BIT | LFT_INTEGER; } //////////////////////////////////////////////////////////////////////////// diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index c29ff207..8cca554c 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -338,31 +338,30 @@ namespace ojph { ui32 bit_depth, bool is_signed, ui32 width) { assert((src_line->flags & line_buf::LFT_32BIT) && - (src_line->flags & line_buf::LFT_REVERSIBLE) == 0 && + (src_line->flags & line_buf::LFT_INTEGER) && (dst_line->flags & line_buf::LFT_32BIT) && - (dst_line->flags & line_buf::LFT_REVERSIBLE) == 0); + (dst_line->flags & line_buf::LFT_INTEGER) == 0); - float mul; - if (bit_depth < 32) - mul = 1.0f / (float)(1 << bit_depth); - else - mul = (float)(1.0 / 65536.0 / 65536.0); + float mul = (float)(1.0 / 65536.0 / 65536.0); const si32* sp = src_line->i32 + src_line_offset; float* dp = dst_line->f32; + ui32 shift = 32 - bit_depth; if (is_signed) { - si32 shift = (1 << (bit_depth - 1)) + 1; + si32 bias = (si32)((ui32)INT_MIN + 1); for (ui32 i = width; i > 0; --i) { - si32 v = *sp++; - v = (v >= 0) ? v : (- v - shift); + si32 v = *sp++ << shift; + v = (v >= 0) ? v : (- v - bias); *dp++ = (float)v * mul; } } else { - for (ui32 i = width; i > 0; --i) - *dp++ = (float)*sp++ * mul - 0.5f; + for (ui32 i = width; i > 0; --i) { + si32 v = *sp++ << shift; + *dp++ = (float)v * mul - 0.5f; + } } } @@ -372,16 +371,10 @@ namespace ojph { ui32 bit_depth, bool is_signed, ui32 width) { assert((src_line->flags & line_buf::LFT_32BIT) && - (src_line->flags & line_buf::LFT_REVERSIBLE) == 0 && + (src_line->flags & line_buf::LFT_INTEGER) == 0 && (dst_line->flags & line_buf::LFT_32BIT) && - (dst_line->flags & line_buf::LFT_REVERSIBLE) == 0); + (dst_line->flags & line_buf::LFT_INTEGER)); - float mul; - if (bit_depth < 32) - mul = 1.0f / (float)(1 << bit_depth); - else - mul = (float)(1.0 / 65536.0 / 65536.0); - const float* sp = src_line->f32; si32* dp = dst_line->i32 + dst_line_offset; if (bit_depth <= 30) @@ -389,28 +382,28 @@ namespace ojph { // We are leaving two bit overhead -- here, we are assuming that after // multiplications, the resulting number can still be represented // using 32 bit integer - const si32 half = (1 << (bit_depth - 1)); - const si32 shift = half + 1; + float mul = (float)(1 << bit_depth); const si32 upper_limit = INT_MAX >> (32 - bit_depth); const si32 lower_limit = INT_MIN >> (32 - bit_depth); if (is_signed) { + const si32 bias = (1 << (bit_depth - 1)) + 1; for (ui32 i = width; i > 0; --i) { si32 v = ojph_round(*sp++ * mul); v = ojph_max(v, lower_limit); v = ojph_min(v, upper_limit); - v = (v >= 0) ? v : (- v - shift); + v = (v >= 0) ? v : (- v - bias); *dp++ = v; } } else { + const si32 half = (1 << (bit_depth - 1)); for (ui32 i = width; i > 0; --i) { si32 v = ojph_round(*sp++ * mul); v = ojph_max(v, lower_limit); v = ojph_min(v, upper_limit); - v = (v >= 0) ? v : (- v - shift); *dp++ = v + half; } } @@ -427,30 +420,30 @@ namespace ojph { // can achieve. All this is academic, because here are talking // about a number which has all the exponent bits set, meaning // it is either infinity, -infinity, qNan or sNan. - const si32 half = (1 << (bit_depth - 1)); - const si32 shift = half + 1; - const si64 upper_limit = LLONG_MAX >> (64 - bit_depth); - const si64 lower_limit = LLONG_MIN >> (64 - bit_depth); + float mul = (float)(1ull << bit_depth); + const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth); + const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth); if (is_signed) { + const si32 bias = (1 << (bit_depth - 1)) + 1; for (ui32 i = width; i > 0; --i) { si64 t = ojph_round64(*sp++ * mul); t = ojph_max(t, lower_limit); t = ojph_min(t, upper_limit); si32 v = (si32)t; - v = (v >= 0) ? v : (- v - shift); + v = (v >= 0) ? v : (- v - bias); *dp++ = v; } } else { + const si32 half = (1 << (bit_depth - 1)); for (ui32 i = width; i > 0; --i) { si64 t = ojph_round64(*sp++ * mul); t = ojph_max(t, lower_limit); t = ojph_min(t, upper_limit); si32 v = (si32)t; - v = (v >= 0) ? v : (- v - shift); *dp++ = v + half; } } @@ -462,12 +455,12 @@ namespace ojph { const line_buf *r, const line_buf *g, const line_buf *b, line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { - assert((y->flags & line_buf::LFT_REVERSIBLE) && - (cb->flags & line_buf::LFT_REVERSIBLE) && - (cr->flags & line_buf::LFT_REVERSIBLE) && - (r->flags & line_buf::LFT_REVERSIBLE) && - (g->flags & line_buf::LFT_REVERSIBLE) && - (b->flags & line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { @@ -512,12 +505,12 @@ namespace ojph { const line_buf *y, const line_buf *cb, const line_buf *cr, line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { - assert((y->flags & line_buf::LFT_REVERSIBLE) && - (cb->flags & line_buf::LFT_REVERSIBLE) && - (cr->flags & line_buf::LFT_REVERSIBLE) && - (r->flags & line_buf::LFT_REVERSIBLE) && - (g->flags & line_buf::LFT_REVERSIBLE) && - (b->flags & line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 05bff311..bc9a9e9f 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -243,12 +243,12 @@ namespace ojph { line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { - assert((y->flags & line_buf::LFT_REVERSIBLE) && - (cb->flags & line_buf::LFT_REVERSIBLE) && - (cr->flags & line_buf::LFT_REVERSIBLE) && - (r->flags & line_buf::LFT_REVERSIBLE) && - (g->flags & line_buf::LFT_REVERSIBLE) && - (b->flags & line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { @@ -333,12 +333,12 @@ namespace ojph { line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { - assert((y->flags & line_buf::LFT_REVERSIBLE) && - (cb->flags & line_buf::LFT_REVERSIBLE) && - (cr->flags & line_buf::LFT_REVERSIBLE) && - (r->flags & line_buf::LFT_REVERSIBLE) && - (g->flags & line_buf::LFT_REVERSIBLE) && - (b->flags & line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index a529c66b..37fa1c8a 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -290,12 +290,12 @@ namespace ojph { line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { - assert((y->flags & line_buf::LFT_REVERSIBLE) && - (cb->flags & line_buf::LFT_REVERSIBLE) && - (cr->flags & line_buf::LFT_REVERSIBLE) && - (r->flags & line_buf::LFT_REVERSIBLE) && - (g->flags & line_buf::LFT_REVERSIBLE) && - (b->flags & line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { @@ -381,12 +381,12 @@ namespace ojph { line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { - assert((y->flags & line_buf::LFT_REVERSIBLE) && - (cb->flags & line_buf::LFT_REVERSIBLE) && - (cr->flags & line_buf::LFT_REVERSIBLE) && - (r->flags & line_buf::LFT_REVERSIBLE) && - (g->flags & line_buf::LFT_REVERSIBLE) && - (b->flags & line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { From e0a3c2bc86d997659452f199f6c51a04c0fa17b5 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Wed, 22 Jan 2025 21:06:19 +1100 Subject: [PATCH 05/20] Added SIMD, except for the 32bit path. Wasm SIMD is missing. Not tested. --- src/core/transform/ojph_colour.cpp | 84 ++++----- src/core/transform/ojph_colour_avx2.cpp | 147 ++++++++++++++++ src/core/transform/ojph_colour_local.h | 30 ++++ src/core/transform/ojph_colour_sse2.cpp | 217 +++++++++++++++++++++--- src/core/transform/ojph_colour_wasm.cpp | 169 ++++++++++++++++-- 5 files changed, 571 insertions(+), 76 deletions(-) diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 8cca554c..617fc41f 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -79,16 +79,16 @@ namespace ojph { void (*cnvrt_float_to_si32) (const float *sp, si32 *dp, float mul, ui32 width) = NULL; - ////////////////////////////////////////////////////////////////////////// - void (*irv_convert_to_float_nlt_type3) ( - const line_buf *src_line, ui32 src_line_offset, - line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL; - ////////////////////////////////////////////////////////////////////////// void (*irv_convert_to_integer_nlt_type3) ( const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) = NULL; + ////////////////////////////////////////////////////////////////////////// + void (*irv_convert_to_float_nlt_type3) ( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL; + ////////////////////////////////////////////////////////////////////////// void (*rct_forward) (const line_buf* r, const line_buf* g, const line_buf* b, @@ -156,6 +156,10 @@ namespace ojph { rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3; cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd; cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32; + irv_convert_to_integer_nlt_type3 = + sse2_irv_convert_to_integer_nlt_type3; + irv_convert_to_float_nlt_type3 = + sse2_irv_convert_to_float_nlt_type3; rct_forward = sse2_rct_forward; rct_backward = sse2_rct_backward; } @@ -178,6 +182,10 @@ namespace ojph { { rev_convert = avx2_rev_convert; rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3; + irv_convert_to_integer_nlt_type3 = + avx2_irv_convert_to_integer_nlt_type3; + irv_convert_to_float_nlt_type3 = + avx2_irv_convert_to_float_nlt_type3; rct_forward = avx2_rct_forward; rct_backward = avx2_rct_backward; } @@ -332,39 +340,6 @@ namespace ojph { *dp++ = ojph_round(*sp++ * mul); } - ////////////////////////////////////////////////////////////////////////// - void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, - ui32 src_line_offset, line_buf *dst_line, - ui32 bit_depth, bool is_signed, ui32 width) - { - assert((src_line->flags & line_buf::LFT_32BIT) && - (src_line->flags & line_buf::LFT_INTEGER) && - (dst_line->flags & line_buf::LFT_32BIT) && - (dst_line->flags & line_buf::LFT_INTEGER) == 0); - - float mul = (float)(1.0 / 65536.0 / 65536.0); - - const si32* sp = src_line->i32 + src_line_offset; - float* dp = dst_line->f32; - ui32 shift = 32 - bit_depth; - if (is_signed) - { - si32 bias = (si32)((ui32)INT_MIN + 1); - for (ui32 i = width; i > 0; --i) { - si32 v = *sp++ << shift; - v = (v >= 0) ? v : (- v - bias); - *dp++ = (float)v * mul; - } - } - else - { - for (ui32 i = width; i > 0; --i) { - si32 v = *sp++ << shift; - *dp++ = (float)v * mul - 0.5f; - } - } - } - ////////////////////////////////////////////////////////////////////////// void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, @@ -450,6 +425,39 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_INTEGER) && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_INTEGER) == 0); + + float mul = (float)(1.0 / 65536.0 / 65536.0); + + const si32* sp = src_line->i32 + src_line_offset; + float* dp = dst_line->f32; + ui32 shift = 32 - bit_depth; + if (is_signed) + { + si32 bias = (si32)((ui32)INT_MIN + 1); + for (ui32 i = width; i > 0; --i) { + si32 v = *sp++ << shift; + v = (v >= 0) ? v : (- v - bias); + *dp++ = (float)v * mul; + } + } + else + { + for (ui32 i = width; i > 0; --i) { + si32 v = *sp++ << shift; + *dp++ = (float)v * mul - 0.5f; + } + } + } + ////////////////////////////////////////////////////////////////////////// void gen_rct_forward( const line_buf *r, const line_buf *g, const line_buf *b, diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index bc9a9e9f..80ef38b4 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -236,6 +236,153 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_INTEGER) == 0 && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_INTEGER)); + + const float* sp = src_line->f32; + si32* dp = dst_line->i32 + dst_line_offset; + if (bit_depth <= 30) + { + // We are leaving two bit overhead -- here, we are assuming that after + // multiplications, the resulting number can still be represented + // using 32 bit integer + __m256 mul = _mm256_set1_ps((float)(1 << bit_depth)); + __m256i upper_limit = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth)); + __m256i lower_limit = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth)); + + if (is_signed) + { + __m256i zero = _mm256_setzero_si256(); + __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) + { + __m256 t = _mm256_loadu_ps(sp); + t = _mm256_mul_ps(t, mul); + t = _mm256_round_ps(t, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + __m256i u = _mm256_cvtps_epi32(t); + u = _mm256_max_epi32(u, lower_limit); + u = _mm256_min_epi32(u, upper_limit); + + __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value + __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value + neg = _mm256_and_si256(c, neg); //keep only - bias - value + __m256i v = _mm256_andnot_si256(c, u); //keep only +ve or 0 + v = _mm256_or_si256(neg, v); //combine + _mm256_storeu_si256((__m256i*)dp, v); + } + } + else + { + __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1))); + for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) { + __m256 t = _mm256_loadu_ps(sp); + t = _mm256_mul_ps(t, mul); + t = _mm256_round_ps(t, + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + __m256i u = _mm256_cvtps_epi32(t); + u = _mm256_max_epi32(u, lower_limit); + u = _mm256_min_epi32(u, upper_limit); + u = _mm256_add_epi32(u, half); + _mm256_storeu_si256((__m256i*)dp, u); + } + } + } + else + { + // There is the possibility that converting to integer will + // exceed the dynamic range of 32bit integer; therefore, we need + // to use 64 bit. One may think, why not limit the floats to the + // range of [-0.5f, 0.5f)? + // Notice the half closed range -- we need a value just below 0.5f. + // While getting this number is possible, after multiplication, the + // resulting number will not be exactly the maximum that the integer + // can achieve. All this is academic, because here are talking + // about a number which has all the exponent bits set, meaning + // it is either infinity, -infinity, qNan or sNan. + float mul = (float)(1ull << bit_depth); + const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth); + const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth); + + if (is_signed) + { + const si32 bias = (1 << (bit_depth - 1)) + 1; + for (ui32 i = width; i > 0; --i) { + si64 t = ojph_round64(*sp++ * mul); + t = ojph_max(t, lower_limit); + t = ojph_min(t, upper_limit); + si32 v = (si32)t; + v = (v >= 0) ? v : (- v - bias); + *dp++ = v; + } + } + else + { + const si32 half = (1 << (bit_depth - 1)); + for (ui32 i = width; i > 0; --i) { + si64 t = ojph_round64(*sp++ * mul); + t = ojph_max(t, lower_limit); + t = ojph_min(t, upper_limit); + si32 v = (si32)t; + *dp++ = v + half; + } + } + } + } + + ////////////////////////////////////////////////////////////////////////// + void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_INTEGER) && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_INTEGER) == 0); + + __m256 mul = _mm256_set1_ps((float)(1.0 / 65536.0 / 65536.0)); + + const si32* sp = src_line->i32 + src_line_offset; + float* dp = dst_line->f32; + si32 shift = 32 - (si32)bit_depth; + if (is_signed) + { + __m256i zero = _mm256_setzero_si256(); + __m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1)); + for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) { + __m256i t = _mm256_loadu_si256((__m256i*)sp); + __m256i u = _mm256_slli_epi32(t, shift); + __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve value + __m256i neg = _mm256_sub_epi32(bias, u); // - bias - value + neg = _mm256_and_si256(c, neg); // keep only - bias - value + t = _mm256_andnot_si256(c, u); // keep only +ve or 0 + u = _mm256_or_si256(neg, t); // combine + __m256 v = _mm256_cvtepi32_ps(u); + v = _mm256_mul_ps(v, mul); + _mm256_storeu_ps(dp, v); + } + } + else + { + __m256 half = _mm256_set1_ps(0.5f); + for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) { + __m256i t = _mm256_loadu_si256((__m256i*)sp); + __m256i u = _mm256_slli_epi32(t, shift); + __m256 v = _mm256_cvtepi32_ps(u); + v = _mm256_mul_ps(v, mul); + v = _mm256_add_ps(v, half); + _mm256_storeu_ps(dp, v); + } + } + } + ////////////////////////////////////////////////////////////////////////// void avx2_rct_forward(const line_buf *r, const line_buf *g, diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index acd0f944..5f28685a 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -168,6 +168,11 @@ namespace ojph { void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void sse2_irv_convert_to_integer_nlt_type3( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); + ////////////////////////////////////////////////////////////////////////// // // @@ -188,6 +193,11 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void sse2_irv_convert_to_float_nlt_type3( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + ////////////////////////////////////////////////////////////////////////// void sse2_rct_forward( const line_buf *r, const line_buf *g, const line_buf *b, @@ -250,6 +260,16 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void avx2_irv_convert_to_integer_nlt_type3( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void avx2_irv_convert_to_float_nlt_type3( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + ////////////////////////////////////////////////////////////////////////// void avx2_rct_forward( const line_buf *r, const line_buf *g, const line_buf *b, @@ -296,6 +316,16 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void wasm_irv_convert_to_integer_nlt_type3( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void wasm_irv_convert_to_float_nlt_type3( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + ////////////////////////////////////////////////////////////////////////// void wasm_rct_forward( const line_buf *r, const line_buf *g, const line_buf *b, diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 37fa1c8a..3c467bd1 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -48,6 +48,159 @@ namespace ojph { namespace local { + ////////////////////////////////////////////////////////////////////////// + void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, + ui32 width) + { + uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + __m128 shift = _mm_set1_ps(0.5f); + __m128 m = _mm_set1_ps(mul); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128 t = _mm_loadu_ps(sp); + __m128 s = _mm_add_ps(t, shift); + s = _mm_mul_ps(s, m); + _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s)); + } + _MM_SET_ROUNDING_MODE(rounding_mode); + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, + ui32 width) + { + uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + __m128 m = _mm_set1_ps(mul); + for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + { + __m128 t = _mm_loadu_ps(sp); + __m128 s = _mm_mul_ps(t, m); + _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s)); + } + _MM_SET_ROUNDING_MODE(rounding_mode); + } + + ////////////////////////////////////////////////////////////////////////// + // This requires SSE4.1 + __m128i ojph_mm_max_epi32(__m128i a, __m128i b) + { + __m128i c = _mm_cmpgt_epi32(a, b); // 0xFFFFFFFF for a > b + __m128i d = _mm_and_si128(c, a); // keep only a, where a > b + __m128i e = _mm_andnot_si128(c, b); // keep only b, where a <= b + return _mm_or_si128(d, e); // combine + } + + ////////////////////////////////////////////////////////////////////////// + // This requires SSE4.1 + __m128i ojph_mm_min_epi32 (__m128i a, __m128i b) + { + __m128i c = _mm_cmplt_epi32(a, b); // 0xFFFFFFFF for a < b + __m128i d = _mm_and_si128(c, a); // keep only a, where a < b + __m128i e = _mm_andnot_si128(c, b); // keep only b, where a >= b + return _mm_or_si128(d, e); // combine + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_INTEGER) == 0 && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_INTEGER)); + + const float* sp = src_line->f32; + si32* dp = dst_line->i32 + dst_line_offset; + if (bit_depth <= 30) + { + // We are leaving two bit overhead -- here, we are assuming that after + // multiplications, the resulting number can still be represented + // using 32 bit integer + __m128 mul = _mm_set1_ps((float)(1 << bit_depth)); + __m128i upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth)); + __m128i lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth)); + + if (is_signed) + { + __m128i zero = _mm_setzero_si128(); + __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) + { + __m128 t = _mm_loadu_ps(sp); + t = _mm_mul_ps(t, mul); + t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + __m128i u = _mm_cvtps_epi32(t); + u = ojph_mm_max_epi32(u, lower_limit); + u = ojph_mm_min_epi32(u, upper_limit); + + __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value + __m128i neg = _mm_sub_epi32(bias, u); //-bias -value + neg = _mm_and_si128(c, neg); //keep only - bias - value + __m128i v = _mm_andnot_si128(c, u); //keep only +ve or 0 + v = _mm_or_si128(neg, v); //combine + _mm_storeu_si128((__m128i*)dp, v); + } + } + else + { + __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1))); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m128 t = _mm_loadu_ps(sp); + t = _mm_mul_ps(t, mul); + t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + __m128i u = _mm_cvtps_epi32(t); + u = ojph_mm_max_epi32(u, lower_limit); + u = ojph_mm_min_epi32(u, upper_limit); + u = _mm_add_epi32(u, half); + _mm_storeu_si128((__m128i*)dp, u); + } + } + } + else + { + // There is the possibility that converting to integer will + // exceed the dynamic range of 32bit integer; therefore, we need + // to use 64 bit. One may think, why not limit the floats to the + // range of [-0.5f, 0.5f)? + // Notice the half closed range -- we need a value just below 0.5f. + // While getting this number is possible, after multiplication, the + // resulting number will not be exactly the maximum that the integer + // can achieve. All this is academic, because here are talking + // about a number which has all the exponent bits set, meaning + // it is either infinity, -infinity, qNan or sNan. + float mul = (float)(1ull << bit_depth); + const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth); + const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth); + + if (is_signed) + { + const si32 bias = (1 << (bit_depth - 1)) + 1; + for (ui32 i = width; i > 0; --i) { + si64 t = ojph_round64(*sp++ * mul); + t = ojph_max(t, lower_limit); + t = ojph_min(t, upper_limit); + si32 v = (si32)t; + v = (v >= 0) ? v : (- v - bias); + *dp++ = v; + } + } + else + { + const si32 half = (1 << (bit_depth - 1)); + for (ui32 i = width; i > 0; --i) { + si64 t = ojph_round64(*sp++ * mul); + t = ojph_max(t, lower_limit); + t = ojph_min(t, upper_limit); + si32 v = (si32)t; + *dp++ = v + half; + } + } + } + } + ///////////////////////////////////////////////////////////////////////// // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) @@ -250,37 +403,49 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, - ui32 width) + void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) { - uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); - __m128 shift = _mm_set1_ps(0.5f); - __m128 m = _mm_set1_ps(mul); - for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_INTEGER) && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_INTEGER) == 0); + + __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0)); + + const si32* sp = src_line->i32 + src_line_offset; + float* dp = dst_line->f32; + si32 shift = 32 - (si32)bit_depth; + if (is_signed) { - __m128 t = _mm_loadu_ps(sp); - __m128 s = _mm_add_ps(t, shift); - s = _mm_mul_ps(s, m); - _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s)); + __m128i zero = _mm_setzero_si128(); + __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m128i t = _mm_loadu_si128((__m128i*)sp); + __m128i u = _mm_slli_epi32(t, shift); + __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value + __m128i neg = _mm_sub_epi32(bias, u); // - bias - value + neg = _mm_and_si128(c, neg); // keep only - bias - value + t = _mm_andnot_si128(c, u); // keep only +ve or 0 + u = _mm_or_si128(neg, t); // combine + __m128 v = _mm_cvtepi32_ps(u); + v = _mm_mul_ps(v, mul); + _mm_storeu_ps(dp, v); + } } - _MM_SET_ROUNDING_MODE(rounding_mode); - } - - ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, - ui32 width) - { - uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); - __m128 m = _mm_set1_ps(mul); - for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) + else { - __m128 t = _mm_loadu_ps(sp); - __m128 s = _mm_mul_ps(t, m); - _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s)); + __m128 half = _mm_set1_ps(0.5f); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m128i t = _mm_loadu_si128((__m128i*)sp); + __m128i u = _mm_slli_epi32(t, shift); + __m128 v = _mm_cvtepi32_ps(u); + v = _mm_mul_ps(v, mul); + v = _mm_add_ps(v, half); + _mm_storeu_ps(dp, v); + } } - _MM_SET_ROUNDING_MODE(rounding_mode); } ////////////////////////////////////////////////////////////////////////// diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 5bf6ccdd..e0a88e8e 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -260,6 +260,151 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_INTEGER) == 0 && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_INTEGER)); + + // const float* sp = src_line->f32; + // si32* dp = dst_line->i32 + dst_line_offset; + // if (bit_depth <= 30) + // { + // // We are leaving two bit overhead -- here, we are assuming that after + // // multiplications, the resulting number can still be represented + // // using 32 bit integer + // __m128 mul = _mm_set1_ps((float)(1 << bit_depth)); + // __m128i upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth)); + // __m128i lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth)); + + // if (is_signed) + // { + // __m128i zero = _mm_setzero_si128(); + // __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + // for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) + // { + // __m128 t = _mm_loadu_ps(sp); + // t = _mm_mul_ps(t, mul); + // t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + // __m128i u = _mm_cvtps_epi32(t); + // u = ojph_mm_max_epi32(u, lower_limit); + // u = ojph_mm_min_epi32(u, upper_limit); + + // __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value + // __m128i neg = _mm_sub_epi32(bias, u); //-bias -value + // neg = _mm_and_si128(c, neg); //keep only - bias - value + // __m128i v = _mm_andnot_si128(c, u); //keep only +ve or 0 + // v = _mm_or_si128(neg, v); //combine + // _mm_storeu_si128((__m128i*)dp, v); + // } + // } + // else + // { + // __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1))); + // for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + // __m128 t = _mm_loadu_ps(sp); + // t = _mm_mul_ps(t, mul); + // t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + // __m128i u = _mm_cvtps_epi32(t); + // u = ojph_mm_max_epi32(u, lower_limit); + // u = ojph_mm_min_epi32(u, upper_limit); + // u = _mm_add_epi32(u, half); + // _mm_storeu_si128((__m128i*)dp, u); + // } + // } + // } + // else + // { + // // There is the possibility that converting to integer will + // // exceed the dynamic range of 32bit integer; therefore, we need + // // to use 64 bit. One may think, why not limit the floats to the + // // range of [-0.5f, 0.5f)? + // // Notice the half closed range -- we need a value just below 0.5f. + // // While getting this number is possible, after multiplication, the + // // resulting number will not be exactly the maximum that the integer + // // can achieve. All this is academic, because here are talking + // // about a number which has all the exponent bits set, meaning + // // it is either infinity, -infinity, qNan or sNan. + // float mul = (float)(1ull << bit_depth); + // const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth); + // const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth); + + // if (is_signed) + // { + // const si32 bias = (1 << (bit_depth - 1)) + 1; + // for (ui32 i = width; i > 0; --i) { + // si64 t = ojph_round64(*sp++ * mul); + // t = ojph_max(t, lower_limit); + // t = ojph_min(t, upper_limit); + // si32 v = (si32)t; + // v = (v >= 0) ? v : (- v - bias); + // *dp++ = v; + // } + // } + // else + // { + // const si32 half = (1 << (bit_depth - 1)); + // for (ui32 i = width; i > 0; --i) { + // si64 t = ojph_round64(*sp++ * mul); + // t = ojph_max(t, lower_limit); + // t = ojph_min(t, upper_limit); + // si32 v = (si32)t; + // *dp++ = v + half; + // } + // } + // } + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + assert((src_line->flags & line_buf::LFT_32BIT) && + (src_line->flags & line_buf::LFT_INTEGER) && + (dst_line->flags & line_buf::LFT_32BIT) && + (dst_line->flags & line_buf::LFT_INTEGER) == 0); + + // __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0)); + + // const si32* sp = src_line->i32 + src_line_offset; + // float* dp = dst_line->f32; + // si32 shift = 32 - (si32)bit_depth; + // if (is_signed) + // { + // __m128i zero = _mm_setzero_si128(); + // __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1)); + // for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + // __m128i t = _mm_loadu_si128((__m128i*)sp); + // __m128i u = _mm_slli_epi32(t, shift); + // __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value + // __m128i neg = _mm_sub_epi32(bias, u); // - bias - value + // neg = _mm_and_si128(c, neg); // keep only - bias - value + // t = _mm_andnot_si128(c, u); // keep only +ve or 0 + // u = _mm_or_si128(neg, t); // combine + // __m128 v = _mm_cvtepi32_ps(u); + // v = _mm_mul_ps(v, mul); + // _mm_storeu_ps(dp, v); + // } + // } + // else + // { + // __m128 half = _mm_set1_ps(0.5f); + // for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + // __m128i t = _mm_loadu_si128((__m128i*)sp); + // __m128i u = _mm_slli_epi32(t, shift); + // __m128 v = _mm_cvtepi32_ps(u); + // v = _mm_mul_ps(v, mul); + // v = _mm_add_ps(v, half); + // _mm_storeu_ps(dp, v); + // } + // } + } + ////////////////////////////////////////////////////////////////////////// void wasm_rct_forward(const line_buf *r, const line_buf *g, @@ -267,12 +412,12 @@ namespace ojph { line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { - assert((y->flags & line_buf::LFT_REVERSIBLE) && - (cb->flags & line_buf::LFT_REVERSIBLE) && - (cr->flags & line_buf::LFT_REVERSIBLE) && - (r->flags & line_buf::LFT_REVERSIBLE) && - (g->flags & line_buf::LFT_REVERSIBLE) && - (b->flags & line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { @@ -357,12 +502,12 @@ namespace ojph { line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { - assert((y->flags & line_buf::LFT_REVERSIBLE) && - (cb->flags & line_buf::LFT_REVERSIBLE) && - (cr->flags & line_buf::LFT_REVERSIBLE) && - (r->flags & line_buf::LFT_REVERSIBLE) && - (g->flags & line_buf::LFT_REVERSIBLE) && - (b->flags & line_buf::LFT_REVERSIBLE)); + assert((y->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && + (cr->flags & line_buf::LFT_INTEGER) && + (r->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && + (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { From 66286f307c90e228c3f691b355dd6362145a3d26 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Wed, 22 Jan 2025 21:10:10 +1100 Subject: [PATCH 06/20] Fixes compilation Error. --- src/core/transform/ojph_colour_sse2.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 3c467bd1..6f974e5d 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -112,6 +112,9 @@ namespace ojph { (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER)); + uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); + _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); + const float* sp = src_line->f32; si32* dp = dst_line->i32 + dst_line_offset; if (bit_depth <= 30) @@ -131,7 +134,6 @@ namespace ojph { { __m128 t = _mm_loadu_ps(sp); t = _mm_mul_ps(t, mul); - t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); __m128i u = _mm_cvtps_epi32(t); u = ojph_mm_max_epi32(u, lower_limit); u = ojph_mm_min_epi32(u, upper_limit); @@ -150,7 +152,6 @@ namespace ojph { for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { __m128 t = _mm_loadu_ps(sp); t = _mm_mul_ps(t, mul); - t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); __m128i u = _mm_cvtps_epi32(t); u = ojph_mm_max_epi32(u, lower_limit); u = ojph_mm_min_epi32(u, upper_limit); @@ -199,6 +200,8 @@ namespace ojph { } } } + + _MM_SET_ROUNDING_MODE(rounding_mode); } ///////////////////////////////////////////////////////////////////////// From 293eacd911a7a68bf3d5c7765bf32b27ec429760 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Thu, 23 Jan 2025 08:45:35 +1100 Subject: [PATCH 07/20] Added missing path in AVX2. Not tested. --- src/core/transform/ojph_colour_avx2.cpp | 73 +++++++++++++++++------ src/core/transform/ojph_colour_sse2.cpp | 77 ++++++++++++++++++------- 2 files changed, 111 insertions(+), 39 deletions(-) diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 80ef38b4..1c9da81e 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -236,6 +236,32 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + static inline + __m256i ojph_mm256_max_ge_epi32(__m256i a, __m256i b, __m256 x, __m256 y) + { + // We must use _CMP_NLT_UQ or _CMP_GE_OQ, _CMP_GE_OS, or _CMP_NLT_US + // It is not clear to me which to use + __m256 ct = _mm256_cmp_ps(x, y, _CMP_NLT_UQ); // 0xFFFFFFFF for x >= y + __m256i c = _mm256_castps_si256(ct); // does not generate any code + __m256i d = _mm256_and_si256(c, a); // keep only a, where x >= y + __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x < y + return _mm256_or_si256(d, e); // combine + } + + ////////////////////////////////////////////////////////////////////////// + static inline + __m256i ojph_mm256_min_lt_epi32(__m256i a, __m256i b, __m256 x, __m256 y) + { + // We must use _CMP_LT_OQ or _CMP_NGE_UQ, _CMP_LT_OS, or _CMP_NGE_US + // It is not clear to me which to use + __m256 ct = _mm256_cmp_ps(x, y, _CMP_NGE_UQ); // 0xFFFFFFFF for x < y + __m256i c = _mm256_castps_si256(ct); // does not generate any code + __m256i d = _mm256_and_si256(c, a); // keep only a, where x < y + __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x >= y + return _mm256_or_si256(d, e); // combine + } + ////////////////////////////////////////////////////////////////////////// void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, @@ -307,31 +333,42 @@ namespace ojph { // can achieve. All this is academic, because here are talking // about a number which has all the exponent bits set, meaning // it is either infinity, -infinity, qNan or sNan. - float mul = (float)(1ull << bit_depth); - const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth); - const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth); + si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth); + __m256 mul = _mm256_set1_ps((float)(1 << bit_depth)); + __m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit); // val < upper + __m256 fl_low_lim = _mm256_set1_ps((float)neg_limit); // val >= lower + __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth)); + __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth)); if (is_signed) { - const si32 bias = (1 << (bit_depth - 1)) + 1; - for (ui32 i = width; i > 0; --i) { - si64 t = ojph_round64(*sp++ * mul); - t = ojph_max(t, lower_limit); - t = ojph_min(t, upper_limit); - si32 v = (si32)t; - v = (v >= 0) ? v : (- v - bias); - *dp++ = v; + __m256i zero = _mm256_setzero_si256(); + __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m256 t = _mm256_loadu_ps(sp); + t = _mm256_mul_ps(t, mul); + __m256i u = _mm256_cvtps_epi32(t); + u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); + u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); + __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value + __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value + neg = _mm256_and_si256(c, neg); //keep only - bias - value + __m256i v = _mm256_andnot_si256(c, u); //keep only +ve or 0 + v = _mm256_or_si256(neg, v); //combine + _mm256_storeu_si256((__m256i*)dp, v); } } else { - const si32 half = (1 << (bit_depth - 1)); - for (ui32 i = width; i > 0; --i) { - si64 t = ojph_round64(*sp++ * mul); - t = ojph_max(t, lower_limit); - t = ojph_min(t, upper_limit); - si32 v = (si32)t; - *dp++ = v + half; + __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1))); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m256 t = _mm256_loadu_ps(sp); + t = _mm256_mul_ps(t, mul); + __m256i u = _mm256_cvtps_epi32(t); + u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); + u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); + u = _mm256_add_epi32(u, half); + _mm256_storeu_si256((__m256i*)dp, u); } } } diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 6f974e5d..e05ceb5e 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -83,7 +83,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - // This requires SSE4.1 + // _mm_max_epi32 requires SSE4.1, so here we implement it in SSE2 + static inline __m128i ojph_mm_max_epi32(__m128i a, __m128i b) { __m128i c = _mm_cmpgt_epi32(a, b); // 0xFFFFFFFF for a > b @@ -93,7 +94,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - // This requires SSE4.1 + // _mm_min_epi32 requires SSE4.1, so here we implement it in SSE2 + static inline __m128i ojph_mm_min_epi32 (__m128i a, __m128i b) { __m128i c = _mm_cmplt_epi32(a, b); // 0xFFFFFFFF for a < b @@ -102,6 +104,28 @@ namespace ojph { return _mm_or_si128(d, e); // combine } + ////////////////////////////////////////////////////////////////////////// + static inline + __m128i ojph_mm_max_ge_epi32(__m128i a, __m128i b, __m128 x, __m128 y) + { + __m128 ct = _mm_cmpge_ps(x, y); // 0xFFFFFFFF for x >= y + __m128i c = _mm_castps_si128(ct); // does not generate any code + __m128i d = _mm_and_si128(c, a); // keep only a, where x >= y + __m128i e = _mm_andnot_si128(c, b); // keep only b, where x < y + return _mm_or_si128(d, e); // combine + } + + ////////////////////////////////////////////////////////////////////////// + static inline + __m128i ojph_mm_min_lt_epi32(__m128i a, __m128i b, __m128 x, __m128 y) + { + __m128 ct = _mm_cmplt_ps(x, y); // 0xFFFFFFFF for x < y + __m128i c = _mm_castps_si128(ct); // does not generate any code + __m128i d = _mm_and_si128(c, a); // keep only a, where x < y + __m128i e = _mm_andnot_si128(c, b); // keep only b, where x >= y + return _mm_or_si128(d, e); // combine + } + ////////////////////////////////////////////////////////////////////////// void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, @@ -129,7 +153,7 @@ namespace ojph { if (is_signed) { __m128i zero = _mm_setzero_si128(); - __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { __m128 t = _mm_loadu_ps(sp); @@ -172,31 +196,42 @@ namespace ojph { // can achieve. All this is academic, because here are talking // about a number which has all the exponent bits set, meaning // it is either infinity, -infinity, qNan or sNan. - float mul = (float)(1ull << bit_depth); - const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth); - const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth); + si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth); + __m128 mul = _mm_set1_ps((float)(1 << bit_depth)); + __m128 fl_upper_limit = _mm_set1_ps(-(float)neg_limit); // val < upper + __m128 fl_lower_limit = _mm_set1_ps( (float)neg_limit); // val >= lower + __m128i s32_upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth)); + __m128i s32_lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth)); if (is_signed) { - const si32 bias = (1 << (bit_depth - 1)) + 1; - for (ui32 i = width; i > 0; --i) { - si64 t = ojph_round64(*sp++ * mul); - t = ojph_max(t, lower_limit); - t = ojph_min(t, upper_limit); - si32 v = (si32)t; - v = (v >= 0) ? v : (- v - bias); - *dp++ = v; + __m128i zero = _mm_setzero_si128(); + __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m128 t = _mm_loadu_ps(sp); + t = _mm_mul_ps(t, mul); + __m128i u = _mm_cvtps_epi32(t); + u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit); + u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit); + __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value + __m128i neg = _mm_sub_epi32(bias, u); //-bias -value + neg = _mm_and_si128(c, neg); //keep only - bias - value + __m128i v = _mm_andnot_si128(c, u); //keep only +ve or 0 + v = _mm_or_si128(neg, v); //combine + _mm_storeu_si128((__m128i*)dp, v); } } else { - const si32 half = (1 << (bit_depth - 1)); - for (ui32 i = width; i > 0; --i) { - si64 t = ojph_round64(*sp++ * mul); - t = ojph_max(t, lower_limit); - t = ojph_min(t, upper_limit); - si32 v = (si32)t; - *dp++ = v + half; + __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1))); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m128 t = _mm_loadu_ps(sp); + t = _mm_mul_ps(t, mul); + __m128i u = _mm_cvtps_epi32(t); + u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit); + u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit); + u = _mm_add_epi32(u, half); + _mm_storeu_si128((__m128i*)dp, u); } } } From 78bade8caf4817f303ad2cf457a9e6a5b369b604 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Thu, 23 Jan 2025 12:05:25 +1100 Subject: [PATCH 08/20] WASM code written. Must test all SIMD now. --- src/core/transform/ojph_colour_wasm.cpp | 384 +++++++++++++----------- 1 file changed, 208 insertions(+), 176 deletions(-) diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index e0a88e8e..c0c4bbea 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -2,21 +2,21 @@ // This software is released under the 2-Clause BSD license, included // below. // -// Copyright (c) 2021, Aous Naman +// Copyright (c) 2021, Aous Naman // Copyright (c) 2021, Kakadu Software Pty Ltd, Australia // Copyright (c) 2021, The University of New South Wales, Australia -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: -// +// // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. -// +// // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. -// +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A @@ -35,6 +35,7 @@ // Date: 9 February 2021 //***************************************************************************/ +#include #include #include @@ -45,16 +46,16 @@ namespace ojph { namespace local { - + ////////////////////////////////////////////////////////////////////////// - void wasm_rev_convert(const line_buf *src_line, + void wasm_rev_convert(const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, - const ui32 dst_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, si64 shift, ui32 width) { if (src_line->flags & line_buf::LFT_32BIT) - { + { if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; @@ -65,9 +66,9 @@ namespace ojph { v128_t s = wasm_v128_load(sp); s = wasm_i32x4_add(s, sh); wasm_v128_store(dp, s); - } + } } - else + else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; @@ -76,18 +77,18 @@ namespace ojph { { v128_t s, t; s = wasm_v128_load(sp); - + t = wasm_i64x2_extend_low_i32x4(s); t = wasm_i64x2_add(t, sh); wasm_v128_store(dp, t); - + t = wasm_i64x2_extend_high_i32x4(s); t = wasm_i64x2_add(t, sh); wasm_v128_store(dp + 2, t); - } + } } } - else + else { assert(src_line->flags | line_buf::LFT_64BIT); assert(dst_line->flags | line_buf::LFT_32BIT); @@ -103,19 +104,19 @@ namespace ojph { s1 = wasm_i64x2_add(s1, sh); s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2); wasm_v128_store(dp, s0); - } + } } } ////////////////////////////////////////////////////////////////////////// - void wasm_rev_convert_nlt_type3(const line_buf *src_line, - const ui32 src_line_offset, - line_buf *dst_line, - const ui32 dst_line_offset, + void wasm_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, si64 shift, ui32 width) { if (src_line->flags & line_buf::LFT_32BIT) - { + { if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; @@ -126,14 +127,14 @@ namespace ojph { { v128_t s = wasm_v128_load(sp); v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value - v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value + v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value s = wasm_v128_andnot(c, s); // keep only +ve or 0 s = wasm_v128_or(s, v_m_sh); // combine wasm_v128_store(dp, s); } } - else + else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; @@ -146,7 +147,7 @@ namespace ojph { u = wasm_i64x2_extend_low_i32x4(s); c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value - v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value + v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value u = wasm_v128_andnot(c, u); // keep only +ve or 0 u = wasm_v128_or(u, v_m_sh); // combine @@ -155,7 +156,7 @@ namespace ojph { u = wasm_i64x2_extend_high_i32x4(s); c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value - v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value + v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value u = wasm_v128_andnot(c, u); // keep only +ve or 0 u = wasm_v128_or(u, v_m_sh); // combine @@ -164,7 +165,7 @@ namespace ojph { } } } - else + else { assert(src_line->flags | line_buf::LFT_64BIT); assert(dst_line->flags | line_buf::LFT_32BIT); @@ -261,7 +262,27 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + static inline + v128_t ojph_wasm_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y) + { + v128_t c = wasm_i32x4_ge(x, y); // 0xFFFFFFFF for x >= y + v128_t d = wasm_v128_and(c, a); // keep only a, where x >= y + v128_t e = wasm_v128_andnot(c, b); // keep only b, where x < y + return wasm_v128_or(d, e); // combine + } + + ////////////////////////////////////////////////////////////////////////// + static inline + v128_t ojph_wasm_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y) + { + v128_t c = wasm_i32x4_lt(x, y); // 0xFFFFFFFF for x < y + v128_t d = wasm_v128_and(c, a); // keep only a, where x < y + v128_t e = wasm_v128_andnot(c, b); // keep only b, where x >= y + return wasm_v128_or(d, e); // combine + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) { @@ -269,99 +290,110 @@ namespace ojph { (src_line->flags & line_buf::LFT_INTEGER) == 0 && (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER)); - - // const float* sp = src_line->f32; - // si32* dp = dst_line->i32 + dst_line_offset; - // if (bit_depth <= 30) - // { - // // We are leaving two bit overhead -- here, we are assuming that after - // // multiplications, the resulting number can still be represented - // // using 32 bit integer - // __m128 mul = _mm_set1_ps((float)(1 << bit_depth)); - // __m128i upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth)); - // __m128i lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth)); + + // rounding mode is always set to _MM_ROUND_NEAREST + + const float* sp = src_line->f32; + si32* dp = dst_line->i32 + dst_line_offset; + if (bit_depth <= 30) + { + // We are leaving two bit overhead -- here, we are assuming that after + // multiplications, the resulting number can still be represented + // using 32 bit integer + v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth)); + v128_t upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth)); + v128_t lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth)); - // if (is_signed) - // { - // __m128i zero = _mm_setzero_si128(); - // __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); - // for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) - // { - // __m128 t = _mm_loadu_ps(sp); - // t = _mm_mul_ps(t, mul); - // t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - // __m128i u = _mm_cvtps_epi32(t); - // u = ojph_mm_max_epi32(u, lower_limit); - // u = ojph_mm_min_epi32(u, upper_limit); - - // __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value - // __m128i neg = _mm_sub_epi32(bias, u); //-bias -value - // neg = _mm_and_si128(c, neg); //keep only - bias - value - // __m128i v = _mm_andnot_si128(c, u); //keep only +ve or 0 - // v = _mm_or_si128(neg, v); //combine - // _mm_storeu_si128((__m128i*)dp, v); - // } - // } - // else - // { - // __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1))); - // for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - // __m128 t = _mm_loadu_ps(sp); - // t = _mm_mul_ps(t, mul); - // t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - // __m128i u = _mm_cvtps_epi32(t); - // u = ojph_mm_max_epi32(u, lower_limit); - // u = ojph_mm_min_epi32(u, upper_limit); - // u = _mm_add_epi32(u, half); - // _mm_storeu_si128((__m128i*)dp, u); - // } - // } - // } - // else - // { - // // There is the possibility that converting to integer will - // // exceed the dynamic range of 32bit integer; therefore, we need - // // to use 64 bit. One may think, why not limit the floats to the - // // range of [-0.5f, 0.5f)? - // // Notice the half closed range -- we need a value just below 0.5f. - // // While getting this number is possible, after multiplication, the - // // resulting number will not be exactly the maximum that the integer - // // can achieve. All this is academic, because here are talking - // // about a number which has all the exponent bits set, meaning - // // it is either infinity, -infinity, qNan or sNan. - // float mul = (float)(1ull << bit_depth); - // const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth); - // const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth); - - // if (is_signed) - // { - // const si32 bias = (1 << (bit_depth - 1)) + 1; - // for (ui32 i = width; i > 0; --i) { - // si64 t = ojph_round64(*sp++ * mul); - // t = ojph_max(t, lower_limit); - // t = ojph_min(t, upper_limit); - // si32 v = (si32)t; - // v = (v >= 0) ? v : (- v - bias); - // *dp++ = v; - // } - // } - // else - // { - // const si32 half = (1 << (bit_depth - 1)); - // for (ui32 i = width; i > 0; --i) { - // si64 t = ojph_round64(*sp++ * mul); - // t = ojph_max(t, lower_limit); - // t = ojph_min(t, upper_limit); - // si32 v = (si32)t; - // *dp++ = v + half; - // } - // } - // } + if (is_signed) + { + v128_t zero = wasm_i32x4_splat(0); + v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) + { + v128_t t = wasm_v128_load(sp); + t = wasm_f32x4_mul(t, mul); + v128_t u = wasm_i32x4_trunc_sat_f32x4(t); + u = wasm_i32x4_max(u, lower_limit); + u = wasm_i32x4_min(u, upper_limit); + + v128_t c = wasm_i32x4_gt(zero, u); //0xFFFFFFFF for -ve value + v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value + neg = wasm_v128_and(c, neg); //keep only - bias - value + v128_t v = wasm_v128_andnot(c, u); //keep only +ve or 0 + v = wasm_v128_or(neg, v); //combine + wasm_v128_store(dp, v); + } + } + else + { + v128_t half = wasm_i32x4_splat(-(1 << (bit_depth - 1))); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = wasm_v128_load(sp); + t = wasm_f32x4_mul(t, mul); + v128_t u = wasm_i32x4_trunc_sat_f32x4(t); + u = wasm_i32x4_max(u, lower_limit); + u = wasm_i32x4_min(u, upper_limit); + u = wasm_i32x4_add(u, half); + wasm_v128_store(dp, u); + } + } + } + else + { + // There is the possibility that converting to integer will + // exceed the dynamic range of 32bit integer; therefore, we need + // to use 64 bit. One may think, why not limit the floats to the + // range of [-0.5f, 0.5f)? + // Notice the half closed range -- we need a value just below 0.5f. + // While getting this number is possible, after multiplication, the + // resulting number will not be exactly the maximum that the integer + // can achieve. All this is academic, because here are talking + // about a number which has all the exponent bits set, meaning + // it is either infinity, -infinity, qNan or sNan. + si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth); + v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth)); + v128_t fl_upper_limit = wasm_f32x4_splat(-(float)neg_limit); // val< up + v128_t fl_lower_limit = wasm_f32x4_splat( (float)neg_limit); // val>=lo + v128_t s32_upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth)); + v128_t s32_lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth)); + + if (is_signed) + { + v128_t zero = wasm_i32x4_splat(0); + v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = wasm_v128_load(sp); + t = wasm_f32x4_mul(t, mul); + v128_t u = wasm_i32x4_trunc_sat_f32x4(t); + u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit); + u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit); + v128_t c = wasm_i32x4_gt(zero, u); //0xFFFFFFFF for -ve value + v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value + neg = wasm_v128_and(c, neg); //keep only - bias - value + v128_t v = wasm_v128_andnot(c, u); //keep only +ve or 0 + v = wasm_v128_or(neg, v); //combine + wasm_v128_store(dp, v); + } + } + else + { + v128_t half = wasm_i32x4_splat(-(1 << (bit_depth - 1))); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = wasm_v128_load(sp); + t = wasm_f32x4_mul(t, mul); + v128_t u = wasm_i32x4_trunc_sat_f32x4(t); + u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit); + u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit); + u = wasm_i32x4_add(u, half); + wasm_v128_store(dp, u); + } + } + } } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line, - ui32 src_line_offset, line_buf *dst_line, + void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) { assert((src_line->flags & line_buf::LFT_32BIT) && @@ -369,64 +401,64 @@ namespace ojph { (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER) == 0); - // __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0)); - - // const si32* sp = src_line->i32 + src_line_offset; - // float* dp = dst_line->f32; - // si32 shift = 32 - (si32)bit_depth; - // if (is_signed) - // { - // __m128i zero = _mm_setzero_si128(); - // __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1)); - // for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - // __m128i t = _mm_loadu_si128((__m128i*)sp); - // __m128i u = _mm_slli_epi32(t, shift); - // __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value - // __m128i neg = _mm_sub_epi32(bias, u); // - bias - value - // neg = _mm_and_si128(c, neg); // keep only - bias - value - // t = _mm_andnot_si128(c, u); // keep only +ve or 0 - // u = _mm_or_si128(neg, t); // combine - // __m128 v = _mm_cvtepi32_ps(u); - // v = _mm_mul_ps(v, mul); - // _mm_storeu_ps(dp, v); - // } - // } - // else - // { - // __m128 half = _mm_set1_ps(0.5f); - // for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - // __m128i t = _mm_loadu_si128((__m128i*)sp); - // __m128i u = _mm_slli_epi32(t, shift); - // __m128 v = _mm_cvtepi32_ps(u); - // v = _mm_mul_ps(v, mul); - // v = _mm_add_ps(v, half); - // _mm_storeu_ps(dp, v); - // } - // } + v128_t mul = wasm_f32x4_splat((float)(1.0 / 65536.0 / 65536.0)); + + const si32* sp = src_line->i32 + src_line_offset; + float* dp = dst_line->f32; + ui32 shift = (ui32)32 - bit_depth; + if (is_signed) + { + v128_t zero = wasm_i32x4_splat(0); + v128_t bias = wasm_i32x4_splat(-(si32)((ui32)INT_MIN + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = wasm_v128_load(sp); + v128_t u = wasm_i32x4_shl(t, shift); + v128_t c = wasm_i32x4_lt(u, zero); // 0xFFFFFFFF for -ve value + v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value + neg = wasm_v128_and(c, neg); // keep only - bias - value + t = wasm_v128_andnot(c, u); // keep only +ve or 0 + u = wasm_v128_or(neg, t); // combine + v128_t v = wasm_f32x4_convert_i32x4(u); + v = wasm_f32x4_mul(v, mul); + wasm_v128_store(dp, v); + } + } + else + { + v128_t half = wasm_f32x4_splat(0.5f); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = wasm_v128_load(sp); + v128_t u = wasm_i32x4_shl(t, shift); + v128_t v = wasm_f32x4_convert_i32x4(u); + v = wasm_f32x4_mul(v, mul); + v = wasm_f32x4_add(v, half); + wasm_v128_store(dp, v); + } + } } ////////////////////////////////////////////////////////////////////////// - void wasm_rct_forward(const line_buf *r, - const line_buf *g, + void wasm_rct_forward(const line_buf *r, + const line_buf *g, const line_buf *b, - line_buf *y, line_buf *cb, line_buf *cr, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { assert((y->flags & line_buf::LFT_INTEGER) && - (cb->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && (cr->flags & line_buf::LFT_INTEGER) && (r->flags & line_buf::LFT_INTEGER) && - (g->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && (b->flags & line_buf::LFT_INTEGER)); - + if (y->flags & line_buf::LFT_32BIT) { assert((y->flags & line_buf::LFT_32BIT) && - (cb->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && (cr->flags & line_buf::LFT_32BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && - (b->flags & line_buf::LFT_32BIT)); + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; @@ -447,13 +479,13 @@ namespace ojph { yp += 4; cbp += 4; crp += 4; } } - else + else { assert((y->flags & line_buf::LFT_64BIT) && - (cb->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && (cr->flags & line_buf::LFT_64BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; @@ -466,7 +498,7 @@ namespace ojph { mr = wasm_i64x2_extend_low_i32x4(mr32); mg = wasm_i64x2_extend_low_i32x4(mg32); mb = wasm_i64x2_extend_low_i32x4(mb32); - + t = wasm_i64x2_add(mr, mb); t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1)); wasm_v128_store(yp, wasm_i64x2_shr(t, 2)); @@ -480,7 +512,7 @@ namespace ojph { mr = wasm_i64x2_extend_high_i32x4(mr32); mg = wasm_i64x2_extend_high_i32x4(mg32); mb = wasm_i64x2_extend_high_i32x4(mb32); - + t = wasm_i64x2_add(mr, mb); t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1)); wasm_v128_store(yp, wasm_i64x2_shr(t, 2)); @@ -496,26 +528,26 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_rct_backward(const line_buf *y, - const line_buf *cb, + void wasm_rct_backward(const line_buf *y, + const line_buf *cb, const line_buf *cr, - line_buf *r, line_buf *g, line_buf *b, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { assert((y->flags & line_buf::LFT_INTEGER) && - (cb->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && (cr->flags & line_buf::LFT_INTEGER) && (r->flags & line_buf::LFT_INTEGER) && - (g->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { assert((y->flags & line_buf::LFT_32BIT) && - (cb->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && (cr->flags & line_buf::LFT_32BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; @@ -540,10 +572,10 @@ namespace ojph { else { assert((y->flags & line_buf::LFT_64BIT) && - (cb->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && (cr->flags & line_buf::LFT_64BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; @@ -580,7 +612,7 @@ namespace ojph { yp += 2; cbp += 2; crp += 2; rp += 4; gp += 4; bp += 4; - } + } } } @@ -603,7 +635,7 @@ namespace ojph { wasm_v128_store(y, my); wasm_v128_store(cb, wasm_f32x4_mul(beta_cbf, wasm_f32x4_sub(mb, my))); wasm_v128_store(cr, wasm_f32x4_mul(beta_crf, wasm_f32x4_sub(mr, my))); - + r += 4; g += 4; b += 4; y += 4; cb += 4; cr += 4; } From 3527f0abe599c52ab53a3ff2ba1d93c22b1b925c Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Thu, 23 Jan 2025 13:09:39 +1100 Subject: [PATCH 09/20] Fixes function pointer assignment. --- src/core/transform/ojph_colour.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 617fc41f..44028037 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -205,6 +205,8 @@ namespace ojph { cnvrt_si32_to_float = wasm_cnvrt_si32_to_float; cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd; cnvrt_float_to_si32 = wasm_cnvrt_float_to_si32; + irv_convert_to_integer_nlt_type3 = wasm_irv_convert_to_integer_nlt_type3; + irv_convert_to_float_nlt_type3 = wasm_irv_convert_to_float_nlt_type3; rct_forward = wasm_rct_forward; rct_backward = wasm_rct_backward; ict_forward = wasm_ict_forward; From fd295de0ce5d5ded1a685bbbbc25d71f79cd8e1d Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Thu, 23 Jan 2025 17:15:42 +1100 Subject: [PATCH 10/20] Bug fixes, some historic. --- src/core/codestream/ojph_codeblock_fun.cpp | 2 + src/core/transform/ojph_colour_wasm.cpp | 80 +++++++++++++--------- 2 files changed, 51 insertions(+), 31 deletions(-) diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp index 565744dd..cad2434a 100644 --- a/src/core/codestream/ojph_codeblock_fun.cpp +++ b/src/core/codestream/ojph_codeblock_fun.cpp @@ -276,6 +276,8 @@ namespace ojph { tx_from_cb64 = NULL; } encode_cb64 = ojph_encode_codeblock64; + bool result = initialize_block_encoder_tables(); + assert(result); ojph_unused(result); #endif // !OJPH_ENABLE_WASM_SIMD diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index c0c4bbea..10114b9b 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -47,6 +47,20 @@ namespace ojph { namespace local { + ////////////////////////////////////////////////////////////////////////// + static inline + v128_t ojph_convert_float_to_i32(v128_t a, v128_t zero, v128_t half) + { // We implement ojph_round, which is + // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float + v128_t c = wasm_f32x4_ge(a, zero); // greater or equal to zero + v128_t p = wasm_f32x4_add(a, half); // for positive, add half + v128_t n = wasm_f32x4_sub(a, half); // for negative, subtract half + v128_t d = wasm_v128_and(c, p); // keep positive only + v128_t e = wasm_v128_andnot(n, c); // keep negative only + v128_t v = wasm_v128_or(d, e); // combine + return wasm_i32x4_trunc_sat_f32x4(v);// truncate (towards 0) + } + ////////////////////////////////////////////////////////////////////////// void wasm_rev_convert(const line_buf *src_line, const ui32 src_line_offset, @@ -129,7 +143,7 @@ namespace ojph { v128_t c = wasm_i32x4_lt(s, zero); // 0xFFFFFFFF for -ve value v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value - s = wasm_v128_andnot(c, s); // keep only +ve or 0 + s = wasm_v128_andnot(s, c); // keep only +ve or 0 s = wasm_v128_or(s, v_m_sh); // combine wasm_v128_store(dp, s); } @@ -149,7 +163,7 @@ namespace ojph { c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value - u = wasm_v128_andnot(c, u); // keep only +ve or 0 + u = wasm_v128_andnot(u, c); // keep only +ve or 0 u = wasm_v128_or(u, v_m_sh); // combine wasm_v128_store(dp, u); @@ -158,7 +172,7 @@ namespace ojph { c = wasm_i64x2_lt(u, zero); // 64b -1 for -ve value v_m_sh = wasm_i64x2_sub(sh, u); // - shift - value v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value - u = wasm_v128_andnot(c, u); // keep only +ve or 0 + u = wasm_v128_andnot(u, c); // keep only +ve or 0 u = wasm_v128_or(u, v_m_sh); // combine wasm_v128_store(dp + 2, u); @@ -182,14 +196,14 @@ namespace ojph { m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value tm = wasm_i64x2_sub(sh, s); // - shift - value n = wasm_v128_and(m, tm); // -ve - p = wasm_v128_andnot(m, s); // +ve + p = wasm_v128_andnot(s, m); // +ve t0 = wasm_v128_or(n, p); s = wasm_v128_load(sp + 2); m = wasm_i64x2_lt(s, zero); // 64b -1 for -ve value tm = wasm_i64x2_sub(sh, s); // - shift - value n = wasm_v128_and(m, tm); // -ve - p = wasm_v128_andnot(m, s); // +ve + p = wasm_v128_andnot(s, m); // +ve t1 = wasm_v128_or(n, p); t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2); @@ -232,16 +246,16 @@ namespace ojph { void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width) { - // rounding mode is always set to _MM_ROUND_NEAREST - v128_t shift = wasm_f32x4_splat(0.5f); + const v128_t zero = wasm_f32x4_splat(0.0f); + const v128_t half = wasm_f32x4_splat(0.5f); v128_t m = wasm_f32x4_splat(mul); for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) { v128_t t = wasm_v128_load(sp); - v128_t s = wasm_f32x4_add(t, shift); + v128_t s = wasm_f32x4_add(t, half); s = wasm_f32x4_mul(s, m); - s = wasm_f32x4_add(s, shift); // + 0.5 and followed by floor next - wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s)); + s = wasm_f32x4_add(s, half); // + 0.5 and followed by floor next + wasm_v128_store(dp, ojph_convert_float_to_i32(s, zero, half)); } } @@ -249,15 +263,15 @@ namespace ojph { void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width) { - // rounding mode is always set to _MM_ROUND_NEAREST - v128_t shift = wasm_f32x4_splat(0.5f); + const v128_t zero = wasm_f32x4_splat(0.0f); + const v128_t half = wasm_f32x4_splat(0.5f); v128_t m = wasm_f32x4_splat(mul); for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) { v128_t t = wasm_v128_load(sp); v128_t s = wasm_f32x4_mul(t, m); - s = wasm_f32x4_add(s, shift); // + 0.5 and followed by floor next - wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s)); + s = wasm_f32x4_add(s, half); // + 0.5 and followed by floor next + wasm_v128_store(dp, ojph_convert_float_to_i32(s, zero, half)); } } @@ -267,7 +281,7 @@ namespace ojph { { v128_t c = wasm_i32x4_ge(x, y); // 0xFFFFFFFF for x >= y v128_t d = wasm_v128_and(c, a); // keep only a, where x >= y - v128_t e = wasm_v128_andnot(c, b); // keep only b, where x < y + v128_t e = wasm_v128_andnot(b, c); // keep only b, where x < y return wasm_v128_or(d, e); // combine } @@ -277,7 +291,7 @@ namespace ojph { { v128_t c = wasm_i32x4_lt(x, y); // 0xFFFFFFFF for x < y v128_t d = wasm_v128_and(c, a); // keep only a, where x < y - v128_t e = wasm_v128_andnot(c, b); // keep only b, where x >= y + v128_t e = wasm_v128_andnot(b, c); // keep only b, where x >= y return wasm_v128_or(d, e); // combine } @@ -291,8 +305,6 @@ namespace ojph { (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER)); - // rounding mode is always set to _MM_ROUND_NEAREST - const float* sp = src_line->f32; si32* dp = dst_line->i32 + dst_line_offset; if (bit_depth <= 30) @@ -306,34 +318,37 @@ namespace ojph { if (is_signed) { - v128_t zero = wasm_i32x4_splat(0); + const v128_t zero = wasm_f32x4_splat(0.0f); + const v128_t half = wasm_f32x4_splat(0.5f); v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); - v128_t u = wasm_i32x4_trunc_sat_f32x4(t); + v128_t u = ojph_convert_float_to_i32(t, zero, half); u = wasm_i32x4_max(u, lower_limit); u = wasm_i32x4_min(u, upper_limit); v128_t c = wasm_i32x4_gt(zero, u); //0xFFFFFFFF for -ve value v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value neg = wasm_v128_and(c, neg); //keep only - bias - value - v128_t v = wasm_v128_andnot(c, u); //keep only +ve or 0 + v128_t v = wasm_v128_andnot(u, c); //keep only +ve or 0 v = wasm_v128_or(neg, v); //combine wasm_v128_store(dp, v); } } else { - v128_t half = wasm_i32x4_splat(-(1 << (bit_depth - 1))); + const v128_t zero = wasm_f32x4_splat(0.0f); + const v128_t half = wasm_f32x4_splat(0.5f); + v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1))); for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); - v128_t u = wasm_i32x4_trunc_sat_f32x4(t); + v128_t u = ojph_convert_float_to_i32(t, zero, half); u = wasm_i32x4_max(u, lower_limit); u = wasm_i32x4_min(u, upper_limit); - u = wasm_i32x4_add(u, half); + u = wasm_i32x4_add(u, ihalf); wasm_v128_store(dp, u); } } @@ -359,32 +374,35 @@ namespace ojph { if (is_signed) { - v128_t zero = wasm_i32x4_splat(0); + const v128_t zero = wasm_f32x4_splat(0.0f); + const v128_t half = wasm_f32x4_splat(0.5f); v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); - v128_t u = wasm_i32x4_trunc_sat_f32x4(t); + v128_t u = ojph_convert_float_to_i32(t, zero, half); u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit); u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit); v128_t c = wasm_i32x4_gt(zero, u); //0xFFFFFFFF for -ve value v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value neg = wasm_v128_and(c, neg); //keep only - bias - value - v128_t v = wasm_v128_andnot(c, u); //keep only +ve or 0 + v128_t v = wasm_v128_andnot(u, c); //keep only +ve or 0 v = wasm_v128_or(neg, v); //combine wasm_v128_store(dp, v); } } else { - v128_t half = wasm_i32x4_splat(-(1 << (bit_depth - 1))); + const v128_t zero = wasm_f32x4_splat(0.0f); + const v128_t half = wasm_f32x4_splat(0.5f); + v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1))); for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); - v128_t u = wasm_i32x4_trunc_sat_f32x4(t); + v128_t u = ojph_convert_float_to_i32(t, zero, half); u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit); u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit); - u = wasm_i32x4_add(u, half); + u = wasm_i32x4_add(u, ihalf); wasm_v128_store(dp, u); } } @@ -416,7 +434,7 @@ namespace ojph { v128_t c = wasm_i32x4_lt(u, zero); // 0xFFFFFFFF for -ve value v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value neg = wasm_v128_and(c, neg); // keep only - bias - value - t = wasm_v128_andnot(c, u); // keep only +ve or 0 + t = wasm_v128_andnot(u, c); // keep only +ve or 0 u = wasm_v128_or(neg, t); // combine v128_t v = wasm_f32x4_convert_i32x4(u); v = wasm_f32x4_mul(v, mul); From 9ec50da9c515c00f2c3eae710f414f267ecaf1d7 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Thu, 23 Jan 2025 17:50:03 +1100 Subject: [PATCH 11/20] A bug fix. --- src/core/transform/ojph_colour_avx2.cpp | 2 +- src/core/transform/ojph_colour_sse2.cpp | 2 +- src/core/transform/ojph_colour_wasm.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 1c9da81e..45391370 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -414,7 +414,7 @@ namespace ojph { __m256i u = _mm256_slli_epi32(t, shift); __m256 v = _mm256_cvtepi32_ps(u); v = _mm256_mul_ps(v, mul); - v = _mm256_add_ps(v, half); + v = _mm256_sub_ps(v, half); _mm256_storeu_ps(dp, v); } } diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index e05ceb5e..208b9616 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -480,7 +480,7 @@ namespace ojph { __m128i u = _mm_slli_epi32(t, shift); __m128 v = _mm_cvtepi32_ps(u); v = _mm_mul_ps(v, mul); - v = _mm_add_ps(v, half); + v = _mm_sub_ps(v, half); _mm_storeu_ps(dp, v); } } diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 10114b9b..8e354784 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -449,7 +449,7 @@ namespace ojph { v128_t u = wasm_i32x4_shl(t, shift); v128_t v = wasm_f32x4_convert_i32x4(u); v = wasm_f32x4_mul(v, mul); - v = wasm_f32x4_add(v, half); + v = wasm_f32x4_sub(v, half); wasm_v128_store(dp, v); } } From ef9f71304ed4161102492a68f86bf5a7eb8e2e3e Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 25 Jan 2025 10:38:04 +1100 Subject: [PATCH 12/20] Bug Fixes. --- src/core/common/ojph_arch.h | 12 -- src/core/transform/ojph_colour.cpp | 191 +++++++---------- src/core/transform/ojph_colour_avx2.cpp | 263 ++++++++++-------------- src/core/transform/ojph_colour_sse2.cpp | 254 ++++++++++------------- 4 files changed, 288 insertions(+), 432 deletions(-) diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h index 33e434a0..29ab7a57 100644 --- a/src/core/common/ojph_arch.h +++ b/src/core/common/ojph_arch.h @@ -271,18 +271,6 @@ namespace ojph { #endif } - //////////////////////////////////////////////////////////////////////////// - static inline si64 ojph_round64(float val) - { - #ifdef OJPH_COMPILER_MSVC - return (si64)(val + (val >= 0.0f ? 0.5f : -0.5f)); - #elif (defined OJPH_COMPILER_GNUC) - return (si64)(val + (val >= 0.0f ? 0.5f : -0.5f)); - #else - return (si64)round(val); - #endif - } - //////////////////////////////////////////////////////////////////////////// static inline si32 ojph_trunc(float val) { diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 44028037..792929b8 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -2,21 +2,21 @@ // This software is released under the 2-Clause BSD license, included // below. // -// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Aous Naman // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia // Copyright (c) 2019, The University of New South Wales, Australia -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: -// +// // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. -// +// // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. -// +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A @@ -53,14 +53,14 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void (*rev_convert) - (const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*rev_convert_nlt_type3) - (const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + (const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// @@ -70,7 +70,7 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void (*cnvrt_si32_to_float) (const si32 *sp, float *dp, float mul, ui32 width) = NULL; - + ////////////////////////////////////////////////////////////////////////// void (*cnvrt_float_to_si32_shftd) (const float *sp, si32 *dp, float mul, ui32 width) = NULL; @@ -81,14 +81,14 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void (*irv_convert_to_integer_nlt_type3) ( - const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*irv_convert_to_float_nlt_type3) ( const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL; - + ////////////////////////////////////////////////////////////////////////// void (*rct_forward) (const line_buf* r, const line_buf* g, const line_buf* b, @@ -192,7 +192,7 @@ namespace ojph { #endif // !OJPH_DISABLE_AVX2 #elif defined(OJPH_ARCH_ARM) - + #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386)) #endif // !OJPH_DISABLE_SIMD @@ -236,12 +236,12 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void gen_rev_convert( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width) { if (src_line->flags & line_buf::LFT_32BIT) - { + { if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; @@ -250,7 +250,7 @@ namespace ojph { for (ui32 i = width; i > 0; --i) *dp++ = *sp++ + s; } - else + else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; @@ -258,7 +258,7 @@ namespace ojph { *dp++ = *sp++ + shift; } } - else + else { assert(src_line->flags & line_buf::LFT_64BIT); assert(dst_line->flags & line_buf::LFT_32BIT); @@ -271,12 +271,12 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// void gen_rev_convert_nlt_type3( - const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, const ui32 dst_line_offset, + const line_buf *src_line, const ui32 src_line_offset, + line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width) { if (src_line->flags & line_buf::LFT_32BIT) - { + { if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; @@ -287,7 +287,7 @@ namespace ojph { *dp++ = v >= 0 ? v : (- v - s); } } - else + else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; @@ -297,7 +297,7 @@ namespace ojph { } } } - else + else { assert(src_line->flags & line_buf::LFT_64BIT); assert(dst_line->flags & line_buf::LFT_32BIT); @@ -315,7 +315,7 @@ namespace ojph { ui32 width) { for (ui32 i = width; i > 0; --i) - *dp++ = (float)*sp++ * mul - 0.5f; + *dp++ = (float)(ui32)*sp++ * mul - 0.5f; } ////////////////////////////////////////////////////////////////////////// @@ -331,7 +331,7 @@ namespace ojph { ui32 width) { for (ui32 i = width; i > 0; --i) - *dp++ = ojph_round((*sp++ + 0.5f) * mul); + *dp++ = (si32)ojph_round((*sp++ + 0.5f) * mul); } ////////////////////////////////////////////////////////////////////////// @@ -343,7 +343,7 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) { @@ -351,85 +351,51 @@ namespace ojph { (src_line->flags & line_buf::LFT_INTEGER) == 0 && (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER)); - + + assert(bit_depth <= 32); const float* sp = src_line->f32; si32* dp = dst_line->i32 + dst_line_offset; - if (bit_depth <= 30) + // There is the possibility that converting to integer will + // exceed the dynamic range of 32bit integer; therefore, care must be + // exercised. + // We look if the floating point number is outside the half-closed + // interval [-0.5f, 0.5f). If so, we limit the resulting integer + // to the maximum/minimum that number supports. + si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth); + float mul = (float)(1ull << bit_depth); + float fl_up_lim = -(float)neg_limit; // val < upper + float fl_low_lim = (float)neg_limit; // val >= lower + si32 s32_up_lim = INT_MAX >> (32 - bit_depth); + si32 s32_low_lim = INT_MIN >> (32 - bit_depth); + + if (is_signed) { - // We are leaving two bit overhead -- here, we are assuming that after - // multiplications, the resulting number can still be represented - // using 32 bit integer - float mul = (float)(1 << bit_depth); - const si32 upper_limit = INT_MAX >> (32 - bit_depth); - const si32 lower_limit = INT_MIN >> (32 - bit_depth); - - if (is_signed) - { - const si32 bias = (1 << (bit_depth - 1)) + 1; - for (ui32 i = width; i > 0; --i) { - si32 v = ojph_round(*sp++ * mul); - v = ojph_max(v, lower_limit); - v = ojph_min(v, upper_limit); - v = (v >= 0) ? v : (- v - bias); - *dp++ = v; - } - } - else - { - const si32 half = (1 << (bit_depth - 1)); - for (ui32 i = width; i > 0; --i) { - si32 v = ojph_round(*sp++ * mul); - v = ojph_max(v, lower_limit); - v = ojph_min(v, upper_limit); - *dp++ = v + half; - } + const si32 bias = (1 << (bit_depth - 1)) + 1; + for (ui32 i = width; i > 0; --i) { + float t = *sp++ * mul; + si32 v = ojph_round(t); + v = t >= fl_low_lim ? v : s32_low_lim; + v = t < fl_up_lim ? v : s32_up_lim; + v = (v >= 0) ? v : (- v - bias); + *dp++ = v; } } else { - // There is the possibility that converting to integer will - // exceed the dynamic range of 32bit integer; therefore, we need - // to use 64 bit. One may think, why not limit the floats to the - // range of [-0.5f, 0.5f)? - // Notice the half closed range -- we need a value just below 0.5f. - // While getting this number is possible, after multiplication, the - // resulting number will not be exactly the maximum that the integer - // can achieve. All this is academic, because here are talking - // about a number which has all the exponent bits set, meaning - // it is either infinity, -infinity, qNan or sNan. - float mul = (float)(1ull << bit_depth); - const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth); - const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth); - - if (is_signed) - { - const si32 bias = (1 << (bit_depth - 1)) + 1; - for (ui32 i = width; i > 0; --i) { - si64 t = ojph_round64(*sp++ * mul); - t = ojph_max(t, lower_limit); - t = ojph_min(t, upper_limit); - si32 v = (si32)t; - v = (v >= 0) ? v : (- v - bias); - *dp++ = v; - } - } - else - { - const si32 half = (1 << (bit_depth - 1)); - for (ui32 i = width; i > 0; --i) { - si64 t = ojph_round64(*sp++ * mul); - t = ojph_max(t, lower_limit); - t = ojph_min(t, upper_limit); - si32 v = (si32)t; - *dp++ = v + half; - } + const si32 half = (1 << (bit_depth - 1)); + for (ui32 i = width; i > 0; --i) { + float t = *sp++ * mul; + si32 v = ojph_round(t); + v = t >= fl_low_lim ? v : s32_low_lim; + v = t < fl_up_lim ? v : s32_up_lim; + *dp++ = v + half; } } } ////////////////////////////////////////////////////////////////////////// - void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, - ui32 src_line_offset, line_buf *dst_line, + void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) { assert((src_line->flags & line_buf::LFT_32BIT) && @@ -453,9 +419,12 @@ namespace ojph { } else { + const si32 half = INT_MIN; for (ui32 i = width; i > 0; --i) { - si32 v = *sp++ << shift; - *dp++ = (float)v * mul - 0.5f; + si32 v = *sp++; + v <<= shift; + v -= half; + *dp++ = (float)v * mul; } } } @@ -466,20 +435,20 @@ namespace ojph { line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { assert((y->flags & line_buf::LFT_INTEGER) && - (cb->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && (cr->flags & line_buf::LFT_INTEGER) && (r->flags & line_buf::LFT_INTEGER) && - (g->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && (b->flags & line_buf::LFT_INTEGER)); - + if (y->flags & line_buf::LFT_32BIT) { assert((y->flags & line_buf::LFT_32BIT) && - (cb->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && (cr->flags & line_buf::LFT_32BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && - (b->flags & line_buf::LFT_32BIT)); + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; for (ui32 i = repeat; i > 0; --i) @@ -490,13 +459,13 @@ namespace ojph { *crp++ = (rr - gg); } } - else + else { assert((y->flags & line_buf::LFT_64BIT) && - (cb->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && (cr->flags & line_buf::LFT_64BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; @@ -516,19 +485,19 @@ namespace ojph { line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { assert((y->flags & line_buf::LFT_INTEGER) && - (cb->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && (cr->flags & line_buf::LFT_INTEGER) && (r->flags & line_buf::LFT_INTEGER) && - (g->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { assert((y->flags & line_buf::LFT_32BIT) && - (cb->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && (cr->flags & line_buf::LFT_32BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; @@ -544,11 +513,11 @@ namespace ojph { else { assert((y->flags & line_buf::LFT_64BIT) && - (cb->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && (cr->flags & line_buf::LFT_64BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && - (b->flags & line_buf::LFT_32BIT)); + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; for (ui32 i = repeat; i > 0; --i) diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 45391370..cb2bf000 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -2,21 +2,21 @@ // This software is released under the 2-Clause BSD license, included // below. // -// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Aous Naman // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia // Copyright (c) 2019, The University of New South Wales, Australia -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: -// +// // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. -// +// // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. -// +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A @@ -50,8 +50,8 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h - static inline - __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) + static inline + __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) { // note than m must be obtained using // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt)); @@ -62,14 +62,14 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_convert(const line_buf *src_line, + void avx2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, - const ui32 dst_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, si64 shift, ui32 width) { if (src_line->flags & line_buf::LFT_32BIT) - { + { if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; @@ -80,9 +80,9 @@ namespace ojph { __m256i s = _mm256_loadu_si256((__m256i*)sp); s = _mm256_add_epi32(s, sh); _mm256_storeu_si256((__m256i*)dp, s); - } + } } - else + else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; @@ -91,18 +91,18 @@ namespace ojph { { __m256i s, t; s = _mm256_loadu_si256((__m256i*)sp); - + t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0)); t = _mm256_add_epi64(t, sh); _mm256_storeu_si256((__m256i*)dp, t); - + t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1)); t = _mm256_add_epi64(t, sh); _mm256_storeu_si256((__m256i*)dp + 1, t); - } + } } } - else + else { assert(src_line->flags | line_buf::LFT_64BIT); assert(dst_line->flags | line_buf::LFT_32BIT); @@ -125,23 +125,23 @@ namespace ojph { s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0)); s = _mm256_andnot_si256(low_bits, s); - + t = _mm256_or_si256(s, t); t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0)); _mm256_storeu_si256((__m256i*)dp, t); - } + } } } ////////////////////////////////////////////////////////////////////////// - void avx2_rev_convert_nlt_type3(const line_buf *src_line, - const ui32 src_line_offset, - line_buf *dst_line, - const ui32 dst_line_offset, + void avx2_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, si64 shift, ui32 width) { if (src_line->flags & line_buf::LFT_32BIT) - { + { if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; @@ -152,14 +152,14 @@ namespace ojph { { __m256i s = _mm256_loadu_si256((__m256i*)sp); __m256i c = _mm256_cmpgt_epi32(zero, s); // 0xFFFFFFFF for -ve val - __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value + __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only -shift-val s = _mm256_andnot_si256(c, s); // keep only +ve or 0 s = _mm256_or_si256(s, v_m_sh); // combine _mm256_storeu_si256((__m256i*)dp, s); } } - else + else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; @@ -174,7 +174,7 @@ namespace ojph { u0 = _mm256_unpacklo_epi32(s, t); // correct 64bit data c = _mm256_unpacklo_epi32(t, t); // 64bit -1 for -ve value - v_m_sh = _mm256_sub_epi64(sh, u0); // - shift - value + v_m_sh = _mm256_sub_epi64(sh, u0); // - shift - value v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value u0 = _mm256_andnot_si256(c, u0); // keep only +ve or 0 u0 = _mm256_or_si256(u0, v_m_sh); // combine @@ -182,7 +182,7 @@ namespace ojph { u1 = _mm256_unpackhi_epi32(s, t); // correct 64bit data c = _mm256_unpackhi_epi32(t, t); // 64bit -1 for -ve value - v_m_sh = _mm256_sub_epi64(sh, u1); // - shift - value + v_m_sh = _mm256_sub_epi64(sh, u1); // - shift - value v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value u1 = _mm256_andnot_si256(c, u1); // keep only +ve or 0 u1 = _mm256_or_si256(u1, v_m_sh); // combine @@ -195,7 +195,7 @@ namespace ojph { } } } - else + else { assert(src_line->flags | line_buf::LFT_64BIT); assert(dst_line->flags | line_buf::LFT_32BIT); @@ -211,7 +211,7 @@ namespace ojph { // m for mask, and tm for temp __m256i s, t, p, n, m, tm; s = _mm256_loadu_si256((__m256i*)sp); - + m = _mm256_cmpgt_epi64(zero, s); // 64b -1 for -ve value tm = _mm256_sub_epi64(sh, s); // - shift - value n = _mm256_and_si256(m, tm); // -ve @@ -263,7 +263,7 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) { @@ -271,112 +271,59 @@ namespace ojph { (src_line->flags & line_buf::LFT_INTEGER) == 0 && (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER)); - + + assert(bit_depth <= 32); const float* sp = src_line->f32; si32* dp = dst_line->i32 + dst_line_offset; - if (bit_depth <= 30) + // There is the possibility that converting to integer will + // exceed the dynamic range of 32bit integer; therefore, care must be + // exercised. + // We look if the floating point number is outside the half-closed + // interval [-0.5f, 0.5f). If so, we limit the resulting integer + // to the maximum/minimum that number supports. + si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth); + __m256 mul = _mm256_set1_ps((float)(1ull << bit_depth)); + __m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit); // val < upper + __m256 fl_low_lim = _mm256_set1_ps((float)neg_limit); // val >= lower + __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth)); + __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth)); + + if (is_signed) { - // We are leaving two bit overhead -- here, we are assuming that after - // multiplications, the resulting number can still be represented - // using 32 bit integer - __m256 mul = _mm256_set1_ps((float)(1 << bit_depth)); - __m256i upper_limit = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth)); - __m256i lower_limit = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth)); - - if (is_signed) - { - __m256i zero = _mm256_setzero_si256(); - __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1)); - for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) - { - __m256 t = _mm256_loadu_ps(sp); - t = _mm256_mul_ps(t, mul); - t = _mm256_round_ps(t, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - __m256i u = _mm256_cvtps_epi32(t); - u = _mm256_max_epi32(u, lower_limit); - u = _mm256_min_epi32(u, upper_limit); - - __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value - __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value - neg = _mm256_and_si256(c, neg); //keep only - bias - value - __m256i v = _mm256_andnot_si256(c, u); //keep only +ve or 0 - v = _mm256_or_si256(neg, v); //combine - _mm256_storeu_si256((__m256i*)dp, v); - } - } - else - { - __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1))); - for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) { - __m256 t = _mm256_loadu_ps(sp); - t = _mm256_mul_ps(t, mul); - t = _mm256_round_ps(t, - _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - __m256i u = _mm256_cvtps_epi32(t); - u = _mm256_max_epi32(u, lower_limit); - u = _mm256_min_epi32(u, upper_limit); - u = _mm256_add_epi32(u, half); - _mm256_storeu_si256((__m256i*)dp, u); - } + __m256i zero = _mm256_setzero_si256(); + __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m256 t = _mm256_loadu_ps(sp); + t = _mm256_mul_ps(t, mul); + __m256i u = _mm256_cvtps_epi32(t); + u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); + u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); + __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value + __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value + neg = _mm256_and_si256(c, neg); //keep only - bias - value + __m256i v = _mm256_andnot_si256(c, u); //keep only +ve or 0 + v = _mm256_or_si256(neg, v); //combine + _mm256_storeu_si256((__m256i*)dp, v); } } else { - // There is the possibility that converting to integer will - // exceed the dynamic range of 32bit integer; therefore, we need - // to use 64 bit. One may think, why not limit the floats to the - // range of [-0.5f, 0.5f)? - // Notice the half closed range -- we need a value just below 0.5f. - // While getting this number is possible, after multiplication, the - // resulting number will not be exactly the maximum that the integer - // can achieve. All this is academic, because here are talking - // about a number which has all the exponent bits set, meaning - // it is either infinity, -infinity, qNan or sNan. - si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth); - __m256 mul = _mm256_set1_ps((float)(1 << bit_depth)); - __m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit); // val < upper - __m256 fl_low_lim = _mm256_set1_ps((float)neg_limit); // val >= lower - __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth)); - __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth)); - - if (is_signed) - { - __m256i zero = _mm256_setzero_si256(); - __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - __m256 t = _mm256_loadu_ps(sp); - t = _mm256_mul_ps(t, mul); - __m256i u = _mm256_cvtps_epi32(t); - u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); - u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); - __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value - __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value - neg = _mm256_and_si256(c, neg); //keep only - bias - value - __m256i v = _mm256_andnot_si256(c, u); //keep only +ve or 0 - v = _mm256_or_si256(neg, v); //combine - _mm256_storeu_si256((__m256i*)dp, v); - } - } - else - { - __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1))); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - __m256 t = _mm256_loadu_ps(sp); - t = _mm256_mul_ps(t, mul); - __m256i u = _mm256_cvtps_epi32(t); - u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); - u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); - u = _mm256_add_epi32(u, half); - _mm256_storeu_si256((__m256i*)dp, u); - } + __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1))); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m256 t = _mm256_loadu_ps(sp); + t = _mm256_mul_ps(t, mul); + __m256i u = _mm256_cvtps_epi32(t); + u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); + u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); + u = _mm256_add_epi32(u, half); + _mm256_storeu_si256((__m256i*)dp, u); } } } ////////////////////////////////////////////////////////////////////////// - void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, - ui32 src_line_offset, line_buf *dst_line, + void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) { assert((src_line->flags & line_buf::LFT_32BIT) && @@ -403,45 +350,45 @@ namespace ojph { u = _mm256_or_si256(neg, t); // combine __m256 v = _mm256_cvtepi32_ps(u); v = _mm256_mul_ps(v, mul); - _mm256_storeu_ps(dp, v); + _mm256_storeu_ps(dp, v); } } else { - __m256 half = _mm256_set1_ps(0.5f); + __m256i half = _mm256_set1_epi32(INT_MIN); for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) { __m256i t = _mm256_loadu_si256((__m256i*)sp); - __m256i u = _mm256_slli_epi32(t, shift); - __m256 v = _mm256_cvtepi32_ps(u); + t = _mm256_slli_epi32(t, shift); + t = _mm256_sub_epi32(t, half); + __m256 v = _mm256_cvtepi32_ps(t); v = _mm256_mul_ps(v, mul); - v = _mm256_sub_ps(v, half); _mm256_storeu_ps(dp, v); } } } ////////////////////////////////////////////////////////////////////////// - void avx2_rct_forward(const line_buf *r, - const line_buf *g, + void avx2_rct_forward(const line_buf *r, + const line_buf *g, const line_buf *b, - line_buf *y, line_buf *cb, line_buf *cr, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { assert((y->flags & line_buf::LFT_INTEGER) && - (cb->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && (cr->flags & line_buf::LFT_INTEGER) && (r->flags & line_buf::LFT_INTEGER) && - (g->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && (b->flags & line_buf::LFT_INTEGER)); - + if (y->flags & line_buf::LFT_32BIT) { assert((y->flags & line_buf::LFT_32BIT) && - (cb->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && (cr->flags & line_buf::LFT_32BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && - (b->flags & line_buf::LFT_32BIT)); + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; for (int i = (repeat + 7) >> 3; i > 0; --i) @@ -461,13 +408,13 @@ namespace ojph { yp += 8; cbp += 8; crp += 8; } } - else + else { assert((y->flags & line_buf::LFT_64BIT) && - (cb->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && (cr->flags & line_buf::LFT_64BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; @@ -481,7 +428,7 @@ namespace ojph { mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0)); mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0)); mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0)); - + t = _mm256_add_epi64(mr, mb); t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1)); _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2)); @@ -495,7 +442,7 @@ namespace ojph { mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1)); mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1)); mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1)); - + t = _mm256_add_epi64(mr, mb); t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1)); _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2)); @@ -508,29 +455,29 @@ namespace ojph { yp += 4; cbp += 4; crp += 4; } } - } + } ////////////////////////////////////////////////////////////////////////// - void avx2_rct_backward(const line_buf *y, - const line_buf *cb, + void avx2_rct_backward(const line_buf *y, + const line_buf *cb, const line_buf *cr, - line_buf *r, line_buf *g, line_buf *b, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { assert((y->flags & line_buf::LFT_INTEGER) && - (cb->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && (cr->flags & line_buf::LFT_INTEGER) && (r->flags & line_buf::LFT_INTEGER) && - (g->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { assert((y->flags & line_buf::LFT_32BIT) && - (cb->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && (cr->flags & line_buf::LFT_32BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; @@ -550,24 +497,24 @@ namespace ojph { yp += 8; cbp += 8; crp += 8; rp += 8; gp += 8; bp += 8; - } + } } else { assert((y->flags & line_buf::LFT_64BIT) && - (cb->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && (cr->flags & line_buf::LFT_64BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2)); - __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, + __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, 0, (si64)ULLONG_MAX); const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; for (int i = (repeat + 7) >> 3; i > 0; --i) { - __m256i my, mcb, mcr, tr, tg, tb; + __m256i my, mcb, mcr, tr, tg, tb; my = _mm256_load_si256((__m256i*)yp); mcb = _mm256_load_si256((__m256i*)cbp); mcr = _mm256_load_si256((__m256i*)crp); @@ -617,7 +564,7 @@ namespace ojph { yp += 4; cbp += 4; crp += 4; rp += 8; gp += 8; bp += 8; - } + } } } diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 208b9616..f1a95447 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -2,21 +2,21 @@ // This software is released under the 2-Clause BSD license, included // below. // -// Copyright (c) 2019, Aous Naman +// Copyright (c) 2019, Aous Naman // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia // Copyright (c) 2019, The University of New South Wales, Australia -// +// // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: -// +// // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. -// +// // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. -// +// // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A @@ -127,7 +127,7 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) { @@ -135,104 +135,55 @@ namespace ojph { (src_line->flags & line_buf::LFT_INTEGER) == 0 && (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER)); - + + assert(bit_depth <= 32); uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); const float* sp = src_line->f32; si32* dp = dst_line->i32 + dst_line_offset; - if (bit_depth <= 30) + // There is the possibility that converting to integer will + // exceed the dynamic range of 32bit integer; therefore, care must be + // exercised. + // We look if the floating point number is outside the half-closed + // interval [-0.5f, 0.5f). If so, we limit the resulting integer + // to the maximum/minimum that number supports. + si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth); + __m128 mul = _mm_set1_ps((float)(1ull << bit_depth)); + __m128 fl_up_lim = _mm_set1_ps(-(float)neg_limit); // val < upper + __m128 fl_low_lim = _mm_set1_ps((float)neg_limit); // val >= lower + __m128i s32_up_lim = _mm_set1_epi32(INT_MAX >> (32 - bit_depth)); + __m128i s32_low_lim = _mm_set1_epi32(INT_MIN >> (32 - bit_depth)); + + if (is_signed) { - // We are leaving two bit overhead -- here, we are assuming that after - // multiplications, the resulting number can still be represented - // using 32 bit integer - __m128 mul = _mm_set1_ps((float)(1 << bit_depth)); - __m128i upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth)); - __m128i lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth)); - - if (is_signed) - { - __m128i zero = _mm_setzero_si128(); - __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) - { - __m128 t = _mm_loadu_ps(sp); - t = _mm_mul_ps(t, mul); - __m128i u = _mm_cvtps_epi32(t); - u = ojph_mm_max_epi32(u, lower_limit); - u = ojph_mm_min_epi32(u, upper_limit); - - __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value - __m128i neg = _mm_sub_epi32(bias, u); //-bias -value - neg = _mm_and_si128(c, neg); //keep only - bias - value - __m128i v = _mm_andnot_si128(c, u); //keep only +ve or 0 - v = _mm_or_si128(neg, v); //combine - _mm_storeu_si128((__m128i*)dp, v); - } - } - else - { - __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1))); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - __m128 t = _mm_loadu_ps(sp); - t = _mm_mul_ps(t, mul); - __m128i u = _mm_cvtps_epi32(t); - u = ojph_mm_max_epi32(u, lower_limit); - u = ojph_mm_min_epi32(u, upper_limit); - u = _mm_add_epi32(u, half); - _mm_storeu_si128((__m128i*)dp, u); - } + __m128i zero = _mm_setzero_si128(); + __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m128 t = _mm_loadu_ps(sp); + t = _mm_mul_ps(t, mul); + __m128i u = _mm_cvtps_epi32(t); + u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); + u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); + __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value + __m128i neg = _mm_sub_epi32(bias, u); //-bias -value + neg = _mm_and_si128(c, neg); //keep only - bias - value + __m128i v = _mm_andnot_si128(c, u); //keep only +ve or 0 + v = _mm_or_si128(neg, v); //combine + _mm_storeu_si128((__m128i*)dp, v); } } else { - // There is the possibility that converting to integer will - // exceed the dynamic range of 32bit integer; therefore, we need - // to use 64 bit. One may think, why not limit the floats to the - // range of [-0.5f, 0.5f)? - // Notice the half closed range -- we need a value just below 0.5f. - // While getting this number is possible, after multiplication, the - // resulting number will not be exactly the maximum that the integer - // can achieve. All this is academic, because here are talking - // about a number which has all the exponent bits set, meaning - // it is either infinity, -infinity, qNan or sNan. - si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth); - __m128 mul = _mm_set1_ps((float)(1 << bit_depth)); - __m128 fl_upper_limit = _mm_set1_ps(-(float)neg_limit); // val < upper - __m128 fl_lower_limit = _mm_set1_ps( (float)neg_limit); // val >= lower - __m128i s32_upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth)); - __m128i s32_lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth)); - - if (is_signed) - { - __m128i zero = _mm_setzero_si128(); - __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - __m128 t = _mm_loadu_ps(sp); - t = _mm_mul_ps(t, mul); - __m128i u = _mm_cvtps_epi32(t); - u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit); - u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit); - __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value - __m128i neg = _mm_sub_epi32(bias, u); //-bias -value - neg = _mm_and_si128(c, neg); //keep only - bias - value - __m128i v = _mm_andnot_si128(c, u); //keep only +ve or 0 - v = _mm_or_si128(neg, v); //combine - _mm_storeu_si128((__m128i*)dp, v); - } - } - else - { - __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1))); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - __m128 t = _mm_loadu_ps(sp); - t = _mm_mul_ps(t, mul); - __m128i u = _mm_cvtps_epi32(t); - u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit); - u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit); - u = _mm_add_epi32(u, half); - _mm_storeu_si128((__m128i*)dp, u); - } + __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1))); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m128 t = _mm_loadu_ps(sp); + t = _mm_mul_ps(t, mul); + __m128i u = _mm_cvtps_epi32(t); + u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); + u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); + u = _mm_add_epi32(u, half); + _mm_storeu_si128((__m128i*)dp, u); } } @@ -241,7 +192,7 @@ namespace ojph { ///////////////////////////////////////////////////////////////////////// // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h - static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) + static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) { // note than m must be obtained using // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt)); @@ -270,14 +221,14 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_convert(const line_buf *src_line, + void sse2_rev_convert(const line_buf *src_line, const ui32 src_line_offset, - line_buf *dst_line, - const ui32 dst_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, si64 shift, ui32 width) { if (src_line->flags & line_buf::LFT_32BIT) - { + { if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; @@ -288,9 +239,9 @@ namespace ojph { __m128i s = _mm_loadu_si128((__m128i*)sp); s = _mm_add_epi32(s, sh); _mm_storeu_si128((__m128i*)dp, s); - } + } } - else + else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; @@ -300,18 +251,18 @@ namespace ojph { { __m128i s, t; s = _mm_loadu_si128((__m128i*)sp); - + t = sse2_cvtlo_epi32_epi64(s, zero); t = _mm_add_epi64(t, sh); _mm_storeu_si128((__m128i*)dp, t); - + t = sse2_cvthi_epi32_epi64(s, zero); t = _mm_add_epi64(t, sh); _mm_storeu_si128((__m128i*)dp + 1, t); - } + } } } - else + else { assert(src_line->flags | line_buf::LFT_64BIT); assert(dst_line->flags | line_buf::LFT_32BIT); @@ -333,22 +284,22 @@ namespace ojph { s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0)); s = _mm_andnot_si128(low_bits, s); - + t = _mm_or_si128(s, t); _mm_storeu_si128((__m128i*)dp, t); - } + } } } ////////////////////////////////////////////////////////////////////////// - void sse2_rev_convert_nlt_type3(const line_buf *src_line, - const ui32 src_line_offset, - line_buf *dst_line, - const ui32 dst_line_offset, + void sse2_rev_convert_nlt_type3(const line_buf *src_line, + const ui32 src_line_offset, + line_buf *dst_line, + const ui32 dst_line_offset, si64 shift, ui32 width) { if (src_line->flags & line_buf::LFT_32BIT) - { + { if (dst_line->flags & line_buf::LFT_32BIT) { const si32 *sp = src_line->i32 + src_line_offset; @@ -359,14 +310,14 @@ namespace ojph { { __m128i s = _mm_loadu_si128((__m128i*)sp); __m128i c = _mm_cmplt_epi32(s, zero); // 0xFFFFFFFF for -ve value - __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value + __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value s = _mm_andnot_si128(c, s); // keep only +ve or 0 s = _mm_or_si128(s, v_m_sh); // combine _mm_storeu_si128((__m128i*)dp, s); } } - else + else { const si32 *sp = src_line->i32 + src_line_offset; si64 *dp = dst_line->i64 + dst_line_offset; @@ -381,7 +332,7 @@ namespace ojph { u = _mm_unpacklo_epi32(s, t); // correct 64bit data c = _mm_unpacklo_epi32(t, t); // 64bit -1 for -ve value - v_m_sh = _mm_sub_epi64(sh, u); // - shift - value + v_m_sh = _mm_sub_epi64(sh, u); // - shift - value v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value u = _mm_andnot_si128(c, u); // keep only +ve or 0 u = _mm_or_si128(u, v_m_sh); // combine @@ -390,7 +341,7 @@ namespace ojph { u = _mm_unpackhi_epi32(s, t); // correct 64bit data c = _mm_unpackhi_epi32(t, t); // 64bit -1 for -ve value - v_m_sh = _mm_sub_epi64(sh, u); // - shift - value + v_m_sh = _mm_sub_epi64(sh, u); // - shift - value v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value u = _mm_andnot_si128(c, u); // keep only +ve or 0 u = _mm_or_si128(u, v_m_sh); // combine @@ -399,7 +350,7 @@ namespace ojph { } } } - else + else { assert(src_line->flags | line_buf::LFT_64BIT); assert(dst_line->flags | line_buf::LFT_32BIT); @@ -414,7 +365,7 @@ namespace ojph { // m for mask, and tm for temp __m128i s, t, p, n, m, tm; s = _mm_loadu_si128((__m128i*)sp); - + tm = _mm_cmplt_epi32(s, zero); // 32b -1 for -ve value m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b tm = _mm_sub_epi64(sh, s); // - shift - value @@ -441,8 +392,8 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, - ui32 src_line_offset, line_buf *dst_line, + void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) { assert((src_line->flags & line_buf::LFT_32BIT) && @@ -451,6 +402,7 @@ namespace ojph { (dst_line->flags & line_buf::LFT_INTEGER) == 0); __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0)); + float mulf = (float)(1.0 / 65536.0 / 65536.0); const si32* sp = src_line->i32 + src_line_offset; float* dp = dst_line->f32; @@ -469,45 +421,45 @@ namespace ojph { u = _mm_or_si128(neg, t); // combine __m128 v = _mm_cvtepi32_ps(u); v = _mm_mul_ps(v, mul); - _mm_storeu_ps(dp, v); + _mm_storeu_ps(dp, v); } } else { - __m128 half = _mm_set1_ps(0.5f); + __m128i half = _mm_set1_epi32(INT_MIN); for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { __m128i t = _mm_loadu_si128((__m128i*)sp); - __m128i u = _mm_slli_epi32(t, shift); - __m128 v = _mm_cvtepi32_ps(u); + t = _mm_slli_epi32(t, shift); + t = _mm_sub_epi32(t, half); + __m128 v = _mm_cvtepi32_ps(t); v = _mm_mul_ps(v, mul); - v = _mm_sub_ps(v, half); _mm_storeu_ps(dp, v); } } } ////////////////////////////////////////////////////////////////////////// - void sse2_rct_forward(const line_buf *r, - const line_buf *g, + void sse2_rct_forward(const line_buf *r, + const line_buf *g, const line_buf *b, - line_buf *y, line_buf *cb, line_buf *cr, + line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat) { assert((y->flags & line_buf::LFT_INTEGER) && - (cb->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && (cr->flags & line_buf::LFT_INTEGER) && (r->flags & line_buf::LFT_INTEGER) && - (g->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && (b->flags & line_buf::LFT_INTEGER)); - + if (y->flags & line_buf::LFT_32BIT) { assert((y->flags & line_buf::LFT_32BIT) && - (cb->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && (cr->flags & line_buf::LFT_32BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && - (b->flags & line_buf::LFT_32BIT)); + (g->flags & line_buf::LFT_32BIT) && + (b->flags & line_buf::LFT_32BIT)); const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32; si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32; for (int i = (repeat + 3) >> 2; i > 0; --i) @@ -527,13 +479,13 @@ namespace ojph { yp += 4; cbp += 4; crp += 4; } } - else + else { assert((y->flags & line_buf::LFT_64BIT) && - (cb->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && (cr->flags & line_buf::LFT_64BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); __m128i zero = _mm_setzero_si128(); __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); @@ -548,7 +500,7 @@ namespace ojph { mr = sse2_cvtlo_epi32_epi64(mr32, zero); mg = sse2_cvtlo_epi32_epi64(mg32, zero); mb = sse2_cvtlo_epi32_epi64(mb32, zero); - + t = _mm_add_epi64(mr, mb); t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1)); _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2)); @@ -562,7 +514,7 @@ namespace ojph { mr = sse2_cvthi_epi32_epi64(mr32, zero); mg = sse2_cvthi_epi32_epi64(mg32, zero); mb = sse2_cvthi_epi32_epi64(mb32, zero); - + t = _mm_add_epi64(mr, mb); t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1)); _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2)); @@ -578,26 +530,26 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_rct_backward(const line_buf *y, - const line_buf *cb, + void sse2_rct_backward(const line_buf *y, + const line_buf *cb, const line_buf *cr, - line_buf *r, line_buf *g, line_buf *b, + line_buf *r, line_buf *g, line_buf *b, ui32 repeat) { assert((y->flags & line_buf::LFT_INTEGER) && - (cb->flags & line_buf::LFT_INTEGER) && + (cb->flags & line_buf::LFT_INTEGER) && (cr->flags & line_buf::LFT_INTEGER) && (r->flags & line_buf::LFT_INTEGER) && - (g->flags & line_buf::LFT_INTEGER) && + (g->flags & line_buf::LFT_INTEGER) && (b->flags & line_buf::LFT_INTEGER)); if (y->flags & line_buf::LFT_32BIT) { assert((y->flags & line_buf::LFT_32BIT) && - (cb->flags & line_buf::LFT_32BIT) && + (cb->flags & line_buf::LFT_32BIT) && (cr->flags & line_buf::LFT_32BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32; si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; @@ -617,15 +569,15 @@ namespace ojph { yp += 4; cbp += 4; crp += 4; rp += 4; gp += 4; bp += 4; - } + } } else { assert((y->flags & line_buf::LFT_64BIT) && - (cb->flags & line_buf::LFT_64BIT) && + (cb->flags & line_buf::LFT_64BIT) && (cr->flags & line_buf::LFT_64BIT) && (r->flags & line_buf::LFT_32BIT) && - (g->flags & line_buf::LFT_32BIT) && + (g->flags & line_buf::LFT_32BIT) && (b->flags & line_buf::LFT_32BIT)); __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2)); __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX); @@ -633,7 +585,7 @@ namespace ojph { si32 *rp = r->i32, *gp = g->i32, *bp = b->i32; for (int i = (repeat + 3) >> 2; i > 0; --i) { - __m128i my, mcb, mcr, tr, tg, tb; + __m128i my, mcb, mcr, tr, tg, tb; my = _mm_load_si128((__m128i*)yp); mcb = _mm_load_si128((__m128i*)cbp); mcr = _mm_load_si128((__m128i*)crp); @@ -678,7 +630,7 @@ namespace ojph { yp += 2; cbp += 2; crp += 2; rp += 4; gp += 4; bp += 4; - } + } } } } From 2ea19eb46e04f1783a599f42dc7e600193c30891 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 25 Jan 2025 11:55:47 +1100 Subject: [PATCH 13/20] Updated WASM SIMD --- src/core/transform/ojph_colour_wasm.cpp | 140 ++++++++---------------- 1 file changed, 44 insertions(+), 96 deletions(-) diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 8e354784..fd08f324 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -50,7 +50,7 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// static inline v128_t ojph_convert_float_to_i32(v128_t a, v128_t zero, v128_t half) - { // We implement ojph_round, which is + { // We implement ojph_round, which is // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float v128_t c = wasm_f32x4_ge(a, zero); // greater or equal to zero v128_t p = wasm_f32x4_add(a, half); // for positive, add half @@ -279,7 +279,7 @@ namespace ojph { static inline v128_t ojph_wasm_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y) { - v128_t c = wasm_i32x4_ge(x, y); // 0xFFFFFFFF for x >= y + v128_t c = wasm_f32x4_ge(x, y); // 0xFFFFFFFF for x >= y v128_t d = wasm_v128_and(c, a); // keep only a, where x >= y v128_t e = wasm_v128_andnot(b, c); // keep only b, where x < y return wasm_v128_or(d, e); // combine @@ -289,7 +289,7 @@ namespace ojph { static inline v128_t ojph_wasm_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y) { - v128_t c = wasm_i32x4_lt(x, y); // 0xFFFFFFFF for x < y + v128_t c = wasm_f32x4_lt(x, y); // 0xFFFFFFFF for x < y v128_t d = wasm_v128_and(c, a); // keep only a, where x < y v128_t e = wasm_v128_andnot(b, c); // keep only b, where x >= y return wasm_v128_or(d, e); // combine @@ -305,106 +305,54 @@ namespace ojph { (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER)); + assert(bit_depth <= 32); const float* sp = src_line->f32; si32* dp = dst_line->i32 + dst_line_offset; - if (bit_depth <= 30) + // There is the possibility that converting to integer will + // exceed the dynamic range of 32bit integer; therefore, care must be + // exercised. + // We look if the floating point number is outside the half-closed + // interval [-0.5f, 0.5f). If so, we limit the resulting integer + // to the maximum/minimum that number supports. + si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth); + v128_t mul = wasm_f32x4_splat((float)(1ull << bit_depth)); + v128_t fl_up_lim = wasm_f32x4_splat(-(float)neg_limit); // val < upper + v128_t fl_low_lim = wasm_f32x4_splat((float)neg_limit); // val >= lower + v128_t s32_up_lim = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth)); + v128_t s32_low_lim = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth)); + + if (is_signed) { - // We are leaving two bit overhead -- here, we are assuming that after - // multiplications, the resulting number can still be represented - // using 32 bit integer - v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth)); - v128_t upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth)); - v128_t lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth)); - - if (is_signed) - { - const v128_t zero = wasm_f32x4_splat(0.0f); - const v128_t half = wasm_f32x4_splat(0.5f); - v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) - { - v128_t t = wasm_v128_load(sp); - t = wasm_f32x4_mul(t, mul); - v128_t u = ojph_convert_float_to_i32(t, zero, half); - u = wasm_i32x4_max(u, lower_limit); - u = wasm_i32x4_min(u, upper_limit); - - v128_t c = wasm_i32x4_gt(zero, u); //0xFFFFFFFF for -ve value - v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value - neg = wasm_v128_and(c, neg); //keep only - bias - value - v128_t v = wasm_v128_andnot(u, c); //keep only +ve or 0 - v = wasm_v128_or(neg, v); //combine - wasm_v128_store(dp, v); - } - } - else - { - const v128_t zero = wasm_f32x4_splat(0.0f); - const v128_t half = wasm_f32x4_splat(0.5f); - v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1))); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - v128_t t = wasm_v128_load(sp); - t = wasm_f32x4_mul(t, mul); - v128_t u = ojph_convert_float_to_i32(t, zero, half); - u = wasm_i32x4_max(u, lower_limit); - u = wasm_i32x4_min(u, upper_limit); - u = wasm_i32x4_add(u, ihalf); - wasm_v128_store(dp, u); - } + const v128_t zero = wasm_f32x4_splat(0.0f); + const v128_t half = wasm_f32x4_splat(0.5f); + v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = wasm_v128_load(sp); + t = wasm_f32x4_mul(t, mul); + v128_t u = ojph_convert_float_to_i32(t, zero, half); + u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim); + u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim); + v128_t c = wasm_i32x4_gt(zero, u); // 0xFFFFFFFF for -ve value + v128_t neg = wasm_i32x4_sub(bias, u); // -bias -value + neg = wasm_v128_and(c, neg); // keep only - bias - value + v128_t v = wasm_v128_andnot(u, c); // keep only +ve or 0 + v = wasm_v128_or(neg, v); // combine + wasm_v128_store(dp, v); } } else { - // There is the possibility that converting to integer will - // exceed the dynamic range of 32bit integer; therefore, we need - // to use 64 bit. One may think, why not limit the floats to the - // range of [-0.5f, 0.5f)? - // Notice the half closed range -- we need a value just below 0.5f. - // While getting this number is possible, after multiplication, the - // resulting number will not be exactly the maximum that the integer - // can achieve. All this is academic, because here are talking - // about a number which has all the exponent bits set, meaning - // it is either infinity, -infinity, qNan or sNan. - si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth); - v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth)); - v128_t fl_upper_limit = wasm_f32x4_splat(-(float)neg_limit); // val< up - v128_t fl_lower_limit = wasm_f32x4_splat( (float)neg_limit); // val>=lo - v128_t s32_upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth)); - v128_t s32_lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth)); - - if (is_signed) - { - const v128_t zero = wasm_f32x4_splat(0.0f); - const v128_t half = wasm_f32x4_splat(0.5f); - v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - v128_t t = wasm_v128_load(sp); - t = wasm_f32x4_mul(t, mul); - v128_t u = ojph_convert_float_to_i32(t, zero, half); - u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit); - u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit); - v128_t c = wasm_i32x4_gt(zero, u); //0xFFFFFFFF for -ve value - v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value - neg = wasm_v128_and(c, neg); //keep only - bias - value - v128_t v = wasm_v128_andnot(u, c); //keep only +ve or 0 - v = wasm_v128_or(neg, v); //combine - wasm_v128_store(dp, v); - } - } - else - { - const v128_t zero = wasm_f32x4_splat(0.0f); - const v128_t half = wasm_f32x4_splat(0.5f); - v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1))); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { - v128_t t = wasm_v128_load(sp); - t = wasm_f32x4_mul(t, mul); - v128_t u = ojph_convert_float_to_i32(t, zero, half); - u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit); - u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit); - u = wasm_i32x4_add(u, ihalf); - wasm_v128_store(dp, u); - } + const v128_t zero = wasm_f32x4_splat(0.0f); + const v128_t half = wasm_f32x4_splat(0.5f); + v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1))); + for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t t = wasm_v128_load(sp); + t = wasm_f32x4_mul(t, mul); + v128_t u = ojph_convert_float_to_i32(t, zero, half); + u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim); + u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim); + u = wasm_i32x4_add(u, ihalf); + wasm_v128_store(dp, u); } } } From 9c22320b8a6cc52e6ef8043a5369104ecbdcd9e8 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sat, 25 Jan 2025 12:25:01 +1100 Subject: [PATCH 14/20] Bug fixes. --- src/core/transform/ojph_colour.cpp | 6 +++--- src/core/transform/ojph_colour_wasm.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 792929b8..49107772 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -419,12 +419,12 @@ namespace ojph { } else { - const si32 half = INT_MIN; + const ui32 half = (ui32)INT_MIN; for (ui32 i = width; i > 0; --i) { - si32 v = *sp++; + ui32 v = (ui32)*sp++; v <<= shift; v -= half; - *dp++ = (float)v * mul; + *dp++ = (float)(si32)v * mul; } } } diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index fd08f324..05a06a26 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -391,13 +391,13 @@ namespace ojph { } else { - v128_t half = wasm_f32x4_splat(0.5f); + v128_t half = wasm_i32x4_splat(INT_MIN); for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); v128_t u = wasm_i32x4_shl(t, shift); + u = wasm_i32x4_sub(u, half); v128_t v = wasm_f32x4_convert_i32x4(u); v = wasm_f32x4_mul(v, mul); - v = wasm_f32x4_sub(v, half); wasm_v128_store(dp, v); } } From 99e33f9a54191e88b43378579507ee5cfa736c70 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 26 Jan 2025 11:30:34 +1100 Subject: [PATCH 15/20] Adopting the NLT type3 routine design for other routines. Major change. --- src/core/codestream/ojph_tile.cpp | 69 ++++-------- src/core/transform/ojph_colour.cpp | 136 ++++++++++++------------ src/core/transform/ojph_colour.h | 23 ++-- src/core/transform/ojph_colour_avx.cpp | 60 ----------- src/core/transform/ojph_colour_avx2.cpp | 83 +++++++++++---- src/core/transform/ojph_colour_local.h | 92 ++++++---------- src/core/transform/ojph_colour_sse.cpp | 82 -------------- src/core/transform/ojph_colour_sse2.cpp | 82 ++++++++++---- src/core/transform/ojph_colour_wasm.cpp | 82 ++++++++++---- 9 files changed, 318 insertions(+), 391 deletions(-) diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp index ae78b06c..7ac60444 100644 --- a/src/core/codestream/ojph_tile.cpp +++ b/src/core/codestream/ojph_tile.cpp @@ -287,15 +287,9 @@ namespace ojph { if (nlt_type3[comp_num] == type3) irv_convert_to_float_nlt_type3(line, line_offsets[comp_num], tc, num_bits[comp_num], is_signed[comp_num], comp_width); - else { - float mul = 1.0f / (float)(1<i32 + line_offsets[comp_num]; - float *dp = tc->f32; - if (is_signed[comp_num]) - cnvrt_si32_to_float(sp, dp, mul, comp_width); - else - cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width); - } + else + irv_convert_to_float(line, line_offsets[comp_num], + tc, num_bits[comp_num], is_signed[comp_num], comp_width); } comps[comp_num].push_line(); } @@ -331,15 +325,10 @@ namespace ojph { irv_convert_to_float_nlt_type3(line, line_offsets[comp_num], lines + comp_num, num_bits[comp_num], is_signed[comp_num], comp_width); - else { - float mul = 1.0f / (float)(1<i32 + line_offsets[comp_num]; - float *dp = lines[comp_num].f32; - if (is_signed[comp_num]) - cnvrt_si32_to_float(sp, dp, mul, comp_width); - else - cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width); - } + else + irv_convert_to_float(line, line_offsets[comp_num], + lines + comp_num, num_bits[comp_num], is_signed[comp_num], + comp_width); if (comp_num == 2) { // irreversible color transform ict_forward(lines[0].f32, lines[1].f32, lines[2].f32, @@ -387,20 +376,13 @@ namespace ojph { else { if (nlt_type3[comp_num] == type3) - { irv_convert_to_integer_nlt_type3(src_line, tgt_line, line_offsets[comp_num], num_bits[comp_num], is_signed[comp_num], comp_width); - } - else { - float mul = (float)(1 << num_bits[comp_num]); - const float *sp = src_line->f32; - si32 *dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) - cnvrt_float_to_si32(sp, dp, mul, comp_width); - else - cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width); - } + else + irv_convert_to_integer(src_line, tgt_line, + line_offsets[comp_num], num_bits[comp_num], + is_signed[comp_num], comp_width); } } else @@ -437,30 +419,19 @@ namespace ojph { } else { + line_buf* lbp; + if (comp_num < 3) + lbp = lines + comp_num; + else + lbp = comps[comp_num].pull_line(); if (nlt_type3[comp_num] == type3) - { - line_buf* lbp; - if (comp_num < 3) - lbp = lines + comp_num; - else - lbp = comps[comp_num].pull_line(); irv_convert_to_integer_nlt_type3(lbp, tgt_line, line_offsets[comp_num], num_bits[comp_num], is_signed[comp_num], comp_width); - } - else { - float mul = (float)(1 << num_bits[comp_num]); - const float *sp; - if (comp_num < 3) - sp = lines[comp_num].f32; - else - sp = comps[comp_num].pull_line()->f32; - si32 *dp = tgt_line->i32 + line_offsets[comp_num]; - if (is_signed[comp_num]) - cnvrt_float_to_si32(sp, dp, mul, comp_width); - else - cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width); - } + else + irv_convert_to_integer(lbp, tgt_line, + line_offsets[comp_num], num_bits[comp_num], + is_signed[comp_num], comp_width); } } diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 49107772..2c559ced 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -63,21 +63,16 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width) = NULL; - ////////////////////////////////////////////////////////////////////////// - void (*cnvrt_si32_to_float_shftd) - (const si32 *sp, float *dp, float mul, ui32 width) = NULL; - - ////////////////////////////////////////////////////////////////////////// - void (*cnvrt_si32_to_float) - (const si32 *sp, float *dp, float mul, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// - void (*cnvrt_float_to_si32_shftd) - (const float *sp, si32 *dp, float mul, ui32 width) = NULL; + void (*irv_convert_to_integer) ( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// - void (*cnvrt_float_to_si32) - (const float *sp, si32 *dp, float mul, ui32 width) = NULL; + void (*irv_convert_to_float) ( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL; ////////////////////////////////////////////////////////////////////////// void (*irv_convert_to_integer_nlt_type3) ( @@ -122,12 +117,10 @@ namespace ojph { rev_convert = gen_rev_convert; rev_convert_nlt_type3 = gen_rev_convert_nlt_type3; - cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd; - cnvrt_si32_to_float = gen_cnvrt_si32_to_float; - cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd; - cnvrt_float_to_si32 = gen_cnvrt_float_to_si32; - irv_convert_to_float_nlt_type3 = gen_irv_convert_to_float_nlt_type3; + irv_convert_to_integer = gen_irv_convert_to_integer; + irv_convert_to_float = gen_irv_convert_to_float; irv_convert_to_integer_nlt_type3 = gen_irv_convert_to_integer_nlt_type3; + irv_convert_to_float_nlt_type3 = gen_irv_convert_to_float_nlt_type3; rct_forward = gen_rct_forward; rct_backward = gen_rct_backward; ict_forward = gen_ict_forward; @@ -140,10 +133,6 @@ namespace ojph { #ifndef OJPH_DISABLE_SSE if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE) { - cnvrt_si32_to_float_shftd = sse_cnvrt_si32_to_float_shftd; - cnvrt_si32_to_float = sse_cnvrt_si32_to_float; - cnvrt_float_to_si32_shftd = sse_cnvrt_float_to_si32_shftd; - cnvrt_float_to_si32 = sse_cnvrt_float_to_si32; ict_forward = sse_ict_forward; ict_backward = sse_ict_backward; } @@ -154,8 +143,8 @@ namespace ojph { { rev_convert = sse2_rev_convert; rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3; - cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd; - cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32; + irv_convert_to_integer = sse2_irv_convert_to_integer; + irv_convert_to_float = sse2_irv_convert_to_float; irv_convert_to_integer_nlt_type3 = sse2_irv_convert_to_integer_nlt_type3; irv_convert_to_float_nlt_type3 = @@ -168,10 +157,6 @@ namespace ojph { #ifndef OJPH_DISABLE_AVX if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX) { - cnvrt_si32_to_float_shftd = avx_cnvrt_si32_to_float_shftd; - cnvrt_si32_to_float = avx_cnvrt_si32_to_float; - cnvrt_float_to_si32_shftd = avx_cnvrt_float_to_si32_shftd; - cnvrt_float_to_si32 = avx_cnvrt_float_to_si32; ict_forward = avx_ict_forward; ict_backward = avx_ict_backward; } @@ -182,6 +167,8 @@ namespace ojph { { rev_convert = avx2_rev_convert; rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3; + irv_convert_to_integer = avx2_irv_convert_to_integer; + irv_convert_to_float = avx2_irv_convert_to_float; irv_convert_to_integer_nlt_type3 = avx2_irv_convert_to_integer_nlt_type3; irv_convert_to_float_nlt_type3 = @@ -201,10 +188,8 @@ namespace ojph { rev_convert = wasm_rev_convert; rev_convert_nlt_type3 = wasm_rev_convert_nlt_type3; - cnvrt_si32_to_float_shftd = wasm_cnvrt_si32_to_float_shftd; - cnvrt_si32_to_float = wasm_cnvrt_si32_to_float; - cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd; - cnvrt_float_to_si32 = wasm_cnvrt_float_to_si32; + irv_convert_to_integer = wasm_irv_convert_to_integer; + irv_convert_to_float = wasm_irv_convert_to_float; irv_convert_to_integer_nlt_type3 = wasm_irv_convert_to_integer_nlt_type3; irv_convert_to_float_nlt_type3 = wasm_irv_convert_to_float_nlt_type3; rct_forward = wasm_rct_forward; @@ -310,40 +295,11 @@ namespace ojph { } } - ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, - ui32 width) - { - for (ui32 i = width; i > 0; --i) - *dp++ = (float)(ui32)*sp++ * mul - 0.5f; - } - - ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, - ui32 width) - { - for (ui32 i = width; i > 0; --i) - *dp++ = (float)*sp++ * mul; - } - - ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, - ui32 width) - { - for (ui32 i = width; i > 0; --i) - *dp++ = (si32)ojph_round((*sp++ + 0.5f) * mul); - } - - ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, - ui32 width) - { - for (ui32 i = width; i > 0; --i) - *dp++ = ojph_round(*sp++ * mul); - } ////////////////////////////////////////////////////////////////////////// - void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + template + static inline + void local_gen_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) { @@ -371,19 +327,20 @@ namespace ojph { if (is_signed) { const si32 bias = (1 << (bit_depth - 1)) + 1; - for (ui32 i = width; i > 0; --i) { + for (int i = width; i > 0; --i) { float t = *sp++ * mul; si32 v = ojph_round(t); v = t >= fl_low_lim ? v : s32_low_lim; v = t < fl_up_lim ? v : s32_up_lim; - v = (v >= 0) ? v : (- v - bias); + if (NLT_TYPE3) + v = (v >= 0) ? v : (- v - bias); *dp++ = v; } } else { - const si32 half = (1 << (bit_depth - 1)); - for (ui32 i = width; i > 0; --i) { + const si32 half = 1 << (bit_depth - 1); + for (int i = width; i > 0; --i) { float t = *sp++ * mul; si32 v = ojph_round(t); v = t >= fl_low_lim ? v : s32_low_lim; @@ -394,7 +351,27 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, + void gen_irv_convert_to_integer(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_gen_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_gen_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + template + static inline + void local_gen_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) { @@ -411,16 +388,17 @@ namespace ojph { if (is_signed) { si32 bias = (si32)((ui32)INT_MIN + 1); - for (ui32 i = width; i > 0; --i) { + for (int i = width; i > 0; --i) { si32 v = *sp++ << shift; - v = (v >= 0) ? v : (- v - bias); + if (NLT_TYPE3) + v = (v >= 0) ? v : (- v - bias); *dp++ = (float)v * mul; } } else { const ui32 half = (ui32)INT_MIN; - for (ui32 i = width; i > 0; --i) { + for (int i = width; i > 0; --i) { ui32 v = (ui32)*sp++; v <<= shift; v -= half; @@ -429,6 +407,24 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void gen_irv_convert_to_float(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_gen_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_gen_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + ////////////////////////////////////////////////////////////////////////// void gen_rct_forward( const line_buf *r, const line_buf *g, const line_buf *b, diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h index d5375a97..b0b5da61 100644 --- a/src/core/transform/ojph_colour.h +++ b/src/core/transform/ojph_colour.h @@ -61,24 +61,14 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); - //////////////////////////////////////////////////////////////////////////// - extern void (*cnvrt_si32_to_float_shftd) - (const si32 *sp, float *dp, float mul, ui32 width); - - //////////////////////////////////////////////////////////////////////////// - extern void (*cnvrt_si32_to_float) - (const si32 *sp, float *dp, float mul, ui32 width); //////////////////////////////////////////////////////////////////////////// - extern void (*cnvrt_float_to_si32_shftd) - (const float *sp, si32 *dp, float mul, ui32 width); - - //////////////////////////////////////////////////////////////////////////// - extern void (*cnvrt_float_to_si32) - (const float *sp, si32 *dp, float mul, ui32 width); + extern void (*irv_convert_to_integer) ( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); //////////////////////////////////////////////////////////////////////////// - extern void (*irv_convert_to_float_nlt_type3) ( + extern void (*irv_convert_to_float) ( const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); @@ -87,6 +77,11 @@ namespace ojph { const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width); + //////////////////////////////////////////////////////////////////////////// + extern void (*irv_convert_to_float_nlt_type3) ( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + //////////////////////////////////////////////////////////////////////////// extern void (*rct_forward) (const line_buf *r, const line_buf *g, const line_buf *b, diff --git a/src/core/transform/ojph_colour_avx.cpp b/src/core/transform/ojph_colour_avx.cpp index 27e78e5c..f6a714d2 100644 --- a/src/core/transform/ojph_colour_avx.cpp +++ b/src/core/transform/ojph_colour_avx.cpp @@ -47,66 +47,6 @@ namespace ojph { namespace local { - ////////////////////////////////////////////////////////////////////////// - void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, - ui32 width) - { - __m256 shift = _mm256_set1_ps(0.5f); - __m256 m = _mm256_set1_ps(mul); - for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) - { - __m256i t = _mm256_loadu_si256((__m256i*)sp); - __m256 s = _mm256_cvtepi32_ps(t); - s = _mm256_mul_ps(s, m); - s = _mm256_sub_ps(s, shift); - _mm256_store_ps(dp, s); - } - } - - ////////////////////////////////////////////////////////////////////////// - void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, - ui32 width) - { - __m256 m = _mm256_set1_ps(mul); - for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) - { - __m256i t = _mm256_loadu_si256((__m256i*)sp); - __m256 s = _mm256_cvtepi32_ps(t); - s = _mm256_mul_ps(s, m); - _mm256_store_ps(dp, s); - } - } - - ////////////////////////////////////////////////////////////////////////// - void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, - ui32 width) - { - __m256 shift = _mm256_set1_ps(0.5f); - __m256 m = _mm256_set1_ps(mul); - for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) - { - __m256 t = _mm256_load_ps(sp); - __m256 s = _mm256_add_ps(t, shift); - s = _mm256_mul_ps(s, m); - s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s)); - } - } - - ////////////////////////////////////////////////////////////////////////// - void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, - ui32 width) - { - __m256 m = _mm256_set1_ps(mul); - for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8) - { - __m256 t = _mm256_load_ps(sp); - __m256 s = _mm256_mul_ps(t, m); - s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); - _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s)); - } - } - ////////////////////////////////////////////////////////////////////////// void avx_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat) diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index cb2bf000..33969f1f 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -263,7 +263,9 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + template + static inline + void local_avx2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) { @@ -292,24 +294,27 @@ namespace ojph { { __m256i zero = _mm256_setzero_si256(); __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) { __m256 t = _mm256_loadu_ps(sp); t = _mm256_mul_ps(t, mul); __m256i u = _mm256_cvtps_epi32(t); u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); u = ojph_mm256_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); - __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value - __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value - neg = _mm256_and_si256(c, neg); //keep only - bias - value - __m256i v = _mm256_andnot_si256(c, u); //keep only +ve or 0 - v = _mm256_or_si256(neg, v); //combine - _mm256_storeu_si256((__m256i*)dp, v); + if (NLT_TYPE3) + { + __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val + __m256i neg = _mm256_sub_epi32(bias, u); // -bias -value + neg = _mm256_and_si256(c, neg); // keep only - bias - val + u = _mm256_andnot_si256(c, u); // keep only +ve or 0 + u = _mm256_or_si256(neg, u); // combine + } + _mm256_storeu_si256((__m256i*)dp, u); } } else { - __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1))); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m256i half = _mm256_set1_epi32(1 << (bit_depth - 1)); + for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) { __m256 t = _mm256_loadu_ps(sp); t = _mm256_mul_ps(t, mul); __m256i u = _mm256_cvtps_epi32(t); @@ -322,7 +327,27 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, + void avx2_irv_convert_to_integer(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_avx2_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_avx2_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + template + static inline + void local_avx2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) { @@ -340,14 +365,17 @@ namespace ojph { { __m256i zero = _mm256_setzero_si256(); __m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1)); - for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) { + for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) { __m256i t = _mm256_loadu_si256((__m256i*)sp); __m256i u = _mm256_slli_epi32(t, shift); - __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve value - __m256i neg = _mm256_sub_epi32(bias, u); // - bias - value - neg = _mm256_and_si256(c, neg); // keep only - bias - value - t = _mm256_andnot_si256(c, u); // keep only +ve or 0 - u = _mm256_or_si256(neg, t); // combine + if (NLT_TYPE3) + { + __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val + __m256i neg = _mm256_sub_epi32(bias, u); // - bias - value + neg = _mm256_and_si256(c, neg); // keep only - bias - val + t = _mm256_andnot_si256(c, u); // keep only +ve or 0 + u = _mm256_or_si256(neg, t); // combine + } __m256 v = _mm256_cvtepi32_ps(u); v = _mm256_mul_ps(v, mul); _mm256_storeu_ps(dp, v); @@ -356,7 +384,7 @@ namespace ojph { else { __m256i half = _mm256_set1_epi32(INT_MIN); - for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) { + for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) { __m256i t = _mm256_loadu_si256((__m256i*)sp); t = _mm256_slli_epi32(t, shift); t = _mm256_sub_epi32(t, half); @@ -367,6 +395,25 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void avx2_irv_convert_to_float(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_avx2_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_avx2_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// void avx2_rct_forward(const line_buf *r, const line_buf *g, diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h index 5f28685a..a85bf8bd 100644 --- a/src/core/transform/ojph_colour_local.h +++ b/src/core/transform/ojph_colour_local.h @@ -77,20 +77,14 @@ namespace ojph { si64 shift, ui32 width); ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, - ui32 width); + void gen_irv_convert_to_float( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); ////////////////////////////////////////////////////////////////////////// - void gen_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, - ui32 width); + void gen_irv_convert_to_integer( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); ////////////////////////////////////////////////////////////////////////// void gen_irv_convert_to_float_nlt_type3( @@ -128,22 +122,6 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, - ui32 width); - ////////////////////////////////////////////////////////////////////////// void sse_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat); @@ -161,12 +139,9 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, - ui32 width); + void sse2_irv_convert_to_integer( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); ////////////////////////////////////////////////////////////////////////// void sse2_irv_convert_to_integer_nlt_type3( @@ -193,6 +168,11 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void sse2_irv_convert_to_float( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + ////////////////////////////////////////////////////////////////////////// void sse2_irv_convert_to_float_nlt_type3( const line_buf *src_line, ui32 src_line_offset, @@ -216,22 +196,6 @@ namespace ojph { // ////////////////////////////////////////////////////////////////////////// - ////////////////////////////////////////////////////////////////////////// - void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, - ui32 width); - ////////////////////////////////////////////////////////////////////////// void avx_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat); @@ -260,6 +224,16 @@ namespace ojph { line_buf *dst_line, const ui32 dst_line_offset, si64 shift, ui32 width); + ////////////////////////////////////////////////////////////////////////// + void avx2_irv_convert_to_integer( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); + + ////////////////////////////////////////////////////////////////////////// + void avx2_irv_convert_to_float( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); + ////////////////////////////////////////////////////////////////////////// void avx2_irv_convert_to_integer_nlt_type3( const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, @@ -289,20 +263,14 @@ namespace ojph { ////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, - ui32 width); - - ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, - ui32 width); + void wasm_irv_convert_to_integer( + const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width); ////////////////////////////////////////////////////////////////////////// - void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, - ui32 width); + void wasm_irv_convert_to_float( + const line_buf *src_line, ui32 src_line_offset, + line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width); ////////////////////////////////////////////////////////////////////////// void wasm_rev_convert( diff --git a/src/core/transform/ojph_colour_sse.cpp b/src/core/transform/ojph_colour_sse.cpp index edd1eaf2..ce61bd89 100644 --- a/src/core/transform/ojph_colour_sse.cpp +++ b/src/core/transform/ojph_colour_sse.cpp @@ -47,88 +47,6 @@ namespace ojph { namespace local { - ////////////////////////////////////////////////////////////////////////// - void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, - ui32 width) - { - __m128 shift = _mm_set1_ps(0.5f); - __m128 m = _mm_set1_ps(mul); - for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) - { - __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp)); - __m128 s = _mm_cvtepi32_ps(t); - s = _mm_mul_ps(s, m); - s = _mm_sub_ps(s, shift); - _mm_store_ps(dp, s); - } - } - - ////////////////////////////////////////////////////////////////////////// - void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, - ui32 width) - { - __m128 m = _mm_set1_ps(mul); - for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4) - { - __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp)); - __m128 s = _mm_cvtepi32_ps(t); - s = _mm_mul_ps(s, m); - _mm_store_ps(dp, s); - } - } - - ////////////////////////////////////////////////////////////////////////// - void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, - ui32 width) - { - uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); - __m128 shift = _mm_set1_ps(0.5f); - __m128 m = _mm_set1_ps(mul); - for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4) - { - __m128 t = _mm_load_ps(sp); - __m128 s = _mm_add_ps(t, shift); - s = _mm_mul_ps(s, m); - // the following is a poorly designed code, but it is the only - // code that I am aware of that compiles on VS 32 and 64 modes - t = s; - *dp++ = _mm_cvtss_si32(t); - t = _mm_shuffle_ps(s, s, 1); - *dp++ = _mm_cvtss_si32(t); - t = _mm_shuffle_ps(s, s, 2); - *dp++ = _mm_cvtss_si32(t); - t = _mm_shuffle_ps(s, s, 3); - *dp++ = _mm_cvtss_si32(t); - } - _MM_SET_ROUNDING_MODE(rounding_mode); - } - - ////////////////////////////////////////////////////////////////////////// - void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, - ui32 width) - { - uint32_t rounding_mode = _MM_GET_ROUNDING_MODE(); - _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); - __m128 m = _mm_set1_ps(mul); - for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4) - { - __m128 t = _mm_load_ps(sp); - __m128 s = _mm_mul_ps(t, m); - // the following is a poorly designed code, but it is the only - // code that I am aware of that compiles on VS 32 and 64 modes - t = s; - *dp++ = _mm_cvtss_si32(t); - t = _mm_shuffle_ps(s, s, 1); - *dp++ = _mm_cvtss_si32(t); - t = _mm_shuffle_ps(s, s, 2); - *dp++ = _mm_cvtss_si32(t); - t = _mm_shuffle_ps(s, s, 3); - *dp++ = _mm_cvtss_si32(t); - } - _MM_SET_ROUNDING_MODE(rounding_mode); - } - ////////////////////////////////////////////////////////////////////////// void sse_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat) diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index f1a95447..96eeb34f 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -127,7 +127,9 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + template + static inline + void local_sse2_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) { @@ -159,24 +161,27 @@ namespace ojph { { __m128i zero = _mm_setzero_si128(); __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { __m128 t = _mm_loadu_ps(sp); t = _mm_mul_ps(t, mul); __m128i u = _mm_cvtps_epi32(t); u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim); u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim); - __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value - __m128i neg = _mm_sub_epi32(bias, u); //-bias -value - neg = _mm_and_si128(c, neg); //keep only - bias - value - __m128i v = _mm_andnot_si128(c, u); //keep only +ve or 0 - v = _mm_or_si128(neg, v); //combine - _mm_storeu_si128((__m128i*)dp, v); + if (NLT_TYPE3) + { + __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value + __m128i neg = _mm_sub_epi32(bias, u); //-bias -value + neg = _mm_and_si128(c, neg); //keep only - bias - value + u = _mm_andnot_si128(c, u); //keep only +ve or 0 + u = _mm_or_si128(neg, u); //combine + } + _mm_storeu_si128((__m128i*)dp, u); } } else { - __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1))); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + __m128i half = _mm_set1_epi32(1 << (bit_depth - 1)); + for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { __m128 t = _mm_loadu_ps(sp); t = _mm_mul_ps(t, mul); __m128i u = _mm_cvtps_epi32(t); @@ -190,6 +195,24 @@ namespace ojph { _MM_SET_ROUNDING_MODE(rounding_mode); } + ////////////////////////////////////////////////////////////////////////// + void sse2_irv_convert_to_integer(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_sse2_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_sse2_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + ///////////////////////////////////////////////////////////////////////// // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) @@ -392,7 +415,9 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, + template + static inline + void local_sse2_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) { @@ -411,14 +436,17 @@ namespace ojph { { __m128i zero = _mm_setzero_si128(); __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { __m128i t = _mm_loadu_si128((__m128i*)sp); __m128i u = _mm_slli_epi32(t, shift); - __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value - __m128i neg = _mm_sub_epi32(bias, u); // - bias - value - neg = _mm_and_si128(c, neg); // keep only - bias - value - t = _mm_andnot_si128(c, u); // keep only +ve or 0 - u = _mm_or_si128(neg, t); // combine + if (NLT_TYPE3) + { + __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value + __m128i neg = _mm_sub_epi32(bias, u); // - bias - value + neg = _mm_and_si128(c, neg); // keep only - bias - value + t = _mm_andnot_si128(c, u); // keep only +ve or 0 + u = _mm_or_si128(neg, t); // combine + } __m128 v = _mm_cvtepi32_ps(u); v = _mm_mul_ps(v, mul); _mm_storeu_ps(dp, v); @@ -427,7 +455,7 @@ namespace ojph { else { __m128i half = _mm_set1_epi32(INT_MIN); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { __m128i t = _mm_loadu_si128((__m128i*)sp); t = _mm_slli_epi32(t, shift); t = _mm_sub_epi32(t, half); @@ -438,6 +466,24 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void sse2_irv_convert_to_float(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_sse2_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_sse2_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + ////////////////////////////////////////////////////////////////////////// void sse2_rct_forward(const line_buf *r, const line_buf *g, diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 05a06a26..548a2042 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -296,7 +296,9 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + template + static inline + void local_wasm_irv_convert_to_integer(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) { @@ -326,26 +328,29 @@ namespace ojph { const v128_t zero = wasm_f32x4_splat(0.0f); const v128_t half = wasm_f32x4_splat(0.5f); v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); v128_t u = ojph_convert_float_to_i32(t, zero, half); u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim); u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim); - v128_t c = wasm_i32x4_gt(zero, u); // 0xFFFFFFFF for -ve value - v128_t neg = wasm_i32x4_sub(bias, u); // -bias -value - neg = wasm_v128_and(c, neg); // keep only - bias - value - v128_t v = wasm_v128_andnot(u, c); // keep only +ve or 0 - v = wasm_v128_or(neg, v); // combine - wasm_v128_store(dp, v); + if (NLT_TYPE3) + { + v128_t c = wasm_i32x4_gt(zero, u); // 0xFFFFFFFF for -ve value + v128_t neg = wasm_i32x4_sub(bias, u); // -bias -value + neg = wasm_v128_and(c, neg); // keep only - bias - value + u = wasm_v128_andnot(u, c); // keep only +ve or 0 + u = wasm_v128_or(neg, u); // combine + } + wasm_v128_store(dp, u); } } else { const v128_t zero = wasm_f32x4_splat(0.0f); const v128_t half = wasm_f32x4_splat(0.5f); - v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1))); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + v128_t ihalf = wasm_i32x4_splat(1 << (bit_depth - 1)); + for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); v128_t u = ojph_convert_float_to_i32(t, zero, half); @@ -358,7 +363,27 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line, + void wasm_irv_convert_to_integer(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_wasm_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + line_buf *dst_line, ui32 dst_line_offset, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_wasm_irv_convert_to_integer(src_line, dst_line, + dst_line_offset, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + template + static inline + void local_wasm_irv_convert_to_float(const line_buf *src_line, ui32 src_line_offset, line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) { @@ -376,14 +401,17 @@ namespace ojph { { v128_t zero = wasm_i32x4_splat(0); v128_t bias = wasm_i32x4_splat(-(si32)((ui32)INT_MIN + 1)); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); v128_t u = wasm_i32x4_shl(t, shift); - v128_t c = wasm_i32x4_lt(u, zero); // 0xFFFFFFFF for -ve value - v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value - neg = wasm_v128_and(c, neg); // keep only - bias - value - t = wasm_v128_andnot(u, c); // keep only +ve or 0 - u = wasm_v128_or(neg, t); // combine + if (NLT_TYPE3) + { + v128_t c = wasm_i32x4_lt(u, zero); // 0xFFFFFFFF for -ve value + v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value + neg = wasm_v128_and(c, neg); // keep only - bias - value + t = wasm_v128_andnot(u, c); // keep only +ve or 0 + u = wasm_v128_or(neg, t); // combine + } v128_t v = wasm_f32x4_convert_i32x4(u); v = wasm_f32x4_mul(v, mul); wasm_v128_store(dp, v); @@ -392,7 +420,7 @@ namespace ojph { else { v128_t half = wasm_i32x4_splat(INT_MIN); - for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); v128_t u = wasm_i32x4_shl(t, shift); u = wasm_i32x4_sub(u, half); @@ -403,6 +431,24 @@ namespace ojph { } } + ////////////////////////////////////////////////////////////////////////// + void wasm_irv_convert_to_float(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_wasm_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + + ////////////////////////////////////////////////////////////////////////// + void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line, + ui32 src_line_offset, line_buf *dst_line, + ui32 bit_depth, bool is_signed, ui32 width) + { + local_wasm_irv_convert_to_float(src_line, src_line_offset, + dst_line, bit_depth, is_signed, width); + } + ////////////////////////////////////////////////////////////////////////// void wasm_rct_forward(const line_buf *r, const line_buf *g, From e24c2cacc9d83fe54eeb6b270812758b55cf6cfb Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 26 Jan 2025 11:36:23 +1100 Subject: [PATCH 16/20] Addressing Warnings. --- src/core/transform/ojph_colour.cpp | 8 ++++---- src/core/transform/ojph_colour_avx2.cpp | 8 ++++---- src/core/transform/ojph_colour_sse2.cpp | 9 ++++----- src/core/transform/ojph_colour_wasm.cpp | 8 ++++---- 4 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 2c559ced..3e6307e7 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -327,7 +327,7 @@ namespace ojph { if (is_signed) { const si32 bias = (1 << (bit_depth - 1)) + 1; - for (int i = width; i > 0; --i) { + for (int i = (int)width; i > 0; --i) { float t = *sp++ * mul; si32 v = ojph_round(t); v = t >= fl_low_lim ? v : s32_low_lim; @@ -340,7 +340,7 @@ namespace ojph { else { const si32 half = 1 << (bit_depth - 1); - for (int i = width; i > 0; --i) { + for (int i = (int)width; i > 0; --i) { float t = *sp++ * mul; si32 v = ojph_round(t); v = t >= fl_low_lim ? v : s32_low_lim; @@ -388,7 +388,7 @@ namespace ojph { if (is_signed) { si32 bias = (si32)((ui32)INT_MIN + 1); - for (int i = width; i > 0; --i) { + for (int i = (int)width; i > 0; --i) { si32 v = *sp++ << shift; if (NLT_TYPE3) v = (v >= 0) ? v : (- v - bias); @@ -398,7 +398,7 @@ namespace ojph { else { const ui32 half = (ui32)INT_MIN; - for (int i = width; i > 0; --i) { + for (int i = (int)width; i > 0; --i) { ui32 v = (ui32)*sp++; v <<= shift; v -= half; diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 33969f1f..25b0858e 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -294,7 +294,7 @@ namespace ojph { { __m256i zero = _mm256_setzero_si256(); __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1)); - for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) { + for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) { __m256 t = _mm256_loadu_ps(sp); t = _mm256_mul_ps(t, mul); __m256i u = _mm256_cvtps_epi32(t); @@ -314,7 +314,7 @@ namespace ojph { else { __m256i half = _mm256_set1_epi32(1 << (bit_depth - 1)); - for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) { + for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) { __m256 t = _mm256_loadu_ps(sp); t = _mm256_mul_ps(t, mul); __m256i u = _mm256_cvtps_epi32(t); @@ -365,7 +365,7 @@ namespace ojph { { __m256i zero = _mm256_setzero_si256(); __m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1)); - for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) { + for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) { __m256i t = _mm256_loadu_si256((__m256i*)sp); __m256i u = _mm256_slli_epi32(t, shift); if (NLT_TYPE3) @@ -384,7 +384,7 @@ namespace ojph { else { __m256i half = _mm256_set1_epi32(INT_MIN); - for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) { + for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) { __m256i t = _mm256_loadu_si256((__m256i*)sp); t = _mm256_slli_epi32(t, shift); t = _mm256_sub_epi32(t, half); diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index 96eeb34f..bb440978 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -161,7 +161,7 @@ namespace ojph { { __m128i zero = _mm_setzero_si128(); __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); - for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { __m128 t = _mm_loadu_ps(sp); t = _mm_mul_ps(t, mul); __m128i u = _mm_cvtps_epi32(t); @@ -181,7 +181,7 @@ namespace ojph { else { __m128i half = _mm_set1_epi32(1 << (bit_depth - 1)); - for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { __m128 t = _mm_loadu_ps(sp); t = _mm_mul_ps(t, mul); __m128i u = _mm_cvtps_epi32(t); @@ -427,7 +427,6 @@ namespace ojph { (dst_line->flags & line_buf::LFT_INTEGER) == 0); __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0)); - float mulf = (float)(1.0 / 65536.0 / 65536.0); const si32* sp = src_line->i32 + src_line_offset; float* dp = dst_line->f32; @@ -436,7 +435,7 @@ namespace ojph { { __m128i zero = _mm_setzero_si128(); __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1)); - for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { __m128i t = _mm_loadu_si128((__m128i*)sp); __m128i u = _mm_slli_epi32(t, shift); if (NLT_TYPE3) @@ -455,7 +454,7 @@ namespace ojph { else { __m128i half = _mm_set1_epi32(INT_MIN); - for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { __m128i t = _mm_loadu_si128((__m128i*)sp); t = _mm_slli_epi32(t, shift); t = _mm_sub_epi32(t, half); diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 548a2042..c7118347 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -328,7 +328,7 @@ namespace ojph { const v128_t zero = wasm_f32x4_splat(0.0f); const v128_t half = wasm_f32x4_splat(0.5f); v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); - for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); v128_t u = ojph_convert_float_to_i32(t, zero, half); @@ -350,7 +350,7 @@ namespace ojph { const v128_t zero = wasm_f32x4_splat(0.0f); const v128_t half = wasm_f32x4_splat(0.5f); v128_t ihalf = wasm_i32x4_splat(1 << (bit_depth - 1)); - for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); v128_t u = ojph_convert_float_to_i32(t, zero, half); @@ -401,7 +401,7 @@ namespace ojph { { v128_t zero = wasm_i32x4_splat(0); v128_t bias = wasm_i32x4_splat(-(si32)((ui32)INT_MIN + 1)); - for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); v128_t u = wasm_i32x4_shl(t, shift); if (NLT_TYPE3) @@ -420,7 +420,7 @@ namespace ojph { else { v128_t half = wasm_i32x4_splat(INT_MIN); - for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) { + for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); v128_t u = wasm_i32x4_shl(t, shift); u = wasm_i32x4_sub(u, half); From 42b2efdbad0fe043d374840a5062935c482a2726 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Sun, 26 Jan 2025 12:19:41 +1100 Subject: [PATCH 17/20] WASM compilation fix. --- src/core/transform/ojph_colour_wasm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index c7118347..d56ec95e 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -372,7 +372,7 @@ namespace ojph { } ////////////////////////////////////////////////////////////////////////// - void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, + void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, ui32 bit_depth, bool is_signed, ui32 width) { From e21bfd04a77f1aecee7642b3245e4f7c110e50e5 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Mon, 27 Jan 2025 18:24:43 +1100 Subject: [PATCH 18/20] Improvement and bug fixes. --- src/core/transform/ojph_colour.cpp | 19 +- src/core/transform/ojph_colour_avx2.cpp | 26 +- src/core/transform/ojph_colour_sse2.cpp | 24 +- src/core/transform/ojph_colour_wasm.cpp | 22 +- tests/mse_pae.cpp | 378 ++++++++++++++++++++++-- 5 files changed, 394 insertions(+), 75 deletions(-) diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp index 3e6307e7..b3c05aea 100644 --- a/src/core/transform/ojph_colour.cpp +++ b/src/core/transform/ojph_colour.cpp @@ -326,7 +326,7 @@ namespace ojph { if (is_signed) { - const si32 bias = (1 << (bit_depth - 1)) + 1; + const si32 bias = (si32)((1ULL << (bit_depth - 1)) + 1); for (int i = (int)width; i > 0; --i) { float t = *sp++ * mul; si32 v = ojph_round(t); @@ -339,7 +339,7 @@ namespace ojph { } else { - const si32 half = 1 << (bit_depth - 1); + const si32 half = (si32)(1ULL << (bit_depth - 1)); for (int i = (int)width; i > 0; --i) { float t = *sp++ * mul; si32 v = ojph_round(t); @@ -380,16 +380,16 @@ namespace ojph { (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER) == 0); - float mul = (float)(1.0 / 65536.0 / 65536.0); + assert(bit_depth <= 32); + float mul = (float)(1.0 / (double)(1ULL << bit_depth)); const si32* sp = src_line->i32 + src_line_offset; float* dp = dst_line->f32; - ui32 shift = 32 - bit_depth; if (is_signed) { - si32 bias = (si32)((ui32)INT_MIN + 1); + const si32 bias = (si32)((1ULL << (bit_depth - 1)) + 1); for (int i = (int)width; i > 0; --i) { - si32 v = *sp++ << shift; + si32 v = *sp++; if (NLT_TYPE3) v = (v >= 0) ? v : (- v - bias); *dp++ = (float)v * mul; @@ -397,12 +397,11 @@ namespace ojph { } else { - const ui32 half = (ui32)INT_MIN; + const si32 half = (si32)(1ULL << (bit_depth - 1)); for (int i = (int)width; i > 0; --i) { - ui32 v = (ui32)*sp++; - v <<= shift; + si32 v = *sp++; v -= half; - *dp++ = (float)(si32)v * mul; + *dp++ = (float)v * mul; } } } diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp index 25b0858e..2283be57 100644 --- a/src/core/transform/ojph_colour_avx2.cpp +++ b/src/core/transform/ojph_colour_avx2.cpp @@ -293,7 +293,8 @@ namespace ojph { if (is_signed) { __m256i zero = _mm256_setzero_si256(); - __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + __m256i bias = + _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1)); for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) { __m256 t = _mm256_loadu_ps(sp); t = _mm256_mul_ps(t, mul); @@ -313,7 +314,7 @@ namespace ojph { } else { - __m256i half = _mm256_set1_epi32(1 << (bit_depth - 1)); + __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1))); for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) { __m256 t = _mm256_loadu_ps(sp); t = _mm256_mul_ps(t, mul); @@ -356,37 +357,36 @@ namespace ojph { (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER) == 0); - __m256 mul = _mm256_set1_ps((float)(1.0 / 65536.0 / 65536.0)); + assert(bit_depth <= 32); + __m256 mul = _mm256_set1_ps((float)(1.0 / (double)(1ULL << bit_depth))); const si32* sp = src_line->i32 + src_line_offset; float* dp = dst_line->f32; - si32 shift = 32 - (si32)bit_depth; if (is_signed) { __m256i zero = _mm256_setzero_si256(); - __m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1)); + __m256i bias = + _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1)); for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) { __m256i t = _mm256_loadu_si256((__m256i*)sp); - __m256i u = _mm256_slli_epi32(t, shift); if (NLT_TYPE3) { - __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val - __m256i neg = _mm256_sub_epi32(bias, u); // - bias - value + __m256i c = _mm256_cmpgt_epi32(zero, t); // 0xFFFFFFFF for -ve val + __m256i neg = _mm256_sub_epi32(bias, t); // - bias - value neg = _mm256_and_si256(c, neg); // keep only - bias - val - t = _mm256_andnot_si256(c, u); // keep only +ve or 0 - u = _mm256_or_si256(neg, t); // combine + c = _mm256_andnot_si256(c, t); // keep only +ve or 0 + t = _mm256_or_si256(neg, c); // combine } - __m256 v = _mm256_cvtepi32_ps(u); + __m256 v = _mm256_cvtepi32_ps(t); v = _mm256_mul_ps(v, mul); _mm256_storeu_ps(dp, v); } } else { - __m256i half = _mm256_set1_epi32(INT_MIN); + __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1))); for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) { __m256i t = _mm256_loadu_si256((__m256i*)sp); - t = _mm256_slli_epi32(t, shift); t = _mm256_sub_epi32(t, half); __m256 v = _mm256_cvtepi32_ps(t); v = _mm256_mul_ps(v, mul); diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp index bb440978..63401169 100644 --- a/src/core/transform/ojph_colour_sse2.cpp +++ b/src/core/transform/ojph_colour_sse2.cpp @@ -160,7 +160,7 @@ namespace ojph { if (is_signed) { __m128i zero = _mm_setzero_si128(); - __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1)); + __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1)); for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { __m128 t = _mm_loadu_ps(sp); t = _mm_mul_ps(t, mul); @@ -180,7 +180,7 @@ namespace ojph { } else { - __m128i half = _mm_set1_epi32(1 << (bit_depth - 1)); + __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1))); for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { __m128 t = _mm_loadu_ps(sp); t = _mm_mul_ps(t, mul); @@ -426,37 +426,35 @@ namespace ojph { (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER) == 0); - __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0)); + assert(bit_depth <= 32); + __m128 mul = _mm_set1_ps((float)(1.0 / (double)(1ULL << bit_depth))); const si32* sp = src_line->i32 + src_line_offset; float* dp = dst_line->f32; - si32 shift = 32 - (si32)bit_depth; if (is_signed) { __m128i zero = _mm_setzero_si128(); - __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1)); + __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1)); for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { __m128i t = _mm_loadu_si128((__m128i*)sp); - __m128i u = _mm_slli_epi32(t, shift); if (NLT_TYPE3) { - __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value - __m128i neg = _mm_sub_epi32(bias, u); // - bias - value + __m128i c = _mm_cmplt_epi32(t, zero); // 0xFFFFFFFF for -ve value + __m128i neg = _mm_sub_epi32(bias, t); // - bias - value neg = _mm_and_si128(c, neg); // keep only - bias - value - t = _mm_andnot_si128(c, u); // keep only +ve or 0 - u = _mm_or_si128(neg, t); // combine + c = _mm_andnot_si128(c, t); // keep only +ve or 0 + t = _mm_or_si128(neg, c); // combine } - __m128 v = _mm_cvtepi32_ps(u); + __m128 v = _mm_cvtepi32_ps(t); v = _mm_mul_ps(v, mul); _mm_storeu_ps(dp, v); } } else { - __m128i half = _mm_set1_epi32(INT_MIN); + __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1))); for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { __m128i t = _mm_loadu_si128((__m128i*)sp); - t = _mm_slli_epi32(t, shift); t = _mm_sub_epi32(t, half); __m128 v = _mm_cvtepi32_ps(t); v = _mm_mul_ps(v, mul); diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index d56ec95e..8f307b13 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -327,7 +327,7 @@ namespace ojph { { const v128_t zero = wasm_f32x4_splat(0.0f); const v128_t half = wasm_f32x4_splat(0.5f); - v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1)); + v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1)); for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); @@ -349,7 +349,7 @@ namespace ojph { { const v128_t zero = wasm_f32x4_splat(0.0f); const v128_t half = wasm_f32x4_splat(0.5f); - v128_t ihalf = wasm_i32x4_splat(1 << (bit_depth - 1)); + v128_t ihalf = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1))); for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); t = wasm_f32x4_mul(t, mul); @@ -392,25 +392,24 @@ namespace ojph { (dst_line->flags & line_buf::LFT_32BIT) && (dst_line->flags & line_buf::LFT_INTEGER) == 0); - v128_t mul = wasm_f32x4_splat((float)(1.0 / 65536.0 / 65536.0)); + assert(bit_depth <= 32); + v128_t mul = wasm_f32x4_splat((float)(1.0 / (double)(1ULL << bit_depth))); const si32* sp = src_line->i32 + src_line_offset; float* dp = dst_line->f32; - ui32 shift = (ui32)32 - bit_depth; if (is_signed) { v128_t zero = wasm_i32x4_splat(0); - v128_t bias = wasm_i32x4_splat(-(si32)((ui32)INT_MIN + 1)); + v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1)); for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); - v128_t u = wasm_i32x4_shl(t, shift); if (NLT_TYPE3) { - v128_t c = wasm_i32x4_lt(u, zero); // 0xFFFFFFFF for -ve value - v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value + v128_t c = wasm_i32x4_lt(t, zero); // 0xFFFFFFFF for -ve value + v128_t neg = wasm_i32x4_sub(bias, t); // - bias - value neg = wasm_v128_and(c, neg); // keep only - bias - value - t = wasm_v128_andnot(u, c); // keep only +ve or 0 - u = wasm_v128_or(neg, t); // combine + c = wasm_v128_andnot(t, c); // keep only +ve or 0 + t = wasm_v128_or(neg, c); // combine } v128_t v = wasm_f32x4_convert_i32x4(u); v = wasm_f32x4_mul(v, mul); @@ -419,10 +418,9 @@ namespace ojph { } else { - v128_t half = wasm_i32x4_splat(INT_MIN); + v128_t half = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1))); for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); - v128_t u = wasm_i32x4_shl(t, shift); u = wasm_i32x4_sub(u, half); v128_t v = wasm_f32x4_convert_i32x4(u); v = wasm_f32x4_mul(v, mul); diff --git a/tests/mse_pae.cpp b/tests/mse_pae.cpp index 84653399..8239b851 100644 --- a/tests/mse_pae.cpp +++ b/tests/mse_pae.cpp @@ -60,7 +60,8 @@ struct img_info { width = height = 0; comps[0] = comps[1] = comps[2] = 0; format = UNDEFINED; - max_val = 0; + bit_depth = 0; + is_signed = false; } ~img_info() { for (ui32 i = 0; i < num_comps; ++i) @@ -70,15 +71,16 @@ struct img_info { } } - void init(ui32 num_comps, size_t width, size_t height, ui32 max_val, - ui32 format=FORMAT444) + void init(ui32 num_comps, size_t width, size_t height, ui32 bit_depth, + bool is_signed, ui32 format=FORMAT444) { assert(num_comps <= 3 && comps[0] == NULL); this->num_comps = num_comps; this->width = width; this->height = height; this->format = format; - this->max_val = max_val; + this->bit_depth = bit_depth; + this->is_signed = is_signed; for (ui32 i = 0; i < num_comps; ++i) switch (format) { @@ -114,7 +116,8 @@ struct img_info { point downsampling[3]; si32 *comps[3]; ui32 format; - ui32 max_val; + ui32 bit_depth; + bool is_signed; }; bool is_pnm(const char *filename) @@ -137,7 +140,7 @@ void load_ppm(const char *filename, img_info& img) ui32 num_comps = ppm.get_num_components(); size_t width = ppm.get_width(); size_t height = ppm.get_height(); - img.init(num_comps, width, height, ppm.get_max_val()); + img.init(num_comps, width, height, ppm.get_bit_depth(0), false); width = calc_aligned_size(width); si32 *buffer = new si32[width]; @@ -259,7 +262,7 @@ void load_yuv(const char *filename, img_info& img) yuv.set_img_props(s, num_comps, num_comps, downsampling); yuv.open(name_buf); - img.init(num_comps, s.w, s.h, (1 << bit_depth) - 1, format); + img.init(num_comps, s.w, s.h, bit_depth, false, format); size_t w = calc_aligned_size(s.w); si32 *buffer = new si32[w]; @@ -281,12 +284,238 @@ void load_yuv(const char *filename, img_info& img) delete[] buffer; } +bool is_rawl(const char *filename) +{ + const char *p = strchr(filename, ':'); // p is either NULL or pointing to ':' + if (p != NULL && p - filename >= 5 && p[-5] == '.' && + toupper(p[-4]) == 'R' && toupper(p[-3])== 'A' && + toupper(p[-2]) == 'W' && toupper(p[-1]) == 'L') + return true; + return false; +} + +void load_rawl(const char *filename, img_info& img) +{ + const char *p = strchr(filename, ':'); // p is either NULL or pointing to ':' + const char *name_end = p; + if (p == NULL) { + printf("A .rawl that does not have the expected format, which is\n"); + printf(".rawl:widthxheightxbitdepthxsignedxnum_comp\n"); + exit(-1); + } + ojph::size s; + ++p; + s.w = (ui32)atoi(p); + p = strchr(p, 'x'); // p is either NULL or pointing to ':' + if (p == NULL) { + printf("Expecting image height.\n"); + printf("A .rawl that does not have the expected format, which is\n"); + printf(".rawl:widthxheightxbitdepthxsignedxnum_comp\n"); + exit(-1); + } + ++p; + s.h = (ui32)atoi(p); + p = strchr(p, 'x'); // p is either NULL or pointing to ':' + if (p == NULL) { + printf("Expecting image bitdepth.\n"); + printf("A .rawl that does not have the expected format, which is\n"); + printf(".rawl:widthxheightxbitdepthxsignedxnum_comp\n"); + exit(-1); + } + ++p; + ui32 bit_depth = (ui32)atoi(p); + p = strchr(p, 'x'); // p is either NULL or pointing to ':' + if (p == NULL) { + printf("Expecting signedness information (either 0 or 1).\n"); + printf("A .rawl that does not have the expected format, which is\n"); + printf(".rawl:widthxheightxbitdepthxsignedxnum_comp, where num_comp is\n"); + printf("either 1 or 3\n"); + exit(-1); + } + ++p; + bool is_signed = *p != '0'; + p = strchr(p, 'x'); // p is either NULL or pointing to ':' + if (p == NULL) { + printf("Expecting number of components.\n"); + printf("A .rawl that does not have the expected format, which is\n"); + printf(".rawl:widthxheightxbitdepthxsignedxnum_comp, where num_comp is\n"); + printf("either 1 or 3\n"); + exit(-1); + } + ++p; + ui32 num_comps = (ui32)atoi(p); + if (num_comps != 1 && num_comps != 3) + { + printf("num_comp must be either 1 or 3, %s was supplied.\n", p); + printf("A .rawl that does not have the expected format, which is\n"); + printf(".rawl:widthxheightxbitdepthxsignedxnum_comp, where format is\n"); + printf("either 1 or 3\n"); + exit(-1); + } + + char name_buf[2048]; + ptrdiff_t cpy_len = name_end - filename > 2047 ? 2047 : name_end - filename; + strncpy(name_buf, filename, (size_t)cpy_len); + name_buf[cpy_len] = 0; + + size_t w = calc_aligned_size(s.w); + if (num_comps == 3) + img.init(num_comps, s.w, s.h, bit_depth, is_signed, FORMAT444); + else + img.init(num_comps, s.w, s.h, bit_depth, is_signed, FORMAT400); + + if (is_signed) + { + if (bit_depth <= 8) + { + si8 *buffer = new si8[s.w * s.h]; + FILE *f = fopen(name_buf, "rb"); + if (f == NULL) { + printf("Error opening file %s\n", name_buf); + exit(-1); + } + + for (ui32 i = 0; i < num_comps; ++i) + { + si8 *sp = buffer; + si32 *dp = img.comps[i]; + if (fread(buffer, 1, s.w * s.h, f) != s.w * s.h) { + printf("Error reading from file %s\n", name_buf); + exit(-1); + } + for (ui32 j = s.w * s.h; j > 0; --j) + *dp++ = *sp++; + } + fclose(f); + delete[] buffer; + } + else if (bit_depth <= 16) + { + si16 *buffer = new si16[s.w * s.h]; + FILE *f = fopen(name_buf, "rb"); + if (f == NULL) { + printf("Error opening file %s\n", name_buf); + exit(-1); + } + + for (ui32 i = 0; i < num_comps; ++i) + { + si16 *sp = buffer; + si32 *dp = img.comps[i]; + if (fread(buffer, 2, s.w * s.h, f) != s.w * s.h) { + printf("Error reading from file %s\n", name_buf); + exit(-1); + } + for (ui32 j = s.w * s.h; j > 0; --j) + *dp++ = *sp++; + } + fclose(f); + delete[] buffer; + } + else + { + si32 *buffer = new si32[s.w * s.h]; + FILE *f = fopen(name_buf, "rb"); + if (f == NULL) { + printf("Error opening file %s\n", name_buf); + exit(-1); + } + + for (ui32 i = 0; i < num_comps; ++i) + { + si32 *sp = buffer; + si32 *dp = img.comps[i]; + if (fread(buffer, 4, s.w * s.h, f) != s.w * s.h) { + printf("Error reading from file %s\n", name_buf); + exit(-1); + } + for (ui32 j = s.w * s.h; j > 0; --j) + *dp++ = *sp++; + } + fclose(f); + delete[] buffer; + } + } + else + { + if (bit_depth <= 8) + { + ui8 *buffer = new ui8[s.w * s.h]; + FILE *f = fopen(name_buf, "rb"); + if (f == NULL) { + printf("Error opening file %s\n", name_buf); + exit(-1); + } + + for (ui32 i = 0; i < num_comps; ++i) + { + ui8 *sp = buffer; + si32 *dp = img.comps[i]; + if (fread(buffer, 1, s.w * s.h, f) != s.w * s.h) { + printf("Error reading from file %s\n", name_buf); + exit(-1); + } + for (ui32 j = s.w * s.h; j > 0; --j) + *dp++ = *sp++; + } + fclose(f); + delete[] buffer; + } + else if (bit_depth <= 16) + { + ui16 *buffer = new ui16[s.w * s.h]; + FILE *f = fopen(name_buf, "rb"); + if (f == NULL) { + printf("Error opening file %s\n", name_buf); + exit(-1); + } + + for (ui32 i = 0; i < num_comps; ++i) + { + ui16 *sp = buffer; + si32 *dp = img.comps[i]; + if (fread(buffer, 2, s.w * s.h, f) != s.w * s.h) { + printf("Error reading from file %s\n", name_buf); + exit(-1); + } + for (ui32 j = s.w * s.h; j > 0; --j) + *dp++ = *sp++; + } + fclose(f); + delete[] buffer; + } + else + { + ui32 *buffer = new ui32[s.w * s.h]; + FILE *f = fopen(name_buf, "rb"); + if (f == NULL) { + printf("Error opening file %s\n", name_buf); + exit(-1); + } + + for (ui32 i = 0; i < num_comps; ++i) + { + ui32 *sp = buffer; + si32 *dp = img.comps[i]; + if (fread(buffer, 4, s.w * s.h, f) != s.w * s.h) { + printf("Error reading from file %s\n", name_buf); + exit(-1); + } + for (ui32 j = s.w * s.h; j > 0; --j) + *dp++ = (si32)*sp++; + } + fclose(f); + delete[] buffer; + } + } +} + void find_mse_pae(const img_info& img1, const img_info& img2, float mse[3], ui32 pae[3]) { if (img1.num_comps != img2.num_comps || img1.format != img2.format || img1.width != img2.width || img1.height != img2.height || - img1.max_val != img2.max_val) + img1.bit_depth != img2.bit_depth || img1.is_signed != img2.is_signed) { printf("Error: mismatching images\n"); exit(-1); @@ -298,26 +527,99 @@ void find_mse_pae(const img_info& img1, const img_info& img2, h = (img1.height + img1.downsampling[c].x - 1) / img1.downsampling[c].x; double se = 0; ui32 lpae = 0; - for (ui32 v = 0; v < h; ++v) - { - si32 *p0 = img1.comps[c] + w * v; - si32 *p1 = img2.comps[c] + w * v; - for (ui32 s = 0; s < w; ++s) + if (img1.is_signed) + for (ui32 v = 0; v < h; ++v) { - si32 err = *p0++ - *p1++; - ui32 ae = (ui32)(err > 0 ? err : -err); - lpae = ae > lpae ? ae : lpae; - se += (double)err * (double)err; + si32 *p0 = img1.comps[c] + w * v; + si32 *p1 = img2.comps[c] + w * v; + for (ui32 s = 0; s < w; ++s) + { + si32 err = *p0++ - *p1++; + ui32 ae = (ui32)(err > 0 ? err : -err); + lpae = ae > lpae ? ae : lpae; + se += (double)err * (double)err; + } + } + else + for (ui32 v = 0; v < h; ++v) + { + ui32 *p0 = (ui32*)img1.comps[c] + w * v; + ui32 *p1 = (ui32*)img2.comps[c] + w * v; + for (ui32 s = 0; s < w; ++s) + { + ui32 a = *p0++; + ui32 b = *p1++; + ui32 err = a > b ? a - b : b - a; + lpae = err > lpae ? err : lpae; + se += (double)err * (double)err; + } } - } mse[c] = (float)se / (float)(w * h); pae[c] = lpae; } - // float t = 0; - // for (ui32 c = 0; c < img1.num_comps; ++c) - // t += (float)mse[c]; - // t /= (float)num_pixels; - // psnr = 10.0f * log10f((float)img1.max_val * (float)img1.max_val / t); +} + +void find_nlt_mse_pae(const img_info& img1, const img_info& img2, + float mse[3], ui32 pae[3]) +{ + if (img1.num_comps != img2.num_comps || img1.format != img2.format || + img1.width != img2.width || img1.height != img2.height || + img1.bit_depth != img2.bit_depth || img1.is_signed != img2.is_signed) + { + printf("Error: mismatching images\n"); + exit(-1); + } + if (img1.is_signed) + for (ui32 c = 0; c < img1.num_comps; ++c) + { + size_t w, h; + w = (img1.width + img1.downsampling[c].x - 1) / img1.downsampling[c].x; + h = (img1.height + img1.downsampling[c].x - 1) / img1.downsampling[c].x; + double se = 0; + ui32 lpae = 0; + si32 bias = (si32)((1ULL << (img1.bit_depth - 1)) + 1); + for (ui32 v = 0; v < h; ++v) + { + si32 *p0 = img1.comps[c] + w * v; + si32 *p1 = img2.comps[c] + w * v; + for (ui32 s = 0; s < w; ++s) + { + si32 a = *p0++; + si32 b = *p1++; + a = (a >= 0) ? a : (- a - bias); + b = (b >= 0) ? b : (- b - bias); + ui32 err = a > b ? a - b : b - a; + lpae = err > lpae ? err : lpae; + se += (double)err * (double)err; + } + } + mse[c] = (float)se / (float)(w * h); + pae[c] = lpae; + } + else + for (ui32 c = 0; c < img1.num_comps; ++c) + { + size_t w, h; + w = (img1.width + img1.downsampling[c].x - 1) / img1.downsampling[c].x; + h = (img1.height + img1.downsampling[c].x - 1) / img1.downsampling[c].x; + double se = 0; + ui32 lpae = 0; + for (ui32 v = 0; v < h; ++v) + { + ui32 *p0 = (ui32*)img1.comps[c] + w * v; + ui32 *p1 = (ui32*)img2.comps[c] + w * v; + for (ui32 s = 0; s < w; ++s) + { + ui32 a = *p0++; + ui32 b = *p1++; + ui32 err = a > b ? a - b : b - a; + lpae = err > lpae ? err : lpae; + se += (double)err * (double)err; + } + } + mse[c] = (float)se / (float)(w * h); + pae[c] = lpae; + } } int main(int argc, char *argv[]) @@ -325,20 +627,36 @@ int main(int argc, char *argv[]) if (argc < 3) { printf("mse_pae expects two arguments \n"); + printf("A third optional argment is \"-nlt\".\n"); exit(-1); } - + + bool nlt = false; + if (argc == 4) + { + if (strcmp("-nlt", argv[3]) == 0) + nlt = true; + else { + printf("unknown 4th parameter %s\n", argv[3]); + exit(-1); + } + } + + img_info img1, img2; try { if (is_pnm(argv[1])) load_ppm(argv[1], img1); else if (is_yuv(argv[1])) load_yuv(argv[1], img1); + else if (is_rawl(argv[1])) + load_rawl(argv[1], img1); else { printf("mse_pae does not know file format of %s\n", argv[1]); printf("or a .yuv that does not have the expected format, which is\n"); printf(".yuv:widthxheightxbitdepthxformat, where format is\n"); - printf("either 444, 422, or 420\n"); + printf("either 444, 422, or 420, or wrongly format .rawl, which has\n"); + printf(".rawl:widthxheightxbitdepthxsignedxnum_comp format.\n"); exit(-1); } } @@ -355,11 +673,14 @@ int main(int argc, char *argv[]) load_ppm(argv[2], img2); else if (is_yuv(argv[2])) load_yuv(argv[2], img2); + else if (is_rawl(argv[2])) + load_rawl(argv[2], img2); else { printf("mse_pae does not know file format of %s\n", argv[2]); printf("or a .yuv that does not have the expected format, which is\n"); printf(".yuv:widthxheightxbitdepthxformat, where format is\n"); - printf("either 444, 422, or 420\n"); + printf("either 444, 422, or 420, or wrongly format .rawl, which has\n"); + printf(".rawl:widthxheightxbitdepthxsignedxnum_comp format.\n"); exit(-1); } } @@ -372,7 +693,10 @@ int main(int argc, char *argv[]) } float mse[3]; ui32 pae[3]; - find_mse_pae(img1, img2, mse, pae); + if (!nlt) + find_mse_pae(img1, img2, mse, pae); + else + find_nlt_mse_pae(img1, img2, mse, pae); for (ui32 c = 0; c < img1.num_comps; ++c) printf("%f %d\n", mse[c], pae[c]); From 9921864ac5b6b08f9b2e69d97f79c9344dda0634 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Mon, 27 Jan 2025 19:00:19 +1100 Subject: [PATCH 19/20] A bug fix. --- src/core/transform/ojph_colour_wasm.cpp | 6 +++--- tests/mse_pae.cpp | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp index 8f307b13..aa9a79eb 100644 --- a/src/core/transform/ojph_colour_wasm.cpp +++ b/src/core/transform/ojph_colour_wasm.cpp @@ -411,7 +411,7 @@ namespace ojph { c = wasm_v128_andnot(t, c); // keep only +ve or 0 t = wasm_v128_or(neg, c); // combine } - v128_t v = wasm_f32x4_convert_i32x4(u); + v128_t v = wasm_f32x4_convert_i32x4(t); v = wasm_f32x4_mul(v, mul); wasm_v128_store(dp, v); } @@ -421,8 +421,8 @@ namespace ojph { v128_t half = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1))); for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) { v128_t t = wasm_v128_load(sp); - u = wasm_i32x4_sub(u, half); - v128_t v = wasm_f32x4_convert_i32x4(u); + t = wasm_i32x4_sub(t, half); + v128_t v = wasm_f32x4_convert_i32x4(t); v = wasm_f32x4_mul(v, mul); wasm_v128_store(dp, v); } diff --git a/tests/mse_pae.cpp b/tests/mse_pae.cpp index 8239b851..f1b84e64 100644 --- a/tests/mse_pae.cpp +++ b/tests/mse_pae.cpp @@ -358,7 +358,6 @@ void load_rawl(const char *filename, img_info& img) strncpy(name_buf, filename, (size_t)cpy_len); name_buf[cpy_len] = 0; - size_t w = calc_aligned_size(s.w); if (num_comps == 3) img.init(num_comps, s.w, s.h, bit_depth, is_signed, FORMAT444); else @@ -588,7 +587,7 @@ void find_nlt_mse_pae(const img_info& img1, const img_info& img2, si32 b = *p1++; a = (a >= 0) ? a : (- a - bias); b = (b >= 0) ? b : (- b - bias); - ui32 err = a > b ? a - b : b - a; + ui32 err = (ui32)(a > b ? a - b : b - a); lpae = err > lpae ? err : lpae; se += (double)err * (double)err; } From d84633859d59eb41abb13fee960988ff6898f7c7 Mon Sep 17 00:00:00 2001 From: Aous Naman Date: Tue, 28 Jan 2025 09:11:13 +1100 Subject: [PATCH 20/20] A version bump. --- src/core/common/ojph_version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h index 4c05b02b..4a61fc99 100644 --- a/src/core/common/ojph_version.h +++ b/src/core/common/ojph_version.h @@ -35,4 +35,4 @@ #define OPENJPH_VERSION_MAJOR 0 #define OPENJPH_VERSION_MINOR 19 -#define OPENJPH_VERSION_PATCH 0 +#define OPENJPH_VERSION_PATCH 1