From 7086c360d9603a5fb9643ced11bc620e8cd17aa9 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Tue, 21 Jan 2025 22:18:38 +1100
Subject: [PATCH 01/20] Generic NLT code is added.  Testing is needed.

---
 src/core/codestream/ojph_tile.cpp      |  96 ++++++++++------
 src/core/common/ojph_arch.h            |  12 ++
 src/core/common/ojph_mem.h             |   4 +-
 src/core/transform/ojph_colour.cpp     | 145 ++++++++++++++++++++++++-
 src/core/transform/ojph_colour.h       |  10 ++
 src/core/transform/ojph_colour_local.h |  10 ++
 6 files changed, 239 insertions(+), 38 deletions(-)

diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index 9cec729c..67fed0bd 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -273,13 +273,18 @@ namespace ojph {
         }
         else
         {
-          float mul = 1.0f / (float)(1<<num_bits[comp_num]);
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          float *dp = tc->f32;
-          if (is_signed[comp_num])
-            cnvrt_si32_to_float(sp, dp, mul, comp_width);
-          else
-            cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width);
+          if (nlt_type3[comp_num] == type3)
+            irv_convert_to_float_nlt_type3(line, line_offsets[comp_num],
+              tc, num_bits[comp_num], is_signed[comp_num], comp_width);
+          else {
+            float mul = 1.0f / (float)(1<<num_bits[comp_num]);
+            const si32 *sp = line->i32 + line_offsets[comp_num];
+            float *dp = tc->f32;
+            if (is_signed[comp_num])
+              cnvrt_si32_to_float(sp, dp, mul, comp_width);
+            else
+              cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width);
+          }
         }
         comps[comp_num].push_line();
       }
@@ -311,13 +316,19 @@ namespace ojph {
         }
         else
         {
-          float mul = 1.0f / (float)(1<<num_bits[comp_num]);
-          const si32 *sp = line->i32 + line_offsets[comp_num];
-          float *dp = lines[comp_num].f32;
-          if (is_signed[comp_num])
-            cnvrt_si32_to_float(sp, dp, mul, comp_width);
-          else
-            cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width);
+          if (nlt_type3[comp_num] == type3)
+            irv_convert_to_float_nlt_type3(line, line_offsets[comp_num],
+              lines + comp_num, num_bits[comp_num], is_signed[comp_num], 
+              comp_width);
+          else {
+            float mul = 1.0f / (float)(1<<num_bits[comp_num]);
+            const si32 *sp = line->i32 + line_offsets[comp_num];
+            float *dp = lines[comp_num].f32;
+            if (is_signed[comp_num])
+              cnvrt_si32_to_float(sp, dp, mul, comp_width);
+            else
+              cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width);
+          }
           if (comp_num == 2)
           { // irreversible color transform
             ict_forward(lines[0].f32, lines[1].f32, lines[2].f32,
@@ -364,13 +375,21 @@ namespace ojph {
         }
         else
         {
-          float mul = (float)(1 << num_bits[comp_num]);
-          const float *sp = src_line->f32;
-          si32 *dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num])
-            cnvrt_float_to_si32(sp, dp, mul, comp_width);
-          else
-            cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width);
+          if (nlt_type3[comp_num] == type3)
+          {
+            irv_convert_to_integer_nlt_type3(src_line, tgt_line, 
+              line_offsets[comp_num], num_bits[comp_num], 
+              is_signed[comp_num], comp_width);
+          }
+          else {
+            float mul = (float)(1 << num_bits[comp_num]);
+            const float *sp = src_line->f32;
+            si32 *dp = tgt_line->i32 + line_offsets[comp_num];
+            if (is_signed[comp_num])
+              cnvrt_float_to_si32(sp, dp, mul, comp_width);
+            else
+              cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width);
+          }
         }
       }
       else
@@ -407,17 +426,30 @@ namespace ojph {
         }
         else
         {
-          float mul = (float)(1 << num_bits[comp_num]);
-          const float *sp;
-          if (comp_num < 3)
-            sp = lines[comp_num].f32;
-          else
-            sp = comps[comp_num].pull_line()->f32;
-          si32 *dp = tgt_line->i32 + line_offsets[comp_num];
-          if (is_signed[comp_num])
-            cnvrt_float_to_si32(sp, dp, mul, comp_width);
-          else
-            cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width);
+          if (nlt_type3[comp_num] == type3)
+          {
+            line_buf* lbp;
+            if (comp_num < 3)
+              lbp = lines + comp_num;
+            else
+              lbp = comps[comp_num].pull_line();            
+            irv_convert_to_integer_nlt_type3(lbp, tgt_line, 
+              line_offsets[comp_num], num_bits[comp_num], 
+              is_signed[comp_num], comp_width);
+          }
+          else {
+            float mul = (float)(1 << num_bits[comp_num]);
+            const float *sp;
+            if (comp_num < 3)
+              sp = lines[comp_num].f32;
+            else
+              sp = comps[comp_num].pull_line()->f32;
+            si32 *dp = tgt_line->i32 + line_offsets[comp_num];
+            if (is_signed[comp_num])
+              cnvrt_float_to_si32(sp, dp, mul, comp_width);
+            else
+              cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width);
+          }
         }
       }
 
diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h
index 29ab7a57..33e434a0 100644
--- a/src/core/common/ojph_arch.h
+++ b/src/core/common/ojph_arch.h
@@ -271,6 +271,18 @@ namespace ojph {
   #endif
   }
 
+  ////////////////////////////////////////////////////////////////////////////
+  static inline si64 ojph_round64(float val)
+  {
+  #ifdef OJPH_COMPILER_MSVC
+    return (si64)(val + (val >= 0.0f ? 0.5f : -0.5f));
+  #elif (defined OJPH_COMPILER_GNUC)
+    return (si64)(val + (val >= 0.0f ? 0.5f : -0.5f));
+  #else
+    return (si64)round(val);
+  #endif
+  }
+
   ////////////////////////////////////////////////////////////////////////////
   static inline si32 ojph_trunc(float val)
   {
diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h
index 99897f32..b910e120 100644
--- a/src/core/common/ojph_mem.h
+++ b/src/core/common/ojph_mem.h
@@ -138,8 +138,8 @@ namespace ojph {
     enum : ui32 {
       LFT_UNDEFINED  = 0x00, // Type is undefined/uninitialized
                              // These flags reflects data size in bytes
-      LFT_BYTE       = 0x01, // Set when data is 1 byte
-      LFT_16BIT      = 0x02, // Set when data is 2 bytes
+      LFT_BYTE       = 0x01, // Set when data is 1 byte  (not used)
+      LFT_16BIT      = 0x02, // Set when data is 2 bytes (not used)
       LFT_32BIT      = 0x04, // Set when data is 4 bytes
       LFT_64BIT      = 0x08, // Set when data is 8 bytes
       LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding
diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index a98b477b..3c6ab026 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -78,6 +78,16 @@ namespace ojph {
     void (*cnvrt_float_to_si32)
       (const float *sp, si32 *dp, float mul, ui32 width) = NULL;
 
+    //////////////////////////////////////////////////////////////////////////
+    void (*irv_convert_to_float_nlt_type3) (
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL;
+      
+    //////////////////////////////////////////////////////////////////////////
+    void (*irv_convert_to_integer_nlt_type3) (
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, 
+      ui32 bit_depth, bool is_signed, ui32 width) = NULL;
+
     //////////////////////////////////////////////////////////////////////////
     void (*rct_forward)
       (const line_buf* r, const line_buf* g, const line_buf* b,
@@ -115,6 +125,8 @@ namespace ojph {
       cnvrt_si32_to_float = gen_cnvrt_si32_to_float;
       cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd;
       cnvrt_float_to_si32 = gen_cnvrt_float_to_si32;
+      irv_convert_to_float_nlt_type3 = gen_irv_convert_to_float_nlt_type3;
+      irv_convert_to_integer_nlt_type3 = gen_irv_convert_to_integer_nlt_type3;
       rct_forward = gen_rct_forward;
       rct_backward = gen_rct_backward;
       ict_forward = gen_ict_forward;
@@ -237,8 +249,8 @@ namespace ojph {
       }
       else 
       {
-        assert(src_line->flags | line_buf::LFT_64BIT);
-        assert(dst_line->flags | line_buf::LFT_32BIT);
+        assert(src_line->flags & line_buf::LFT_64BIT);
+        assert(dst_line->flags & line_buf::LFT_32BIT);
         const si64 *sp = src_line->i64 + src_line_offset;
         si32 *dp = dst_line->i32 + dst_line_offset;
         for (ui32 i = width; i > 0; --i)
@@ -276,8 +288,8 @@ namespace ojph {
       }
       else 
       {
-        assert(src_line->flags | line_buf::LFT_64BIT);
-        assert(dst_line->flags | line_buf::LFT_32BIT);
+        assert(src_line->flags & line_buf::LFT_64BIT);
+        assert(dst_line->flags & line_buf::LFT_32BIT);
         const si64 *sp = src_line->i64 + src_line_offset;
         si32 *dp = dst_line->i32 + dst_line_offset;
         for (ui32 i = width; i > 0; --i) {
@@ -319,6 +331,131 @@ namespace ojph {
         *dp++ = ojph_round(*sp++ * mul);
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
+      ui32 src_line_offset, line_buf *dst_line, 
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_REVERSIBLE) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_REVERSIBLE) == 0);
+
+      float mul;
+      if (bit_depth < 32)
+        mul = 1.0f / (float)(1 << bit_depth);
+      else
+        mul = (float)(1.0 / 65536.0 / 65536.0);
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      if (is_signed)
+      {
+        si32 shift = (1 << (bit_depth - 1)) + 1;
+        for (ui32 i = width; i > 0; --i) {
+          si32 v = *sp++;
+          v = (v >= 0) ? v : (- v - shift);
+          *dp++ = (float)v * mul;
+        }
+      }
+      else
+      {
+        for (ui32 i = width; i > 0; --i)
+          *dp++ = (float)*sp++ * mul - 0.5f;
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_REVERSIBLE) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_REVERSIBLE) == 0);
+      
+      float mul;
+      if (bit_depth < 32)
+        mul = 1.0f / (float)(1 << bit_depth);
+      else
+        mul = (float)(1.0 / 65536.0 / 65536.0);
+
+      const float* sp = src_line->f32;
+      si32* dp = dst_line->i32 + dst_line_offset;
+      if (bit_depth <= 30) 
+      {
+        // We are leaving two bit overhead -- here, we are assuming that after
+        // multiplications, the resulting number can still be represented
+        // using 32 bit integer
+        const si32 half = (1 << (bit_depth - 1));
+        const si32 shift = half + 1;
+        const si32 upper_limit = 0x7FFFFFFF >> (32 - bit_depth);
+        const si32 lower_limit = 0x80000000 >> (32 - bit_depth);
+
+        if (is_signed)
+        {
+          for (ui32 i = width; i > 0; --i) {
+            si32 v = ojph_round(*sp++ * mul);
+            v = ojph_max(v, lower_limit);
+            v = ojph_min(v, upper_limit);
+            v = (v >= 0) ? v : (- v - shift);
+            *dp++ = v;
+          }
+        }
+        else
+        {
+          for (ui32 i = width; i > 0; --i) {
+            si32 v = ojph_round(*sp++ * mul);
+            v = ojph_max(v, lower_limit);
+            v = ojph_min(v, upper_limit);
+            v = (v >= 0) ? v : (- v - shift);
+            *dp++ = v + half;
+          }
+        }
+      }
+      else
+      {
+        // There is the possibility that converting to integer will
+        // exceed the dynamic range of 32bit integer; therefore, we need
+        // to use 64 bit.  One may think, why not limit the floats to the
+        // range of [-0.5f, 0.5f)? 
+        // Notice the half closed range -- we need a value just below 0.5f.
+        // While getting this number is possible, after multiplication, the
+        // resulting number will not be exactly the maximum that the integer 
+        // can achieve.  All this is academic, because here are talking
+        // about a number which has all the exponent bits set, meaning 
+        // it is either infinity, -infinity, qNan or sNan.
+        const si32 half = (1 << (bit_depth - 1));
+        const si32 shift = half + 1;
+        const si64 upper_limit = 0x7FFFFFFFFFFFFFFFLL >> (64 - bit_depth);
+        const si64 lower_limit = 0x8000000000000000LL >> (64 - bit_depth);
+
+        if (is_signed)
+        {
+          for (ui32 i = width; i > 0; --i) {
+            si64 t = ojph_round64(*sp++ * mul);
+            t = ojph_max(t, lower_limit);
+            t = ojph_min(t, upper_limit);
+            si32 v = (si32)t;
+            v = (v >= 0) ? v : (- v - shift);
+            *dp++ = v;
+          }
+        }
+        else
+        {
+          for (ui32 i = width; i > 0; --i) {
+            si64 t = ojph_round64(*sp++ * mul);
+            t = ojph_max(t, lower_limit);
+            t = ojph_min(t, upper_limit);
+            si32 v = (si32)t;
+            v = (v >= 0) ? v : (- v - shift);
+            *dp++ = v + half;
+          }
+        }
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void gen_rct_forward(
       const line_buf *r, const line_buf *g, const line_buf *b,
diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h
index cc42aaa5..d5375a97 100644
--- a/src/core/transform/ojph_colour.h
+++ b/src/core/transform/ojph_colour.h
@@ -77,6 +77,16 @@ namespace ojph {
   extern void (*cnvrt_float_to_si32)
     (const float *sp, si32 *dp, float mul, ui32 width);
 
+  ////////////////////////////////////////////////////////////////////////////
+  extern void (*irv_convert_to_float_nlt_type3) (
+    const line_buf *src_line, ui32 src_line_offset,
+    line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
+  ////////////////////////////////////////////////////////////////////////////
+  extern void (*irv_convert_to_integer_nlt_type3) (
+    const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+    ui32 bit_depth, bool is_signed, ui32 width);
+
   ////////////////////////////////////////////////////////////////////////////
   extern void (*rct_forward)
     (const line_buf *r, const line_buf *g, const line_buf *b,
diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h
index 5eb8b746..acd0f944 100644
--- a/src/core/transform/ojph_colour_local.h
+++ b/src/core/transform/ojph_colour_local.h
@@ -92,6 +92,16 @@ namespace ojph {
     void gen_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
                                  ui32 width);
 
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_convert_to_float_nlt_type3(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_convert_to_integer_nlt_type3(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
+
     //////////////////////////////////////////////////////////////////////////
     void gen_rct_forward(
       const line_buf *r, const line_buf *g, const line_buf *b,

From d065c767e227b7f539b1fcdfddd9c4029380671c Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Tue, 21 Jan 2025 22:30:33 +1100
Subject: [PATCH 02/20] Fixes warning on Mac.

---
 src/core/transform/ojph_colour.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 3c6ab026..63438948 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -390,8 +390,8 @@ namespace ojph {
         // using 32 bit integer
         const si32 half = (1 << (bit_depth - 1));
         const si32 shift = half + 1;
-        const si32 upper_limit = 0x7FFFFFFF >> (32 - bit_depth);
-        const si32 lower_limit = 0x80000000 >> (32 - bit_depth);
+        const si32 upper_limit = INT_MAX >> (32 - bit_depth);
+        const si32 lower_limit = INT_MIN >> (32 - bit_depth);
 
         if (is_signed)
         {
@@ -428,8 +428,8 @@ namespace ojph {
         // it is either infinity, -infinity, qNan or sNan.
         const si32 half = (1 << (bit_depth - 1));
         const si32 shift = half + 1;
-        const si64 upper_limit = 0x7FFFFFFFFFFFFFFFLL >> (64 - bit_depth);
-        const si64 lower_limit = 0x8000000000000000LL >> (64 - bit_depth);
+        const si64 upper_limit = LLONG_MAX >> (64 - bit_depth);
+        const si64 lower_limit = LLONG_MIN >> (64 - bit_depth);
 
         if (is_signed)
         {

From bf48100b12c8c6dd43d29ff772ddd9d39650673b Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Tue, 21 Jan 2025 22:31:59 +1100
Subject: [PATCH 03/20] Fixes compilation.

---
 src/core/transform/ojph_colour.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 63438948..c29ff207 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -36,6 +36,7 @@
 //***************************************************************************/
 
 #include <cmath>
+#include <climits>
 
 #include "ojph_defs.h"
 #include "ojph_arch.h"

From e60473c28f1ad9862493c30a4f1a714f6965cdf4 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Wed, 22 Jan 2025 12:40:22 +1100
Subject: [PATCH 04/20] Bug fixes. Improvements.

---
 src/core/codestream/ojph_resolution.cpp | 68 +++++++++++++++-------
 src/core/codestream/ojph_subband.cpp    | 24 ++++++--
 src/core/codestream/ojph_tile.cpp       | 21 +++++--
 src/core/common/ojph_mem.h              | 16 +----
 src/core/others/ojph_mem.cpp            | 28 +--------
 src/core/transform/ojph_colour.cpp      | 77 +++++++++++--------------
 src/core/transform/ojph_colour_avx2.cpp | 24 ++++----
 src/core/transform/ojph_colour_sse2.cpp | 24 ++++----
 8 files changed, 145 insertions(+), 137 deletions(-)

diff --git a/src/core/codestream/ojph_resolution.cpp b/src/core/codestream/ojph_resolution.cpp
index 8ee5b79d..343615f8 100644
--- a/src/core/codestream/ojph_resolution.cpp
+++ b/src/core/codestream/ojph_resolution.cpp
@@ -207,20 +207,31 @@ namespace ojph {
 
         const param_qcd* qp = codestream->access_qcd()->get_qcc(comp_num);
         ui32 precision = qp->propose_precision(cdp);
+        const param_atk* atk = cdp->access_atk();
+        bool reversible = atk->is_reversible();
 
         ui32 width = res_rect.siz.w + 1;
-        if (precision <= 32) {
-          for (ui32 i = 0; i < num_steps; ++i)
+        if (reversible)
+        {
+          if (precision <= 32) {
+            for (ui32 i = 0; i < num_steps; ++i)
+              allocator->pre_alloc_data<si32>(width, 1);
             allocator->pre_alloc_data<si32>(width, 1);
-          allocator->pre_alloc_data<si32>(width, 1);
-          allocator->pre_alloc_data<si32>(width, 1);
+            allocator->pre_alloc_data<si32>(width, 1);
+          }
+          else 
+          {
+            for (ui32 i = 0; i < num_steps; ++i)
+              allocator->pre_alloc_data<si64>(width, 1);
+            allocator->pre_alloc_data<si64>(width, 1);
+            allocator->pre_alloc_data<si64>(width, 1);
+          }
         }
-        else 
-        {
+        else {
           for (ui32 i = 0; i < num_steps; ++i)
-            allocator->pre_alloc_data<si64>(width, 1);
-          allocator->pre_alloc_data<si64>(width, 1);
-          allocator->pre_alloc_data<si64>(width, 1);
+            allocator->pre_alloc_data<float>(width, 1);
+          allocator->pre_alloc_data<float>(width, 1);
+          allocator->pre_alloc_data<float>(width, 1);
         }
       }
     }
@@ -474,21 +485,38 @@ namespace ojph {
 
         // initiate storage of line_buf
         ui32 width = res_rect.siz.w + 1;
-        if (precision <= 32)
+        if (this->reversible)
         {
-          for (ui32 i = 0; i < num_steps; ++i)
-            ssp[i].line->wrap(
+          if (precision <= 32)
+          {
+            for (ui32 i = 0; i < num_steps; ++i)
+              ssp[i].line->wrap(
+                allocator->post_alloc_data<si32>(width, 1), width, 1);
+            sig->line->wrap(
               allocator->post_alloc_data<si32>(width, 1), width, 1);
-          sig->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
-          aug->line->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+            aug->line->wrap(
+              allocator->post_alloc_data<si32>(width, 1), width, 1);
+          }
+          else
+          {
+            for (ui32 i = 0; i < num_steps; ++i)
+              ssp[i].line->wrap(
+                allocator->post_alloc_data<si64>(width, 1), width, 1);
+            sig->line->wrap(
+              allocator->post_alloc_data<si64>(width, 1), width, 1);
+            aug->line->wrap(
+              allocator->post_alloc_data<si64>(width, 1), width, 1);
+          }
         }
-        else
+        else 
         {
-          for (ui32 i = 0; i < num_steps; ++i)
-            ssp[i].line->wrap(
-              allocator->post_alloc_data<si64>(width, 1), width, 1);
-          sig->line->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
-          aug->line->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
+            for (ui32 i = 0; i < num_steps; ++i)
+              ssp[i].line->wrap(
+                allocator->post_alloc_data<float>(width, 1), width, 1);
+            sig->line->wrap(
+              allocator->post_alloc_data<float>(width, 1), width, 1);
+            aug->line->wrap(
+              allocator->post_alloc_data<float>(width, 1), width, 1);
         }
 
         cur_line = 0;
diff --git a/src/core/codestream/ojph_subband.cpp b/src/core/codestream/ojph_subband.cpp
index 4830895f..655a2b8b 100644
--- a/src/core/codestream/ojph_subband.cpp
+++ b/src/core/codestream/ojph_subband.cpp
@@ -92,6 +92,8 @@ namespace ojph {
 
       const param_qcd* qp = codestream->access_qcd()->get_qcc(comp_num);
       ui32 precision = qp->propose_precision(cdp);
+      const param_atk* atk = cdp->access_atk();
+      bool reversible = atk->is_reversible();
 
       for (ui32 i = 0; i < num_blocks.w; ++i)
         codeblock::pre_alloc(codestream, nominal, precision);
@@ -100,10 +102,15 @@ namespace ojph {
       allocator->pre_alloc_obj<line_buf>(1);
       //allocate line_buf
       ui32 width = band_rect.siz.w + 1;
-      if (precision <= 32)      
-        allocator->pre_alloc_data<si32>(width, 1);
+      if (reversible)
+      {
+        if (precision <= 32)
+          allocator->pre_alloc_data<si32>(width, 1);
+        else
+          allocator->pre_alloc_data<si64>(width, 1);
+      }
       else
-        allocator->pre_alloc_data<si64>(width, 1);
+        allocator->pre_alloc_data<float>(width, 1);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -201,10 +208,15 @@ namespace ojph {
       lines = allocator->post_alloc_obj<line_buf>(1);
       //allocate line_buf
       ui32 width = band_rect.siz.w + 1;
-      if (precision <= 32)      
-        lines->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+      if (reversible)
+      {
+        if (precision <= 32)      
+          lines->wrap(allocator->post_alloc_data<si32>(width, 1), width, 1);
+        else
+          lines->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
+      }
       else
-        lines->wrap(allocator->post_alloc_data<si64>(width, 1), width, 1);
+        lines->wrap(allocator->post_alloc_data<float>(width, 1), width, 1);
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index 67fed0bd..ae78b06c 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -122,11 +122,16 @@ namespace ojph {
       }
 
       //allocate lines
-      if (codestream->get_cod()->is_employing_color_transform())
+      const param_cod* cdp = codestream->get_cod();
+      if (cdp->is_employing_color_transform())
       {
         allocator->pre_alloc_obj<line_buf>(3);
-        for (int i = 0; i < 3; ++i)
-          allocator->pre_alloc_data<si32>(width, 0);
+        if (cdp->access_atk()->is_reversible())
+          for (int i = 0; i < 3; ++i)
+            allocator->pre_alloc_data<si32>(width, 0);
+        else
+          for (int i = 0; i < 3; ++i)
+            allocator->pre_alloc_data<float>(width, 0);
       }
     }
 
@@ -230,8 +235,14 @@ namespace ojph {
       {
         num_lines = 3;
         lines = allocator->post_alloc_obj<line_buf>(num_lines);
-        for (int i = 0; i < 3; ++i)
-          lines[i].wrap(allocator->post_alloc_data<si32>(width, 0), width, 0);
+        if (reversible)
+          for (int i = 0; i < 3; ++i)
+            lines[i].wrap(
+              allocator->post_alloc_data<si32>(width, 0), width, 0);
+        else
+          for (int i = 0; i < 3; ++i)
+            lines[i].wrap(
+              allocator->post_alloc_data<float>(width, 0), width, 0);
       }
       else
       {
diff --git a/src/core/common/ojph_mem.h b/src/core/common/ojph_mem.h
index b910e120..b5a91759 100644
--- a/src/core/common/ojph_mem.h
+++ b/src/core/common/ojph_mem.h
@@ -142,26 +142,14 @@ namespace ojph {
       LFT_16BIT      = 0x02, // Set when data is 2 bytes (not used)
       LFT_32BIT      = 0x04, // Set when data is 4 bytes
       LFT_64BIT      = 0x08, // Set when data is 8 bytes
-      LFT_REVERSIBLE = 0x10, // Set when data is used for reversible coding
-                             // Not all combinations are useful
+      LFT_INTEGER    = 0x10, // Set when data is an integer, in other words
+                             // 32bit integer, not 32bit float
       LFT_SIZE_MASK  = 0x0F, // To extract data size
     };
 
   public:
     line_buf() : size(0), pre_size(0), flags(LFT_UNDEFINED), i32(0) {}
 
-    template<typename T>
-    void pre_alloc(mem_fixed_allocator *p, size_t num_ele, ui32 pre_size)
-    {
-      memset(this, 0, sizeof(line_buf));
-      p->pre_alloc_data<T>(num_ele, pre_size);
-      size = num_ele;
-      this->pre_size = pre_size;
-    }
-    
-    template<typename T>
-    void finalize_alloc(mem_fixed_allocator *p);
-
     template<typename T>
     void wrap(T *buffer, size_t num_ele, ui32 pre_size);
 
diff --git a/src/core/others/ojph_mem.cpp b/src/core/others/ojph_mem.cpp
index 0bb0b5f7..8b1af56f 100644
--- a/src/core/others/ojph_mem.cpp
+++ b/src/core/others/ojph_mem.cpp
@@ -49,30 +49,6 @@ namespace ojph {
   //
   ////////////////////////////////////////////////////////////////////////////
 
-  ////////////////////////////////////////////////////////////////////////////
-  template<>
-  void line_buf::finalize_alloc<si32>(mem_fixed_allocator *p)
-  {
-    assert(p != 0 && size != 0);
-    i32 = p->post_alloc_data<si32>(size, pre_size);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////
-  template<>
-  void line_buf::finalize_alloc<float>(mem_fixed_allocator *p)
-  {
-    assert(p != 0 && size != 0);
-    f32 = p->post_alloc_data<float>(size, pre_size);
-  }
-
-  ////////////////////////////////////////////////////////////////////////////
-  template<>
-  void line_buf::finalize_alloc<si64>(mem_fixed_allocator *p)
-  {
-    assert(p != 0 && size != 0);
-    i64 = p->post_alloc_data<si64>(size, pre_size);
-  }
-
   ////////////////////////////////////////////////////////////////////////////
   template<>
   void line_buf::wrap(si32 *buffer, size_t num_ele, ui32 pre_size)
@@ -80,7 +56,7 @@ namespace ojph {
     this->i32 = buffer;
     this->size = num_ele;
     this->pre_size = pre_size;
-    this->flags = LFT_32BIT | LFT_REVERSIBLE;
+    this->flags = LFT_32BIT | LFT_INTEGER;
   }
 
   ////////////////////////////////////////////////////////////////////////////
@@ -100,7 +76,7 @@ namespace ojph {
     this->i64 = buffer;
     this->size = num_ele;
     this->pre_size = pre_size;
-    this->flags = LFT_64BIT | LFT_REVERSIBLE;
+    this->flags = LFT_64BIT | LFT_INTEGER;
   }
 
   ////////////////////////////////////////////////////////////////////////////
diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index c29ff207..8cca554c 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -338,31 +338,30 @@ namespace ojph {
       ui32 bit_depth, bool is_signed, ui32 width)
     {
       assert((src_line->flags & line_buf::LFT_32BIT) &&
-             (src_line->flags & line_buf::LFT_REVERSIBLE) == 0 &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
              (dst_line->flags & line_buf::LFT_32BIT) &&
-             (dst_line->flags & line_buf::LFT_REVERSIBLE) == 0);
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
 
-      float mul;
-      if (bit_depth < 32)
-        mul = 1.0f / (float)(1 << bit_depth);
-      else
-        mul = (float)(1.0 / 65536.0 / 65536.0);
+      float mul = (float)(1.0 / 65536.0 / 65536.0);
 
       const si32* sp = src_line->i32 + src_line_offset;
       float* dp = dst_line->f32;
+      ui32 shift = 32 - bit_depth;
       if (is_signed)
       {
-        si32 shift = (1 << (bit_depth - 1)) + 1;
+        si32 bias = (si32)((ui32)INT_MIN + 1);
         for (ui32 i = width; i > 0; --i) {
-          si32 v = *sp++;
-          v = (v >= 0) ? v : (- v - shift);
+          si32 v = *sp++ << shift;
+          v = (v >= 0) ? v : (- v - bias);
           *dp++ = (float)v * mul;
         }
       }
       else
       {
-        for (ui32 i = width; i > 0; --i)
-          *dp++ = (float)*sp++ * mul - 0.5f;
+        for (ui32 i = width; i > 0; --i) {
+          si32 v = *sp++ << shift;
+          *dp++ = (float)v * mul - 0.5f;
+        }
       }
     }
 
@@ -372,16 +371,10 @@ namespace ojph {
       ui32 bit_depth, bool is_signed, ui32 width)
     {
       assert((src_line->flags & line_buf::LFT_32BIT) &&
-             (src_line->flags & line_buf::LFT_REVERSIBLE) == 0 &&
+             (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
              (dst_line->flags & line_buf::LFT_32BIT) &&
-             (dst_line->flags & line_buf::LFT_REVERSIBLE) == 0);
+             (dst_line->flags & line_buf::LFT_INTEGER));
       
-      float mul;
-      if (bit_depth < 32)
-        mul = 1.0f / (float)(1 << bit_depth);
-      else
-        mul = (float)(1.0 / 65536.0 / 65536.0);
-
       const float* sp = src_line->f32;
       si32* dp = dst_line->i32 + dst_line_offset;
       if (bit_depth <= 30) 
@@ -389,28 +382,28 @@ namespace ojph {
         // We are leaving two bit overhead -- here, we are assuming that after
         // multiplications, the resulting number can still be represented
         // using 32 bit integer
-        const si32 half = (1 << (bit_depth - 1));
-        const si32 shift = half + 1;
+        float mul = (float)(1 << bit_depth);
         const si32 upper_limit = INT_MAX >> (32 - bit_depth);
         const si32 lower_limit = INT_MIN >> (32 - bit_depth);
 
         if (is_signed)
         {
+          const si32 bias = (1 << (bit_depth - 1)) + 1;
           for (ui32 i = width; i > 0; --i) {
             si32 v = ojph_round(*sp++ * mul);
             v = ojph_max(v, lower_limit);
             v = ojph_min(v, upper_limit);
-            v = (v >= 0) ? v : (- v - shift);
+            v = (v >= 0) ? v : (- v - bias);
             *dp++ = v;
           }
         }
         else
         {
+          const si32 half = (1 << (bit_depth - 1));
           for (ui32 i = width; i > 0; --i) {
             si32 v = ojph_round(*sp++ * mul);
             v = ojph_max(v, lower_limit);
             v = ojph_min(v, upper_limit);
-            v = (v >= 0) ? v : (- v - shift);
             *dp++ = v + half;
           }
         }
@@ -427,30 +420,30 @@ namespace ojph {
         // can achieve.  All this is academic, because here are talking
         // about a number which has all the exponent bits set, meaning 
         // it is either infinity, -infinity, qNan or sNan.
-        const si32 half = (1 << (bit_depth - 1));
-        const si32 shift = half + 1;
-        const si64 upper_limit = LLONG_MAX >> (64 - bit_depth);
-        const si64 lower_limit = LLONG_MIN >> (64 - bit_depth);
+        float mul = (float)(1ull << bit_depth);
+        const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
+        const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
 
         if (is_signed)
         {
+          const si32 bias = (1 << (bit_depth - 1)) + 1;
           for (ui32 i = width; i > 0; --i) {
             si64 t = ojph_round64(*sp++ * mul);
             t = ojph_max(t, lower_limit);
             t = ojph_min(t, upper_limit);
             si32 v = (si32)t;
-            v = (v >= 0) ? v : (- v - shift);
+            v = (v >= 0) ? v : (- v - bias);
             *dp++ = v;
           }
         }
         else
         {
+          const si32 half = (1 << (bit_depth - 1));
           for (ui32 i = width; i > 0; --i) {
             si64 t = ojph_round64(*sp++ * mul);
             t = ojph_max(t, lower_limit);
             t = ojph_min(t, upper_limit);
             si32 v = (si32)t;
-            v = (v >= 0) ? v : (- v - shift);
             *dp++ = v + half;
           }
         }
@@ -462,12 +455,12 @@ namespace ojph {
       const line_buf *r, const line_buf *g, const line_buf *b,
       line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
     {
-      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
-             (cb->flags & line_buf::LFT_REVERSIBLE) && 
-             (cr->flags & line_buf::LFT_REVERSIBLE) &&
-             (r->flags  & line_buf::LFT_REVERSIBLE) &&
-             (g->flags  & line_buf::LFT_REVERSIBLE) && 
-             (b->flags  & line_buf::LFT_REVERSIBLE));
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) && 
+             (b->flags  & line_buf::LFT_INTEGER));
       
       if  (y->flags & line_buf::LFT_32BIT)
       {
@@ -512,12 +505,12 @@ namespace ojph {
       const line_buf *y, const line_buf *cb, const line_buf *cr,
       line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
     {
-      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
-             (cb->flags & line_buf::LFT_REVERSIBLE) && 
-             (cr->flags & line_buf::LFT_REVERSIBLE) &&
-             (r->flags  & line_buf::LFT_REVERSIBLE) &&
-             (g->flags  & line_buf::LFT_REVERSIBLE) && 
-             (b->flags  & line_buf::LFT_REVERSIBLE));
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) && 
+             (b->flags  & line_buf::LFT_INTEGER));
 
       if (y->flags & line_buf::LFT_32BIT)
       {
diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index 05bff311..bc9a9e9f 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -243,12 +243,12 @@ namespace ojph {
                           line_buf *y, line_buf *cb, line_buf *cr, 
                           ui32 repeat)
     {
-      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
-             (cb->flags & line_buf::LFT_REVERSIBLE) && 
-             (cr->flags & line_buf::LFT_REVERSIBLE) &&
-             (r->flags  & line_buf::LFT_REVERSIBLE) &&
-             (g->flags  & line_buf::LFT_REVERSIBLE) && 
-             (b->flags  & line_buf::LFT_REVERSIBLE));
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) && 
+             (b->flags  & line_buf::LFT_INTEGER));
       
       if  (y->flags & line_buf::LFT_32BIT)
       {
@@ -333,12 +333,12 @@ namespace ojph {
                            line_buf *r, line_buf *g, line_buf *b, 
                            ui32 repeat)
     {
-      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
-             (cb->flags & line_buf::LFT_REVERSIBLE) && 
-             (cr->flags & line_buf::LFT_REVERSIBLE) &&
-             (r->flags  & line_buf::LFT_REVERSIBLE) &&
-             (g->flags  & line_buf::LFT_REVERSIBLE) && 
-             (b->flags  & line_buf::LFT_REVERSIBLE));
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) && 
+             (b->flags  & line_buf::LFT_INTEGER));
 
       if (y->flags & line_buf::LFT_32BIT)
       {
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index a529c66b..37fa1c8a 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -290,12 +290,12 @@ namespace ojph {
                           line_buf *y, line_buf *cb, line_buf *cr, 
                           ui32 repeat)
     {
-      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
-             (cb->flags & line_buf::LFT_REVERSIBLE) && 
-             (cr->flags & line_buf::LFT_REVERSIBLE) &&
-             (r->flags  & line_buf::LFT_REVERSIBLE) &&
-             (g->flags  & line_buf::LFT_REVERSIBLE) && 
-             (b->flags  & line_buf::LFT_REVERSIBLE));
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) && 
+             (b->flags  & line_buf::LFT_INTEGER));
       
       if  (y->flags & line_buf::LFT_32BIT)
       {
@@ -381,12 +381,12 @@ namespace ojph {
                            line_buf *r, line_buf *g, line_buf *b, 
                            ui32 repeat)
     {
-      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
-             (cb->flags & line_buf::LFT_REVERSIBLE) && 
-             (cr->flags & line_buf::LFT_REVERSIBLE) &&
-             (r->flags  & line_buf::LFT_REVERSIBLE) &&
-             (g->flags  & line_buf::LFT_REVERSIBLE) && 
-             (b->flags  & line_buf::LFT_REVERSIBLE));
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) && 
+             (b->flags  & line_buf::LFT_INTEGER));
 
       if (y->flags & line_buf::LFT_32BIT)
       {

From e0a3c2bc86d997659452f199f6c51a04c0fa17b5 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Wed, 22 Jan 2025 21:06:19 +1100
Subject: [PATCH 05/20] Added SIMD, except for the 32bit path. Wasm SIMD is
 missing.  Not tested.

---
 src/core/transform/ojph_colour.cpp      |  84 ++++-----
 src/core/transform/ojph_colour_avx2.cpp | 147 ++++++++++++++++
 src/core/transform/ojph_colour_local.h  |  30 ++++
 src/core/transform/ojph_colour_sse2.cpp | 217 +++++++++++++++++++++---
 src/core/transform/ojph_colour_wasm.cpp | 169 ++++++++++++++++--
 5 files changed, 571 insertions(+), 76 deletions(-)

diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 8cca554c..617fc41f 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -79,16 +79,16 @@ namespace ojph {
     void (*cnvrt_float_to_si32)
       (const float *sp, si32 *dp, float mul, ui32 width) = NULL;
 
-    //////////////////////////////////////////////////////////////////////////
-    void (*irv_convert_to_float_nlt_type3) (
-      const line_buf *src_line, ui32 src_line_offset,
-      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL;
-      
     //////////////////////////////////////////////////////////////////////////
     void (*irv_convert_to_integer_nlt_type3) (
       const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, 
       ui32 bit_depth, bool is_signed, ui32 width) = NULL;
 
+    //////////////////////////////////////////////////////////////////////////
+    void (*irv_convert_to_float_nlt_type3) (
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL;
+      
     //////////////////////////////////////////////////////////////////////////
     void (*rct_forward)
       (const line_buf* r, const line_buf* g, const line_buf* b,
@@ -156,6 +156,10 @@ namespace ojph {
           rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3;
           cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd;
           cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32;
+          irv_convert_to_integer_nlt_type3 =
+            sse2_irv_convert_to_integer_nlt_type3;
+          irv_convert_to_float_nlt_type3 =
+            sse2_irv_convert_to_float_nlt_type3;
           rct_forward = sse2_rct_forward;
           rct_backward = sse2_rct_backward;
         }
@@ -178,6 +182,10 @@ namespace ojph {
         {
           rev_convert = avx2_rev_convert;
           rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3;
+          irv_convert_to_integer_nlt_type3 =
+            avx2_irv_convert_to_integer_nlt_type3;
+          irv_convert_to_float_nlt_type3 =
+            avx2_irv_convert_to_float_nlt_type3;
           rct_forward = avx2_rct_forward;
           rct_backward = avx2_rct_backward;
         }
@@ -332,39 +340,6 @@ namespace ojph {
         *dp++ = ojph_round(*sp++ * mul);
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
-      ui32 src_line_offset, line_buf *dst_line, 
-      ui32 bit_depth, bool is_signed, ui32 width)
-    {
-      assert((src_line->flags & line_buf::LFT_32BIT) &&
-             (src_line->flags & line_buf::LFT_INTEGER) &&
-             (dst_line->flags & line_buf::LFT_32BIT) &&
-             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
-
-      float mul = (float)(1.0 / 65536.0 / 65536.0);
-
-      const si32* sp = src_line->i32 + src_line_offset;
-      float* dp = dst_line->f32;
-      ui32 shift = 32 - bit_depth;
-      if (is_signed)
-      {
-        si32 bias = (si32)((ui32)INT_MIN + 1);
-        for (ui32 i = width; i > 0; --i) {
-          si32 v = *sp++ << shift;
-          v = (v >= 0) ? v : (- v - bias);
-          *dp++ = (float)v * mul;
-        }
-      }
-      else
-      {
-        for (ui32 i = width; i > 0; --i) {
-          si32 v = *sp++ << shift;
-          *dp++ = (float)v * mul - 0.5f;
-        }
-      }
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
       line_buf *dst_line, ui32 dst_line_offset,
@@ -450,6 +425,39 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
+      ui32 src_line_offset, line_buf *dst_line, 
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
+
+      float mul = (float)(1.0 / 65536.0 / 65536.0);
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      ui32 shift = 32 - bit_depth;
+      if (is_signed)
+      {
+        si32 bias = (si32)((ui32)INT_MIN + 1);
+        for (ui32 i = width; i > 0; --i) {
+          si32 v = *sp++ << shift;
+          v = (v >= 0) ? v : (- v - bias);
+          *dp++ = (float)v * mul;
+        }
+      }
+      else
+      {
+        for (ui32 i = width; i > 0; --i) {
+          si32 v = *sp++ << shift;
+          *dp++ = (float)v * mul - 0.5f;
+        }
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void gen_rct_forward(
       const line_buf *r, const line_buf *g, const line_buf *b,
diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index bc9a9e9f..80ef38b4 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -236,6 +236,153 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER));
+      
+      const float* sp = src_line->f32;
+      si32* dp = dst_line->i32 + dst_line_offset;
+      if (bit_depth <= 30) 
+      {
+        // We are leaving two bit overhead -- here, we are assuming that after
+        // multiplications, the resulting number can still be represented
+        // using 32 bit integer
+        __m256 mul = _mm256_set1_ps((float)(1 << bit_depth));
+        __m256i upper_limit = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
+        __m256i lower_limit = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
+
+        if (is_signed)
+        {
+          __m256i zero = _mm256_setzero_si256();
+          __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));
+          for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) 
+          {
+            __m256 t = _mm256_loadu_ps(sp);
+            t = _mm256_mul_ps(t, mul);
+            t = _mm256_round_ps(t, 
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+            __m256i u = _mm256_cvtps_epi32(t);
+            u = _mm256_max_epi32(u, lower_limit);
+            u = _mm256_min_epi32(u, upper_limit);
+
+            __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
+            __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value
+            neg = _mm256_and_si256(c, neg);          //keep only - bias - value
+            __m256i v = _mm256_andnot_si256(c, u);   //keep only +ve or 0
+            v = _mm256_or_si256(neg, v);             //combine
+            _mm256_storeu_si256((__m256i*)dp, v);
+          }
+        }
+        else
+        {
+          __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1)));
+          for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
+            __m256 t = _mm256_loadu_ps(sp);
+            t = _mm256_mul_ps(t, mul);
+            t = _mm256_round_ps(t, 
+              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+            __m256i u = _mm256_cvtps_epi32(t);
+            u = _mm256_max_epi32(u, lower_limit);
+            u = _mm256_min_epi32(u, upper_limit);
+            u = _mm256_add_epi32(u, half);
+            _mm256_storeu_si256((__m256i*)dp, u);
+          }
+        }
+      }
+      else
+      {
+        // There is the possibility that converting to integer will
+        // exceed the dynamic range of 32bit integer; therefore, we need
+        // to use 64 bit.  One may think, why not limit the floats to the
+        // range of [-0.5f, 0.5f)? 
+        // Notice the half closed range -- we need a value just below 0.5f.
+        // While getting this number is possible, after multiplication, the
+        // resulting number will not be exactly the maximum that the integer 
+        // can achieve.  All this is academic, because here are talking
+        // about a number which has all the exponent bits set, meaning 
+        // it is either infinity, -infinity, qNan or sNan.
+        float mul = (float)(1ull << bit_depth);
+        const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
+        const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
+
+        if (is_signed)
+        {
+          const si32 bias = (1 << (bit_depth - 1)) + 1;
+          for (ui32 i = width; i > 0; --i) {
+            si64 t = ojph_round64(*sp++ * mul);
+            t = ojph_max(t, lower_limit);
+            t = ojph_min(t, upper_limit);
+            si32 v = (si32)t;
+            v = (v >= 0) ? v : (- v - bias);
+            *dp++ = v;
+          }
+        }
+        else
+        {
+          const si32 half = (1 << (bit_depth - 1));
+          for (ui32 i = width; i > 0; --i) {
+            si64 t = ojph_round64(*sp++ * mul);
+            t = ojph_max(t, lower_limit);
+            t = ojph_min(t, upper_limit);
+            si32 v = (si32)t;
+            *dp++ = v + half;
+          }
+        }
+      }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
+      ui32 src_line_offset, line_buf *dst_line, 
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
+
+      __m256 mul = _mm256_set1_ps((float)(1.0 / 65536.0 / 65536.0));
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      si32 shift = 32 - (si32)bit_depth;
+      if (is_signed)
+      {
+        __m256i zero = _mm256_setzero_si256();
+        __m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1));
+        for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
+          __m256i t = _mm256_loadu_si256((__m256i*)sp);
+          __m256i u = _mm256_slli_epi32(t, shift);
+          __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve value
+          __m256i neg = _mm256_sub_epi32(bias, u); // - bias - value
+          neg = _mm256_and_si256(c, neg);          // keep only - bias - value
+          t = _mm256_andnot_si256(c, u);           // keep only +ve or 0
+          u = _mm256_or_si256(neg, t);             // combine
+          __m256 v = _mm256_cvtepi32_ps(u);
+          v = _mm256_mul_ps(v, mul);
+          _mm256_storeu_ps(dp, v);        
+        }
+      }
+      else
+      {
+        __m256 half = _mm256_set1_ps(0.5f);
+        for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
+          __m256i t = _mm256_loadu_si256((__m256i*)sp);
+          __m256i u = _mm256_slli_epi32(t, shift);
+          __m256 v = _mm256_cvtepi32_ps(u);
+          v = _mm256_mul_ps(v, mul);
+          v = _mm256_add_ps(v, half);
+          _mm256_storeu_ps(dp, v);
+        }
+      }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void avx2_rct_forward(const line_buf *r, 
                           const line_buf *g, 
diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h
index acd0f944..5f28685a 100644
--- a/src/core/transform/ojph_colour_local.h
+++ b/src/core/transform/ojph_colour_local.h
@@ -168,6 +168,11 @@ namespace ojph {
     void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
                                   ui32 width);
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_integer_nlt_type3(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
+
     //////////////////////////////////////////////////////////////////////////
     //
     //
@@ -188,6 +193,11 @@ namespace ojph {
       line_buf *dst_line, const ui32 dst_line_offset, 
       si64 shift, ui32 width);
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_float_nlt_type3(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
     //////////////////////////////////////////////////////////////////////////
     void sse2_rct_forward(
       const line_buf *r, const line_buf *g, const line_buf *b,
@@ -250,6 +260,16 @@ namespace ojph {
       line_buf *dst_line, const ui32 dst_line_offset, 
       si64 shift, ui32 width);
 
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_integer_nlt_type3(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_float_nlt_type3(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
     //////////////////////////////////////////////////////////////////////////
     void avx2_rct_forward(
       const line_buf *r, const line_buf *g, const line_buf *b,
@@ -296,6 +316,16 @@ namespace ojph {
       line_buf *dst_line, const ui32 dst_line_offset, 
       si64 shift, ui32 width);
 
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_integer_nlt_type3(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_float_nlt_type3(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
     //////////////////////////////////////////////////////////////////////////
     void wasm_rct_forward(
       const line_buf *r, const line_buf *g, const line_buf *b,
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index 37fa1c8a..3c467bd1 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -48,6 +48,159 @@
 namespace ojph {
   namespace local {
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
+                                       ui32 width)
+    {
+      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
+      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+      __m128 shift = _mm_set1_ps(0.5f);
+      __m128 m = _mm_set1_ps(mul);
+      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      {
+        __m128 t = _mm_loadu_ps(sp);
+        __m128 s = _mm_add_ps(t, shift);
+        s = _mm_mul_ps(s, m);
+        _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
+      }
+      _MM_SET_ROUNDING_MODE(rounding_mode);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
+                                  ui32 width)
+    {
+      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
+      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+      __m128 m = _mm_set1_ps(mul);
+      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      {
+        __m128 t = _mm_loadu_ps(sp);
+        __m128 s = _mm_mul_ps(t, m);
+        _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
+      }
+      _MM_SET_ROUNDING_MODE(rounding_mode);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // This requires SSE4.1
+    __m128i ojph_mm_max_epi32(__m128i a, __m128i b)
+    {
+      __m128i c = _mm_cmpgt_epi32(a, b);  // 0xFFFFFFFF for a > b
+      __m128i d = _mm_and_si128(c, a);    // keep only a, where a > b
+      __m128i e = _mm_andnot_si128(c, b); // keep only b, where a <= b
+      return _mm_or_si128(d, e);          // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // This requires SSE4.1
+    __m128i ojph_mm_min_epi32 (__m128i a, __m128i b)
+    {
+      __m128i c = _mm_cmplt_epi32(a, b);  // 0xFFFFFFFF for a < b
+      __m128i d = _mm_and_si128(c, a);    // keep only a, where a < b
+      __m128i e = _mm_andnot_si128(c, b); // keep only b, where a >= b
+      return _mm_or_si128(d, e);          // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER));
+      
+      const float* sp = src_line->f32;
+      si32* dp = dst_line->i32 + dst_line_offset;
+      if (bit_depth <= 30) 
+      {
+        // We are leaving two bit overhead -- here, we are assuming that after
+        // multiplications, the resulting number can still be represented
+        // using 32 bit integer
+        __m128 mul = _mm_set1_ps((float)(1 << bit_depth));
+        __m128i upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
+        __m128i lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
+        
+        if (is_signed)
+        {
+          __m128i zero = _mm_setzero_si128();
+          __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));          
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) 
+          {
+            __m128 t = _mm_loadu_ps(sp);
+            t = _mm_mul_ps(t, mul);
+            t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+            __m128i u = _mm_cvtps_epi32(t);
+            u = ojph_mm_max_epi32(u, lower_limit);
+            u = ojph_mm_min_epi32(u, upper_limit);
+
+            __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
+            __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
+            neg = _mm_and_si128(c, neg);          //keep only - bias - value
+            __m128i v = _mm_andnot_si128(c, u);   //keep only +ve or 0
+            v = _mm_or_si128(neg, v);             //combine
+            _mm_storeu_si128((__m128i*)dp, v);
+          }
+        }
+        else
+        {
+          __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1)));
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+            __m128 t = _mm_loadu_ps(sp);
+            t = _mm_mul_ps(t, mul);
+            t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+            __m128i u = _mm_cvtps_epi32(t);
+            u = ojph_mm_max_epi32(u, lower_limit);
+            u = ojph_mm_min_epi32(u, upper_limit);
+            u = _mm_add_epi32(u, half);
+            _mm_storeu_si128((__m128i*)dp, u);
+          }
+        }
+      }
+      else
+      {
+        // There is the possibility that converting to integer will
+        // exceed the dynamic range of 32bit integer; therefore, we need
+        // to use 64 bit.  One may think, why not limit the floats to the
+        // range of [-0.5f, 0.5f)? 
+        // Notice the half closed range -- we need a value just below 0.5f.
+        // While getting this number is possible, after multiplication, the
+        // resulting number will not be exactly the maximum that the integer 
+        // can achieve.  All this is academic, because here are talking
+        // about a number which has all the exponent bits set, meaning 
+        // it is either infinity, -infinity, qNan or sNan.
+        float mul = (float)(1ull << bit_depth);
+        const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
+        const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
+
+        if (is_signed)
+        {
+          const si32 bias = (1 << (bit_depth - 1)) + 1;
+          for (ui32 i = width; i > 0; --i) {
+            si64 t = ojph_round64(*sp++ * mul);
+            t = ojph_max(t, lower_limit);
+            t = ojph_min(t, upper_limit);
+            si32 v = (si32)t;
+            v = (v >= 0) ? v : (- v - bias);
+            *dp++ = v;
+          }
+        }
+        else
+        {
+          const si32 half = (1 << (bit_depth - 1));
+          for (ui32 i = width; i > 0; --i) {
+            si64 t = ojph_round64(*sp++ * mul);
+            t = ojph_max(t, lower_limit);
+            t = ojph_min(t, upper_limit);
+            si32 v = (si32)t;
+            *dp++ = v + half;
+          }
+        }
+      }
+    }
+
     /////////////////////////////////////////////////////////////////////////
     // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
     static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) 
@@ -250,37 +403,49 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width)
+    void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
+      ui32 src_line_offset, line_buf *dst_line, 
+      ui32 bit_depth, bool is_signed, ui32 width)
     {
-      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
-      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
-      __m128 shift = _mm_set1_ps(0.5f);
-      __m128 m = _mm_set1_ps(mul);
-      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
+
+      __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0));
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      si32 shift = 32 - (si32)bit_depth;
+      if (is_signed)
       {
-        __m128 t = _mm_loadu_ps(sp);
-        __m128 s = _mm_add_ps(t, shift);
-        s = _mm_mul_ps(s, m);
-        _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
+        __m128i zero = _mm_setzero_si128();
+        __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1));
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m128i t = _mm_loadu_si128((__m128i*)sp);
+          __m128i u = _mm_slli_epi32(t, shift);
+          __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value
+          __m128i neg = _mm_sub_epi32(bias, u); // - bias - value
+          neg = _mm_and_si128(c, neg);          // keep only - bias - value
+          t = _mm_andnot_si128(c, u);           // keep only +ve or 0
+          u = _mm_or_si128(neg, t);             // combine
+          __m128 v = _mm_cvtepi32_ps(u);
+          v = _mm_mul_ps(v, mul);
+          _mm_storeu_ps(dp, v);        
+        }
       }
-      _MM_SET_ROUNDING_MODE(rounding_mode);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width)
-    {
-      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
-      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
-      __m128 m = _mm_set1_ps(mul);
-      for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
+      else
       {
-        __m128 t = _mm_loadu_ps(sp);
-        __m128 s = _mm_mul_ps(t, m);
-        _mm_storeu_si128((__m128i*)dp, _mm_cvtps_epi32(s));
+        __m128 half = _mm_set1_ps(0.5f);
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m128i t = _mm_loadu_si128((__m128i*)sp);
+          __m128i u = _mm_slli_epi32(t, shift);
+          __m128 v = _mm_cvtepi32_ps(u);
+          v = _mm_mul_ps(v, mul);
+          v = _mm_add_ps(v, half);
+          _mm_storeu_ps(dp, v);
+        }
       }
-      _MM_SET_ROUNDING_MODE(rounding_mode);
     }
 
     //////////////////////////////////////////////////////////////////////////
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index 5bf6ccdd..e0a88e8e 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -260,6 +260,151 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER));
+      
+      // const float* sp = src_line->f32;
+      // si32* dp = dst_line->i32 + dst_line_offset;
+      // if (bit_depth <= 30) 
+      // {
+      //   // We are leaving two bit overhead -- here, we are assuming that after
+      //   // multiplications, the resulting number can still be represented
+      //   // using 32 bit integer
+      //   __m128 mul = _mm_set1_ps((float)(1 << bit_depth));
+      //   __m128i upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
+      //   __m128i lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
+        
+      //   if (is_signed)
+      //   {
+      //     __m128i zero = _mm_setzero_si128();
+      //     __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));          
+      //     for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) 
+      //     {
+      //       __m128 t = _mm_loadu_ps(sp);
+      //       t = _mm_mul_ps(t, mul);
+      //       t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+      //       __m128i u = _mm_cvtps_epi32(t);
+      //       u = ojph_mm_max_epi32(u, lower_limit);
+      //       u = ojph_mm_min_epi32(u, upper_limit);
+
+      //       __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
+      //       __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
+      //       neg = _mm_and_si128(c, neg);          //keep only - bias - value
+      //       __m128i v = _mm_andnot_si128(c, u);   //keep only +ve or 0
+      //       v = _mm_or_si128(neg, v);             //combine
+      //       _mm_storeu_si128((__m128i*)dp, v);
+      //     }
+      //   }
+      //   else
+      //   {
+      //     __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1)));
+      //     for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+      //       __m128 t = _mm_loadu_ps(sp);
+      //       t = _mm_mul_ps(t, mul);
+      //       t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+      //       __m128i u = _mm_cvtps_epi32(t);
+      //       u = ojph_mm_max_epi32(u, lower_limit);
+      //       u = ojph_mm_min_epi32(u, upper_limit);
+      //       u = _mm_add_epi32(u, half);
+      //       _mm_storeu_si128((__m128i*)dp, u);
+      //     }
+      //   }
+      // }
+      // else
+      // {
+      //   // There is the possibility that converting to integer will
+      //   // exceed the dynamic range of 32bit integer; therefore, we need
+      //   // to use 64 bit.  One may think, why not limit the floats to the
+      //   // range of [-0.5f, 0.5f)? 
+      //   // Notice the half closed range -- we need a value just below 0.5f.
+      //   // While getting this number is possible, after multiplication, the
+      //   // resulting number will not be exactly the maximum that the integer 
+      //   // can achieve.  All this is academic, because here are talking
+      //   // about a number which has all the exponent bits set, meaning 
+      //   // it is either infinity, -infinity, qNan or sNan.
+      //   float mul = (float)(1ull << bit_depth);
+      //   const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
+      //   const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
+
+      //   if (is_signed)
+      //   {
+      //     const si32 bias = (1 << (bit_depth - 1)) + 1;
+      //     for (ui32 i = width; i > 0; --i) {
+      //       si64 t = ojph_round64(*sp++ * mul);
+      //       t = ojph_max(t, lower_limit);
+      //       t = ojph_min(t, upper_limit);
+      //       si32 v = (si32)t;
+      //       v = (v >= 0) ? v : (- v - bias);
+      //       *dp++ = v;
+      //     }
+      //   }
+      //   else
+      //   {
+      //     const si32 half = (1 << (bit_depth - 1));
+      //     for (ui32 i = width; i > 0; --i) {
+      //       si64 t = ojph_round64(*sp++ * mul);
+      //       t = ojph_max(t, lower_limit);
+      //       t = ojph_min(t, upper_limit);
+      //       si32 v = (si32)t;
+      //       *dp++ = v + half;
+      //     }
+      //   }
+      // }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
+      ui32 src_line_offset, line_buf *dst_line, 
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      assert((src_line->flags & line_buf::LFT_32BIT) &&
+             (src_line->flags & line_buf::LFT_INTEGER) &&
+             (dst_line->flags & line_buf::LFT_32BIT) &&
+             (dst_line->flags & line_buf::LFT_INTEGER) == 0);
+
+      // __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0));
+
+      // const si32* sp = src_line->i32 + src_line_offset;
+      // float* dp = dst_line->f32;
+      // si32 shift = 32 - (si32)bit_depth;
+      // if (is_signed)
+      // {
+      //   __m128i zero = _mm_setzero_si128();
+      //   __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1));
+      //   for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+      //     __m128i t = _mm_loadu_si128((__m128i*)sp);
+      //     __m128i u = _mm_slli_epi32(t, shift);
+      //     __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value
+      //     __m128i neg = _mm_sub_epi32(bias, u); // - bias - value
+      //     neg = _mm_and_si128(c, neg);          // keep only - bias - value
+      //     t = _mm_andnot_si128(c, u);           // keep only +ve or 0
+      //     u = _mm_or_si128(neg, t);             // combine
+      //     __m128 v = _mm_cvtepi32_ps(u);
+      //     v = _mm_mul_ps(v, mul);
+      //     _mm_storeu_ps(dp, v);        
+      //   }
+      // }
+      // else
+      // {
+      //   __m128 half = _mm_set1_ps(0.5f);
+      //   for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+      //     __m128i t = _mm_loadu_si128((__m128i*)sp);
+      //     __m128i u = _mm_slli_epi32(t, shift);
+      //     __m128 v = _mm_cvtepi32_ps(u);
+      //     v = _mm_mul_ps(v, mul);
+      //     v = _mm_add_ps(v, half);
+      //     _mm_storeu_ps(dp, v);
+      //   }
+      // }
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void wasm_rct_forward(const line_buf *r, 
                           const line_buf *g, 
@@ -267,12 +412,12 @@ namespace ojph {
                           line_buf *y, line_buf *cb, line_buf *cr, 
                           ui32 repeat)
     {
-      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
-             (cb->flags & line_buf::LFT_REVERSIBLE) && 
-             (cr->flags & line_buf::LFT_REVERSIBLE) &&
-             (r->flags  & line_buf::LFT_REVERSIBLE) &&
-             (g->flags  & line_buf::LFT_REVERSIBLE) && 
-             (b->flags  & line_buf::LFT_REVERSIBLE));
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) && 
+             (b->flags  & line_buf::LFT_INTEGER));
       
       if  (y->flags & line_buf::LFT_32BIT)
       {
@@ -357,12 +502,12 @@ namespace ojph {
                            line_buf *r, line_buf *g, line_buf *b, 
                            ui32 repeat)
     {
-      assert((y->flags  & line_buf::LFT_REVERSIBLE) &&
-             (cb->flags & line_buf::LFT_REVERSIBLE) && 
-             (cr->flags & line_buf::LFT_REVERSIBLE) &&
-             (r->flags  & line_buf::LFT_REVERSIBLE) &&
-             (g->flags  & line_buf::LFT_REVERSIBLE) && 
-             (b->flags  & line_buf::LFT_REVERSIBLE));
+      assert((y->flags  & line_buf::LFT_INTEGER) &&
+             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cr->flags & line_buf::LFT_INTEGER) &&
+             (r->flags  & line_buf::LFT_INTEGER) &&
+             (g->flags  & line_buf::LFT_INTEGER) && 
+             (b->flags  & line_buf::LFT_INTEGER));
 
       if (y->flags & line_buf::LFT_32BIT)
       {

From 66286f307c90e228c3f691b355dd6362145a3d26 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Wed, 22 Jan 2025 21:10:10 +1100
Subject: [PATCH 06/20] Fixes compilation Error.

---
 src/core/transform/ojph_colour_sse2.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index 3c467bd1..6f974e5d 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -112,6 +112,9 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER));
       
+      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
+      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
+
       const float* sp = src_line->f32;
       si32* dp = dst_line->i32 + dst_line_offset;
       if (bit_depth <= 30) 
@@ -131,7 +134,6 @@ namespace ojph {
           {
             __m128 t = _mm_loadu_ps(sp);
             t = _mm_mul_ps(t, mul);
-            t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
             __m128i u = _mm_cvtps_epi32(t);
             u = ojph_mm_max_epi32(u, lower_limit);
             u = ojph_mm_min_epi32(u, upper_limit);
@@ -150,7 +152,6 @@ namespace ojph {
           for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
             __m128 t = _mm_loadu_ps(sp);
             t = _mm_mul_ps(t, mul);
-            t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
             __m128i u = _mm_cvtps_epi32(t);
             u = ojph_mm_max_epi32(u, lower_limit);
             u = ojph_mm_min_epi32(u, upper_limit);
@@ -199,6 +200,8 @@ namespace ojph {
           }
         }
       }
+
+      _MM_SET_ROUNDING_MODE(rounding_mode);
     }
 
     /////////////////////////////////////////////////////////////////////////

From 293eacd911a7a68bf3d5c7765bf32b27ec429760 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Thu, 23 Jan 2025 08:45:35 +1100
Subject: [PATCH 07/20] Added missing path in AVX2. Not tested.

---
 src/core/transform/ojph_colour_avx2.cpp | 73 +++++++++++++++++------
 src/core/transform/ojph_colour_sse2.cpp | 77 ++++++++++++++++++-------
 2 files changed, 111 insertions(+), 39 deletions(-)

diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index 80ef38b4..1c9da81e 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -236,6 +236,32 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    __m256i ojph_mm256_max_ge_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
+    {
+      // We must use _CMP_NLT_UQ or _CMP_GE_OQ, _CMP_GE_OS, or _CMP_NLT_US
+      // It is not clear to me which to use
+      __m256 ct = _mm256_cmp_ps(x, y, _CMP_NLT_UQ); // 0xFFFFFFFF for x >= y
+      __m256i c = _mm256_castps_si256(ct);   // does not generate any code
+      __m256i d = _mm256_and_si256(c, a);    // keep only a, where x >= y
+      __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x <  y
+      return _mm256_or_si256(d, e);          // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    __m256i ojph_mm256_min_lt_epi32(__m256i a, __m256i b, __m256 x, __m256 y)
+    {
+      // We must use _CMP_LT_OQ or _CMP_NGE_UQ, _CMP_LT_OS, or _CMP_NGE_US
+      // It is not clear to me which to use
+      __m256 ct = _mm256_cmp_ps(x, y, _CMP_NGE_UQ); // 0xFFFFFFFF for x < y
+      __m256i c = _mm256_castps_si256(ct);   // does not generate any code
+      __m256i d = _mm256_and_si256(c, a);    // keep only a, where x <  y
+      __m256i e = _mm256_andnot_si256(c, b); // keep only b, where x >= y
+      return _mm256_or_si256(d, e);          // combine
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
       line_buf *dst_line, ui32 dst_line_offset,
@@ -307,31 +333,42 @@ namespace ojph {
         // can achieve.  All this is academic, because here are talking
         // about a number which has all the exponent bits set, meaning 
         // it is either infinity, -infinity, qNan or sNan.
-        float mul = (float)(1ull << bit_depth);
-        const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
-        const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
+        si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
+        __m256 mul = _mm256_set1_ps((float)(1 << bit_depth));
+        __m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit);  // val < upper
+        __m256 fl_low_lim = _mm256_set1_ps((float)neg_limit);  // val >= lower
+        __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
+        __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
 
         if (is_signed)
         {
-          const si32 bias = (1 << (bit_depth - 1)) + 1;
-          for (ui32 i = width; i > 0; --i) {
-            si64 t = ojph_round64(*sp++ * mul);
-            t = ojph_max(t, lower_limit);
-            t = ojph_min(t, upper_limit);
-            si32 v = (si32)t;
-            v = (v >= 0) ? v : (- v - bias);
-            *dp++ = v;
+          __m256i zero = _mm256_setzero_si256();
+          __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));                   
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+            __m256 t = _mm256_loadu_ps(sp);
+            t = _mm256_mul_ps(t, mul);
+            __m256i u = _mm256_cvtps_epi32(t);
+            u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+            u = ojph_mm256_min_lt_epi32(u,  s32_up_lim, t,  fl_up_lim);
+            __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
+            __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value
+            neg = _mm256_and_si256(c, neg);          //keep only - bias - value
+            __m256i v = _mm256_andnot_si256(c, u);   //keep only +ve or 0
+            v = _mm256_or_si256(neg, v);             //combine
+            _mm256_storeu_si256((__m256i*)dp, v);
           }
         }
         else
         {
-          const si32 half = (1 << (bit_depth - 1));
-          for (ui32 i = width; i > 0; --i) {
-            si64 t = ojph_round64(*sp++ * mul);
-            t = ojph_max(t, lower_limit);
-            t = ojph_min(t, upper_limit);
-            si32 v = (si32)t;
-            *dp++ = v + half;
+          __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1)));
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+            __m256 t = _mm256_loadu_ps(sp);
+            t = _mm256_mul_ps(t, mul);
+            __m256i u = _mm256_cvtps_epi32(t);
+            u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+            u = ojph_mm256_min_lt_epi32(u,  s32_up_lim, t,  fl_up_lim);
+            u = _mm256_add_epi32(u, half);
+            _mm256_storeu_si256((__m256i*)dp, u);
           }
         }
       }
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index 6f974e5d..e05ceb5e 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -83,7 +83,8 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    // This requires SSE4.1
+    // _mm_max_epi32 requires SSE4.1, so here we implement it in SSE2
+    static inline
     __m128i ojph_mm_max_epi32(__m128i a, __m128i b)
     {
       __m128i c = _mm_cmpgt_epi32(a, b);  // 0xFFFFFFFF for a > b
@@ -93,7 +94,8 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    // This requires SSE4.1
+    // _mm_min_epi32 requires SSE4.1, so here we implement it in SSE2
+    static inline
     __m128i ojph_mm_min_epi32 (__m128i a, __m128i b)
     {
       __m128i c = _mm_cmplt_epi32(a, b);  // 0xFFFFFFFF for a < b
@@ -102,6 +104,28 @@ namespace ojph {
       return _mm_or_si128(d, e);          // combine
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    __m128i ojph_mm_max_ge_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
+    {
+      __m128 ct = _mm_cmpge_ps(x, y);     // 0xFFFFFFFF for x >= y
+      __m128i c = _mm_castps_si128(ct);   // does not generate any code
+      __m128i d = _mm_and_si128(c, a);    // keep only a, where x >= y
+      __m128i e = _mm_andnot_si128(c, b); // keep only b, where x <  y
+      return _mm_or_si128(d, e);          // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    __m128i ojph_mm_min_lt_epi32(__m128i a, __m128i b, __m128 x, __m128 y)
+    {
+      __m128 ct = _mm_cmplt_ps(x, y);     // 0xFFFFFFFF for x < y
+      __m128i c = _mm_castps_si128(ct);   // does not generate any code
+      __m128i d = _mm_and_si128(c, a);    // keep only a, where x <  y
+      __m128i e = _mm_andnot_si128(c, b); // keep only b, where x >= y
+      return _mm_or_si128(d, e);          // combine
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
       line_buf *dst_line, ui32 dst_line_offset,
@@ -129,7 +153,7 @@ namespace ojph {
         if (is_signed)
         {
           __m128i zero = _mm_setzero_si128();
-          __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));          
+          __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
           for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) 
           {
             __m128 t = _mm_loadu_ps(sp);
@@ -172,31 +196,42 @@ namespace ojph {
         // can achieve.  All this is academic, because here are talking
         // about a number which has all the exponent bits set, meaning 
         // it is either infinity, -infinity, qNan or sNan.
-        float mul = (float)(1ull << bit_depth);
-        const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
-        const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
+        si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
+        __m128 mul = _mm_set1_ps((float)(1 << bit_depth));
+        __m128 fl_upper_limit = _mm_set1_ps(-(float)neg_limit); // val < upper
+        __m128 fl_lower_limit = _mm_set1_ps( (float)neg_limit); // val >= lower
+        __m128i s32_upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
+        __m128i s32_lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
 
         if (is_signed)
         {
-          const si32 bias = (1 << (bit_depth - 1)) + 1;
-          for (ui32 i = width; i > 0; --i) {
-            si64 t = ojph_round64(*sp++ * mul);
-            t = ojph_max(t, lower_limit);
-            t = ojph_min(t, upper_limit);
-            si32 v = (si32)t;
-            v = (v >= 0) ? v : (- v - bias);
-            *dp++ = v;
+          __m128i zero = _mm_setzero_si128();
+          __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));                   
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+            __m128 t = _mm_loadu_ps(sp);
+            t = _mm_mul_ps(t, mul);
+            __m128i u = _mm_cvtps_epi32(t);
+            u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit);
+            u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit);
+            __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
+            __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
+            neg = _mm_and_si128(c, neg);          //keep only - bias - value
+            __m128i v = _mm_andnot_si128(c, u);   //keep only +ve or 0
+            v = _mm_or_si128(neg, v);             //combine
+            _mm_storeu_si128((__m128i*)dp, v);
           }
         }
         else
         {
-          const si32 half = (1 << (bit_depth - 1));
-          for (ui32 i = width; i > 0; --i) {
-            si64 t = ojph_round64(*sp++ * mul);
-            t = ojph_max(t, lower_limit);
-            t = ojph_min(t, upper_limit);
-            si32 v = (si32)t;
-            *dp++ = v + half;
+          __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1)));
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+            __m128 t = _mm_loadu_ps(sp);
+            t = _mm_mul_ps(t, mul);
+            __m128i u = _mm_cvtps_epi32(t);
+            u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit);
+            u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit);
+            u = _mm_add_epi32(u, half);
+            _mm_storeu_si128((__m128i*)dp, u);
           }
         }
       }

From 78bade8caf4817f303ad2cf457a9e6a5b369b604 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Thu, 23 Jan 2025 12:05:25 +1100
Subject: [PATCH 08/20] WASM code written.  Must test all SIMD now.

---
 src/core/transform/ojph_colour_wasm.cpp | 384 +++++++++++++-----------
 1 file changed, 208 insertions(+), 176 deletions(-)

diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index e0a88e8e..c0c4bbea 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2021, Aous Naman 
+// Copyright (c) 2021, Aous Naman
 // Copyright (c) 2021, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2021, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -35,6 +35,7 @@
 // Date: 9 February 2021
 //***************************************************************************/
 
+#include <climits>
 #include <cmath>
 #include <wasm_simd128.h>
 
@@ -45,16 +46,16 @@
 
 namespace ojph {
   namespace local {
-    
+
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_convert(const line_buf *src_line, 
+    void wasm_rev_convert(const line_buf *src_line,
                           const ui32 src_line_offset,
-                          line_buf *dst_line, 
-                          const ui32 dst_line_offset, 
+                          line_buf *dst_line,
+                          const ui32 dst_line_offset,
                           si64 shift, ui32 width)
     {
       if (src_line->flags & line_buf::LFT_32BIT)
-      { 
+      {
         if (dst_line->flags & line_buf::LFT_32BIT)
         {
           const si32 *sp = src_line->i32 + src_line_offset;
@@ -65,9 +66,9 @@ namespace ojph {
             v128_t s = wasm_v128_load(sp);
             s = wasm_i32x4_add(s, sh);
             wasm_v128_store(dp, s);
-          }            
+          }
         }
-        else 
+        else
         {
           const si32 *sp = src_line->i32 + src_line_offset;
           si64 *dp = dst_line->i64 + dst_line_offset;
@@ -76,18 +77,18 @@ namespace ojph {
           {
             v128_t s, t;
             s = wasm_v128_load(sp);
-            
+
             t = wasm_i64x2_extend_low_i32x4(s);
             t = wasm_i64x2_add(t, sh);
             wasm_v128_store(dp, t);
-            
+
             t = wasm_i64x2_extend_high_i32x4(s);
             t = wasm_i64x2_add(t, sh);
             wasm_v128_store(dp + 2, t);
-          }            
+          }
         }
       }
-      else 
+      else
       {
         assert(src_line->flags | line_buf::LFT_64BIT);
         assert(dst_line->flags | line_buf::LFT_32BIT);
@@ -103,19 +104,19 @@ namespace ojph {
           s1 = wasm_i64x2_add(s1, sh);
           s0 = wasm_i32x4_shuffle(s0, s1, 0, 2, 4 + 0, 4 + 2);
           wasm_v128_store(dp, s0);
-        }            
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rev_convert_nlt_type3(const line_buf *src_line, 
-                                    const ui32 src_line_offset, 
-                                    line_buf *dst_line, 
-                                    const ui32 dst_line_offset, 
+    void wasm_rev_convert_nlt_type3(const line_buf *src_line,
+                                    const ui32 src_line_offset,
+                                    line_buf *dst_line,
+                                    const ui32 dst_line_offset,
                                     si64 shift, ui32 width)
     {
       if (src_line->flags & line_buf::LFT_32BIT)
-      { 
+      {
         if (dst_line->flags & line_buf::LFT_32BIT)
         {
           const si32 *sp = src_line->i32 + src_line_offset;
@@ -126,14 +127,14 @@ namespace ojph {
           {
             v128_t s = wasm_v128_load(sp);
             v128_t c = wasm_i32x4_lt(s, zero);     // 0xFFFFFFFF for -ve value
-            v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value 
+            v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value
             v_m_sh = wasm_v128_and(c, v_m_sh);     // keep only - shift - value
             s = wasm_v128_andnot(c, s);            // keep only +ve or 0
             s = wasm_v128_or(s, v_m_sh);           // combine
             wasm_v128_store(dp, s);
           }
         }
-        else 
+        else
         {
           const si32 *sp = src_line->i32 + src_line_offset;
           si64 *dp = dst_line->i64 + dst_line_offset;
@@ -146,7 +147,7 @@ namespace ojph {
 
             u = wasm_i64x2_extend_low_i32x4(s);
             c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
-            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value 
+            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value
             v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
             u = wasm_v128_andnot(c, u);        // keep only +ve or 0
             u = wasm_v128_or(u, v_m_sh);       // combine
@@ -155,7 +156,7 @@ namespace ojph {
 
             u = wasm_i64x2_extend_high_i32x4(s);
             c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
-            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value 
+            v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value
             v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
             u = wasm_v128_andnot(c, u);        // keep only +ve or 0
             u = wasm_v128_or(u, v_m_sh);       // combine
@@ -164,7 +165,7 @@ namespace ojph {
           }
         }
       }
-      else 
+      else
       {
         assert(src_line->flags | line_buf::LFT_64BIT);
         assert(dst_line->flags | line_buf::LFT_32BIT);
@@ -261,7 +262,27 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
+    static inline
+    v128_t ojph_wasm_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
+    {
+      v128_t c = wasm_i32x4_ge(x, y);    // 0xFFFFFFFF for x >= y
+      v128_t d = wasm_v128_and(c, a);    // keep only a, where x >= y
+      v128_t e = wasm_v128_andnot(c, b); // keep only b, where x <  y
+      return wasm_v128_or(d, e);         // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    v128_t ojph_wasm_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
+    {
+      v128_t c = wasm_i32x4_lt(x, y);    // 0xFFFFFFFF for x < y
+      v128_t d = wasm_v128_and(c, a);    // keep only a, where x <  y
+      v128_t e = wasm_v128_andnot(c, b); // keep only b, where x >= y
+      return wasm_v128_or(d, e);         // combine
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
       line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -269,99 +290,110 @@ namespace ojph {
              (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER));
-      
-      // const float* sp = src_line->f32;
-      // si32* dp = dst_line->i32 + dst_line_offset;
-      // if (bit_depth <= 30) 
-      // {
-      //   // We are leaving two bit overhead -- here, we are assuming that after
-      //   // multiplications, the resulting number can still be represented
-      //   // using 32 bit integer
-      //   __m128 mul = _mm_set1_ps((float)(1 << bit_depth));
-      //   __m128i upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
-      //   __m128i lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
+
+      // rounding mode is always set to _MM_ROUND_NEAREST
+
+      const float* sp = src_line->f32;
+      si32* dp = dst_line->i32 + dst_line_offset;
+      if (bit_depth <= 30) 
+      {
+        // We are leaving two bit overhead -- here, we are assuming that after
+        // multiplications, the resulting number can still be represented
+        // using 32 bit integer
+        v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth));
+        v128_t upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
+        v128_t lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
         
-      //   if (is_signed)
-      //   {
-      //     __m128i zero = _mm_setzero_si128();
-      //     __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));          
-      //     for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) 
-      //     {
-      //       __m128 t = _mm_loadu_ps(sp);
-      //       t = _mm_mul_ps(t, mul);
-      //       t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-      //       __m128i u = _mm_cvtps_epi32(t);
-      //       u = ojph_mm_max_epi32(u, lower_limit);
-      //       u = ojph_mm_min_epi32(u, upper_limit);
-
-      //       __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
-      //       __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
-      //       neg = _mm_and_si128(c, neg);          //keep only - bias - value
-      //       __m128i v = _mm_andnot_si128(c, u);   //keep only +ve or 0
-      //       v = _mm_or_si128(neg, v);             //combine
-      //       _mm_storeu_si128((__m128i*)dp, v);
-      //     }
-      //   }
-      //   else
-      //   {
-      //     __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1)));
-      //     for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-      //       __m128 t = _mm_loadu_ps(sp);
-      //       t = _mm_mul_ps(t, mul);
-      //       t = _mm_round_ps(t, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-      //       __m128i u = _mm_cvtps_epi32(t);
-      //       u = ojph_mm_max_epi32(u, lower_limit);
-      //       u = ojph_mm_min_epi32(u, upper_limit);
-      //       u = _mm_add_epi32(u, half);
-      //       _mm_storeu_si128((__m128i*)dp, u);
-      //     }
-      //   }
-      // }
-      // else
-      // {
-      //   // There is the possibility that converting to integer will
-      //   // exceed the dynamic range of 32bit integer; therefore, we need
-      //   // to use 64 bit.  One may think, why not limit the floats to the
-      //   // range of [-0.5f, 0.5f)? 
-      //   // Notice the half closed range -- we need a value just below 0.5f.
-      //   // While getting this number is possible, after multiplication, the
-      //   // resulting number will not be exactly the maximum that the integer 
-      //   // can achieve.  All this is academic, because here are talking
-      //   // about a number which has all the exponent bits set, meaning 
-      //   // it is either infinity, -infinity, qNan or sNan.
-      //   float mul = (float)(1ull << bit_depth);
-      //   const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
-      //   const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
-
-      //   if (is_signed)
-      //   {
-      //     const si32 bias = (1 << (bit_depth - 1)) + 1;
-      //     for (ui32 i = width; i > 0; --i) {
-      //       si64 t = ojph_round64(*sp++ * mul);
-      //       t = ojph_max(t, lower_limit);
-      //       t = ojph_min(t, upper_limit);
-      //       si32 v = (si32)t;
-      //       v = (v >= 0) ? v : (- v - bias);
-      //       *dp++ = v;
-      //     }
-      //   }
-      //   else
-      //   {
-      //     const si32 half = (1 << (bit_depth - 1));
-      //     for (ui32 i = width; i > 0; --i) {
-      //       si64 t = ojph_round64(*sp++ * mul);
-      //       t = ojph_max(t, lower_limit);
-      //       t = ojph_min(t, upper_limit);
-      //       si32 v = (si32)t;
-      //       *dp++ = v + half;
-      //     }
-      //   }
-      // }
+        if (is_signed)
+        {
+          v128_t zero = wasm_i32x4_splat(0);
+          v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) 
+          {
+            v128_t t = wasm_v128_load(sp);
+            t = wasm_f32x4_mul(t, mul);
+            v128_t u = wasm_i32x4_trunc_sat_f32x4(t);
+            u = wasm_i32x4_max(u, lower_limit);
+            u = wasm_i32x4_min(u, upper_limit);
+
+            v128_t c = wasm_i32x4_gt(zero, u);    //0xFFFFFFFF for -ve value
+            v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value
+            neg = wasm_v128_and(c, neg);          //keep only - bias - value
+            v128_t v = wasm_v128_andnot(c, u);    //keep only +ve or 0
+            v = wasm_v128_or(neg, v);             //combine
+            wasm_v128_store(dp, v);
+          }
+        }
+        else
+        {
+          v128_t half = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+            v128_t t = wasm_v128_load(sp);
+            t = wasm_f32x4_mul(t, mul);
+            v128_t u = wasm_i32x4_trunc_sat_f32x4(t);
+            u = wasm_i32x4_max(u, lower_limit);
+            u = wasm_i32x4_min(u, upper_limit);
+            u = wasm_i32x4_add(u, half);
+            wasm_v128_store(dp, u);
+          }
+        }
+      }
+      else
+      {
+        // There is the possibility that converting to integer will
+        // exceed the dynamic range of 32bit integer; therefore, we need
+        // to use 64 bit.  One may think, why not limit the floats to the
+        // range of [-0.5f, 0.5f)? 
+        // Notice the half closed range -- we need a value just below 0.5f.
+        // While getting this number is possible, after multiplication, the
+        // resulting number will not be exactly the maximum that the integer 
+        // can achieve.  All this is academic, because here are talking
+        // about a number which has all the exponent bits set, meaning 
+        // it is either infinity, -infinity, qNan or sNan.
+        si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
+        v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth));
+        v128_t fl_upper_limit = wasm_f32x4_splat(-(float)neg_limit); // val< up
+        v128_t fl_lower_limit = wasm_f32x4_splat( (float)neg_limit); // val>=lo
+        v128_t s32_upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
+        v128_t s32_lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
+
+        if (is_signed)
+        {
+          v128_t zero = wasm_i32x4_splat(0);
+          v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));                   
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+            v128_t t = wasm_v128_load(sp);
+            t = wasm_f32x4_mul(t, mul);
+            v128_t u = wasm_i32x4_trunc_sat_f32x4(t);
+            u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit);
+            u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit);
+            v128_t c = wasm_i32x4_gt(zero, u);    //0xFFFFFFFF for -ve value
+            v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value
+            neg = wasm_v128_and(c, neg);          //keep only - bias - value
+            v128_t v = wasm_v128_andnot(c, u);    //keep only +ve or 0
+            v = wasm_v128_or(neg, v);             //combine
+            wasm_v128_store(dp, v);
+          }
+        }
+        else
+        {
+          v128_t half = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
+          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+            v128_t t = wasm_v128_load(sp);
+            t = wasm_f32x4_mul(t, mul);
+            v128_t u = wasm_i32x4_trunc_sat_f32x4(t);
+            u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit);
+            u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit);
+            u = wasm_i32x4_add(u, half);
+            wasm_v128_store(dp, u);
+          }
+        }
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
-      ui32 src_line_offset, line_buf *dst_line, 
+    void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
       assert((src_line->flags & line_buf::LFT_32BIT) &&
@@ -369,64 +401,64 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER) == 0);
 
-      // __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0));
-
-      // const si32* sp = src_line->i32 + src_line_offset;
-      // float* dp = dst_line->f32;
-      // si32 shift = 32 - (si32)bit_depth;
-      // if (is_signed)
-      // {
-      //   __m128i zero = _mm_setzero_si128();
-      //   __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1));
-      //   for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-      //     __m128i t = _mm_loadu_si128((__m128i*)sp);
-      //     __m128i u = _mm_slli_epi32(t, shift);
-      //     __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value
-      //     __m128i neg = _mm_sub_epi32(bias, u); // - bias - value
-      //     neg = _mm_and_si128(c, neg);          // keep only - bias - value
-      //     t = _mm_andnot_si128(c, u);           // keep only +ve or 0
-      //     u = _mm_or_si128(neg, t);             // combine
-      //     __m128 v = _mm_cvtepi32_ps(u);
-      //     v = _mm_mul_ps(v, mul);
-      //     _mm_storeu_ps(dp, v);        
-      //   }
-      // }
-      // else
-      // {
-      //   __m128 half = _mm_set1_ps(0.5f);
-      //   for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-      //     __m128i t = _mm_loadu_si128((__m128i*)sp);
-      //     __m128i u = _mm_slli_epi32(t, shift);
-      //     __m128 v = _mm_cvtepi32_ps(u);
-      //     v = _mm_mul_ps(v, mul);
-      //     v = _mm_add_ps(v, half);
-      //     _mm_storeu_ps(dp, v);
-      //   }
-      // }
+      v128_t mul = wasm_f32x4_splat((float)(1.0 / 65536.0 / 65536.0));
+
+      const si32* sp = src_line->i32 + src_line_offset;
+      float* dp = dst_line->f32;
+      ui32 shift = (ui32)32 - bit_depth;
+      if (is_signed)
+      {
+        v128_t zero = wasm_i32x4_splat(0);
+        v128_t bias = wasm_i32x4_splat(-(si32)((ui32)INT_MIN + 1));
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = wasm_v128_load(sp);
+          v128_t u = wasm_i32x4_shl(t, shift);
+          v128_t c = wasm_i32x4_lt(u, zero);    // 0xFFFFFFFF for -ve value
+          v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value
+          neg = wasm_v128_and(c, neg);          // keep only - bias - value
+          t = wasm_v128_andnot(c, u);           // keep only +ve or 0
+          u = wasm_v128_or(neg, t);             // combine
+          v128_t v = wasm_f32x4_convert_i32x4(u);
+          v = wasm_f32x4_mul(v, mul);
+          wasm_v128_store(dp, v);
+        }
+      }
+      else
+      {
+        v128_t half = wasm_f32x4_splat(0.5f);
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = wasm_v128_load(sp);
+          v128_t u = wasm_i32x4_shl(t, shift);
+          v128_t v = wasm_f32x4_convert_i32x4(u);
+          v = wasm_f32x4_mul(v, mul);
+          v = wasm_f32x4_add(v, half);
+          wasm_v128_store(dp, v);
+        }
+      }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_forward(const line_buf *r, 
-                          const line_buf *g, 
+    void wasm_rct_forward(const line_buf *r,
+                          const line_buf *g,
                           const line_buf *b,
-                          line_buf *y, line_buf *cb, line_buf *cr, 
+                          line_buf *y, line_buf *cb, line_buf *cr,
                           ui32 repeat)
     {
       assert((y->flags  & line_buf::LFT_INTEGER) &&
-             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cb->flags & line_buf::LFT_INTEGER) &&
              (cr->flags & line_buf::LFT_INTEGER) &&
              (r->flags  & line_buf::LFT_INTEGER) &&
-             (g->flags  & line_buf::LFT_INTEGER) && 
+             (g->flags  & line_buf::LFT_INTEGER) &&
              (b->flags  & line_buf::LFT_INTEGER));
-      
+
       if  (y->flags & line_buf::LFT_32BIT)
       {
         assert((y->flags  & line_buf::LFT_32BIT) &&
-               (cb->flags & line_buf::LFT_32BIT) && 
+               (cb->flags & line_buf::LFT_32BIT) &&
                (cr->flags & line_buf::LFT_32BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
-               (b->flags  & line_buf::LFT_32BIT));        
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
         const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
         si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
 
@@ -447,13 +479,13 @@ namespace ojph {
             yp += 4; cbp += 4; crp += 4;
         }
       }
-      else 
+      else
       {
         assert((y->flags  & line_buf::LFT_64BIT) &&
-               (cb->flags & line_buf::LFT_64BIT) && 
+               (cb->flags & line_buf::LFT_64BIT) &&
                (cr->flags & line_buf::LFT_64BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
         si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
@@ -466,7 +498,7 @@ namespace ojph {
           mr = wasm_i64x2_extend_low_i32x4(mr32);
           mg = wasm_i64x2_extend_low_i32x4(mg32);
           mb = wasm_i64x2_extend_low_i32x4(mb32);
-          
+
           t = wasm_i64x2_add(mr, mb);
           t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
           wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
@@ -480,7 +512,7 @@ namespace ojph {
           mr = wasm_i64x2_extend_high_i32x4(mr32);
           mg = wasm_i64x2_extend_high_i32x4(mg32);
           mb = wasm_i64x2_extend_high_i32x4(mb32);
-          
+
           t = wasm_i64x2_add(mr, mb);
           t = wasm_i64x2_add(t, wasm_i64x2_shl(mg, 1));
           wasm_v128_store(yp, wasm_i64x2_shr(t, 2));
@@ -496,26 +528,26 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_rct_backward(const line_buf *y, 
-                           const line_buf *cb, 
+    void wasm_rct_backward(const line_buf *y,
+                           const line_buf *cb,
                            const line_buf *cr,
-                           line_buf *r, line_buf *g, line_buf *b, 
+                           line_buf *r, line_buf *g, line_buf *b,
                            ui32 repeat)
     {
       assert((y->flags  & line_buf::LFT_INTEGER) &&
-             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cb->flags & line_buf::LFT_INTEGER) &&
              (cr->flags & line_buf::LFT_INTEGER) &&
              (r->flags  & line_buf::LFT_INTEGER) &&
-             (g->flags  & line_buf::LFT_INTEGER) && 
+             (g->flags  & line_buf::LFT_INTEGER) &&
              (b->flags  & line_buf::LFT_INTEGER));
 
       if (y->flags & line_buf::LFT_32BIT)
       {
         assert((y->flags  & line_buf::LFT_32BIT) &&
-               (cb->flags & line_buf::LFT_32BIT) && 
+               (cb->flags & line_buf::LFT_32BIT) &&
                (cr->flags & line_buf::LFT_32BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
         si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
@@ -540,10 +572,10 @@ namespace ojph {
       else
       {
         assert((y->flags  & line_buf::LFT_64BIT) &&
-               (cb->flags & line_buf::LFT_64BIT) && 
+               (cb->flags & line_buf::LFT_64BIT) &&
                (cr->flags & line_buf::LFT_64BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
         si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
@@ -580,7 +612,7 @@ namespace ojph {
 
           yp += 2; cbp += 2; crp += 2;
           rp += 4; gp += 4; bp += 4;
-        }        
+        }
       }
     }
 
@@ -603,7 +635,7 @@ namespace ojph {
         wasm_v128_store(y, my);
         wasm_v128_store(cb, wasm_f32x4_mul(beta_cbf, wasm_f32x4_sub(mb, my)));
         wasm_v128_store(cr, wasm_f32x4_mul(beta_crf, wasm_f32x4_sub(mr, my)));
-        
+
         r += 4; g += 4; b += 4;
         y += 4; cb += 4; cr += 4;
       }

From 3527f0abe599c52ab53a3ff2ba1d93c22b1b925c Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Thu, 23 Jan 2025 13:09:39 +1100
Subject: [PATCH 09/20] Fixes function pointer assignment.

---
 src/core/transform/ojph_colour.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 617fc41f..44028037 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -205,6 +205,8 @@ namespace ojph {
       cnvrt_si32_to_float = wasm_cnvrt_si32_to_float;
       cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd;
       cnvrt_float_to_si32 = wasm_cnvrt_float_to_si32;
+      irv_convert_to_integer_nlt_type3 = wasm_irv_convert_to_integer_nlt_type3;
+      irv_convert_to_float_nlt_type3 = wasm_irv_convert_to_float_nlt_type3;
       rct_forward = wasm_rct_forward;
       rct_backward = wasm_rct_backward;
       ict_forward = wasm_ict_forward;

From fd295de0ce5d5ded1a685bbbbc25d71f79cd8e1d Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Thu, 23 Jan 2025 17:15:42 +1100
Subject: [PATCH 10/20] Bug fixes, some historic.

---
 src/core/codestream/ojph_codeblock_fun.cpp |  2 +
 src/core/transform/ojph_colour_wasm.cpp    | 80 +++++++++++++---------
 2 files changed, 51 insertions(+), 31 deletions(-)

diff --git a/src/core/codestream/ojph_codeblock_fun.cpp b/src/core/codestream/ojph_codeblock_fun.cpp
index 565744dd..cad2434a 100644
--- a/src/core/codestream/ojph_codeblock_fun.cpp
+++ b/src/core/codestream/ojph_codeblock_fun.cpp
@@ -276,6 +276,8 @@ namespace ojph {
         tx_from_cb64 = NULL;
       }
       encode_cb64 = ojph_encode_codeblock64;
+      bool result = initialize_block_encoder_tables();
+      assert(result); ojph_unused(result);      
 
 #endif // !OJPH_ENABLE_WASM_SIMD
 
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index c0c4bbea..10114b9b 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -47,6 +47,20 @@
 namespace ojph {
   namespace local {
 
+    //////////////////////////////////////////////////////////////////////////
+    static inline
+    v128_t ojph_convert_float_to_i32(v128_t a, v128_t zero, v128_t half)
+    { // We implement ojph_round, which is 
+      // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float
+      v128_t c = wasm_f32x4_ge(a, zero);   // greater or equal to zero
+      v128_t p = wasm_f32x4_add(a, half);  // for positive, add half
+      v128_t n = wasm_f32x4_sub(a, half);  // for negative, subtract half
+      v128_t d = wasm_v128_and(c, p);      // keep positive only
+      v128_t e = wasm_v128_andnot(n, c);   // keep negative only
+      v128_t v = wasm_v128_or(d, e);       // combine
+      return wasm_i32x4_trunc_sat_f32x4(v);// truncate (towards 0)
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void wasm_rev_convert(const line_buf *src_line,
                           const ui32 src_line_offset,
@@ -129,7 +143,7 @@ namespace ojph {
             v128_t c = wasm_i32x4_lt(s, zero);     // 0xFFFFFFFF for -ve value
             v128_t v_m_sh = wasm_i32x4_sub(sh, s); // - shift - value
             v_m_sh = wasm_v128_and(c, v_m_sh);     // keep only - shift - value
-            s = wasm_v128_andnot(c, s);            // keep only +ve or 0
+            s = wasm_v128_andnot(s, c);            // keep only +ve or 0
             s = wasm_v128_or(s, v_m_sh);           // combine
             wasm_v128_store(dp, s);
           }
@@ -149,7 +163,7 @@ namespace ojph {
             c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
             v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value
             v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
-            u = wasm_v128_andnot(c, u);        // keep only +ve or 0
+            u = wasm_v128_andnot(u, c);        // keep only +ve or 0
             u = wasm_v128_or(u, v_m_sh);       // combine
 
             wasm_v128_store(dp, u);
@@ -158,7 +172,7 @@ namespace ojph {
             c = wasm_i64x2_lt(u, zero);        // 64b -1 for -ve value
             v_m_sh = wasm_i64x2_sub(sh, u);    // - shift - value
             v_m_sh = wasm_v128_and(c, v_m_sh); // keep only - shift - value
-            u = wasm_v128_andnot(c, u);        // keep only +ve or 0
+            u = wasm_v128_andnot(u, c);        // keep only +ve or 0
             u = wasm_v128_or(u, v_m_sh);       // combine
 
             wasm_v128_store(dp + 2, u);
@@ -182,14 +196,14 @@ namespace ojph {
           m = wasm_i64x2_lt(s, zero);   // 64b -1 for -ve value
           tm = wasm_i64x2_sub(sh, s);   // - shift - value
           n = wasm_v128_and(m, tm);     // -ve
-          p = wasm_v128_andnot(m, s);   // +ve
+          p = wasm_v128_andnot(s, m);   // +ve
           t0 = wasm_v128_or(n, p);
 
           s = wasm_v128_load(sp + 2);
           m = wasm_i64x2_lt(s, zero);   // 64b -1 for -ve value
           tm = wasm_i64x2_sub(sh, s);   // - shift - value
           n = wasm_v128_and(m, tm);     // -ve
-          p = wasm_v128_andnot(m, s);   // +ve
+          p = wasm_v128_andnot(s, m);   // +ve
           t1 = wasm_v128_or(n, p);
 
           t0 = wasm_i32x4_shuffle(t0, t1, 0, 2, 4 + 0, 4 + 2);
@@ -232,16 +246,16 @@ namespace ojph {
     void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
                                         ui32 width)
     {
-      // rounding mode is always set to _MM_ROUND_NEAREST
-      v128_t shift = wasm_f32x4_splat(0.5f);
+      const v128_t zero = wasm_f32x4_splat(0.0f);
+      const v128_t half = wasm_f32x4_splat(0.5f);
       v128_t m = wasm_f32x4_splat(mul);
       for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
       {
         v128_t t = wasm_v128_load(sp);
-        v128_t s = wasm_f32x4_add(t, shift);
+        v128_t s = wasm_f32x4_add(t, half);
         s = wasm_f32x4_mul(s, m);
-        s = wasm_f32x4_add(s, shift); // + 0.5 and followed by floor next
-        wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
+        s = wasm_f32x4_add(s, half); // + 0.5 and followed by floor next
+        wasm_v128_store(dp, ojph_convert_float_to_i32(s, zero, half));
       }
     }
 
@@ -249,15 +263,15 @@ namespace ojph {
     void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
                                   ui32 width)
     {
-      // rounding mode is always set to _MM_ROUND_NEAREST
-      v128_t shift = wasm_f32x4_splat(0.5f);
+      const v128_t zero = wasm_f32x4_splat(0.0f);
+      const v128_t half = wasm_f32x4_splat(0.5f);
       v128_t m = wasm_f32x4_splat(mul);
       for (int i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
       {
         v128_t t = wasm_v128_load(sp);
         v128_t s = wasm_f32x4_mul(t, m);
-        s = wasm_f32x4_add(s, shift); // + 0.5 and followed by floor next
-        wasm_v128_store(dp, wasm_i32x4_trunc_sat_f32x4(s));
+        s = wasm_f32x4_add(s, half); // + 0.5 and followed by floor next
+        wasm_v128_store(dp, ojph_convert_float_to_i32(s, zero, half));
       }
     }
 
@@ -267,7 +281,7 @@ namespace ojph {
     {
       v128_t c = wasm_i32x4_ge(x, y);    // 0xFFFFFFFF for x >= y
       v128_t d = wasm_v128_and(c, a);    // keep only a, where x >= y
-      v128_t e = wasm_v128_andnot(c, b); // keep only b, where x <  y
+      v128_t e = wasm_v128_andnot(b, c); // keep only b, where x <  y
       return wasm_v128_or(d, e);         // combine
     }
 
@@ -277,7 +291,7 @@ namespace ojph {
     {
       v128_t c = wasm_i32x4_lt(x, y);    // 0xFFFFFFFF for x < y
       v128_t d = wasm_v128_and(c, a);    // keep only a, where x <  y
-      v128_t e = wasm_v128_andnot(c, b); // keep only b, where x >= y
+      v128_t e = wasm_v128_andnot(b, c); // keep only b, where x >= y
       return wasm_v128_or(d, e);         // combine
     }
 
@@ -291,8 +305,6 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER));
 
-      // rounding mode is always set to _MM_ROUND_NEAREST
-
       const float* sp = src_line->f32;
       si32* dp = dst_line->i32 + dst_line_offset;
       if (bit_depth <= 30) 
@@ -306,34 +318,37 @@ namespace ojph {
         
         if (is_signed)
         {
-          v128_t zero = wasm_i32x4_splat(0);
+          const v128_t zero = wasm_f32x4_splat(0.0f);
+          const v128_t half = wasm_f32x4_splat(0.5f);
           v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
           for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) 
           {
             v128_t t = wasm_v128_load(sp);
             t = wasm_f32x4_mul(t, mul);
-            v128_t u = wasm_i32x4_trunc_sat_f32x4(t);
+            v128_t u = ojph_convert_float_to_i32(t, zero, half);
             u = wasm_i32x4_max(u, lower_limit);
             u = wasm_i32x4_min(u, upper_limit);
 
             v128_t c = wasm_i32x4_gt(zero, u);    //0xFFFFFFFF for -ve value
             v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value
             neg = wasm_v128_and(c, neg);          //keep only - bias - value
-            v128_t v = wasm_v128_andnot(c, u);    //keep only +ve or 0
+            v128_t v = wasm_v128_andnot(u, c);    //keep only +ve or 0
             v = wasm_v128_or(neg, v);             //combine
             wasm_v128_store(dp, v);
           }
         }
         else
         {
-          v128_t half = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
+          const v128_t zero = wasm_f32x4_splat(0.0f);
+          const v128_t half = wasm_f32x4_splat(0.5f);
+          v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
           for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
             v128_t t = wasm_v128_load(sp);
             t = wasm_f32x4_mul(t, mul);
-            v128_t u = wasm_i32x4_trunc_sat_f32x4(t);
+            v128_t u = ojph_convert_float_to_i32(t, zero, half);
             u = wasm_i32x4_max(u, lower_limit);
             u = wasm_i32x4_min(u, upper_limit);
-            u = wasm_i32x4_add(u, half);
+            u = wasm_i32x4_add(u, ihalf);
             wasm_v128_store(dp, u);
           }
         }
@@ -359,32 +374,35 @@ namespace ojph {
 
         if (is_signed)
         {
-          v128_t zero = wasm_i32x4_splat(0);
+          const v128_t zero = wasm_f32x4_splat(0.0f);
+          const v128_t half = wasm_f32x4_splat(0.5f);
           v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));                   
           for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
             v128_t t = wasm_v128_load(sp);
             t = wasm_f32x4_mul(t, mul);
-            v128_t u = wasm_i32x4_trunc_sat_f32x4(t);
+            v128_t u = ojph_convert_float_to_i32(t, zero, half);
             u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit);
             u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit);
             v128_t c = wasm_i32x4_gt(zero, u);    //0xFFFFFFFF for -ve value
             v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value
             neg = wasm_v128_and(c, neg);          //keep only - bias - value
-            v128_t v = wasm_v128_andnot(c, u);    //keep only +ve or 0
+            v128_t v = wasm_v128_andnot(u, c);    //keep only +ve or 0
             v = wasm_v128_or(neg, v);             //combine
             wasm_v128_store(dp, v);
           }
         }
         else
         {
-          v128_t half = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
+          const v128_t zero = wasm_f32x4_splat(0.0f);
+          const v128_t half = wasm_f32x4_splat(0.5f);
+          v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
           for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
             v128_t t = wasm_v128_load(sp);
             t = wasm_f32x4_mul(t, mul);
-            v128_t u = wasm_i32x4_trunc_sat_f32x4(t);
+            v128_t u = ojph_convert_float_to_i32(t, zero, half);
             u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit);
             u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit);
-            u = wasm_i32x4_add(u, half);
+            u = wasm_i32x4_add(u, ihalf);
             wasm_v128_store(dp, u);
           }
         }
@@ -416,7 +434,7 @@ namespace ojph {
           v128_t c = wasm_i32x4_lt(u, zero);    // 0xFFFFFFFF for -ve value
           v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value
           neg = wasm_v128_and(c, neg);          // keep only - bias - value
-          t = wasm_v128_andnot(c, u);           // keep only +ve or 0
+          t = wasm_v128_andnot(u, c);           // keep only +ve or 0
           u = wasm_v128_or(neg, t);             // combine
           v128_t v = wasm_f32x4_convert_i32x4(u);
           v = wasm_f32x4_mul(v, mul);

From 9ec50da9c515c00f2c3eae710f414f267ecaf1d7 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Thu, 23 Jan 2025 17:50:03 +1100
Subject: [PATCH 11/20] A bug fix.

---
 src/core/transform/ojph_colour_avx2.cpp | 2 +-
 src/core/transform/ojph_colour_sse2.cpp | 2 +-
 src/core/transform/ojph_colour_wasm.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index 1c9da81e..45391370 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -414,7 +414,7 @@ namespace ojph {
           __m256i u = _mm256_slli_epi32(t, shift);
           __m256 v = _mm256_cvtepi32_ps(u);
           v = _mm256_mul_ps(v, mul);
-          v = _mm256_add_ps(v, half);
+          v = _mm256_sub_ps(v, half);
           _mm256_storeu_ps(dp, v);
         }
       }
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index e05ceb5e..208b9616 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -480,7 +480,7 @@ namespace ojph {
           __m128i u = _mm_slli_epi32(t, shift);
           __m128 v = _mm_cvtepi32_ps(u);
           v = _mm_mul_ps(v, mul);
-          v = _mm_add_ps(v, half);
+          v = _mm_sub_ps(v, half);
           _mm_storeu_ps(dp, v);
         }
       }
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index 10114b9b..8e354784 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -449,7 +449,7 @@ namespace ojph {
           v128_t u = wasm_i32x4_shl(t, shift);
           v128_t v = wasm_f32x4_convert_i32x4(u);
           v = wasm_f32x4_mul(v, mul);
-          v = wasm_f32x4_add(v, half);
+          v = wasm_f32x4_sub(v, half);
           wasm_v128_store(dp, v);
         }
       }

From ef9f71304ed4161102492a68f86bf5a7eb8e2e3e Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 25 Jan 2025 10:38:04 +1100
Subject: [PATCH 12/20] Bug Fixes.

---
 src/core/common/ojph_arch.h             |  12 --
 src/core/transform/ojph_colour.cpp      | 191 +++++++----------
 src/core/transform/ojph_colour_avx2.cpp | 263 ++++++++++--------------
 src/core/transform/ojph_colour_sse2.cpp | 254 ++++++++++-------------
 4 files changed, 288 insertions(+), 432 deletions(-)

diff --git a/src/core/common/ojph_arch.h b/src/core/common/ojph_arch.h
index 33e434a0..29ab7a57 100644
--- a/src/core/common/ojph_arch.h
+++ b/src/core/common/ojph_arch.h
@@ -271,18 +271,6 @@ namespace ojph {
   #endif
   }
 
-  ////////////////////////////////////////////////////////////////////////////
-  static inline si64 ojph_round64(float val)
-  {
-  #ifdef OJPH_COMPILER_MSVC
-    return (si64)(val + (val >= 0.0f ? 0.5f : -0.5f));
-  #elif (defined OJPH_COMPILER_GNUC)
-    return (si64)(val + (val >= 0.0f ? 0.5f : -0.5f));
-  #else
-    return (si64)round(val);
-  #endif
-  }
-
   ////////////////////////////////////////////////////////////////////////////
   static inline si32 ojph_trunc(float val)
   {
diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 44028037..792929b8 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -53,14 +53,14 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void (*rev_convert)
-      (const line_buf *src_line, const ui32 src_line_offset, 
-       line_buf *dst_line, const ui32 dst_line_offset, 
+      (const line_buf *src_line, const ui32 src_line_offset,
+       line_buf *dst_line, const ui32 dst_line_offset,
        si64 shift, ui32 width) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
     void (*rev_convert_nlt_type3)
-      (const line_buf *src_line, const ui32 src_line_offset, 
-       line_buf *dst_line, const ui32 dst_line_offset, 
+      (const line_buf *src_line, const ui32 src_line_offset,
+       line_buf *dst_line, const ui32 dst_line_offset,
        si64 shift, ui32 width) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
@@ -70,7 +70,7 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     void (*cnvrt_si32_to_float)
       (const si32 *sp, float *dp, float mul, ui32 width) = NULL;
-      
+
     //////////////////////////////////////////////////////////////////////////
     void (*cnvrt_float_to_si32_shftd)
       (const float *sp, si32 *dp, float mul, ui32 width) = NULL;
@@ -81,14 +81,14 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void (*irv_convert_to_integer_nlt_type3) (
-      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset, 
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
     void (*irv_convert_to_float_nlt_type3) (
       const line_buf *src_line, ui32 src_line_offset,
       line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL;
-      
+
     //////////////////////////////////////////////////////////////////////////
     void (*rct_forward)
       (const line_buf* r, const line_buf* g, const line_buf* b,
@@ -192,7 +192,7 @@ namespace ojph {
       #endif // !OJPH_DISABLE_AVX2
 
     #elif defined(OJPH_ARCH_ARM)
-    
+
     #endif // !(defined(OJPH_ARCH_X86_64) || defined(OJPH_ARCH_I386))
 
   #endif // !OJPH_DISABLE_SIMD
@@ -236,12 +236,12 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void gen_rev_convert(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width)
     {
       if (src_line->flags & line_buf::LFT_32BIT)
-      { 
+      {
         if (dst_line->flags & line_buf::LFT_32BIT)
         {
           const si32 *sp = src_line->i32 + src_line_offset;
@@ -250,7 +250,7 @@ namespace ojph {
           for (ui32 i = width; i > 0; --i)
             *dp++ = *sp++ + s;
         }
-        else 
+        else
         {
           const si32 *sp = src_line->i32 + src_line_offset;
           si64 *dp = dst_line->i64 + dst_line_offset;
@@ -258,7 +258,7 @@ namespace ojph {
             *dp++ = *sp++ + shift;
         }
       }
-      else 
+      else
       {
         assert(src_line->flags & line_buf::LFT_64BIT);
         assert(dst_line->flags & line_buf::LFT_32BIT);
@@ -271,12 +271,12 @@ namespace ojph {
 
     //////////////////////////////////////////////////////////////////////////
     void gen_rev_convert_nlt_type3(
-      const line_buf *src_line, const ui32 src_line_offset, 
-      line_buf *dst_line, const ui32 dst_line_offset, 
+      const line_buf *src_line, const ui32 src_line_offset,
+      line_buf *dst_line, const ui32 dst_line_offset,
       si64 shift, ui32 width)
     {
       if (src_line->flags & line_buf::LFT_32BIT)
-      { 
+      {
         if (dst_line->flags & line_buf::LFT_32BIT)
         {
           const si32 *sp = src_line->i32 + src_line_offset;
@@ -287,7 +287,7 @@ namespace ojph {
             *dp++ = v >= 0 ? v : (- v - s);
           }
         }
-        else 
+        else
         {
           const si32 *sp = src_line->i32 + src_line_offset;
           si64 *dp = dst_line->i64 + dst_line_offset;
@@ -297,7 +297,7 @@ namespace ojph {
           }
         }
       }
-      else 
+      else
       {
         assert(src_line->flags & line_buf::LFT_64BIT);
         assert(dst_line->flags & line_buf::LFT_32BIT);
@@ -315,7 +315,7 @@ namespace ojph {
                                        ui32 width)
     {
       for (ui32 i = width; i > 0; --i)
-        *dp++ = (float)*sp++ * mul - 0.5f;
+        *dp++ = (float)(ui32)*sp++ * mul - 0.5f;
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -331,7 +331,7 @@ namespace ojph {
                                        ui32 width)
     {
       for (ui32 i = width; i > 0; --i)
-        *dp++ = ojph_round((*sp++ + 0.5f) * mul);
+        *dp++ = (si32)ojph_round((*sp++ + 0.5f) * mul);
     }
 
     //////////////////////////////////////////////////////////////////////////
@@ -343,7 +343,7 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
+    void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
       line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -351,85 +351,51 @@ namespace ojph {
              (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER));
-      
+
+      assert(bit_depth <= 32);
       const float* sp = src_line->f32;
       si32* dp = dst_line->i32 + dst_line_offset;
-      if (bit_depth <= 30) 
+      // There is the possibility that converting to integer will
+      // exceed the dynamic range of 32bit integer; therefore, care must be
+      // exercised.
+      // We look if the floating point number is outside the half-closed
+      // interval [-0.5f, 0.5f). If so, we limit the resulting integer
+      // to the maximum/minimum that number supports.
+      si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
+      float mul = (float)(1ull << bit_depth);
+      float fl_up_lim = -(float)neg_limit; // val < upper
+      float fl_low_lim = (float)neg_limit; // val >= lower
+      si32 s32_up_lim = INT_MAX >> (32 - bit_depth);
+      si32 s32_low_lim = INT_MIN >> (32 - bit_depth);
+
+      if (is_signed)
       {
-        // We are leaving two bit overhead -- here, we are assuming that after
-        // multiplications, the resulting number can still be represented
-        // using 32 bit integer
-        float mul = (float)(1 << bit_depth);
-        const si32 upper_limit = INT_MAX >> (32 - bit_depth);
-        const si32 lower_limit = INT_MIN >> (32 - bit_depth);
-
-        if (is_signed)
-        {
-          const si32 bias = (1 << (bit_depth - 1)) + 1;
-          for (ui32 i = width; i > 0; --i) {
-            si32 v = ojph_round(*sp++ * mul);
-            v = ojph_max(v, lower_limit);
-            v = ojph_min(v, upper_limit);
-            v = (v >= 0) ? v : (- v - bias);
-            *dp++ = v;
-          }
-        }
-        else
-        {
-          const si32 half = (1 << (bit_depth - 1));
-          for (ui32 i = width; i > 0; --i) {
-            si32 v = ojph_round(*sp++ * mul);
-            v = ojph_max(v, lower_limit);
-            v = ojph_min(v, upper_limit);
-            *dp++ = v + half;
-          }
+        const si32 bias = (1 << (bit_depth - 1)) + 1;
+        for (ui32 i = width; i > 0; --i) {
+          float t = *sp++ * mul;
+          si32 v = ojph_round(t);
+          v = t >= fl_low_lim ? v : s32_low_lim;
+          v = t <  fl_up_lim  ? v : s32_up_lim;
+          v = (v >= 0) ? v : (- v - bias);
+          *dp++ = v;
         }
       }
       else
       {
-        // There is the possibility that converting to integer will
-        // exceed the dynamic range of 32bit integer; therefore, we need
-        // to use 64 bit.  One may think, why not limit the floats to the
-        // range of [-0.5f, 0.5f)? 
-        // Notice the half closed range -- we need a value just below 0.5f.
-        // While getting this number is possible, after multiplication, the
-        // resulting number will not be exactly the maximum that the integer 
-        // can achieve.  All this is academic, because here are talking
-        // about a number which has all the exponent bits set, meaning 
-        // it is either infinity, -infinity, qNan or sNan.
-        float mul = (float)(1ull << bit_depth);
-        const si64 upper_limit = (si64)LLONG_MAX >> (64 - bit_depth);
-        const si64 lower_limit = (si64)LLONG_MIN >> (64 - bit_depth);
-
-        if (is_signed)
-        {
-          const si32 bias = (1 << (bit_depth - 1)) + 1;
-          for (ui32 i = width; i > 0; --i) {
-            si64 t = ojph_round64(*sp++ * mul);
-            t = ojph_max(t, lower_limit);
-            t = ojph_min(t, upper_limit);
-            si32 v = (si32)t;
-            v = (v >= 0) ? v : (- v - bias);
-            *dp++ = v;
-          }
-        }
-        else
-        {
-          const si32 half = (1 << (bit_depth - 1));
-          for (ui32 i = width; i > 0; --i) {
-            si64 t = ojph_round64(*sp++ * mul);
-            t = ojph_max(t, lower_limit);
-            t = ojph_min(t, upper_limit);
-            si32 v = (si32)t;
-            *dp++ = v + half;
-          }
+        const si32 half = (1 << (bit_depth - 1));
+        for (ui32 i = width; i > 0; --i) {
+          float t = *sp++ * mul;
+          si32 v = ojph_round(t);
+          v = t >= fl_low_lim ? v : s32_low_lim;
+          v = t <  fl_up_lim  ? v : s32_up_lim;
+          *dp++ = v + half;
         }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
-      ui32 src_line_offset, line_buf *dst_line, 
+    void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
       assert((src_line->flags & line_buf::LFT_32BIT) &&
@@ -453,9 +419,12 @@ namespace ojph {
       }
       else
       {
+        const si32 half = INT_MIN;
         for (ui32 i = width; i > 0; --i) {
-          si32 v = *sp++ << shift;
-          *dp++ = (float)v * mul - 0.5f;
+          si32 v = *sp++;
+          v <<= shift;
+          v -= half;
+          *dp++ = (float)v * mul;
         }
       }
     }
@@ -466,20 +435,20 @@ namespace ojph {
       line_buf *y, line_buf *cb, line_buf *cr, ui32 repeat)
     {
       assert((y->flags  & line_buf::LFT_INTEGER) &&
-             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cb->flags & line_buf::LFT_INTEGER) &&
              (cr->flags & line_buf::LFT_INTEGER) &&
              (r->flags  & line_buf::LFT_INTEGER) &&
-             (g->flags  & line_buf::LFT_INTEGER) && 
+             (g->flags  & line_buf::LFT_INTEGER) &&
              (b->flags  & line_buf::LFT_INTEGER));
-      
+
       if  (y->flags & line_buf::LFT_32BIT)
       {
         assert((y->flags  & line_buf::LFT_32BIT) &&
-               (cb->flags & line_buf::LFT_32BIT) && 
+               (cb->flags & line_buf::LFT_32BIT) &&
                (cr->flags & line_buf::LFT_32BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
-               (b->flags  & line_buf::LFT_32BIT));        
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
         const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
         si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
         for (ui32 i = repeat; i > 0; --i)
@@ -490,13 +459,13 @@ namespace ojph {
           *crp++ = (rr - gg);
         }
       }
-      else 
+      else
       {
         assert((y->flags  & line_buf::LFT_64BIT) &&
-               (cb->flags & line_buf::LFT_64BIT) && 
+               (cb->flags & line_buf::LFT_64BIT) &&
                (cr->flags & line_buf::LFT_64BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
         si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
@@ -516,19 +485,19 @@ namespace ojph {
       line_buf *r, line_buf *g, line_buf *b, ui32 repeat)
     {
       assert((y->flags  & line_buf::LFT_INTEGER) &&
-             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cb->flags & line_buf::LFT_INTEGER) &&
              (cr->flags & line_buf::LFT_INTEGER) &&
              (r->flags  & line_buf::LFT_INTEGER) &&
-             (g->flags  & line_buf::LFT_INTEGER) && 
+             (g->flags  & line_buf::LFT_INTEGER) &&
              (b->flags  & line_buf::LFT_INTEGER));
 
       if (y->flags & line_buf::LFT_32BIT)
       {
         assert((y->flags  & line_buf::LFT_32BIT) &&
-               (cb->flags & line_buf::LFT_32BIT) && 
+               (cb->flags & line_buf::LFT_32BIT) &&
                (cr->flags & line_buf::LFT_32BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
         si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
@@ -544,11 +513,11 @@ namespace ojph {
       else
       {
         assert((y->flags  & line_buf::LFT_64BIT) &&
-               (cb->flags & line_buf::LFT_64BIT) && 
+               (cb->flags & line_buf::LFT_64BIT) &&
                (cr->flags & line_buf::LFT_64BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
-               (b->flags  & line_buf::LFT_32BIT));   
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
         const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
         si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
         for (ui32 i = repeat; i > 0; --i)
diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index 45391370..cb2bf000 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -50,8 +50,8 @@ namespace ojph {
 
     /////////////////////////////////////////////////////////////////////////
     // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
-    static inline 
-    __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m) 
+    static inline
+    __m256i avx2_mm256_srai_epi64(__m256i a, int amt, __m256i m)
     {
       // note than m must be obtained using
       // __m256i m = _mm256_set1_epi64x(1ULL << (63 - amt));
@@ -62,14 +62,14 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_convert(const line_buf *src_line, 
+    void avx2_rev_convert(const line_buf *src_line,
                           const ui32 src_line_offset,
-                          line_buf *dst_line, 
-                          const ui32 dst_line_offset, 
+                          line_buf *dst_line,
+                          const ui32 dst_line_offset,
                           si64 shift, ui32 width)
     {
       if (src_line->flags & line_buf::LFT_32BIT)
-      { 
+      {
         if (dst_line->flags & line_buf::LFT_32BIT)
         {
           const si32 *sp = src_line->i32 + src_line_offset;
@@ -80,9 +80,9 @@ namespace ojph {
             __m256i s = _mm256_loadu_si256((__m256i*)sp);
             s = _mm256_add_epi32(s, sh);
             _mm256_storeu_si256((__m256i*)dp, s);
-          }            
+          }
         }
-        else 
+        else
         {
           const si32 *sp = src_line->i32 + src_line_offset;
           si64 *dp = dst_line->i64 + dst_line_offset;
@@ -91,18 +91,18 @@ namespace ojph {
           {
             __m256i s, t;
             s = _mm256_loadu_si256((__m256i*)sp);
-            
+
             t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 0));
             t = _mm256_add_epi64(t, sh);
             _mm256_storeu_si256((__m256i*)dp, t);
-            
+
             t = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(s, 1));
             t = _mm256_add_epi64(t, sh);
             _mm256_storeu_si256((__m256i*)dp + 1, t);
-          }            
+          }
         }
       }
-      else 
+      else
       {
         assert(src_line->flags | line_buf::LFT_64BIT);
         assert(dst_line->flags | line_buf::LFT_32BIT);
@@ -125,23 +125,23 @@ namespace ojph {
 
           s = _mm256_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
           s = _mm256_andnot_si256(low_bits, s);
-          
+
           t = _mm256_or_si256(s, t);
           t = _mm256_permute4x64_epi64(t, _MM_SHUFFLE(3, 1, 2, 0));
           _mm256_storeu_si256((__m256i*)dp, t);
-        }            
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rev_convert_nlt_type3(const line_buf *src_line, 
-                                    const ui32 src_line_offset, 
-                                    line_buf *dst_line, 
-                                    const ui32 dst_line_offset, 
+    void avx2_rev_convert_nlt_type3(const line_buf *src_line,
+                                    const ui32 src_line_offset,
+                                    line_buf *dst_line,
+                                    const ui32 dst_line_offset,
                                     si64 shift, ui32 width)
     {
       if (src_line->flags & line_buf::LFT_32BIT)
-      { 
+      {
         if (dst_line->flags & line_buf::LFT_32BIT)
         {
           const si32 *sp = src_line->i32 + src_line_offset;
@@ -152,14 +152,14 @@ namespace ojph {
           {
             __m256i s = _mm256_loadu_si256((__m256i*)sp);
             __m256i c = _mm256_cmpgt_epi32(zero, s);  // 0xFFFFFFFF for -ve val
-            __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value 
+            __m256i v_m_sh = _mm256_sub_epi32(sh, s); // - shift - value
             v_m_sh = _mm256_and_si256(c, v_m_sh);     // keep only -shift-val
             s = _mm256_andnot_si256(c, s);            // keep only +ve or 0
             s = _mm256_or_si256(s, v_m_sh);           // combine
             _mm256_storeu_si256((__m256i*)dp, s);
           }
         }
-        else 
+        else
         {
           const si32 *sp = src_line->i32 + src_line_offset;
           si64 *dp = dst_line->i64 + dst_line_offset;
@@ -174,7 +174,7 @@ namespace ojph {
             u0 = _mm256_unpacklo_epi32(s, t);     // correct 64bit data
             c = _mm256_unpacklo_epi32(t, t);      // 64bit -1 for -ve value
 
-            v_m_sh = _mm256_sub_epi64(sh, u0);    // - shift - value 
+            v_m_sh = _mm256_sub_epi64(sh, u0);    // - shift - value
             v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
             u0 = _mm256_andnot_si256(c, u0);      // keep only +ve or 0
             u0 = _mm256_or_si256(u0, v_m_sh);     // combine
@@ -182,7 +182,7 @@ namespace ojph {
             u1 = _mm256_unpackhi_epi32(s, t);     // correct 64bit data
             c = _mm256_unpackhi_epi32(t, t);      // 64bit -1 for -ve value
 
-            v_m_sh = _mm256_sub_epi64(sh, u1);    // - shift - value 
+            v_m_sh = _mm256_sub_epi64(sh, u1);    // - shift - value
             v_m_sh = _mm256_and_si256(c, v_m_sh); // keep only - shift - value
             u1 = _mm256_andnot_si256(c, u1);      // keep only +ve or 0
             u1 = _mm256_or_si256(u1, v_m_sh);     // combine
@@ -195,7 +195,7 @@ namespace ojph {
           }
         }
       }
-      else 
+      else
       {
         assert(src_line->flags | line_buf::LFT_64BIT);
         assert(dst_line->flags | line_buf::LFT_32BIT);
@@ -211,7 +211,7 @@ namespace ojph {
           // m for mask, and tm for temp
           __m256i s, t, p, n, m, tm;
           s = _mm256_loadu_si256((__m256i*)sp);
-          
+
           m = _mm256_cmpgt_epi64(zero, s);    // 64b -1 for -ve value
           tm = _mm256_sub_epi64(sh, s);       // - shift - value
           n = _mm256_and_si256(m, tm);        // -ve
@@ -263,7 +263,7 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
+    void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
       line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -271,112 +271,59 @@ namespace ojph {
              (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER));
-      
+
+      assert(bit_depth <= 32);
       const float* sp = src_line->f32;
       si32* dp = dst_line->i32 + dst_line_offset;
-      if (bit_depth <= 30) 
+      // There is the possibility that converting to integer will
+      // exceed the dynamic range of 32bit integer; therefore, care must be
+      // exercised.
+      // We look if the floating point number is outside the half-closed
+      // interval [-0.5f, 0.5f). If so, we limit the resulting integer
+      // to the maximum/minimum that number supports.
+      si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
+      __m256 mul = _mm256_set1_ps((float)(1ull << bit_depth));
+      __m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit);  // val < upper
+      __m256 fl_low_lim = _mm256_set1_ps((float)neg_limit);  // val >= lower
+      __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
+      __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
+
+      if (is_signed)
       {
-        // We are leaving two bit overhead -- here, we are assuming that after
-        // multiplications, the resulting number can still be represented
-        // using 32 bit integer
-        __m256 mul = _mm256_set1_ps((float)(1 << bit_depth));
-        __m256i upper_limit = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
-        __m256i lower_limit = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
-
-        if (is_signed)
-        {
-          __m256i zero = _mm256_setzero_si256();
-          __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));
-          for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) 
-          {
-            __m256 t = _mm256_loadu_ps(sp);
-            t = _mm256_mul_ps(t, mul);
-            t = _mm256_round_ps(t, 
-              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-            __m256i u = _mm256_cvtps_epi32(t);
-            u = _mm256_max_epi32(u, lower_limit);
-            u = _mm256_min_epi32(u, upper_limit);
-
-            __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
-            __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value
-            neg = _mm256_and_si256(c, neg);          //keep only - bias - value
-            __m256i v = _mm256_andnot_si256(c, u);   //keep only +ve or 0
-            v = _mm256_or_si256(neg, v);             //combine
-            _mm256_storeu_si256((__m256i*)dp, v);
-          }
-        }
-        else
-        {
-          __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1)));
-          for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
-            __m256 t = _mm256_loadu_ps(sp);
-            t = _mm256_mul_ps(t, mul);
-            t = _mm256_round_ps(t, 
-              _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-            __m256i u = _mm256_cvtps_epi32(t);
-            u = _mm256_max_epi32(u, lower_limit);
-            u = _mm256_min_epi32(u, upper_limit);
-            u = _mm256_add_epi32(u, half);
-            _mm256_storeu_si256((__m256i*)dp, u);
-          }
+        __m256i zero = _mm256_setzero_si256();
+        __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m256 t = _mm256_loadu_ps(sp);
+          t = _mm256_mul_ps(t, mul);
+          __m256i u = _mm256_cvtps_epi32(t);
+          u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_mm256_min_lt_epi32(u,  s32_up_lim, t,  fl_up_lim);
+          __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
+          __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value
+          neg = _mm256_and_si256(c, neg);          //keep only - bias - value
+          __m256i v = _mm256_andnot_si256(c, u);   //keep only +ve or 0
+          v = _mm256_or_si256(neg, v);             //combine
+          _mm256_storeu_si256((__m256i*)dp, v);
         }
       }
       else
       {
-        // There is the possibility that converting to integer will
-        // exceed the dynamic range of 32bit integer; therefore, we need
-        // to use 64 bit.  One may think, why not limit the floats to the
-        // range of [-0.5f, 0.5f)? 
-        // Notice the half closed range -- we need a value just below 0.5f.
-        // While getting this number is possible, after multiplication, the
-        // resulting number will not be exactly the maximum that the integer 
-        // can achieve.  All this is academic, because here are talking
-        // about a number which has all the exponent bits set, meaning 
-        // it is either infinity, -infinity, qNan or sNan.
-        si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
-        __m256 mul = _mm256_set1_ps((float)(1 << bit_depth));
-        __m256 fl_up_lim = _mm256_set1_ps(-(float)neg_limit);  // val < upper
-        __m256 fl_low_lim = _mm256_set1_ps((float)neg_limit);  // val >= lower
-        __m256i s32_up_lim = _mm256_set1_epi32(INT_MAX >> (32 - bit_depth));
-        __m256i s32_low_lim = _mm256_set1_epi32(INT_MIN >> (32 - bit_depth));
-
-        if (is_signed)
-        {
-          __m256i zero = _mm256_setzero_si256();
-          __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));                   
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-            __m256 t = _mm256_loadu_ps(sp);
-            t = _mm256_mul_ps(t, mul);
-            __m256i u = _mm256_cvtps_epi32(t);
-            u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
-            u = ojph_mm256_min_lt_epi32(u,  s32_up_lim, t,  fl_up_lim);
-            __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
-            __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value
-            neg = _mm256_and_si256(c, neg);          //keep only - bias - value
-            __m256i v = _mm256_andnot_si256(c, u);   //keep only +ve or 0
-            v = _mm256_or_si256(neg, v);             //combine
-            _mm256_storeu_si256((__m256i*)dp, v);
-          }
-        }
-        else
-        {
-          __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1)));
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-            __m256 t = _mm256_loadu_ps(sp);
-            t = _mm256_mul_ps(t, mul);
-            __m256i u = _mm256_cvtps_epi32(t);
-            u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
-            u = ojph_mm256_min_lt_epi32(u,  s32_up_lim, t,  fl_up_lim);
-            u = _mm256_add_epi32(u, half);
-            _mm256_storeu_si256((__m256i*)dp, u);
-          }
+        __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1)));
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m256 t = _mm256_loadu_ps(sp);
+          t = _mm256_mul_ps(t, mul);
+          __m256i u = _mm256_cvtps_epi32(t);
+          u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_mm256_min_lt_epi32(u,  s32_up_lim, t,  fl_up_lim);
+          u = _mm256_add_epi32(u, half);
+          _mm256_storeu_si256((__m256i*)dp, u);
         }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
-      ui32 src_line_offset, line_buf *dst_line, 
+    void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
       assert((src_line->flags & line_buf::LFT_32BIT) &&
@@ -403,45 +350,45 @@ namespace ojph {
           u = _mm256_or_si256(neg, t);             // combine
           __m256 v = _mm256_cvtepi32_ps(u);
           v = _mm256_mul_ps(v, mul);
-          _mm256_storeu_ps(dp, v);        
+          _mm256_storeu_ps(dp, v);
         }
       }
       else
       {
-        __m256 half = _mm256_set1_ps(0.5f);
+        __m256i half = _mm256_set1_epi32(INT_MIN);
         for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256i t = _mm256_loadu_si256((__m256i*)sp);
-          __m256i u = _mm256_slli_epi32(t, shift);
-          __m256 v = _mm256_cvtepi32_ps(u);
+          t = _mm256_slli_epi32(t, shift);
+          t = _mm256_sub_epi32(t, half);
+          __m256 v = _mm256_cvtepi32_ps(t);
           v = _mm256_mul_ps(v, mul);
-          v = _mm256_sub_ps(v, half);
           _mm256_storeu_ps(dp, v);
         }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_forward(const line_buf *r, 
-                          const line_buf *g, 
+    void avx2_rct_forward(const line_buf *r,
+                          const line_buf *g,
                           const line_buf *b,
-                          line_buf *y, line_buf *cb, line_buf *cr, 
+                          line_buf *y, line_buf *cb, line_buf *cr,
                           ui32 repeat)
     {
       assert((y->flags  & line_buf::LFT_INTEGER) &&
-             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cb->flags & line_buf::LFT_INTEGER) &&
              (cr->flags & line_buf::LFT_INTEGER) &&
              (r->flags  & line_buf::LFT_INTEGER) &&
-             (g->flags  & line_buf::LFT_INTEGER) && 
+             (g->flags  & line_buf::LFT_INTEGER) &&
              (b->flags  & line_buf::LFT_INTEGER));
-      
+
       if  (y->flags & line_buf::LFT_32BIT)
       {
         assert((y->flags  & line_buf::LFT_32BIT) &&
-               (cb->flags & line_buf::LFT_32BIT) && 
+               (cb->flags & line_buf::LFT_32BIT) &&
                (cr->flags & line_buf::LFT_32BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
-               (b->flags  & line_buf::LFT_32BIT));        
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
         const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
         si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
         for (int i = (repeat + 7) >> 3; i > 0; --i)
@@ -461,13 +408,13 @@ namespace ojph {
           yp += 8; cbp += 8; crp += 8;
         }
       }
-      else 
+      else
       {
         assert((y->flags  & line_buf::LFT_64BIT) &&
-               (cb->flags & line_buf::LFT_64BIT) && 
+               (cb->flags & line_buf::LFT_64BIT) &&
                (cr->flags & line_buf::LFT_64BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
         const si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
@@ -481,7 +428,7 @@ namespace ojph {
           mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 0));
           mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 0));
           mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 0));
-          
+
           t = _mm256_add_epi64(mr, mb);
           t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
           _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
@@ -495,7 +442,7 @@ namespace ojph {
           mr = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mr32, 1));
           mg = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mg32, 1));
           mb = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(mb32, 1));
-          
+
           t = _mm256_add_epi64(mr, mb);
           t = _mm256_add_epi64(t, _mm256_slli_epi64(mg, 1));
           _mm256_store_si256((__m256i*)yp, avx2_mm256_srai_epi64(t, 2, v2));
@@ -508,29 +455,29 @@ namespace ojph {
           yp += 4; cbp += 4; crp += 4;
         }
       }
-    }    
+    }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_rct_backward(const line_buf *y, 
-                           const line_buf *cb, 
+    void avx2_rct_backward(const line_buf *y,
+                           const line_buf *cb,
                            const line_buf *cr,
-                           line_buf *r, line_buf *g, line_buf *b, 
+                           line_buf *r, line_buf *g, line_buf *b,
                            ui32 repeat)
     {
       assert((y->flags  & line_buf::LFT_INTEGER) &&
-             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cb->flags & line_buf::LFT_INTEGER) &&
              (cr->flags & line_buf::LFT_INTEGER) &&
              (r->flags  & line_buf::LFT_INTEGER) &&
-             (g->flags  & line_buf::LFT_INTEGER) && 
+             (g->flags  & line_buf::LFT_INTEGER) &&
              (b->flags  & line_buf::LFT_INTEGER));
 
       if (y->flags & line_buf::LFT_32BIT)
       {
         assert((y->flags  & line_buf::LFT_32BIT) &&
-               (cb->flags & line_buf::LFT_32BIT) && 
+               (cb->flags & line_buf::LFT_32BIT) &&
                (cr->flags & line_buf::LFT_32BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
         si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
@@ -550,24 +497,24 @@ namespace ojph {
 
           yp += 8; cbp += 8; crp += 8;
           rp += 8; gp += 8; bp += 8;
-        }        
+        }
       }
       else
       {
         assert((y->flags  & line_buf::LFT_64BIT) &&
-               (cb->flags & line_buf::LFT_64BIT) && 
+               (cb->flags & line_buf::LFT_64BIT) &&
                (cr->flags & line_buf::LFT_64BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         __m256i v2 = _mm256_set1_epi64x(1ULL << (63 - 2));
-        __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX, 
+        __m256i low_bits = _mm256_set_epi64x(0, (si64)ULLONG_MAX,
                                              0, (si64)ULLONG_MAX);
         const si64 *yp = y->i64, *cbp = cb->i64, *crp = cr->i64;
         si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
         for (int i = (repeat + 7) >> 3; i > 0; --i)
         {
-          __m256i my, mcb, mcr, tr, tg, tb;          
+          __m256i my, mcb, mcr, tr, tg, tb;
           my  = _mm256_load_si256((__m256i*)yp);
           mcb = _mm256_load_si256((__m256i*)cbp);
           mcr = _mm256_load_si256((__m256i*)crp);
@@ -617,7 +564,7 @@ namespace ojph {
 
           yp += 4; cbp += 4; crp += 4;
           rp += 8; gp += 8; bp += 8;
-        }        
+        }
       }
     }
 
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index 208b9616..f1a95447 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -2,21 +2,21 @@
 // This software is released under the 2-Clause BSD license, included
 // below.
 //
-// Copyright (c) 2019, Aous Naman 
+// Copyright (c) 2019, Aous Naman
 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
 // Copyright (c) 2019, The University of New South Wales, Australia
-// 
+//
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions are
 // met:
-// 
+//
 // 1. Redistributions of source code must retain the above copyright
 // notice, this list of conditions and the following disclaimer.
-// 
+//
 // 2. Redistributions in binary form must reproduce the above copyright
 // notice, this list of conditions and the following disclaimer in the
 // documentation and/or other materials provided with the distribution.
-// 
+//
 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
@@ -127,7 +127,7 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line, 
+    void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
       line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -135,104 +135,55 @@ namespace ojph {
              (src_line->flags & line_buf::LFT_INTEGER) == 0 &&
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER));
-      
+
+      assert(bit_depth <= 32);
       uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
       _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
 
       const float* sp = src_line->f32;
       si32* dp = dst_line->i32 + dst_line_offset;
-      if (bit_depth <= 30) 
+      // There is the possibility that converting to integer will
+      // exceed the dynamic range of 32bit integer; therefore, care must be
+      // exercised.
+      // We look if the floating point number is outside the half-closed
+      // interval [-0.5f, 0.5f). If so, we limit the resulting integer
+      // to the maximum/minimum that number supports.
+      si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
+      __m128 mul = _mm_set1_ps((float)(1ull << bit_depth));
+      __m128 fl_up_lim = _mm_set1_ps(-(float)neg_limit); // val < upper
+      __m128 fl_low_lim = _mm_set1_ps((float)neg_limit); // val >= lower
+      __m128i s32_up_lim = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
+      __m128i s32_low_lim = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
+
+      if (is_signed)
       {
-        // We are leaving two bit overhead -- here, we are assuming that after
-        // multiplications, the resulting number can still be represented
-        // using 32 bit integer
-        __m128 mul = _mm_set1_ps((float)(1 << bit_depth));
-        __m128i upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
-        __m128i lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
-        
-        if (is_signed)
-        {
-          __m128i zero = _mm_setzero_si128();
-          __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) 
-          {
-            __m128 t = _mm_loadu_ps(sp);
-            t = _mm_mul_ps(t, mul);
-            __m128i u = _mm_cvtps_epi32(t);
-            u = ojph_mm_max_epi32(u, lower_limit);
-            u = ojph_mm_min_epi32(u, upper_limit);
-
-            __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
-            __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
-            neg = _mm_and_si128(c, neg);          //keep only - bias - value
-            __m128i v = _mm_andnot_si128(c, u);   //keep only +ve or 0
-            v = _mm_or_si128(neg, v);             //combine
-            _mm_storeu_si128((__m128i*)dp, v);
-          }
-        }
-        else
-        {
-          __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1)));
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-            __m128 t = _mm_loadu_ps(sp);
-            t = _mm_mul_ps(t, mul);
-            __m128i u = _mm_cvtps_epi32(t);
-            u = ojph_mm_max_epi32(u, lower_limit);
-            u = ojph_mm_min_epi32(u, upper_limit);
-            u = _mm_add_epi32(u, half);
-            _mm_storeu_si128((__m128i*)dp, u);
-          }
+        __m128i zero = _mm_setzero_si128();
+        __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m128 t = _mm_loadu_ps(sp);
+          t = _mm_mul_ps(t, mul);
+          __m128i u = _mm_cvtps_epi32(t);
+          u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
+          __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
+          __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
+          neg = _mm_and_si128(c, neg);          //keep only - bias - value
+          __m128i v = _mm_andnot_si128(c, u);   //keep only +ve or 0
+          v = _mm_or_si128(neg, v);             //combine
+          _mm_storeu_si128((__m128i*)dp, v);
         }
       }
       else
       {
-        // There is the possibility that converting to integer will
-        // exceed the dynamic range of 32bit integer; therefore, we need
-        // to use 64 bit.  One may think, why not limit the floats to the
-        // range of [-0.5f, 0.5f)? 
-        // Notice the half closed range -- we need a value just below 0.5f.
-        // While getting this number is possible, after multiplication, the
-        // resulting number will not be exactly the maximum that the integer 
-        // can achieve.  All this is academic, because here are talking
-        // about a number which has all the exponent bits set, meaning 
-        // it is either infinity, -infinity, qNan or sNan.
-        si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
-        __m128 mul = _mm_set1_ps((float)(1 << bit_depth));
-        __m128 fl_upper_limit = _mm_set1_ps(-(float)neg_limit); // val < upper
-        __m128 fl_lower_limit = _mm_set1_ps( (float)neg_limit); // val >= lower
-        __m128i s32_upper_limit = _mm_set1_epi32(INT_MAX >> (32 - bit_depth));
-        __m128i s32_lower_limit = _mm_set1_epi32(INT_MIN >> (32 - bit_depth));
-
-        if (is_signed)
-        {
-          __m128i zero = _mm_setzero_si128();
-          __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));                   
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-            __m128 t = _mm_loadu_ps(sp);
-            t = _mm_mul_ps(t, mul);
-            __m128i u = _mm_cvtps_epi32(t);
-            u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit);
-            u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit);
-            __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
-            __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
-            neg = _mm_and_si128(c, neg);          //keep only - bias - value
-            __m128i v = _mm_andnot_si128(c, u);   //keep only +ve or 0
-            v = _mm_or_si128(neg, v);             //combine
-            _mm_storeu_si128((__m128i*)dp, v);
-          }
-        }
-        else
-        {
-          __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1)));
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-            __m128 t = _mm_loadu_ps(sp);
-            t = _mm_mul_ps(t, mul);
-            __m128i u = _mm_cvtps_epi32(t);
-            u = ojph_mm_max_ge_epi32(u, s32_lower_limit, t, fl_lower_limit);
-            u = ojph_mm_min_lt_epi32(u, s32_upper_limit, t, fl_upper_limit);
-            u = _mm_add_epi32(u, half);
-            _mm_storeu_si128((__m128i*)dp, u);
-          }
+        __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1)));
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          __m128 t = _mm_loadu_ps(sp);
+          t = _mm_mul_ps(t, mul);
+          __m128i u = _mm_cvtps_epi32(t);
+          u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
+          u = _mm_add_epi32(u, half);
+          _mm_storeu_si128((__m128i*)dp, u);
         }
       }
 
@@ -241,7 +192,7 @@ namespace ojph {
 
     /////////////////////////////////////////////////////////////////////////
     // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
-    static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m) 
+    static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
     {
       // note than m must be obtained using
       // __m128i m = _mm_set1_epi64x(1ULL << (63 - amt));
@@ -270,14 +221,14 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_convert(const line_buf *src_line, 
+    void sse2_rev_convert(const line_buf *src_line,
                           const ui32 src_line_offset,
-                          line_buf *dst_line, 
-                          const ui32 dst_line_offset, 
+                          line_buf *dst_line,
+                          const ui32 dst_line_offset,
                           si64 shift, ui32 width)
     {
       if (src_line->flags & line_buf::LFT_32BIT)
-      { 
+      {
         if (dst_line->flags & line_buf::LFT_32BIT)
         {
           const si32 *sp = src_line->i32 + src_line_offset;
@@ -288,9 +239,9 @@ namespace ojph {
             __m128i s = _mm_loadu_si128((__m128i*)sp);
             s = _mm_add_epi32(s, sh);
             _mm_storeu_si128((__m128i*)dp, s);
-          }            
+          }
         }
-        else 
+        else
         {
           const si32 *sp = src_line->i32 + src_line_offset;
           si64 *dp = dst_line->i64 + dst_line_offset;
@@ -300,18 +251,18 @@ namespace ojph {
           {
             __m128i s, t;
             s = _mm_loadu_si128((__m128i*)sp);
-            
+
             t = sse2_cvtlo_epi32_epi64(s, zero);
             t = _mm_add_epi64(t, sh);
             _mm_storeu_si128((__m128i*)dp, t);
-            
+
             t = sse2_cvthi_epi32_epi64(s, zero);
             t = _mm_add_epi64(t, sh);
             _mm_storeu_si128((__m128i*)dp + 1, t);
-          }            
+          }
         }
       }
-      else 
+      else
       {
         assert(src_line->flags | line_buf::LFT_64BIT);
         assert(dst_line->flags | line_buf::LFT_32BIT);
@@ -333,22 +284,22 @@ namespace ojph {
 
           s = _mm_shuffle_epi32(s, _MM_SHUFFLE(2, 0, 0, 0));
           s = _mm_andnot_si128(low_bits, s);
-          
+
           t = _mm_or_si128(s, t);
           _mm_storeu_si128((__m128i*)dp, t);
-        }            
+        }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rev_convert_nlt_type3(const line_buf *src_line, 
-                                    const ui32 src_line_offset, 
-                                    line_buf *dst_line, 
-                                    const ui32 dst_line_offset, 
+    void sse2_rev_convert_nlt_type3(const line_buf *src_line,
+                                    const ui32 src_line_offset,
+                                    line_buf *dst_line,
+                                    const ui32 dst_line_offset,
                                     si64 shift, ui32 width)
     {
       if (src_line->flags & line_buf::LFT_32BIT)
-      { 
+      {
         if (dst_line->flags & line_buf::LFT_32BIT)
         {
           const si32 *sp = src_line->i32 + src_line_offset;
@@ -359,14 +310,14 @@ namespace ojph {
           {
             __m128i s = _mm_loadu_si128((__m128i*)sp);
             __m128i c = _mm_cmplt_epi32(s, zero);  // 0xFFFFFFFF for -ve value
-            __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value 
+            __m128i v_m_sh = _mm_sub_epi32(sh, s); // - shift - value
             v_m_sh = _mm_and_si128(c, v_m_sh);     // keep only - shift - value
             s = _mm_andnot_si128(c, s);            // keep only +ve or 0
             s = _mm_or_si128(s, v_m_sh);           // combine
             _mm_storeu_si128((__m128i*)dp, s);
           }
         }
-        else 
+        else
         {
           const si32 *sp = src_line->i32 + src_line_offset;
           si64 *dp = dst_line->i64 + dst_line_offset;
@@ -381,7 +332,7 @@ namespace ojph {
             u = _mm_unpacklo_epi32(s, t);      // correct 64bit data
             c = _mm_unpacklo_epi32(t, t);      // 64bit -1 for -ve value
 
-            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value 
+            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value
             v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
             u = _mm_andnot_si128(c, u);        // keep only +ve or 0
             u = _mm_or_si128(u, v_m_sh);       // combine
@@ -390,7 +341,7 @@ namespace ojph {
             u = _mm_unpackhi_epi32(s, t);      // correct 64bit data
             c = _mm_unpackhi_epi32(t, t);      // 64bit -1 for -ve value
 
-            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value 
+            v_m_sh = _mm_sub_epi64(sh, u);     // - shift - value
             v_m_sh = _mm_and_si128(c, v_m_sh); // keep only - shift - value
             u = _mm_andnot_si128(c, u);        // keep only +ve or 0
             u = _mm_or_si128(u, v_m_sh);       // combine
@@ -399,7 +350,7 @@ namespace ojph {
           }
         }
       }
-      else 
+      else
       {
         assert(src_line->flags | line_buf::LFT_64BIT);
         assert(dst_line->flags | line_buf::LFT_32BIT);
@@ -414,7 +365,7 @@ namespace ojph {
           // m for mask, and tm for temp
           __m128i s, t, p, n, m, tm;
           s = _mm_loadu_si128((__m128i*)sp);
-          
+
           tm = _mm_cmplt_epi32(s, zero);   // 32b -1 for -ve value
           m = _mm_shuffle_epi32(tm, _MM_SHUFFLE(3, 3, 1, 1)); // expand to 64b
           tm = _mm_sub_epi64(sh, s);       // - shift - value
@@ -441,8 +392,8 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line, 
-      ui32 src_line_offset, line_buf *dst_line, 
+    void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
       assert((src_line->flags & line_buf::LFT_32BIT) &&
@@ -451,6 +402,7 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_INTEGER) == 0);
 
       __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0));
+      float mulf = (float)(1.0 / 65536.0 / 65536.0);
 
       const si32* sp = src_line->i32 + src_line_offset;
       float* dp = dst_line->f32;
@@ -469,45 +421,45 @@ namespace ojph {
           u = _mm_or_si128(neg, t);             // combine
           __m128 v = _mm_cvtepi32_ps(u);
           v = _mm_mul_ps(v, mul);
-          _mm_storeu_ps(dp, v);        
+          _mm_storeu_ps(dp, v);
         }
       }
       else
       {
-        __m128 half = _mm_set1_ps(0.5f);
+        __m128i half = _mm_set1_epi32(INT_MIN);
         for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128i t = _mm_loadu_si128((__m128i*)sp);
-          __m128i u = _mm_slli_epi32(t, shift);
-          __m128 v = _mm_cvtepi32_ps(u);
+          t = _mm_slli_epi32(t, shift);
+          t = _mm_sub_epi32(t, half);
+          __m128 v = _mm_cvtepi32_ps(t);
           v = _mm_mul_ps(v, mul);
-          v = _mm_sub_ps(v, half);
           _mm_storeu_ps(dp, v);
         }
       }
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_forward(const line_buf *r, 
-                          const line_buf *g, 
+    void sse2_rct_forward(const line_buf *r,
+                          const line_buf *g,
                           const line_buf *b,
-                          line_buf *y, line_buf *cb, line_buf *cr, 
+                          line_buf *y, line_buf *cb, line_buf *cr,
                           ui32 repeat)
     {
       assert((y->flags  & line_buf::LFT_INTEGER) &&
-             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cb->flags & line_buf::LFT_INTEGER) &&
              (cr->flags & line_buf::LFT_INTEGER) &&
              (r->flags  & line_buf::LFT_INTEGER) &&
-             (g->flags  & line_buf::LFT_INTEGER) && 
+             (g->flags  & line_buf::LFT_INTEGER) &&
              (b->flags  & line_buf::LFT_INTEGER));
-      
+
       if  (y->flags & line_buf::LFT_32BIT)
       {
         assert((y->flags  & line_buf::LFT_32BIT) &&
-               (cb->flags & line_buf::LFT_32BIT) && 
+               (cb->flags & line_buf::LFT_32BIT) &&
                (cr->flags & line_buf::LFT_32BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
-               (b->flags  & line_buf::LFT_32BIT));        
+               (g->flags  & line_buf::LFT_32BIT) &&
+               (b->flags  & line_buf::LFT_32BIT));
         const si32 *rp = r->i32, * gp = g->i32, * bp = b->i32;
         si32 *yp = y->i32, * cbp = cb->i32, * crp = cr->i32;
         for (int i = (repeat + 3) >> 2; i > 0; --i)
@@ -527,13 +479,13 @@ namespace ojph {
           yp += 4; cbp += 4; crp += 4;
         }
       }
-      else 
+      else
       {
         assert((y->flags  & line_buf::LFT_64BIT) &&
-               (cb->flags & line_buf::LFT_64BIT) && 
+               (cb->flags & line_buf::LFT_64BIT) &&
                (cr->flags & line_buf::LFT_64BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         __m128i zero = _mm_setzero_si128();
         __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
@@ -548,7 +500,7 @@ namespace ojph {
           mr = sse2_cvtlo_epi32_epi64(mr32, zero);
           mg = sse2_cvtlo_epi32_epi64(mg32, zero);
           mb = sse2_cvtlo_epi32_epi64(mb32, zero);
-          
+
           t = _mm_add_epi64(mr, mb);
           t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
           _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
@@ -562,7 +514,7 @@ namespace ojph {
           mr = sse2_cvthi_epi32_epi64(mr32, zero);
           mg = sse2_cvthi_epi32_epi64(mg32, zero);
           mb = sse2_cvthi_epi32_epi64(mb32, zero);
-          
+
           t = _mm_add_epi64(mr, mb);
           t = _mm_add_epi64(t, _mm_slli_epi64(mg, 1));
           _mm_store_si128((__m128i*)yp, sse2_mm_srai_epi64(t, 2, v2));
@@ -578,26 +530,26 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_rct_backward(const line_buf *y, 
-                           const line_buf *cb, 
+    void sse2_rct_backward(const line_buf *y,
+                           const line_buf *cb,
                            const line_buf *cr,
-                           line_buf *r, line_buf *g, line_buf *b, 
+                           line_buf *r, line_buf *g, line_buf *b,
                            ui32 repeat)
     {
       assert((y->flags  & line_buf::LFT_INTEGER) &&
-             (cb->flags & line_buf::LFT_INTEGER) && 
+             (cb->flags & line_buf::LFT_INTEGER) &&
              (cr->flags & line_buf::LFT_INTEGER) &&
              (r->flags  & line_buf::LFT_INTEGER) &&
-             (g->flags  & line_buf::LFT_INTEGER) && 
+             (g->flags  & line_buf::LFT_INTEGER) &&
              (b->flags  & line_buf::LFT_INTEGER));
 
       if (y->flags & line_buf::LFT_32BIT)
       {
         assert((y->flags  & line_buf::LFT_32BIT) &&
-               (cb->flags & line_buf::LFT_32BIT) && 
+               (cb->flags & line_buf::LFT_32BIT) &&
                (cr->flags & line_buf::LFT_32BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         const si32 *yp = y->i32, *cbp = cb->i32, *crp = cr->i32;
         si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
@@ -617,15 +569,15 @@ namespace ojph {
 
           yp += 4; cbp += 4; crp += 4;
           rp += 4; gp += 4; bp += 4;
-        }        
+        }
       }
       else
       {
         assert((y->flags  & line_buf::LFT_64BIT) &&
-               (cb->flags & line_buf::LFT_64BIT) && 
+               (cb->flags & line_buf::LFT_64BIT) &&
                (cr->flags & line_buf::LFT_64BIT) &&
                (r->flags  & line_buf::LFT_32BIT) &&
-               (g->flags  & line_buf::LFT_32BIT) && 
+               (g->flags  & line_buf::LFT_32BIT) &&
                (b->flags  & line_buf::LFT_32BIT));
         __m128i v2 = _mm_set1_epi64x(1ULL << (63 - 2));
         __m128i low_bits = _mm_set_epi64x(0, (si64)ULLONG_MAX);
@@ -633,7 +585,7 @@ namespace ojph {
         si32 *rp = r->i32, *gp = g->i32, *bp = b->i32;
         for (int i = (repeat + 3) >> 2; i > 0; --i)
         {
-          __m128i my, mcb, mcr, tr, tg, tb;          
+          __m128i my, mcb, mcr, tr, tg, tb;
           my  = _mm_load_si128((__m128i*)yp);
           mcb = _mm_load_si128((__m128i*)cbp);
           mcr = _mm_load_si128((__m128i*)crp);
@@ -678,7 +630,7 @@ namespace ojph {
 
           yp += 2; cbp += 2; crp += 2;
           rp += 4; gp += 4; bp += 4;
-        }        
+        }
       }
     }
   }

From 2ea19eb46e04f1783a599f42dc7e600193c30891 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 25 Jan 2025 11:55:47 +1100
Subject: [PATCH 13/20] Updated WASM SIMD

---
 src/core/transform/ojph_colour_wasm.cpp | 140 ++++++++----------------
 1 file changed, 44 insertions(+), 96 deletions(-)

diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index 8e354784..fd08f324 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -50,7 +50,7 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
     static inline
     v128_t ojph_convert_float_to_i32(v128_t a, v128_t zero, v128_t half)
-    { // We implement ojph_round, which is 
+    { // We implement ojph_round, which is
       // val + (val >= 0.0f ? 0.5f : -0.5f), where val is float
       v128_t c = wasm_f32x4_ge(a, zero);   // greater or equal to zero
       v128_t p = wasm_f32x4_add(a, half);  // for positive, add half
@@ -279,7 +279,7 @@ namespace ojph {
     static inline
     v128_t ojph_wasm_i32x4_max_ge(v128_t a, v128_t b, v128_t x, v128_t y)
     {
-      v128_t c = wasm_i32x4_ge(x, y);    // 0xFFFFFFFF for x >= y
+      v128_t c = wasm_f32x4_ge(x, y);    // 0xFFFFFFFF for x >= y
       v128_t d = wasm_v128_and(c, a);    // keep only a, where x >= y
       v128_t e = wasm_v128_andnot(b, c); // keep only b, where x <  y
       return wasm_v128_or(d, e);         // combine
@@ -289,7 +289,7 @@ namespace ojph {
     static inline
     v128_t ojph_wasm_i32x4_min_lt(v128_t a, v128_t b, v128_t x, v128_t y)
     {
-      v128_t c = wasm_i32x4_lt(x, y);    // 0xFFFFFFFF for x < y
+      v128_t c = wasm_f32x4_lt(x, y);    // 0xFFFFFFFF for x < y
       v128_t d = wasm_v128_and(c, a);    // keep only a, where x <  y
       v128_t e = wasm_v128_andnot(b, c); // keep only b, where x >= y
       return wasm_v128_or(d, e);         // combine
@@ -305,106 +305,54 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER));
 
+      assert(bit_depth <= 32);
       const float* sp = src_line->f32;
       si32* dp = dst_line->i32 + dst_line_offset;
-      if (bit_depth <= 30) 
+      // There is the possibility that converting to integer will
+      // exceed the dynamic range of 32bit integer; therefore, care must be
+      // exercised.
+      // We look if the floating point number is outside the half-closed
+      // interval [-0.5f, 0.5f). If so, we limit the resulting integer
+      // to the maximum/minimum that number supports.
+      si32 neg_limit = (si32)INT_MIN >> (32 - bit_depth);
+      v128_t mul = wasm_f32x4_splat((float)(1ull << bit_depth));
+      v128_t fl_up_lim = wasm_f32x4_splat(-(float)neg_limit); // val < upper
+      v128_t fl_low_lim = wasm_f32x4_splat((float)neg_limit); // val >= lower
+      v128_t s32_up_lim = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
+      v128_t s32_low_lim = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
+
+      if (is_signed)
       {
-        // We are leaving two bit overhead -- here, we are assuming that after
-        // multiplications, the resulting number can still be represented
-        // using 32 bit integer
-        v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth));
-        v128_t upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
-        v128_t lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
-        
-        if (is_signed)
-        {
-          const v128_t zero = wasm_f32x4_splat(0.0f);
-          const v128_t half = wasm_f32x4_splat(0.5f);
-          v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) 
-          {
-            v128_t t = wasm_v128_load(sp);
-            t = wasm_f32x4_mul(t, mul);
-            v128_t u = ojph_convert_float_to_i32(t, zero, half);
-            u = wasm_i32x4_max(u, lower_limit);
-            u = wasm_i32x4_min(u, upper_limit);
-
-            v128_t c = wasm_i32x4_gt(zero, u);    //0xFFFFFFFF for -ve value
-            v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value
-            neg = wasm_v128_and(c, neg);          //keep only - bias - value
-            v128_t v = wasm_v128_andnot(u, c);    //keep only +ve or 0
-            v = wasm_v128_or(neg, v);             //combine
-            wasm_v128_store(dp, v);
-          }
-        }
-        else
-        {
-          const v128_t zero = wasm_f32x4_splat(0.0f);
-          const v128_t half = wasm_f32x4_splat(0.5f);
-          v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-            v128_t t = wasm_v128_load(sp);
-            t = wasm_f32x4_mul(t, mul);
-            v128_t u = ojph_convert_float_to_i32(t, zero, half);
-            u = wasm_i32x4_max(u, lower_limit);
-            u = wasm_i32x4_min(u, upper_limit);
-            u = wasm_i32x4_add(u, ihalf);
-            wasm_v128_store(dp, u);
-          }
+        const v128_t zero = wasm_f32x4_splat(0.0f);
+        const v128_t half = wasm_f32x4_splat(0.5f);
+        v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = wasm_v128_load(sp);
+          t = wasm_f32x4_mul(t, mul);
+          v128_t u = ojph_convert_float_to_i32(t, zero, half);
+          u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
+          v128_t c = wasm_i32x4_gt(zero, u);    // 0xFFFFFFFF for -ve value
+          v128_t neg = wasm_i32x4_sub(bias, u); // -bias -value
+          neg = wasm_v128_and(c, neg);          // keep only - bias - value
+          v128_t v = wasm_v128_andnot(u, c);    // keep only +ve or 0
+          v = wasm_v128_or(neg, v);             // combine
+          wasm_v128_store(dp, v);
         }
       }
       else
       {
-        // There is the possibility that converting to integer will
-        // exceed the dynamic range of 32bit integer; therefore, we need
-        // to use 64 bit.  One may think, why not limit the floats to the
-        // range of [-0.5f, 0.5f)? 
-        // Notice the half closed range -- we need a value just below 0.5f.
-        // While getting this number is possible, after multiplication, the
-        // resulting number will not be exactly the maximum that the integer 
-        // can achieve.  All this is academic, because here are talking
-        // about a number which has all the exponent bits set, meaning 
-        // it is either infinity, -infinity, qNan or sNan.
-        si64 neg_limit = (si64)LLONG_MIN >> (64 - bit_depth);
-        v128_t mul = wasm_f32x4_splat((float)(1 << bit_depth));
-        v128_t fl_upper_limit = wasm_f32x4_splat(-(float)neg_limit); // val< up
-        v128_t fl_lower_limit = wasm_f32x4_splat( (float)neg_limit); // val>=lo
-        v128_t s32_upper_limit = wasm_i32x4_splat(INT_MAX >> (32 - bit_depth));
-        v128_t s32_lower_limit = wasm_i32x4_splat(INT_MIN >> (32 - bit_depth));
-
-        if (is_signed)
-        {
-          const v128_t zero = wasm_f32x4_splat(0.0f);
-          const v128_t half = wasm_f32x4_splat(0.5f);
-          v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));                   
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-            v128_t t = wasm_v128_load(sp);
-            t = wasm_f32x4_mul(t, mul);
-            v128_t u = ojph_convert_float_to_i32(t, zero, half);
-            u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit);
-            u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit);
-            v128_t c = wasm_i32x4_gt(zero, u);    //0xFFFFFFFF for -ve value
-            v128_t neg = wasm_i32x4_sub(bias, u); //-bias -value
-            neg = wasm_v128_and(c, neg);          //keep only - bias - value
-            v128_t v = wasm_v128_andnot(u, c);    //keep only +ve or 0
-            v = wasm_v128_or(neg, v);             //combine
-            wasm_v128_store(dp, v);
-          }
-        }
-        else
-        {
-          const v128_t zero = wasm_f32x4_splat(0.0f);
-          const v128_t half = wasm_f32x4_splat(0.5f);
-          v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
-          for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
-            v128_t t = wasm_v128_load(sp);
-            t = wasm_f32x4_mul(t, mul);
-            v128_t u = ojph_convert_float_to_i32(t, zero, half);
-            u = ojph_wasm_i32x4_max_ge(u, s32_lower_limit, t, fl_lower_limit);
-            u = ojph_wasm_i32x4_min_lt(u, s32_upper_limit, t, fl_upper_limit);
-            u = wasm_i32x4_add(u, ihalf);
-            wasm_v128_store(dp, u);
-          }
+        const v128_t zero = wasm_f32x4_splat(0.0f);
+        const v128_t half = wasm_f32x4_splat(0.5f);
+        v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
+        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+          v128_t t = wasm_v128_load(sp);
+          t = wasm_f32x4_mul(t, mul);
+          v128_t u = ojph_convert_float_to_i32(t, zero, half);
+          u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
+          u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
+          u = wasm_i32x4_add(u, ihalf);
+          wasm_v128_store(dp, u);
         }
       }
     }

From 9c22320b8a6cc52e6ef8043a5369104ecbdcd9e8 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sat, 25 Jan 2025 12:25:01 +1100
Subject: [PATCH 14/20] Bug fixes.

---
 src/core/transform/ojph_colour.cpp      | 6 +++---
 src/core/transform/ojph_colour_wasm.cpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 792929b8..49107772 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -419,12 +419,12 @@ namespace ojph {
       }
       else
       {
-        const si32 half = INT_MIN;
+        const ui32 half = (ui32)INT_MIN;
         for (ui32 i = width; i > 0; --i) {
-          si32 v = *sp++;
+          ui32 v = (ui32)*sp++;
           v <<= shift;
           v -= half;
-          *dp++ = (float)v * mul;
+          *dp++ = (float)(si32)v * mul;
         }
       }
     }
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index fd08f324..05a06a26 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -391,13 +391,13 @@ namespace ojph {
       }
       else
       {
-        v128_t half = wasm_f32x4_splat(0.5f);
+        v128_t half = wasm_i32x4_splat(INT_MIN);
         for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           v128_t u = wasm_i32x4_shl(t, shift);
+          u = wasm_i32x4_sub(u, half);
           v128_t v = wasm_f32x4_convert_i32x4(u);
           v = wasm_f32x4_mul(v, mul);
-          v = wasm_f32x4_sub(v, half);
           wasm_v128_store(dp, v);
         }
       }

From 99e33f9a54191e88b43378579507ee5cfa736c70 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sun, 26 Jan 2025 11:30:34 +1100
Subject: [PATCH 15/20] Adopting the NLT type3 routine design for other
 routines. Major change.

---
 src/core/codestream/ojph_tile.cpp       |  69 ++++--------
 src/core/transform/ojph_colour.cpp      | 136 ++++++++++++------------
 src/core/transform/ojph_colour.h        |  23 ++--
 src/core/transform/ojph_colour_avx.cpp  |  60 -----------
 src/core/transform/ojph_colour_avx2.cpp |  83 +++++++++++----
 src/core/transform/ojph_colour_local.h  |  92 ++++++----------
 src/core/transform/ojph_colour_sse.cpp  |  82 --------------
 src/core/transform/ojph_colour_sse2.cpp |  82 ++++++++++----
 src/core/transform/ojph_colour_wasm.cpp |  82 ++++++++++----
 9 files changed, 318 insertions(+), 391 deletions(-)

diff --git a/src/core/codestream/ojph_tile.cpp b/src/core/codestream/ojph_tile.cpp
index ae78b06c..7ac60444 100644
--- a/src/core/codestream/ojph_tile.cpp
+++ b/src/core/codestream/ojph_tile.cpp
@@ -287,15 +287,9 @@ namespace ojph {
           if (nlt_type3[comp_num] == type3)
             irv_convert_to_float_nlt_type3(line, line_offsets[comp_num],
               tc, num_bits[comp_num], is_signed[comp_num], comp_width);
-          else {
-            float mul = 1.0f / (float)(1<<num_bits[comp_num]);
-            const si32 *sp = line->i32 + line_offsets[comp_num];
-            float *dp = tc->f32;
-            if (is_signed[comp_num])
-              cnvrt_si32_to_float(sp, dp, mul, comp_width);
-            else
-              cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width);
-          }
+          else
+            irv_convert_to_float(line, line_offsets[comp_num],
+              tc, num_bits[comp_num], is_signed[comp_num], comp_width);
         }
         comps[comp_num].push_line();
       }
@@ -331,15 +325,10 @@ namespace ojph {
             irv_convert_to_float_nlt_type3(line, line_offsets[comp_num],
               lines + comp_num, num_bits[comp_num], is_signed[comp_num], 
               comp_width);
-          else {
-            float mul = 1.0f / (float)(1<<num_bits[comp_num]);
-            const si32 *sp = line->i32 + line_offsets[comp_num];
-            float *dp = lines[comp_num].f32;
-            if (is_signed[comp_num])
-              cnvrt_si32_to_float(sp, dp, mul, comp_width);
-            else
-              cnvrt_si32_to_float_shftd(sp, dp, mul, comp_width);
-          }
+          else
+            irv_convert_to_float(line, line_offsets[comp_num],
+              lines + comp_num, num_bits[comp_num], is_signed[comp_num], 
+              comp_width);
           if (comp_num == 2)
           { // irreversible color transform
             ict_forward(lines[0].f32, lines[1].f32, lines[2].f32,
@@ -387,20 +376,13 @@ namespace ojph {
         else
         {
           if (nlt_type3[comp_num] == type3)
-          {
             irv_convert_to_integer_nlt_type3(src_line, tgt_line, 
               line_offsets[comp_num], num_bits[comp_num], 
               is_signed[comp_num], comp_width);
-          }
-          else {
-            float mul = (float)(1 << num_bits[comp_num]);
-            const float *sp = src_line->f32;
-            si32 *dp = tgt_line->i32 + line_offsets[comp_num];
-            if (is_signed[comp_num])
-              cnvrt_float_to_si32(sp, dp, mul, comp_width);
-            else
-              cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width);
-          }
+          else
+            irv_convert_to_integer(src_line, tgt_line, 
+              line_offsets[comp_num], num_bits[comp_num], 
+              is_signed[comp_num], comp_width);
         }
       }
       else
@@ -437,30 +419,19 @@ namespace ojph {
         }
         else
         {
+          line_buf* lbp;
+          if (comp_num < 3)
+            lbp = lines + comp_num;
+          else
+            lbp = comps[comp_num].pull_line();            
           if (nlt_type3[comp_num] == type3)
-          {
-            line_buf* lbp;
-            if (comp_num < 3)
-              lbp = lines + comp_num;
-            else
-              lbp = comps[comp_num].pull_line();            
             irv_convert_to_integer_nlt_type3(lbp, tgt_line, 
               line_offsets[comp_num], num_bits[comp_num], 
               is_signed[comp_num], comp_width);
-          }
-          else {
-            float mul = (float)(1 << num_bits[comp_num]);
-            const float *sp;
-            if (comp_num < 3)
-              sp = lines[comp_num].f32;
-            else
-              sp = comps[comp_num].pull_line()->f32;
-            si32 *dp = tgt_line->i32 + line_offsets[comp_num];
-            if (is_signed[comp_num])
-              cnvrt_float_to_si32(sp, dp, mul, comp_width);
-            else
-              cnvrt_float_to_si32_shftd(sp, dp, mul, comp_width);
-          }
+          else
+            irv_convert_to_integer(lbp, tgt_line, 
+              line_offsets[comp_num], num_bits[comp_num], 
+              is_signed[comp_num], comp_width);
         }
       }
 
diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 49107772..2c559ced 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -63,21 +63,16 @@ namespace ojph {
        line_buf *dst_line, const ui32 dst_line_offset,
        si64 shift, ui32 width) = NULL;
 
-    //////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_si32_to_float_shftd)
-      (const si32 *sp, float *dp, float mul, ui32 width) = NULL;
-
-    //////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_si32_to_float)
-      (const si32 *sp, float *dp, float mul, ui32 width) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_float_to_si32_shftd)
-      (const float *sp, si32 *dp, float mul, ui32 width) = NULL;
+    void (*irv_convert_to_integer) (
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
-    void (*cnvrt_float_to_si32)
-      (const float *sp, si32 *dp, float mul, ui32 width) = NULL;
+    void (*irv_convert_to_float) (
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width) = NULL;
 
     //////////////////////////////////////////////////////////////////////////
     void (*irv_convert_to_integer_nlt_type3) (
@@ -122,12 +117,10 @@ namespace ojph {
 
       rev_convert = gen_rev_convert;
       rev_convert_nlt_type3 = gen_rev_convert_nlt_type3;
-      cnvrt_si32_to_float_shftd = gen_cnvrt_si32_to_float_shftd;
-      cnvrt_si32_to_float = gen_cnvrt_si32_to_float;
-      cnvrt_float_to_si32_shftd = gen_cnvrt_float_to_si32_shftd;
-      cnvrt_float_to_si32 = gen_cnvrt_float_to_si32;
-      irv_convert_to_float_nlt_type3 = gen_irv_convert_to_float_nlt_type3;
+      irv_convert_to_integer = gen_irv_convert_to_integer;
+      irv_convert_to_float = gen_irv_convert_to_float;
       irv_convert_to_integer_nlt_type3 = gen_irv_convert_to_integer_nlt_type3;
+      irv_convert_to_float_nlt_type3 = gen_irv_convert_to_float_nlt_type3;
       rct_forward = gen_rct_forward;
       rct_backward = gen_rct_backward;
       ict_forward = gen_ict_forward;
@@ -140,10 +133,6 @@ namespace ojph {
       #ifndef OJPH_DISABLE_SSE
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_SSE)
         {
-          cnvrt_si32_to_float_shftd = sse_cnvrt_si32_to_float_shftd;
-          cnvrt_si32_to_float = sse_cnvrt_si32_to_float;
-          cnvrt_float_to_si32_shftd = sse_cnvrt_float_to_si32_shftd;
-          cnvrt_float_to_si32 = sse_cnvrt_float_to_si32;
           ict_forward = sse_ict_forward;
           ict_backward = sse_ict_backward;
         }
@@ -154,8 +143,8 @@ namespace ojph {
         {
           rev_convert = sse2_rev_convert;
           rev_convert_nlt_type3 = sse2_rev_convert_nlt_type3;
-          cnvrt_float_to_si32_shftd = sse2_cnvrt_float_to_si32_shftd;
-          cnvrt_float_to_si32 = sse2_cnvrt_float_to_si32;
+          irv_convert_to_integer = sse2_irv_convert_to_integer;
+          irv_convert_to_float = sse2_irv_convert_to_float;
           irv_convert_to_integer_nlt_type3 =
             sse2_irv_convert_to_integer_nlt_type3;
           irv_convert_to_float_nlt_type3 =
@@ -168,10 +157,6 @@ namespace ojph {
       #ifndef OJPH_DISABLE_AVX
         if (get_cpu_ext_level() >= X86_CPU_EXT_LEVEL_AVX)
         {
-          cnvrt_si32_to_float_shftd = avx_cnvrt_si32_to_float_shftd;
-          cnvrt_si32_to_float = avx_cnvrt_si32_to_float;
-          cnvrt_float_to_si32_shftd = avx_cnvrt_float_to_si32_shftd;
-          cnvrt_float_to_si32 = avx_cnvrt_float_to_si32;
           ict_forward = avx_ict_forward;
           ict_backward = avx_ict_backward;
         }
@@ -182,6 +167,8 @@ namespace ojph {
         {
           rev_convert = avx2_rev_convert;
           rev_convert_nlt_type3 = avx2_rev_convert_nlt_type3;
+          irv_convert_to_integer = avx2_irv_convert_to_integer;
+          irv_convert_to_float = avx2_irv_convert_to_float;
           irv_convert_to_integer_nlt_type3 =
             avx2_irv_convert_to_integer_nlt_type3;
           irv_convert_to_float_nlt_type3 =
@@ -201,10 +188,8 @@ namespace ojph {
 
       rev_convert = wasm_rev_convert;
       rev_convert_nlt_type3 = wasm_rev_convert_nlt_type3;
-      cnvrt_si32_to_float_shftd = wasm_cnvrt_si32_to_float_shftd;
-      cnvrt_si32_to_float = wasm_cnvrt_si32_to_float;
-      cnvrt_float_to_si32_shftd = wasm_cnvrt_float_to_si32_shftd;
-      cnvrt_float_to_si32 = wasm_cnvrt_float_to_si32;
+      irv_convert_to_integer = wasm_irv_convert_to_integer;
+      irv_convert_to_float = wasm_irv_convert_to_float;
       irv_convert_to_integer_nlt_type3 = wasm_irv_convert_to_integer_nlt_type3;
       irv_convert_to_float_nlt_type3 = wasm_irv_convert_to_float_nlt_type3;
       rct_forward = wasm_rct_forward;
@@ -310,40 +295,11 @@ namespace ojph {
       }
     }
 
-    //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width)
-    {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = (float)(ui32)*sp++ * mul - 0.5f;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width)
-    {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = (float)*sp++ * mul;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width)
-    {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = (si32)ojph_round((*sp++ + 0.5f) * mul);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width)
-    {
-      for (ui32 i = width; i > 0; --i)
-        *dp++ = ojph_round(*sp++ * mul);
-    }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+    template<bool NLT_TYPE3>
+    static inline
+    void local_gen_irv_convert_to_integer(const line_buf *src_line,
       line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -371,19 +327,20 @@ namespace ojph {
       if (is_signed)
       {
         const si32 bias = (1 << (bit_depth - 1)) + 1;
-        for (ui32 i = width; i > 0; --i) {
+        for (int i = width; i > 0; --i) {
           float t = *sp++ * mul;
           si32 v = ojph_round(t);
           v = t >= fl_low_lim ? v : s32_low_lim;
           v = t <  fl_up_lim  ? v : s32_up_lim;
-          v = (v >= 0) ? v : (- v - bias);
+          if (NLT_TYPE3)
+            v = (v >= 0) ? v : (- v - bias);
           *dp++ = v;
         }
       }
       else
       {
-        const si32 half = (1 << (bit_depth - 1));
-        for (ui32 i = width; i > 0; --i) {
+        const si32 half = 1 << (bit_depth - 1);
+        for (int i = width; i > 0; --i) {
           float t = *sp++ * mul;
           si32 v = ojph_round(t);
           v = t >= fl_low_lim ? v : s32_low_lim;
@@ -394,7 +351,27 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+    void gen_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_gen_irv_convert_to_integer<false>(src_line, dst_line,
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_gen_irv_convert_to_integer<true>(src_line, dst_line,
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    template<bool NLT_TYPE3>
+    static inline
+    void local_gen_irv_convert_to_float(const line_buf *src_line,
       ui32 src_line_offset, line_buf *dst_line,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -411,16 +388,17 @@ namespace ojph {
       if (is_signed)
       {
         si32 bias = (si32)((ui32)INT_MIN + 1);
-        for (ui32 i = width; i > 0; --i) {
+        for (int i = width; i > 0; --i) {
           si32 v = *sp++ << shift;
-          v = (v >= 0) ? v : (- v - bias);
+          if (NLT_TYPE3)
+            v = (v >= 0) ? v : (- v - bias);
           *dp++ = (float)v * mul;
         }
       }
       else
       {
         const ui32 half = (ui32)INT_MIN;
-        for (ui32 i = width; i > 0; --i) {
+        for (int i = width; i > 0; --i) {
           ui32 v = (ui32)*sp++;
           v <<= shift;
           v -= half;
@@ -429,6 +407,24 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_gen_irv_convert_to_float<false>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void gen_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_gen_irv_convert_to_float<true>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void gen_rct_forward(
       const line_buf *r, const line_buf *g, const line_buf *b,
diff --git a/src/core/transform/ojph_colour.h b/src/core/transform/ojph_colour.h
index d5375a97..b0b5da61 100644
--- a/src/core/transform/ojph_colour.h
+++ b/src/core/transform/ojph_colour.h
@@ -61,24 +61,14 @@ namespace ojph {
      line_buf *dst_line, const ui32 dst_line_offset, 
      si64 shift, ui32 width);
 
-  ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_si32_to_float_shftd)
-    (const si32 *sp, float *dp, float mul, ui32 width);
-
-  ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_si32_to_float)
-    (const si32 *sp, float *dp, float mul, ui32 width);
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_float_to_si32_shftd)
-    (const float *sp, si32 *dp, float mul, ui32 width);
-
-  ////////////////////////////////////////////////////////////////////////////
-  extern void (*cnvrt_float_to_si32)
-    (const float *sp, si32 *dp, float mul, ui32 width);
+  extern void (*irv_convert_to_integer) (
+    const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+    ui32 bit_depth, bool is_signed, ui32 width);
 
   ////////////////////////////////////////////////////////////////////////////
-  extern void (*irv_convert_to_float_nlt_type3) (
+  extern void (*irv_convert_to_float) (
     const line_buf *src_line, ui32 src_line_offset,
     line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
@@ -87,6 +77,11 @@ namespace ojph {
     const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
     ui32 bit_depth, bool is_signed, ui32 width);
 
+  ////////////////////////////////////////////////////////////////////////////
+  extern void (*irv_convert_to_float_nlt_type3) (
+    const line_buf *src_line, ui32 src_line_offset,
+    line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
   ////////////////////////////////////////////////////////////////////////////
   extern void (*rct_forward)
     (const line_buf *r, const line_buf *g, const line_buf *b,
diff --git a/src/core/transform/ojph_colour_avx.cpp b/src/core/transform/ojph_colour_avx.cpp
index 27e78e5c..f6a714d2 100644
--- a/src/core/transform/ojph_colour_avx.cpp
+++ b/src/core/transform/ojph_colour_avx.cpp
@@ -47,66 +47,6 @@
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width)
-    {
-      __m256 shift = _mm256_set1_ps(0.5f);
-      __m256 m = _mm256_set1_ps(mul);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
-      {
-        __m256i t = _mm256_loadu_si256((__m256i*)sp);
-        __m256 s = _mm256_cvtepi32_ps(t);
-        s = _mm256_mul_ps(s, m);
-        s = _mm256_sub_ps(s, shift);
-        _mm256_store_ps(dp, s);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width)
-    {
-      __m256 m = _mm256_set1_ps(mul);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
-      {
-        __m256i t = _mm256_loadu_si256((__m256i*)sp);
-        __m256 s = _mm256_cvtepi32_ps(t);
-        s = _mm256_mul_ps(s, m);
-        _mm256_store_ps(dp, s);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width)
-    {
-      __m256 shift = _mm256_set1_ps(0.5f);
-      __m256 m = _mm256_set1_ps(mul);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
-      {
-        __m256 t = _mm256_load_ps(sp);
-        __m256 s = _mm256_add_ps(t, shift);
-        s = _mm256_mul_ps(s, m);
-        s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-        _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width)
-    {
-      __m256 m = _mm256_set1_ps(mul);
-      for (int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
-      {
-        __m256 t = _mm256_load_ps(sp);
-        __m256 s = _mm256_mul_ps(t, m);
-        s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
-        _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
-      }
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void avx_ict_forward(const float *r, const float *g, const float *b,
                          float *y, float *cb, float *cr, ui32 repeat)
diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index cb2bf000..33969f1f 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -263,7 +263,9 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+    template<bool NLT_TYPE3>
+    static inline
+    void local_avx2_irv_convert_to_integer(const line_buf *src_line,
       line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -292,24 +294,27 @@ namespace ojph {
       {
         __m256i zero = _mm256_setzero_si256();
         __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256 t = _mm256_loadu_ps(sp);
           t = _mm256_mul_ps(t, mul);
           __m256i u = _mm256_cvtps_epi32(t);
           u = ojph_mm256_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
           u = ojph_mm256_min_lt_epi32(u,  s32_up_lim, t,  fl_up_lim);
-          __m256i c = _mm256_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
-          __m256i neg = _mm256_sub_epi32(bias, u); //-bias -value
-          neg = _mm256_and_si256(c, neg);          //keep only - bias - value
-          __m256i v = _mm256_andnot_si256(c, u);   //keep only +ve or 0
-          v = _mm256_or_si256(neg, v);             //combine
-          _mm256_storeu_si256((__m256i*)dp, v);
+          if (NLT_TYPE3)
+          {
+            __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val
+            __m256i neg = _mm256_sub_epi32(bias, u); // -bias -value
+            neg = _mm256_and_si256(c, neg);          // keep only - bias - val
+            u = _mm256_andnot_si256(c, u);           // keep only +ve or 0
+            u = _mm256_or_si256(neg, u);             // combine
+          }
+          _mm256_storeu_si256((__m256i*)dp, u);
         }
       }
       else
       {
-        __m256i half = _mm256_set1_epi32(-(1 << (bit_depth - 1)));
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        __m256i half = _mm256_set1_epi32(1 << (bit_depth - 1));
+        for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256 t = _mm256_loadu_ps(sp);
           t = _mm256_mul_ps(t, mul);
           __m256i u = _mm256_cvtps_epi32(t);
@@ -322,7 +327,27 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+    void avx2_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_avx2_irv_convert_to_integer<false>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_avx2_irv_convert_to_integer<true>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    template<bool NLT_TYPE3>
+    static inline    
+    void local_avx2_irv_convert_to_float(const line_buf *src_line,
       ui32 src_line_offset, line_buf *dst_line,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -340,14 +365,17 @@ namespace ojph {
       {
         __m256i zero = _mm256_setzero_si256();
         __m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1));
-        for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
+        for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256i t = _mm256_loadu_si256((__m256i*)sp);
           __m256i u = _mm256_slli_epi32(t, shift);
-          __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve value
-          __m256i neg = _mm256_sub_epi32(bias, u); // - bias - value
-          neg = _mm256_and_si256(c, neg);          // keep only - bias - value
-          t = _mm256_andnot_si256(c, u);           // keep only +ve or 0
-          u = _mm256_or_si256(neg, t);             // combine
+          if (NLT_TYPE3)
+          {          
+            __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val
+            __m256i neg = _mm256_sub_epi32(bias, u); // - bias - value
+            neg = _mm256_and_si256(c, neg);          // keep only - bias - val
+            t = _mm256_andnot_si256(c, u);           // keep only +ve or 0
+            u = _mm256_or_si256(neg, t);             // combine
+          }
           __m256 v = _mm256_cvtepi32_ps(u);
           v = _mm256_mul_ps(v, mul);
           _mm256_storeu_ps(dp, v);
@@ -356,7 +384,7 @@ namespace ojph {
       else
       {
         __m256i half = _mm256_set1_epi32(INT_MIN);
-        for (ui32 i = width; i > 0; i -= 8, sp += 8, dp += 8) {
+        for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256i t = _mm256_loadu_si256((__m256i*)sp);
           t = _mm256_slli_epi32(t, shift);
           t = _mm256_sub_epi32(t, half);
@@ -367,6 +395,25 @@ namespace ojph {
       }
     }
 
+        //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_avx2_irv_convert_to_float<false>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_avx2_irv_convert_to_float<true>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+
     //////////////////////////////////////////////////////////////////////////
     void avx2_rct_forward(const line_buf *r,
                           const line_buf *g,
diff --git a/src/core/transform/ojph_colour_local.h b/src/core/transform/ojph_colour_local.h
index 5f28685a..a85bf8bd 100644
--- a/src/core/transform/ojph_colour_local.h
+++ b/src/core/transform/ojph_colour_local.h
@@ -77,20 +77,14 @@ namespace ojph {
       si64 shift, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width);
+    void gen_irv_convert_to_float(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void gen_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width);
+    void gen_irv_convert_to_integer(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void gen_irv_convert_to_float_nlt_type3(
@@ -128,22 +122,6 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width);
-
     //////////////////////////////////////////////////////////////////////////
     void sse_ict_forward(const float *r, const float *g, const float *b,
                          float *y, float *cb, float *cr, ui32 repeat);
@@ -161,12 +139,9 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                        ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse2_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                  ui32 width);
+    void sse2_irv_convert_to_integer(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void sse2_irv_convert_to_integer_nlt_type3(
@@ -193,6 +168,11 @@ namespace ojph {
       line_buf *dst_line, const ui32 dst_line_offset, 
       si64 shift, ui32 width);
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_float(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
     //////////////////////////////////////////////////////////////////////////
     void sse2_irv_convert_to_float_nlt_type3(
       const line_buf *src_line, ui32 src_line_offset,
@@ -216,22 +196,6 @@ namespace ojph {
     //
     //////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width);
-
     //////////////////////////////////////////////////////////////////////////
     void avx_ict_forward(const float *r, const float *g, const float *b,
                          float *y, float *cb, float *cr, ui32 repeat);
@@ -260,6 +224,16 @@ namespace ojph {
       line_buf *dst_line, const ui32 dst_line_offset, 
       si64 shift, ui32 width);
 
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_integer(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
+
+    //////////////////////////////////////////////////////////////////////////
+    void avx2_irv_convert_to_float(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
+
     //////////////////////////////////////////////////////////////////////////
     void avx2_irv_convert_to_integer_nlt_type3(
       const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
@@ -289,20 +263,14 @@ namespace ojph {
     //////////////////////////////////////////////////////////////////////////
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                        ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                  ui32 width);
-
-    //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                        ui32 width);
+    void wasm_irv_convert_to_integer(
+      const line_buf *src_line, line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                  ui32 width);
+    void wasm_irv_convert_to_float(
+      const line_buf *src_line, ui32 src_line_offset,
+      line_buf *dst_line, ui32 bit_depth, bool is_signed, ui32 width);
 
     //////////////////////////////////////////////////////////////////////////
     void wasm_rev_convert(
diff --git a/src/core/transform/ojph_colour_sse.cpp b/src/core/transform/ojph_colour_sse.cpp
index edd1eaf2..ce61bd89 100644
--- a/src/core/transform/ojph_colour_sse.cpp
+++ b/src/core/transform/ojph_colour_sse.cpp
@@ -47,88 +47,6 @@
 namespace ojph {
   namespace local {
 
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul,
-                                       ui32 width)
-    {
-      __m128 shift = _mm_set1_ps(0.5f);
-      __m128 m = _mm_set1_ps(mul);
-      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
-      {
-        __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp));
-        __m128 s = _mm_cvtepi32_ps(t);
-        s = _mm_mul_ps(s, m);
-        s = _mm_sub_ps(s, shift);
-        _mm_store_ps(dp, s);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul,
-                                 ui32 width)
-    {
-      __m128 m = _mm_set1_ps(mul);
-      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
-      {
-        __m128i t = _mm_castps_si128(_mm_loadu_ps((float*)sp));
-        __m128 s = _mm_cvtepi32_ps(t);
-        s = _mm_mul_ps(s, m);
-        _mm_store_ps(dp, s);
-      }
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul,
-                                       ui32 width)
-    {
-      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
-      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
-      __m128 shift = _mm_set1_ps(0.5f);
-      __m128 m = _mm_set1_ps(mul);
-      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4)
-      {
-        __m128 t = _mm_load_ps(sp);
-        __m128 s = _mm_add_ps(t, shift);
-        s = _mm_mul_ps(s, m);
-        // the following is a poorly designed code, but it is the only
-        // code that I am aware of that compiles on VS 32 and 64 modes
-        t = s;
-        *dp++ = _mm_cvtss_si32(t); 
-        t = _mm_shuffle_ps(s, s, 1);
-        *dp++ = _mm_cvtss_si32(t); 
-        t = _mm_shuffle_ps(s, s, 2);
-        *dp++ = _mm_cvtss_si32(t); 
-        t = _mm_shuffle_ps(s, s, 3);
-        *dp++ = _mm_cvtss_si32(t);
-      }
-      _MM_SET_ROUNDING_MODE(rounding_mode);
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul,
-                                 ui32 width)
-    {
-      uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
-      _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
-      __m128 m = _mm_set1_ps(mul);
-      for (ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4)
-      {
-        __m128 t = _mm_load_ps(sp);
-        __m128 s = _mm_mul_ps(t, m);
-        // the following is a poorly designed code, but it is the only
-        // code that I am aware of that compiles on VS 32 and 64 modes
-        t = s;
-        *dp++ = _mm_cvtss_si32(t);
-        t = _mm_shuffle_ps(s, s, 1);
-        *dp++ = _mm_cvtss_si32(t);
-        t = _mm_shuffle_ps(s, s, 2);
-        *dp++ = _mm_cvtss_si32(t);
-        t = _mm_shuffle_ps(s, s, 3);
-        *dp++ = _mm_cvtss_si32(t);
-      }
-      _MM_SET_ROUNDING_MODE(rounding_mode);
-    }
-
     //////////////////////////////////////////////////////////////////////////
     void sse_ict_forward(const float *r, const float *g, const float *b,
                          float *y, float *cb, float *cr, ui32 repeat)
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index f1a95447..96eeb34f 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -127,7 +127,9 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+    template <bool NLT_TYPE3>
+    static inline
+    void local_sse2_irv_convert_to_integer(const line_buf *src_line,
       line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -159,24 +161,27 @@ namespace ojph {
       {
         __m128i zero = _mm_setzero_si128();
         __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128 t = _mm_loadu_ps(sp);
           t = _mm_mul_ps(t, mul);
           __m128i u = _mm_cvtps_epi32(t);
           u = ojph_mm_max_ge_epi32(u, s32_low_lim, t, fl_low_lim);
           u = ojph_mm_min_lt_epi32(u, s32_up_lim, t, fl_up_lim);
-          __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
-          __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
-          neg = _mm_and_si128(c, neg);          //keep only - bias - value
-          __m128i v = _mm_andnot_si128(c, u);   //keep only +ve or 0
-          v = _mm_or_si128(neg, v);             //combine
-          _mm_storeu_si128((__m128i*)dp, v);
+          if (NLT_TYPE3)
+          {
+            __m128i c = _mm_cmpgt_epi32(zero, u); //0xFFFFFFFF for -ve value
+            __m128i neg = _mm_sub_epi32(bias, u); //-bias -value
+            neg = _mm_and_si128(c, neg);          //keep only - bias - value
+            u = _mm_andnot_si128(c, u);           //keep only +ve or 0
+            u = _mm_or_si128(neg, u);             //combine
+          }
+          _mm_storeu_si128((__m128i*)dp, u);
         }
       }
       else
       {
-        __m128i half = _mm_set1_epi32(-(1 << (bit_depth - 1)));
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        __m128i half = _mm_set1_epi32(1 << (bit_depth - 1));
+        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128 t = _mm_loadu_ps(sp);
           t = _mm_mul_ps(t, mul);
           __m128i u = _mm_cvtps_epi32(t);
@@ -190,6 +195,24 @@ namespace ojph {
       _MM_SET_ROUNDING_MODE(rounding_mode);
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_sse2_irv_convert_to_integer<false>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_sse2_irv_convert_to_integer<true>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
     /////////////////////////////////////////////////////////////////////////
     // https://github.com/seung-lab/dijkstra3d/blob/master/libdivide.h
     static inline __m128i sse2_mm_srai_epi64(__m128i a, int amt, __m128i m)
@@ -392,7 +415,9 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+    template<bool NLT_TYPE3>
+    static inline
+    void local_sse2_irv_convert_to_float(const line_buf *src_line,
       ui32 src_line_offset, line_buf *dst_line,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -411,14 +436,17 @@ namespace ojph {
       {
         __m128i zero = _mm_setzero_si128();
         __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1));
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128i t = _mm_loadu_si128((__m128i*)sp);
           __m128i u = _mm_slli_epi32(t, shift);
-          __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value
-          __m128i neg = _mm_sub_epi32(bias, u); // - bias - value
-          neg = _mm_and_si128(c, neg);          // keep only - bias - value
-          t = _mm_andnot_si128(c, u);           // keep only +ve or 0
-          u = _mm_or_si128(neg, t);             // combine
+          if (NLT_TYPE3)
+          {
+            __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value
+            __m128i neg = _mm_sub_epi32(bias, u); // - bias - value
+            neg = _mm_and_si128(c, neg);          // keep only - bias - value
+            t = _mm_andnot_si128(c, u);           // keep only +ve or 0
+            u = _mm_or_si128(neg, t);             // combine
+          }
           __m128 v = _mm_cvtepi32_ps(u);
           v = _mm_mul_ps(v, mul);
           _mm_storeu_ps(dp, v);
@@ -427,7 +455,7 @@ namespace ojph {
       else
       {
         __m128i half = _mm_set1_epi32(INT_MIN);
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128i t = _mm_loadu_si128((__m128i*)sp);
           t = _mm_slli_epi32(t, shift);
           t = _mm_sub_epi32(t, half);
@@ -438,6 +466,24 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_sse2_irv_convert_to_float<false>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_sse2_irv_convert_to_float<true>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void sse2_rct_forward(const line_buf *r,
                           const line_buf *g,
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index 05a06a26..548a2042 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -296,7 +296,9 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+    template <bool NLT_TYPE3>
+    static inline
+    void local_wasm_irv_convert_to_integer(const line_buf *src_line,
       line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -326,26 +328,29 @@ namespace ojph {
         const v128_t zero = wasm_f32x4_splat(0.0f);
         const v128_t half = wasm_f32x4_splat(0.5f);
         v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           t = wasm_f32x4_mul(t, mul);
           v128_t u = ojph_convert_float_to_i32(t, zero, half);
           u = ojph_wasm_i32x4_max_ge(u, s32_low_lim, t, fl_low_lim);
           u = ojph_wasm_i32x4_min_lt(u, s32_up_lim, t, fl_up_lim);
-          v128_t c = wasm_i32x4_gt(zero, u);    // 0xFFFFFFFF for -ve value
-          v128_t neg = wasm_i32x4_sub(bias, u); // -bias -value
-          neg = wasm_v128_and(c, neg);          // keep only - bias - value
-          v128_t v = wasm_v128_andnot(u, c);    // keep only +ve or 0
-          v = wasm_v128_or(neg, v);             // combine
-          wasm_v128_store(dp, v);
+          if (NLT_TYPE3)
+          {
+            v128_t c = wasm_i32x4_gt(zero, u);    // 0xFFFFFFFF for -ve value
+            v128_t neg = wasm_i32x4_sub(bias, u); // -bias -value
+            neg = wasm_v128_and(c, neg);          // keep only - bias - value
+            u = wasm_v128_andnot(u, c);           // keep only +ve or 0
+            u = wasm_v128_or(neg, u);             // combine
+          }
+          wasm_v128_store(dp, u);
         }
       }
       else
       {
         const v128_t zero = wasm_f32x4_splat(0.0f);
         const v128_t half = wasm_f32x4_splat(0.5f);
-        v128_t ihalf = wasm_i32x4_splat(-(1 << (bit_depth - 1)));
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        v128_t ihalf = wasm_i32x4_splat(1 << (bit_depth - 1));
+        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           t = wasm_f32x4_mul(t, mul);
           v128_t u = ojph_convert_float_to_i32(t, zero, half);
@@ -358,7 +363,27 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+    void wasm_irv_convert_to_integer(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_wasm_irv_convert_to_integer<false>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+      line_buf *dst_line, ui32 dst_line_offset,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_wasm_irv_convert_to_integer<true>(src_line, dst_line, 
+        dst_line_offset, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    template <bool NLT_TYPE3>
+    static inline
+    void local_wasm_irv_convert_to_float(const line_buf *src_line,
       ui32 src_line_offset, line_buf *dst_line,
       ui32 bit_depth, bool is_signed, ui32 width)
     {
@@ -376,14 +401,17 @@ namespace ojph {
       {
         v128_t zero = wasm_i32x4_splat(0);
         v128_t bias = wasm_i32x4_splat(-(si32)((ui32)INT_MIN + 1));
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           v128_t u = wasm_i32x4_shl(t, shift);
-          v128_t c = wasm_i32x4_lt(u, zero);    // 0xFFFFFFFF for -ve value
-          v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value
-          neg = wasm_v128_and(c, neg);          // keep only - bias - value
-          t = wasm_v128_andnot(u, c);           // keep only +ve or 0
-          u = wasm_v128_or(neg, t);             // combine
+          if (NLT_TYPE3)
+          {
+            v128_t c = wasm_i32x4_lt(u, zero);    // 0xFFFFFFFF for -ve value
+            v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value
+            neg = wasm_v128_and(c, neg);          // keep only - bias - value
+            t = wasm_v128_andnot(u, c);           // keep only +ve or 0
+            u = wasm_v128_or(neg, t);             // combine
+          }
           v128_t v = wasm_f32x4_convert_i32x4(u);
           v = wasm_f32x4_mul(v, mul);
           wasm_v128_store(dp, v);
@@ -392,7 +420,7 @@ namespace ojph {
       else
       {
         v128_t half = wasm_i32x4_splat(INT_MIN);
-        for (ui32 i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           v128_t u = wasm_i32x4_shl(t, shift);
           u = wasm_i32x4_sub(u, half);
@@ -403,6 +431,24 @@ namespace ojph {
       }
     }
 
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_float(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_wasm_irv_convert_to_float<false>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    void wasm_irv_convert_to_float_nlt_type3(const line_buf *src_line,
+      ui32 src_line_offset, line_buf *dst_line,
+      ui32 bit_depth, bool is_signed, ui32 width)
+    {
+      local_wasm_irv_convert_to_float<true>(src_line, src_line_offset,
+        dst_line, bit_depth, is_signed, width);
+    }
+
     //////////////////////////////////////////////////////////////////////////
     void wasm_rct_forward(const line_buf *r,
                           const line_buf *g,

From e24c2cacc9d83fe54eeb6b270812758b55cf6cfb Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Sun, 26 Jan 2025 11:36:23 +1100
Subject: [PATCH 16/20] Addressing Warnings.

---
 src/core/transform/ojph_colour.cpp      | 8 ++++----
 src/core/transform/ojph_colour_avx2.cpp | 8 ++++----
 src/core/transform/ojph_colour_sse2.cpp | 9 ++++-----
 src/core/transform/ojph_colour_wasm.cpp | 8 ++++----
 4 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 2c559ced..3e6307e7 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -327,7 +327,7 @@ namespace ojph {
       if (is_signed)
       {
         const si32 bias = (1 << (bit_depth - 1)) + 1;
-        for (int i = width; i > 0; --i) {
+        for (int i = (int)width; i > 0; --i) {
           float t = *sp++ * mul;
           si32 v = ojph_round(t);
           v = t >= fl_low_lim ? v : s32_low_lim;
@@ -340,7 +340,7 @@ namespace ojph {
       else
       {
         const si32 half = 1 << (bit_depth - 1);
-        for (int i = width; i > 0; --i) {
+        for (int i = (int)width; i > 0; --i) {
           float t = *sp++ * mul;
           si32 v = ojph_round(t);
           v = t >= fl_low_lim ? v : s32_low_lim;
@@ -388,7 +388,7 @@ namespace ojph {
       if (is_signed)
       {
         si32 bias = (si32)((ui32)INT_MIN + 1);
-        for (int i = width; i > 0; --i) {
+        for (int i = (int)width; i > 0; --i) {
           si32 v = *sp++ << shift;
           if (NLT_TYPE3)
             v = (v >= 0) ? v : (- v - bias);
@@ -398,7 +398,7 @@ namespace ojph {
       else
       {
         const ui32 half = (ui32)INT_MIN;
-        for (int i = width; i > 0; --i) {
+        for (int i = (int)width; i > 0; --i) {
           ui32 v = (ui32)*sp++;
           v <<= shift;
           v -= half;
diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index 33969f1f..25b0858e 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -294,7 +294,7 @@ namespace ojph {
       {
         __m256i zero = _mm256_setzero_si256();
         __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));
-        for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) {
+        for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256 t = _mm256_loadu_ps(sp);
           t = _mm256_mul_ps(t, mul);
           __m256i u = _mm256_cvtps_epi32(t);
@@ -314,7 +314,7 @@ namespace ojph {
       else
       {
         __m256i half = _mm256_set1_epi32(1 << (bit_depth - 1));
-        for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) {
+        for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256 t = _mm256_loadu_ps(sp);
           t = _mm256_mul_ps(t, mul);
           __m256i u = _mm256_cvtps_epi32(t);
@@ -365,7 +365,7 @@ namespace ojph {
       {
         __m256i zero = _mm256_setzero_si256();
         __m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1));
-        for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) {
+        for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256i t = _mm256_loadu_si256((__m256i*)sp);
           __m256i u = _mm256_slli_epi32(t, shift);
           if (NLT_TYPE3)
@@ -384,7 +384,7 @@ namespace ojph {
       else
       {
         __m256i half = _mm256_set1_epi32(INT_MIN);
-        for (int i = width; i > 0; i -= 8, sp += 8, dp += 8) {
+        for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256i t = _mm256_loadu_si256((__m256i*)sp);
           t = _mm256_slli_epi32(t, shift);
           t = _mm256_sub_epi32(t, half);
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index 96eeb34f..bb440978 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -161,7 +161,7 @@ namespace ojph {
       {
         __m128i zero = _mm_setzero_si128();
         __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
-        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128 t = _mm_loadu_ps(sp);
           t = _mm_mul_ps(t, mul);
           __m128i u = _mm_cvtps_epi32(t);
@@ -181,7 +181,7 @@ namespace ojph {
       else
       {
         __m128i half = _mm_set1_epi32(1 << (bit_depth - 1));
-        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128 t = _mm_loadu_ps(sp);
           t = _mm_mul_ps(t, mul);
           __m128i u = _mm_cvtps_epi32(t);
@@ -427,7 +427,6 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_INTEGER) == 0);
 
       __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0));
-      float mulf = (float)(1.0 / 65536.0 / 65536.0);
 
       const si32* sp = src_line->i32 + src_line_offset;
       float* dp = dst_line->f32;
@@ -436,7 +435,7 @@ namespace ojph {
       {
         __m128i zero = _mm_setzero_si128();
         __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1));
-        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128i t = _mm_loadu_si128((__m128i*)sp);
           __m128i u = _mm_slli_epi32(t, shift);
           if (NLT_TYPE3)
@@ -455,7 +454,7 @@ namespace ojph {
       else
       {
         __m128i half = _mm_set1_epi32(INT_MIN);
-        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128i t = _mm_loadu_si128((__m128i*)sp);
           t = _mm_slli_epi32(t, shift);
           t = _mm_sub_epi32(t, half);
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index 548a2042..c7118347 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -328,7 +328,7 @@ namespace ojph {
         const v128_t zero = wasm_f32x4_splat(0.0f);
         const v128_t half = wasm_f32x4_splat(0.5f);
         v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
-        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           t = wasm_f32x4_mul(t, mul);
           v128_t u = ojph_convert_float_to_i32(t, zero, half);
@@ -350,7 +350,7 @@ namespace ojph {
         const v128_t zero = wasm_f32x4_splat(0.0f);
         const v128_t half = wasm_f32x4_splat(0.5f);
         v128_t ihalf = wasm_i32x4_splat(1 << (bit_depth - 1));
-        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           t = wasm_f32x4_mul(t, mul);
           v128_t u = ojph_convert_float_to_i32(t, zero, half);
@@ -401,7 +401,7 @@ namespace ojph {
       {
         v128_t zero = wasm_i32x4_splat(0);
         v128_t bias = wasm_i32x4_splat(-(si32)((ui32)INT_MIN + 1));
-        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           v128_t u = wasm_i32x4_shl(t, shift);
           if (NLT_TYPE3)
@@ -420,7 +420,7 @@ namespace ojph {
       else
       {
         v128_t half = wasm_i32x4_splat(INT_MIN);
-        for (int i = width; i > 0; i -= 4, sp += 4, dp += 4) {
+        for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           v128_t u = wasm_i32x4_shl(t, shift);
           u = wasm_i32x4_sub(u, half);

From 42b2efdbad0fe043d374840a5062935c482a2726 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Sun, 26 Jan 2025 12:19:41 +1100
Subject: [PATCH 17/20] WASM compilation fix.

---
 src/core/transform/ojph_colour_wasm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index c7118347..d56ec95e 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -372,7 +372,7 @@ namespace ojph {
     }
 
     //////////////////////////////////////////////////////////////////////////
-    void sse2_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
+    void wasm_irv_convert_to_integer_nlt_type3(const line_buf *src_line,
       line_buf *dst_line, ui32 dst_line_offset,
       ui32 bit_depth, bool is_signed, ui32 width)
     {

From e21bfd04a77f1aecee7642b3245e4f7c110e50e5 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Mon, 27 Jan 2025 18:24:43 +1100
Subject: [PATCH 18/20] Improvement and bug fixes.

---
 src/core/transform/ojph_colour.cpp      |  19 +-
 src/core/transform/ojph_colour_avx2.cpp |  26 +-
 src/core/transform/ojph_colour_sse2.cpp |  24 +-
 src/core/transform/ojph_colour_wasm.cpp |  22 +-
 tests/mse_pae.cpp                       | 378 ++++++++++++++++++++++--
 5 files changed, 394 insertions(+), 75 deletions(-)

diff --git a/src/core/transform/ojph_colour.cpp b/src/core/transform/ojph_colour.cpp
index 3e6307e7..b3c05aea 100644
--- a/src/core/transform/ojph_colour.cpp
+++ b/src/core/transform/ojph_colour.cpp
@@ -326,7 +326,7 @@ namespace ojph {
 
       if (is_signed)
       {
-        const si32 bias = (1 << (bit_depth - 1)) + 1;
+        const si32 bias = (si32)((1ULL << (bit_depth - 1)) + 1);
         for (int i = (int)width; i > 0; --i) {
           float t = *sp++ * mul;
           si32 v = ojph_round(t);
@@ -339,7 +339,7 @@ namespace ojph {
       }
       else
       {
-        const si32 half = 1 << (bit_depth - 1);
+        const si32 half = (si32)(1ULL << (bit_depth - 1));
         for (int i = (int)width; i > 0; --i) {
           float t = *sp++ * mul;
           si32 v = ojph_round(t);
@@ -380,16 +380,16 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER) == 0);
 
-      float mul = (float)(1.0 / 65536.0 / 65536.0);
+      assert(bit_depth <= 32);
+      float mul = (float)(1.0 / (double)(1ULL << bit_depth));
 
       const si32* sp = src_line->i32 + src_line_offset;
       float* dp = dst_line->f32;
-      ui32 shift = 32 - bit_depth;
       if (is_signed)
       {
-        si32 bias = (si32)((ui32)INT_MIN + 1);
+        const si32 bias = (si32)((1ULL << (bit_depth - 1)) + 1);
         for (int i = (int)width; i > 0; --i) {
-          si32 v = *sp++ << shift;
+          si32 v = *sp++;
           if (NLT_TYPE3)
             v = (v >= 0) ? v : (- v - bias);
           *dp++ = (float)v * mul;
@@ -397,12 +397,11 @@ namespace ojph {
       }
       else
       {
-        const ui32 half = (ui32)INT_MIN;
+        const si32 half = (si32)(1ULL << (bit_depth - 1));
         for (int i = (int)width; i > 0; --i) {
-          ui32 v = (ui32)*sp++;
-          v <<= shift;
+          si32 v = *sp++;
           v -= half;
-          *dp++ = (float)(si32)v * mul;
+          *dp++ = (float)v * mul;
         }
       }
     }
diff --git a/src/core/transform/ojph_colour_avx2.cpp b/src/core/transform/ojph_colour_avx2.cpp
index 25b0858e..2283be57 100644
--- a/src/core/transform/ojph_colour_avx2.cpp
+++ b/src/core/transform/ojph_colour_avx2.cpp
@@ -293,7 +293,8 @@ namespace ojph {
       if (is_signed)
       {
         __m256i zero = _mm256_setzero_si256();
-        __m256i bias = _mm256_set1_epi32(-((1 << (bit_depth - 1)) + 1));
+        __m256i bias = 
+          _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
         for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256 t = _mm256_loadu_ps(sp);
           t = _mm256_mul_ps(t, mul);
@@ -313,7 +314,7 @@ namespace ojph {
       }
       else
       {
-        __m256i half = _mm256_set1_epi32(1 << (bit_depth - 1));
+        __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
         for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256 t = _mm256_loadu_ps(sp);
           t = _mm256_mul_ps(t, mul);
@@ -356,37 +357,36 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER) == 0);
 
-      __m256 mul = _mm256_set1_ps((float)(1.0 / 65536.0 / 65536.0));
+      assert(bit_depth <= 32);
+      __m256 mul = _mm256_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
 
       const si32* sp = src_line->i32 + src_line_offset;
       float* dp = dst_line->f32;
-      si32 shift = 32 - (si32)bit_depth;
       if (is_signed)
       {
         __m256i zero = _mm256_setzero_si256();
-        __m256i bias = _mm256_set1_epi32(-(si32)((ui32)INT_MIN + 1));
+        __m256i bias = 
+          _mm256_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
         for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256i t = _mm256_loadu_si256((__m256i*)sp);
-          __m256i u = _mm256_slli_epi32(t, shift);
           if (NLT_TYPE3)
           {          
-            __m256i c = _mm256_cmpgt_epi32(zero, u); // 0xFFFFFFFF for -ve val
-            __m256i neg = _mm256_sub_epi32(bias, u); // - bias - value
+            __m256i c = _mm256_cmpgt_epi32(zero, t); // 0xFFFFFFFF for -ve val
+            __m256i neg = _mm256_sub_epi32(bias, t); // - bias - value
             neg = _mm256_and_si256(c, neg);          // keep only - bias - val
-            t = _mm256_andnot_si256(c, u);           // keep only +ve or 0
-            u = _mm256_or_si256(neg, t);             // combine
+            c = _mm256_andnot_si256(c, t);           // keep only +ve or 0
+            t = _mm256_or_si256(neg, c);             // combine
           }
-          __m256 v = _mm256_cvtepi32_ps(u);
+          __m256 v = _mm256_cvtepi32_ps(t);
           v = _mm256_mul_ps(v, mul);
           _mm256_storeu_ps(dp, v);
         }
       }
       else
       {
-        __m256i half = _mm256_set1_epi32(INT_MIN);
+        __m256i half = _mm256_set1_epi32((si32)(1ULL << (bit_depth - 1)));
         for (int i = (int)width; i > 0; i -= 8, sp += 8, dp += 8) {
           __m256i t = _mm256_loadu_si256((__m256i*)sp);
-          t = _mm256_slli_epi32(t, shift);
           t = _mm256_sub_epi32(t, half);
           __m256 v = _mm256_cvtepi32_ps(t);
           v = _mm256_mul_ps(v, mul);
diff --git a/src/core/transform/ojph_colour_sse2.cpp b/src/core/transform/ojph_colour_sse2.cpp
index bb440978..63401169 100644
--- a/src/core/transform/ojph_colour_sse2.cpp
+++ b/src/core/transform/ojph_colour_sse2.cpp
@@ -160,7 +160,7 @@ namespace ojph {
       if (is_signed)
       {
         __m128i zero = _mm_setzero_si128();
-        __m128i bias = _mm_set1_epi32(-((1 << (bit_depth - 1)) + 1));
+        __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
         for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128 t = _mm_loadu_ps(sp);
           t = _mm_mul_ps(t, mul);
@@ -180,7 +180,7 @@ namespace ojph {
       }
       else
       {
-        __m128i half = _mm_set1_epi32(1 << (bit_depth - 1));
+        __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
         for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128 t = _mm_loadu_ps(sp);
           t = _mm_mul_ps(t, mul);
@@ -426,37 +426,35 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER) == 0);
 
-      __m128 mul = _mm_set1_ps((float)(1.0 / 65536.0 / 65536.0));
+      assert(bit_depth <= 32);
+      __m128 mul = _mm_set1_ps((float)(1.0 / (double)(1ULL << bit_depth)));
 
       const si32* sp = src_line->i32 + src_line_offset;
       float* dp = dst_line->f32;
-      si32 shift = 32 - (si32)bit_depth;
       if (is_signed)
       {
         __m128i zero = _mm_setzero_si128();
-        __m128i bias = _mm_set1_epi32(-(si32)((ui32)INT_MIN + 1));
+        __m128i bias = _mm_set1_epi32(-(si32)((1ULL << (bit_depth - 1)) + 1));
         for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128i t = _mm_loadu_si128((__m128i*)sp);
-          __m128i u = _mm_slli_epi32(t, shift);
           if (NLT_TYPE3)
           {
-            __m128i c = _mm_cmplt_epi32(u, zero); // 0xFFFFFFFF for -ve value
-            __m128i neg = _mm_sub_epi32(bias, u); // - bias - value
+            __m128i c = _mm_cmplt_epi32(t, zero); // 0xFFFFFFFF for -ve value
+            __m128i neg = _mm_sub_epi32(bias, t); // - bias - value
             neg = _mm_and_si128(c, neg);          // keep only - bias - value
-            t = _mm_andnot_si128(c, u);           // keep only +ve or 0
-            u = _mm_or_si128(neg, t);             // combine
+            c = _mm_andnot_si128(c, t);           // keep only +ve or 0
+            t = _mm_or_si128(neg, c);             // combine
           }
-          __m128 v = _mm_cvtepi32_ps(u);
+          __m128 v = _mm_cvtepi32_ps(t);
           v = _mm_mul_ps(v, mul);
           _mm_storeu_ps(dp, v);
         }
       }
       else
       {
-        __m128i half = _mm_set1_epi32(INT_MIN);
+        __m128i half = _mm_set1_epi32((si32)(1ULL << (bit_depth - 1)));
         for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           __m128i t = _mm_loadu_si128((__m128i*)sp);
-          t = _mm_slli_epi32(t, shift);
           t = _mm_sub_epi32(t, half);
           __m128 v = _mm_cvtepi32_ps(t);
           v = _mm_mul_ps(v, mul);
diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index d56ec95e..8f307b13 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -327,7 +327,7 @@ namespace ojph {
       {
         const v128_t zero = wasm_f32x4_splat(0.0f);
         const v128_t half = wasm_f32x4_splat(0.5f);
-        v128_t bias = wasm_i32x4_splat(-((1 << (bit_depth - 1)) + 1));
+        v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
         for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           t = wasm_f32x4_mul(t, mul);
@@ -349,7 +349,7 @@ namespace ojph {
       {
         const v128_t zero = wasm_f32x4_splat(0.0f);
         const v128_t half = wasm_f32x4_splat(0.5f);
-        v128_t ihalf = wasm_i32x4_splat(1 << (bit_depth - 1));
+        v128_t ihalf = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
         for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
           t = wasm_f32x4_mul(t, mul);
@@ -392,25 +392,24 @@ namespace ojph {
              (dst_line->flags & line_buf::LFT_32BIT) &&
              (dst_line->flags & line_buf::LFT_INTEGER) == 0);
 
-      v128_t mul = wasm_f32x4_splat((float)(1.0 / 65536.0 / 65536.0));
+      assert(bit_depth <= 32);
+      v128_t mul = wasm_f32x4_splat((float)(1.0 / (double)(1ULL << bit_depth)));
 
       const si32* sp = src_line->i32 + src_line_offset;
       float* dp = dst_line->f32;
-      ui32 shift = (ui32)32 - bit_depth;
       if (is_signed)
       {
         v128_t zero = wasm_i32x4_splat(0);
-        v128_t bias = wasm_i32x4_splat(-(si32)((ui32)INT_MIN + 1));
+        v128_t bias = wasm_i32x4_splat(-(si32)((1ULL << (bit_depth - 1)) + 1));
         for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
-          v128_t u = wasm_i32x4_shl(t, shift);
           if (NLT_TYPE3)
           {
-            v128_t c = wasm_i32x4_lt(u, zero);    // 0xFFFFFFFF for -ve value
-            v128_t neg = wasm_i32x4_sub(bias, u); // - bias - value
+            v128_t c = wasm_i32x4_lt(t, zero);    // 0xFFFFFFFF for -ve value
+            v128_t neg = wasm_i32x4_sub(bias, t); // - bias - value
             neg = wasm_v128_and(c, neg);          // keep only - bias - value
-            t = wasm_v128_andnot(u, c);           // keep only +ve or 0
-            u = wasm_v128_or(neg, t);             // combine
+            c = wasm_v128_andnot(t, c);           // keep only +ve or 0
+            t = wasm_v128_or(neg, c);             // combine
           }
           v128_t v = wasm_f32x4_convert_i32x4(u);
           v = wasm_f32x4_mul(v, mul);
@@ -419,10 +418,9 @@ namespace ojph {
       }
       else
       {
-        v128_t half = wasm_i32x4_splat(INT_MIN);
+        v128_t half = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
         for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
-          v128_t u = wasm_i32x4_shl(t, shift);
           u = wasm_i32x4_sub(u, half);
           v128_t v = wasm_f32x4_convert_i32x4(u);
           v = wasm_f32x4_mul(v, mul);
diff --git a/tests/mse_pae.cpp b/tests/mse_pae.cpp
index 84653399..8239b851 100644
--- a/tests/mse_pae.cpp
+++ b/tests/mse_pae.cpp
@@ -60,7 +60,8 @@ struct img_info {
     width = height = 0;
     comps[0] = comps[1] = comps[2] = 0;
     format = UNDEFINED;
-    max_val = 0;
+    bit_depth = 0;
+    is_signed = false;
   }
   ~img_info() {
     for (ui32 i = 0; i < num_comps; ++i)
@@ -70,15 +71,16 @@ struct img_info {
     }
   }
   
-  void init(ui32 num_comps, size_t width, size_t height, ui32 max_val,
-            ui32 format=FORMAT444)
+  void init(ui32 num_comps, size_t width, size_t height, ui32 bit_depth,
+            bool is_signed, ui32 format=FORMAT444)
   {
     assert(num_comps <= 3 && comps[0] == NULL);
     this->num_comps = num_comps;
     this->width = width;
     this->height = height;
     this->format = format;
-    this->max_val = max_val;
+    this->bit_depth = bit_depth;
+    this->is_signed = is_signed;
     for (ui32 i = 0; i < num_comps; ++i)
       switch (format)
       {
@@ -114,7 +116,8 @@ struct img_info {
   point downsampling[3];
   si32 *comps[3];
   ui32 format;
-  ui32 max_val;
+  ui32 bit_depth;
+  bool is_signed;
 };
 
 bool is_pnm(const char *filename)
@@ -137,7 +140,7 @@ void load_ppm(const char *filename, img_info& img)
   ui32 num_comps = ppm.get_num_components();
   size_t width = ppm.get_width();
   size_t height = ppm.get_height();
-  img.init(num_comps, width, height, ppm.get_max_val());
+  img.init(num_comps, width, height, ppm.get_bit_depth(0), false);
   
   width = calc_aligned_size<si32, byte_alignment>(width);
   si32 *buffer = new si32[width];
@@ -259,7 +262,7 @@ void load_yuv(const char *filename, img_info& img)
   yuv.set_img_props(s, num_comps, num_comps, downsampling);  
   yuv.open(name_buf);
   
-  img.init(num_comps, s.w, s.h, (1 << bit_depth) - 1, format);
+  img.init(num_comps, s.w, s.h, bit_depth, false, format);
   
   size_t w = calc_aligned_size<si32, byte_alignment>(s.w);
   si32 *buffer = new si32[w];
@@ -281,12 +284,238 @@ void load_yuv(const char *filename, img_info& img)
   delete[] buffer;
 }
 
+bool is_rawl(const char *filename)
+{
+  const char *p = strchr(filename, ':'); // p is either NULL or pointing to ':'
+  if (p != NULL && p - filename >= 5 && p[-5] == '.' && 
+      toupper(p[-4]) == 'R' && toupper(p[-3])== 'A' && 
+      toupper(p[-2]) == 'W' && toupper(p[-1]) == 'L')
+    return true;
+  return false;
+}
+
+void load_rawl(const char *filename, img_info& img)
+{  
+  const char *p = strchr(filename, ':'); // p is either NULL or pointing to ':'
+  const char *name_end = p;
+  if (p == NULL) {
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp\n");
+    exit(-1);
+  }
+  ojph::size s;
+  ++p;
+  s.w = (ui32)atoi(p);
+  p = strchr(p, 'x'); // p is either NULL or pointing to ':'
+  if (p == NULL) {
+    printf("Expecting image height.\n");
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp\n");
+    exit(-1);
+  }
+  ++p;
+  s.h = (ui32)atoi(p);
+  p = strchr(p, 'x'); // p is either NULL or pointing to ':'
+  if (p == NULL) {
+    printf("Expecting image bitdepth.\n");
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp\n");
+    exit(-1);
+  }
+  ++p;
+  ui32 bit_depth = (ui32)atoi(p);
+  p = strchr(p, 'x'); // p is either NULL or pointing to ':'
+  if (p == NULL) {
+    printf("Expecting signedness information (either 0 or 1).\n");
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp, where num_comp is\n");
+    printf("either 1 or 3\n");
+    exit(-1);
+  }
+  ++p;
+  bool is_signed = *p != '0';
+  p = strchr(p, 'x'); // p is either NULL or pointing to ':'
+  if (p == NULL) {
+    printf("Expecting number of components.\n");
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp, where num_comp is\n");
+    printf("either 1 or 3\n");
+    exit(-1);
+  }
+  ++p;
+  ui32 num_comps = (ui32)atoi(p);
+  if (num_comps != 1 && num_comps != 3)
+  {
+    printf("num_comp must be either 1 or 3, %s was supplied.\n", p);
+    printf("A .rawl that does not have the expected format, which is\n");
+    printf(".rawl:widthxheightxbitdepthxsignedxnum_comp, where format is\n");
+    printf("either 1 or 3\n");
+    exit(-1);
+  }
+
+  char name_buf[2048];
+  ptrdiff_t cpy_len = name_end - filename > 2047 ? 2047 : name_end - filename;
+  strncpy(name_buf, filename, (size_t)cpy_len);
+  name_buf[cpy_len] = 0;
+
+  size_t w = calc_aligned_size<si32, byte_alignment>(s.w);
+  if (num_comps == 3)
+    img.init(num_comps, s.w, s.h, bit_depth, is_signed, FORMAT444);
+  else
+    img.init(num_comps, s.w, s.h, bit_depth, is_signed, FORMAT400);
+
+  if (is_signed)
+  {
+    if (bit_depth <= 8)
+    {
+      si8 *buffer = new si8[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        si8 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 1, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+    else if (bit_depth <= 16)
+    {
+      si16 *buffer = new si16[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        si16 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 2, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+    else
+    {
+      si32 *buffer = new si32[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        si32 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 4, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+  }
+  else
+  {
+    if (bit_depth <= 8)
+    {
+      ui8 *buffer = new ui8[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        ui8 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 1, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+    else if (bit_depth <= 16)
+    {
+      ui16 *buffer = new ui16[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        ui16 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 2, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = *sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+    else
+    {
+      ui32 *buffer = new ui32[s.w *  s.h];      
+      FILE *f = fopen(name_buf, "rb");
+      if (f == NULL) {
+        printf("Error opening file %s\n", name_buf);
+        exit(-1);
+      }
+
+      for (ui32 i = 0; i < num_comps; ++i)
+      {
+        ui32 *sp = buffer;
+        si32 *dp = img.comps[i];
+        if (fread(buffer, 4, s.w * s.h, f) != s.w * s.h) {
+          printf("Error reading from file %s\n", name_buf);
+          exit(-1);
+        }
+        for (ui32 j = s.w * s.h; j > 0; --j)
+          *dp++ = (si32)*sp++;
+      }
+      fclose(f);
+      delete[] buffer;
+    }
+  }
+}
+
 void find_mse_pae(const img_info& img1, const img_info& img2, 
                   float mse[3], ui32 pae[3])
 {
   if (img1.num_comps != img2.num_comps || img1.format != img2.format ||
       img1.width != img2.width || img1.height != img2.height ||
-      img1.max_val != img2.max_val)
+      img1.bit_depth != img2.bit_depth || img1.is_signed != img2.is_signed)
   {
     printf("Error: mismatching images\n");
     exit(-1);
@@ -298,26 +527,99 @@ void find_mse_pae(const img_info& img1, const img_info& img2,
     h = (img1.height + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
     double se = 0;
     ui32 lpae = 0;
-    for (ui32 v = 0; v < h; ++v)
-    {
-      si32 *p0 = img1.comps[c] + w * v;
-      si32 *p1 = img2.comps[c] + w * v;
-      for (ui32 s = 0; s < w; ++s)
+    if (img1.is_signed)
+      for (ui32 v = 0; v < h; ++v)
       {
-        si32 err = *p0++ - *p1++;
-        ui32 ae = (ui32)(err > 0 ? err : -err);
-        lpae = ae > lpae ? ae : lpae;
-        se += (double)err * (double)err;
+        si32 *p0 = img1.comps[c] + w * v;
+        si32 *p1 = img2.comps[c] + w * v;
+        for (ui32 s = 0; s < w; ++s)
+        {
+          si32 err = *p0++ - *p1++;
+          ui32 ae = (ui32)(err > 0 ? err : -err);
+          lpae = ae > lpae ? ae : lpae;
+          se += (double)err * (double)err;
+        }
+      }
+    else
+      for (ui32 v = 0; v < h; ++v)
+      {
+        ui32 *p0 = (ui32*)img1.comps[c] + w * v;
+        ui32 *p1 = (ui32*)img2.comps[c] + w * v;
+        for (ui32 s = 0; s < w; ++s)
+        {
+          ui32 a = *p0++;
+          ui32 b = *p1++;
+          ui32 err = a > b ? a - b : b - a;
+          lpae = err > lpae ? err : lpae;
+          se += (double)err * (double)err;
+        }
       }
-    }
     mse[c] = (float)se / (float)(w * h);
     pae[c] = lpae;
   }
-  // float t = 0;
-  // for (ui32 c = 0; c < img1.num_comps; ++c)
-  //   t += (float)mse[c];
-  // t /= (float)num_pixels;
-  // psnr = 10.0f * log10f((float)img1.max_val * (float)img1.max_val / t);
+}
+
+void find_nlt_mse_pae(const img_info& img1, const img_info& img2, 
+                      float mse[3], ui32 pae[3])
+{
+  if (img1.num_comps != img2.num_comps || img1.format != img2.format ||
+      img1.width != img2.width || img1.height != img2.height ||
+      img1.bit_depth != img2.bit_depth || img1.is_signed != img2.is_signed)
+  {
+    printf("Error: mismatching images\n");
+    exit(-1);
+  }
+  if (img1.is_signed)
+    for (ui32 c = 0; c < img1.num_comps; ++c)
+    {
+      size_t w, h;
+      w = (img1.width + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
+      h = (img1.height + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
+      double se = 0;
+      ui32 lpae = 0;
+      si32 bias = (si32)((1ULL << (img1.bit_depth - 1)) + 1);
+      for (ui32 v = 0; v < h; ++v)
+      {
+        si32 *p0 = img1.comps[c] + w * v;
+        si32 *p1 = img2.comps[c] + w * v;
+        for (ui32 s = 0; s < w; ++s)
+        {
+          si32 a = *p0++;
+          si32 b = *p1++;
+          a = (a >= 0) ? a : (- a - bias);
+          b = (b >= 0) ? b : (- b - bias);
+          ui32 err = a > b ? a - b : b - a;
+          lpae = err > lpae ? err : lpae;
+          se += (double)err * (double)err;
+        }
+      }
+      mse[c] = (float)se / (float)(w * h);
+      pae[c] = lpae;
+    }
+  else
+    for (ui32 c = 0; c < img1.num_comps; ++c)
+    {
+      size_t w, h;
+      w = (img1.width + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
+      h = (img1.height + img1.downsampling[c].x - 1) / img1.downsampling[c].x;
+      double se = 0;
+      ui32 lpae = 0;
+      for (ui32 v = 0; v < h; ++v)
+      {
+        ui32 *p0 = (ui32*)img1.comps[c] + w * v;
+        ui32 *p1 = (ui32*)img2.comps[c] + w * v;
+        for (ui32 s = 0; s < w; ++s)
+        {
+          ui32 a = *p0++;
+          ui32 b = *p1++;
+          ui32 err = a > b ? a - b : b - a;
+          lpae = err > lpae ? err : lpae;
+          se += (double)err * (double)err;
+        }
+      }
+      mse[c] = (float)se / (float)(w * h);
+      pae[c] = lpae;
+    }
 }
 
 int main(int argc, char *argv[])
@@ -325,20 +627,36 @@ int main(int argc, char *argv[])
   if (argc < 3)
   {
     printf("mse_pae expects two arguments <filename1, filename2>\n");
+    printf("A third optional argment is \"-nlt\".\n");
     exit(-1);
   }
-    
+
+  bool nlt = false;
+  if (argc == 4)
+  {
+    if (strcmp("-nlt", argv[3]) == 0)
+      nlt = true;
+    else {
+      printf("unknown 4th parameter %s\n", argv[3]);
+      exit(-1);      
+    }
+  }
+
+
   img_info img1, img2;
   try {
     if (is_pnm(argv[1]))
       load_ppm(argv[1], img1);
     else if (is_yuv(argv[1]))
       load_yuv(argv[1], img1);
+    else if (is_rawl(argv[1]))
+      load_rawl(argv[1], img1);
     else {
       printf("mse_pae does not know file format of %s\n", argv[1]);
       printf("or a .yuv that does not have the expected format, which is\n");
       printf(".yuv:widthxheightxbitdepthxformat, where format is\n");
-      printf("either 444, 422, or 420\n");
+      printf("either 444, 422, or 420, or wrongly format .rawl, which has\n");
+      printf(".rawl:widthxheightxbitdepthxsignedxnum_comp format.\n");
       exit(-1);  
     }
   }
@@ -355,11 +673,14 @@ int main(int argc, char *argv[])
       load_ppm(argv[2], img2);
     else if (is_yuv(argv[2]))
       load_yuv(argv[2], img2);
+    else if (is_rawl(argv[2]))
+      load_rawl(argv[2], img2);
     else {
       printf("mse_pae does not know file format of %s\n", argv[2]);
       printf("or a .yuv that does not have the expected format, which is\n");
       printf(".yuv:widthxheightxbitdepthxformat, where format is\n");
-      printf("either 444, 422, or 420\n");
+      printf("either 444, 422, or 420, or wrongly format .rawl, which has\n");
+      printf(".rawl:widthxheightxbitdepthxsignedxnum_comp format.\n");
       exit(-1);  
     }
   }
@@ -372,7 +693,10 @@ int main(int argc, char *argv[])
   }  
   
   float mse[3]; ui32 pae[3];
-  find_mse_pae(img1, img2, mse, pae);
+  if (!nlt)
+    find_mse_pae(img1, img2, mse, pae);
+  else
+    find_nlt_mse_pae(img1, img2, mse, pae);
   
   for (ui32 c = 0; c < img1.num_comps; ++c)
     printf("%f %d\n", mse[c], pae[c]);

From 9921864ac5b6b08f9b2e69d97f79c9344dda0634 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous72@yahoo.com>
Date: Mon, 27 Jan 2025 19:00:19 +1100
Subject: [PATCH 19/20] A bug fix.

---
 src/core/transform/ojph_colour_wasm.cpp | 6 +++---
 tests/mse_pae.cpp                       | 3 +--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/core/transform/ojph_colour_wasm.cpp b/src/core/transform/ojph_colour_wasm.cpp
index 8f307b13..aa9a79eb 100644
--- a/src/core/transform/ojph_colour_wasm.cpp
+++ b/src/core/transform/ojph_colour_wasm.cpp
@@ -411,7 +411,7 @@ namespace ojph {
             c = wasm_v128_andnot(t, c);           // keep only +ve or 0
             t = wasm_v128_or(neg, c);             // combine
           }
-          v128_t v = wasm_f32x4_convert_i32x4(u);
+          v128_t v = wasm_f32x4_convert_i32x4(t);
           v = wasm_f32x4_mul(v, mul);
           wasm_v128_store(dp, v);
         }
@@ -421,8 +421,8 @@ namespace ojph {
         v128_t half = wasm_i32x4_splat((si32)(1ULL << (bit_depth - 1)));
         for (int i = (int)width; i > 0; i -= 4, sp += 4, dp += 4) {
           v128_t t = wasm_v128_load(sp);
-          u = wasm_i32x4_sub(u, half);
-          v128_t v = wasm_f32x4_convert_i32x4(u);
+          t = wasm_i32x4_sub(t, half);
+          v128_t v = wasm_f32x4_convert_i32x4(t);
           v = wasm_f32x4_mul(v, mul);
           wasm_v128_store(dp, v);
         }
diff --git a/tests/mse_pae.cpp b/tests/mse_pae.cpp
index 8239b851..f1b84e64 100644
--- a/tests/mse_pae.cpp
+++ b/tests/mse_pae.cpp
@@ -358,7 +358,6 @@ void load_rawl(const char *filename, img_info& img)
   strncpy(name_buf, filename, (size_t)cpy_len);
   name_buf[cpy_len] = 0;
 
-  size_t w = calc_aligned_size<si32, byte_alignment>(s.w);
   if (num_comps == 3)
     img.init(num_comps, s.w, s.h, bit_depth, is_signed, FORMAT444);
   else
@@ -588,7 +587,7 @@ void find_nlt_mse_pae(const img_info& img1, const img_info& img2,
           si32 b = *p1++;
           a = (a >= 0) ? a : (- a - bias);
           b = (b >= 0) ? b : (- b - bias);
-          ui32 err = a > b ? a - b : b - a;
+          ui32 err = (ui32)(a > b ? a - b : b - a);
           lpae = err > lpae ? err : lpae;
           se += (double)err * (double)err;
         }

From d84633859d59eb41abb13fee960988ff6898f7c7 Mon Sep 17 00:00:00 2001
From: Aous Naman <aous@unsw.edu.au>
Date: Tue, 28 Jan 2025 09:11:13 +1100
Subject: [PATCH 20/20] A version bump.

---
 src/core/common/ojph_version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/common/ojph_version.h b/src/core/common/ojph_version.h
index 4c05b02b..4a61fc99 100644
--- a/src/core/common/ojph_version.h
+++ b/src/core/common/ojph_version.h
@@ -35,4 +35,4 @@
 
 #define OPENJPH_VERSION_MAJOR 0
 #define OPENJPH_VERSION_MINOR 19
-#define OPENJPH_VERSION_PATCH 0
+#define OPENJPH_VERSION_PATCH 1