From 2a8928e67a4d6111e2f6092bb476ff0b2b272016 Mon Sep 17 00:00:00 2001 From: "yuzhuohuang@qq.com" Date: Thu, 19 Jul 2012 14:33:15 +0800 Subject: [PATCH] Add CPU capability check. [PART 3] --- src/subtitles/Rasterizer.cpp | 513 ++++++++++++++++++++--------------- src/subtitles/Rasterizer.h | 1 + 2 files changed, 300 insertions(+), 214 deletions(-) diff --git a/src/subtitles/Rasterizer.cpp b/src/subtitles/Rasterizer.cpp index 89191f501..9251862c6 100644 --- a/src/subtitles/Rasterizer.cpp +++ b/src/subtitles/Rasterizer.cpp @@ -1562,82 +1562,314 @@ void Rasterizer::FillSolidRect(SubPicDesc& spd, int x, int y, int nWidth, int nH void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const byte* pBorder, int x, int y, int w, int h, const byte* pAlphaMask, int pitch, DWORD color_alpha ) { - pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL; - pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL; - byte* dst = outputAlphaMask + y*mOverlayPitch + x; + if (g_cpuid.m_flags & CCpuID::sse2) + { + pBody = pBody!=NULL ? pBody + y*mOverlayPitch + x: NULL; + pBorder = pBorder!=NULL ? pBorder + y*mOverlayPitch + x: NULL; + byte* dst = outputAlphaMask + y*mOverlayPitch + x; - const int x0 = ((reinterpret_cast(dst)+3)&~3) - reinterpret_cast(dst) < w ? - ((reinterpret_cast(dst)+3)&~3) - reinterpret_cast(dst) : w; //IMPORTANT! Should not exceed w. - const int x00 = ((reinterpret_cast(dst)+15)&~15) - reinterpret_cast(dst) < w ? - ((reinterpret_cast(dst)+15)&~15) - reinterpret_cast(dst) : w;//IMPORTANT! Should not exceed w. - const int x_end00 = ((reinterpret_cast(dst)+w)&~15) - reinterpret_cast(dst); - const int x_end0 = ((reinterpret_cast(dst)+w)&~3) - reinterpret_cast(dst); - const int x_end = w; + const int x0 = ((reinterpret_cast(dst)+3)&~3) - reinterpret_cast(dst) < w ? + ((reinterpret_cast(dst)+3)&~3) - reinterpret_cast(dst) : w; //IMPORTANT! Should not exceed w. + const int x00 = ((reinterpret_cast(dst)+15)&~15) - reinterpret_cast(dst) < w ? + ((reinterpret_cast(dst)+15)&~15) - reinterpret_cast(dst) : w;//IMPORTANT! Should not exceed w. + const int x_end00 = ((reinterpret_cast(dst)+w)&~15) - reinterpret_cast(dst); + const int x_end0 = ((reinterpret_cast(dst)+w)&~3) - reinterpret_cast(dst); + const int x_end = w; - __m64 color_alpha_64 = _mm_set1_pi16(color_alpha); - __m128i color_alpha_128 = _mm_set1_epi16(color_alpha); + __m64 color_alpha_64 = _mm_set1_pi16(color_alpha); + __m128i color_alpha_128 = _mm_set1_epi16(color_alpha); - if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL) - { - /* - __asm + if(pAlphaMask==NULL && pBody!=NULL && pBorder!=NULL) { - mov eax, color_alpha - movd XMM3, eax - punpcklwd XMM3, XMM3 - pshufd XMM3, XMM3, 0 - } - */ - while(h--) + while(h--) + { + int j=0; + for( ; j>6; + } + for( ;j(pBorder+j)); + __m64 body = _mm_cvtsi32_si64(*reinterpret_cast(pBody+j)); + border = _mm_subs_pu8(border, body); + __m64 zero = _mm_setzero_si64(); + border = _mm_unpacklo_pi8(border, zero); + border = _mm_mullo_pi16(border, color_alpha_64); + border = _mm_srli_pi16(border, 6); + border = _mm_packs_pu16(border,border); + *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(border); + } + __m128i zero = _mm_setzero_si128(); + for( ;j(pBorder+j)); + __m128i body = _mm_loadu_si128(reinterpret_cast(pBody+j)); + border = _mm_subs_epu8(border,body); + __m128i srchi = border; + border = _mm_unpacklo_epi8(border, zero); + srchi = _mm_unpackhi_epi8(srchi, zero); + border = _mm_mullo_epi16(border, color_alpha_128); + srchi = _mm_mullo_epi16(srchi, color_alpha_128); + border = _mm_srli_epi16(border, 6); + srchi = _mm_srli_epi16(srchi, 6); + border = _mm_packus_epi16(border, srchi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border); + } + for( ;j(pBorder+j)); + __m64 body = _mm_cvtsi32_si64(*reinterpret_cast(pBody+j)); + border = _mm_subs_pu8(border, body); + __m64 zero = _mm_setzero_si64(); + border = _mm_unpacklo_pi8(border, zero); + border = _mm_mullo_pi16(border, color_alpha_64); + border = _mm_srli_pi16(border, 6); + border = _mm_packs_pu16(border,border); + *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(border); + } + for( ;j>6; + } + pBody += mOverlayPitch; + pBorder += mOverlayPitch; + //pAlphaMask += pitch; + dst += mOverlayPitch; + } + } + else if( ((pBody==NULL) + (pBorder==NULL))==1 && pAlphaMask==NULL) { - int j=0; - for( ; j>6; + int j=0; + for( ; j>6; + } + for( ;j(src1+j)); + __m64 zero = _mm_setzero_si64(); + src = _mm_unpacklo_pi8(src, zero); + src = _mm_mullo_pi16(src, color_alpha_64); + src = _mm_srli_pi16(src, 6); + src = _mm_packs_pu16(src,src); + *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(src); + } + __m128i zero = _mm_setzero_si128(); + for( ;j(src1+j)); + __m128i srchi = src; + src = _mm_unpacklo_epi8(src, zero); + srchi = _mm_unpackhi_epi8(srchi, zero); + src = _mm_mullo_epi16(src, color_alpha_128); + srchi = _mm_mullo_epi16(srchi, color_alpha_128); + src = _mm_srli_epi16(src, 6); + srchi = _mm_srli_epi16(srchi, 6); + src = _mm_packus_epi16(src, srchi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src); + } + for( ;j(src1+j)); + __m64 zero = _mm_setzero_si64(); + src = _mm_unpacklo_pi8(src, zero); + src = _mm_mullo_pi16(src, color_alpha_64); + src = _mm_srli_pi16(src, 6); + src = _mm_packs_pu16(src,src); + *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(src); + } + for( ;j>6; + } + src1 += mOverlayPitch; + //pAlphaMask += pitch; + dst += mOverlayPitch; } - for( ;j(pBorder+j)); - __m64 body = _mm_cvtsi32_si64(*reinterpret_cast(pBody+j)); - border = _mm_subs_pu8(border, body); - __m64 zero = _mm_setzero_si64(); - border = _mm_unpacklo_pi8(border, zero); - border = _mm_mullo_pi16(border, color_alpha_64); - border = _mm_srli_pi16(border, 6); - border = _mm_packs_pu16(border,border); - *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(border); + int j=0; + for( ; j>12; + } + for( ;j(src1+j)); + __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast(pAlphaMask+j)); + __m64 zero = _mm_setzero_si64(); + src = _mm_unpacklo_pi8(src, zero); + src = _mm_mullo_pi16(src, color_alpha_64); + mask = _mm_unpacklo_pi8(zero, mask); //important! + src = _mm_mulhi_pi16(src, mask); //important! + src = _mm_srli_pi16(src, 12+8-16); //important! + src = _mm_packs_pu16(src,src); + *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(src); + } + __m128i zero = _mm_setzero_si128(); + for( ;j(src1+j)); + __m128i mask = _mm_loadu_si128(reinterpret_cast(pAlphaMask+j)); + __m128i srchi = src; + __m128i maskhi = mask; + src = _mm_unpacklo_epi8(src, zero); + srchi = _mm_unpackhi_epi8(srchi, zero); + mask = _mm_unpacklo_epi8(zero, mask); //important! + maskhi = _mm_unpackhi_epi8(zero, maskhi); + src = _mm_mullo_epi16(src, color_alpha_128); + srchi = _mm_mullo_epi16(srchi, color_alpha_128); + src = _mm_mulhi_epu16(src, mask); //important! + srchi = _mm_mulhi_epu16(srchi, maskhi); + src = _mm_srli_epi16(src, 12+8-16); //important! + srchi = _mm_srli_epi16(srchi, 12+8-16); + src = _mm_packus_epi16(src, srchi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src); + } + for( ;j(src1+j)); + __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast(pAlphaMask+j)); + __m64 zero = _mm_setzero_si64(); + src = _mm_unpacklo_pi8(src, zero); + src = _mm_mullo_pi16(src, color_alpha_64); + mask = _mm_unpacklo_pi8(zero, mask); //important! + src = _mm_mulhi_pi16(src, mask); //important! + src = _mm_srli_pi16(src, 12+8-16); //important! + src = _mm_packs_pu16(src,src); + *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(src); + } + for( ;j>12; + } + src1 += mOverlayPitch; + pAlphaMask += pitch; + dst += mOverlayPitch; } - __m128i zero = _mm_setzero_si128(); - for( ;j(pBorder+j)); - __m128i body = _mm_loadu_si128(reinterpret_cast(pBody+j)); - border = _mm_subs_epu8(border,body); - __m128i srchi = border; - border = _mm_unpacklo_epi8(border, zero); - srchi = _mm_unpackhi_epi8(srchi, zero); - border = _mm_mullo_epi16(border, color_alpha_128); - srchi = _mm_mullo_epi16(srchi, color_alpha_128); - border = _mm_srli_epi16(border, 6); - srchi = _mm_srli_epi16(srchi, 6); - border = _mm_packus_epi16(border, srchi); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border); + int j=0; + for( ; j>12; + } + for( ;j(pBorder+j)); + __m64 body = _mm_cvtsi32_si64(*reinterpret_cast(pBody+j)); + border = _mm_subs_pu8(border, body); + __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast(pAlphaMask+j)); + __m64 zero = _mm_setzero_si64(); + border = _mm_unpacklo_pi8(border, zero); + border = _mm_mullo_pi16(border, color_alpha_64); + mask = _mm_unpacklo_pi8(zero, mask); //important! + border = _mm_mulhi_pi16(border, mask); //important! + border = _mm_srli_pi16(border, 12+8-16); //important! + border = _mm_packs_pu16(border,border); + *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(border); + } + __m128i zero = _mm_setzero_si128(); + for( ;j(pBorder+j)); + __m128i body = _mm_loadu_si128(reinterpret_cast(pBody+j)); + border = _mm_subs_epu8(border,body); + + __m128i mask = _mm_loadu_si128(reinterpret_cast(pAlphaMask+j)); + __m128i srchi = border; + __m128i maskhi = mask; + border = _mm_unpacklo_epi8(border, zero); + srchi = _mm_unpackhi_epi8(srchi, zero); + mask = _mm_unpacklo_epi8(zero, mask); //important! + maskhi = _mm_unpackhi_epi8(zero, maskhi); + border = _mm_mullo_epi16(border, color_alpha_128); + srchi = _mm_mullo_epi16(srchi, color_alpha_128); + border = _mm_mulhi_epu16(border, mask); //important! + srchi = _mm_mulhi_epu16(srchi, maskhi); + border = _mm_srli_epi16(border, 12+8-16); //important! + srchi = _mm_srli_epi16(srchi, 12+8-16); + border = _mm_packus_epi16(border, srchi); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border); + } + for( ;j(pBorder+j)); + __m64 body = _mm_cvtsi32_si64(*reinterpret_cast(pBody+j)); + border = _mm_subs_pu8(border, body); + __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast(pAlphaMask+j)); + __m64 zero = _mm_setzero_si64(); + border = _mm_unpacklo_pi8(border, zero); + border = _mm_mullo_pi16(border, color_alpha_64); + mask = _mm_unpacklo_pi8(zero, mask); //important! + border = _mm_mulhi_pi16(border, mask); //important! + border = _mm_srli_pi16(border, 12+8-16); //important! + border = _mm_packs_pu16(border,border); + *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(border); + } + for( ;j>12; + } + pBody += mOverlayPitch; + pBorder += mOverlayPitch; + pAlphaMask += pitch; + dst += mOverlayPitch; } - for( ;j(pBorder+j)); - __m64 body = _mm_cvtsi32_si64(*reinterpret_cast(pBody+j)); - border = _mm_subs_pu8(border, body); - __m64 zero = _mm_setzero_si64(); - border = _mm_unpacklo_pi8(border, zero); - border = _mm_mullo_pi16(border, color_alpha_64); - border = _mm_srli_pi16(border, 6); - border = _mm_packs_pu16(border,border); - *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(border); + for(int j=0;j>6; - } - for( ;j(src1+j)); - __m64 zero = _mm_setzero_si64(); - src = _mm_unpacklo_pi8(src, zero); - src = _mm_mullo_pi16(src, color_alpha_64); - src = _mm_srli_pi16(src, 6); - src = _mm_packs_pu16(src,src); - *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(src); - } - __m128i zero = _mm_setzero_si128(); - for( ;j(src1+j)); - __m128i srchi = src; - src = _mm_unpacklo_epi8(src, zero); - srchi = _mm_unpackhi_epi8(srchi, zero); - src = _mm_mullo_epi16(src, color_alpha_128); - srchi = _mm_mullo_epi16(srchi, color_alpha_128); - src = _mm_srli_epi16(src, 6); - srchi = _mm_srli_epi16(srchi, 6); - src = _mm_packus_epi16(src, srchi); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src); - } - for( ;j(src1+j)); - __m64 zero = _mm_setzero_si64(); - src = _mm_unpacklo_pi8(src, zero); - src = _mm_mullo_pi16(src, color_alpha_64); - src = _mm_srli_pi16(src, 6); - src = _mm_packs_pu16(src,src); - *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(src); - } - for( ;j>6; } @@ -1708,57 +1902,7 @@ void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const b while(h--) { int j=0; - for( ; j>12; - } - for( ;j(src1+j)); - __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast(pAlphaMask+j)); - __m64 zero = _mm_setzero_si64(); - src = _mm_unpacklo_pi8(src, zero); - src = _mm_mullo_pi16(src, color_alpha_64); - mask = _mm_unpacklo_pi8(zero, mask); //important! - src = _mm_mulhi_pi16(src, mask); //important! - src = _mm_srli_pi16(src, 12+8-16); //important! - src = _mm_packs_pu16(src,src); - *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(src); - } - __m128i zero = _mm_setzero_si128(); - for( ;j(src1+j)); - __m128i mask = _mm_loadu_si128(reinterpret_cast(pAlphaMask+j)); - __m128i srchi = src; - __m128i maskhi = mask; - src = _mm_unpacklo_epi8(src, zero); - srchi = _mm_unpackhi_epi8(srchi, zero); - mask = _mm_unpacklo_epi8(zero, mask); //important! - maskhi = _mm_unpackhi_epi8(zero, maskhi); - src = _mm_mullo_epi16(src, color_alpha_128); - srchi = _mm_mullo_epi16(srchi, color_alpha_128); - src = _mm_mulhi_epu16(src, mask); //important! - srchi = _mm_mulhi_epu16(srchi, maskhi); - src = _mm_srli_epi16(src, 12+8-16); //important! - srchi = _mm_srli_epi16(srchi, 12+8-16); - src = _mm_packus_epi16(src, srchi); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), src); - } - for( ;j(src1+j)); - __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast(pAlphaMask+j)); - __m64 zero = _mm_setzero_si64(); - src = _mm_unpacklo_pi8(src, zero); - src = _mm_mullo_pi16(src, color_alpha_64); - mask = _mm_unpacklo_pi8(zero, mask); //important! - src = _mm_mulhi_pi16(src, mask); //important! - src = _mm_srli_pi16(src, 12+8-16); //important! - src = _mm_packs_pu16(src,src); - *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(src); - } - for( ;j>12; } @@ -1772,66 +1916,7 @@ void Overlay::_DoFillAlphaMash(byte* outputAlphaMask, const byte* pBody, const b while(h--) { int j=0; - for( ; j>12; - } - for( ;j(pBorder+j)); - __m64 body = _mm_cvtsi32_si64(*reinterpret_cast(pBody+j)); - border = _mm_subs_pu8(border, body); - __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast(pAlphaMask+j)); - __m64 zero = _mm_setzero_si64(); - border = _mm_unpacklo_pi8(border, zero); - border = _mm_mullo_pi16(border, color_alpha_64); - mask = _mm_unpacklo_pi8(zero, mask); //important! - border = _mm_mulhi_pi16(border, mask); //important! - border = _mm_srli_pi16(border, 12+8-16); //important! - border = _mm_packs_pu16(border,border); - *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(border); - } - __m128i zero = _mm_setzero_si128(); - for( ;j(pBorder+j)); - __m128i body = _mm_loadu_si128(reinterpret_cast(pBody+j)); - border = _mm_subs_epu8(border,body); - - __m128i mask = _mm_loadu_si128(reinterpret_cast(pAlphaMask+j)); - __m128i srchi = border; - __m128i maskhi = mask; - border = _mm_unpacklo_epi8(border, zero); - srchi = _mm_unpackhi_epi8(srchi, zero); - mask = _mm_unpacklo_epi8(zero, mask); //important! - maskhi = _mm_unpackhi_epi8(zero, maskhi); - border = _mm_mullo_epi16(border, color_alpha_128); - srchi = _mm_mullo_epi16(srchi, color_alpha_128); - border = _mm_mulhi_epu16(border, mask); //important! - srchi = _mm_mulhi_epu16(srchi, maskhi); - border = _mm_srli_epi16(border, 12+8-16); //important! - srchi = _mm_srli_epi16(srchi, 12+8-16); - border = _mm_packus_epi16(border, srchi); - _mm_storeu_si128(reinterpret_cast<__m128i*>(dst+j), border); - } - for( ;j(pBorder+j)); - __m64 body = _mm_cvtsi32_si64(*reinterpret_cast(pBody+j)); - border = _mm_subs_pu8(border, body); - __m64 mask = _mm_cvtsi32_si64(*reinterpret_cast(pAlphaMask+j)); - __m64 zero = _mm_setzero_si64(); - border = _mm_unpacklo_pi8(border, zero); - border = _mm_mullo_pi16(border, color_alpha_64); - mask = _mm_unpacklo_pi8(zero, mask); //important! - border = _mm_mulhi_pi16(border, mask); //important! - border = _mm_srli_pi16(border, 12+8-16); //important! - border = _mm_packs_pu16(border,border); - *reinterpret_cast(dst+j) = _mm_cvtsi64_si32(border); - } - for( ;j SharedPtrOverlay;