Merge pull request #11358 from akallabeth/yuv-segv-fix

Yuv 420 fix
This commit is contained in:
akallabeth 2025-03-17 08:28:57 +01:00 committed by GitHub
commit 13cacd5010
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 107 additions and 264 deletions

View File

@ -756,7 +756,10 @@ void h264_context_free(H264_CONTEXT* h264)
if (h264)
{
if (h264->subsystem)
{
WINPR_ASSERT(h264->subsystem->Uninit);
h264->subsystem->Uninit(h264);
}
for (size_t x = 0; x < 3; x++)
{

View File

@ -76,6 +76,7 @@ void primitives_init_YCoCg(primitives_t* WINPR_RESTRICT prims)
void primitives_init_YCoCg_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_YCoCg(prims);
primitives_init_YCoCg_ssse3(prims);
primitives_init_YCoCg_neon(prims);
}

View File

@ -32,8 +32,6 @@
FREERDP_LOCAL void primitives_init_YCoCg_ssse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_YCoCg(prims);
if (!IsProcessorFeaturePresentEx(PF_EX_SSSE3) ||
!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
@ -45,8 +43,6 @@ static inline void primitives_init_YCoCg_ssse3(primitives_t* WINPR_RESTRICT prim
FREERDP_LOCAL void primitives_init_YCoCg_neon_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_YCoCg_neon(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_YCoCg(prims);
if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
return;
primitives_init_YCoCg_neon_int(prims);

View File

@ -341,7 +341,6 @@ static inline void general_YUV444ToRGB_DOUBLE_ROW(BYTE* WINPR_RESTRICT pRGB[2],
const BYTE* WINPR_RESTRICT pU[2],
const BYTE* WINPR_RESTRICT pV[2], size_t nWidth)
{
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
WINPR_ASSERT(nWidth % 2 == 0);
@ -364,10 +363,7 @@ static inline void general_YUV444ToRGB_DOUBLE_ROW(BYTE* WINPR_RESTRICT pRGB[2],
const INT32 avgV = ((4 * v) - subV);
v = CONDITIONAL_CLIP(avgV, WINPR_ASSERTING_INT_CAST(BYTE, v));
}
const BYTE r = YUV2R(y, u, v);
const BYTE g = YUV2G(y, u, v);
const BYTE b = YUV2B(y, u, v);
pRGB[i] = writePixel(pRGB[i], formatSize, DstFormat, r, g, b, 0);
pRGB[i] = writeYUVPixel(pRGB[i], DstFormat, y, u, v, writePixel);
}
}
}
@ -379,17 +375,16 @@ static inline void general_YUV444ToRGB_SINGLE_ROW(BYTE* WINPR_RESTRICT pRGB, UIN
const BYTE* WINPR_RESTRICT pV, size_t nWidth)
{
fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
WINPR_ASSERT(nWidth % 2 == 0);
for (size_t x = 0; x < nWidth; x += 2)
{
for (size_t j = 0; j < 2; j++)
{
const BYTE r = YUV2R(pY[x + j], pU[x + j], pV[x + j]);
const BYTE g = YUV2G(pY[x + j], pU[x + j], pV[x + j]);
const BYTE b = YUV2B(pY[x + j], pU[x + j], pV[x + j]);
pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
const BYTE y = pY[x + j];
const BYTE u = pU[x + j];
const BYTE v = pV[x + j];
pRGB = writeYUVPixel(pRGB, DstFormat, y, u, v, writePixel);
}
}
}
@ -438,8 +433,6 @@ static inline void general_YUV444ToBGRX_DOUBLE_ROW(BYTE* WINPR_RESTRICT pRGB[2],
const BYTE* WINPR_RESTRICT pU[2],
const BYTE* WINPR_RESTRICT pV[2], size_t nWidth)
{
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
WINPR_ASSERT(nWidth % 2 == 0);
for (size_t x = 0; x < nWidth; x += 2)
{
@ -457,10 +450,10 @@ static inline void general_YUV444ToBGRX_DOUBLE_ROW(BYTE* WINPR_RESTRICT pRGB[2],
{
for (size_t j = 0; j < 2; j++)
{
const BYTE r = YUV2R(pY[i][x + j], U[i][j], V[i][j]);
const BYTE g = YUV2G(pY[i][x + j], U[i][j], V[i][j]);
const BYTE b = YUV2B(pY[i][x + j], U[i][j], V[i][j]);
pRGB[i] = writePixelBGRX(pRGB[i], formatSize, DstFormat, r, g, b, 0);
const BYTE y = pY[i][x + j];
const BYTE u = U[i][j];
const BYTE v = V[i][j];
pRGB[i] = writeYUVPixel(pRGB[i], DstFormat, y, u, v, writePixelBGRX);
}
}
}
@ -471,17 +464,15 @@ static inline void general_YUV444ToBGRX_SINGLE_ROW(BYTE* WINPR_RESTRICT pRGB, UI
const BYTE* WINPR_RESTRICT pU,
const BYTE* WINPR_RESTRICT pV, size_t nWidth)
{
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
WINPR_ASSERT(nWidth % 2 == 0);
for (size_t x = 0; x < nWidth; x += 2)
{
for (size_t j = 0; j < 2; j++)
{
const BYTE r = YUV2R(pY[x + j], pU[x + j], pV[x + j]);
const BYTE g = YUV2G(pY[x + j], pU[x + j], pV[x + j]);
const BYTE b = YUV2B(pY[x + j], pU[x + j], pV[x + j]);
pRGB = writePixelBGRX(pRGB, formatSize, DstFormat, r, g, b, 0);
const BYTE Y = pY[x + j];
const BYTE U = pU[x + j];
const BYTE V = pV[x + j];
pRGB = writeYUVPixel(pRGB, DstFormat, Y, U, V, writePixelBGRX);
}
}
}
@ -548,124 +539,90 @@ static pstatus_t general_YUV420ToRGB_8u_P3AC4R(const BYTE* WINPR_RESTRICT pSrc[3
UINT32 dstStep, UINT32 DstFormat,
const prim_size_t* WINPR_RESTRICT roi)
{
UINT32 dstPad = 0;
UINT32 srcPad[3];
BYTE Y = 0;
BYTE U = 0;
BYTE V = 0;
UINT32 halfWidth = 0;
UINT32 halfHeight = 0;
const BYTE* pY = NULL;
const BYTE* pU = NULL;
const BYTE* pV = NULL;
BYTE* pRGB = pDst;
UINT32 nWidth = 0;
UINT32 nHeight = 0;
UINT32 lastRow = 0;
UINT32 lastCol = 0;
WINPR_ASSERT(roi);
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
fkt_writePixel writePixel = getPixelWriteFunction(DstFormat, FALSE);
pY = pSrc[0];
pU = pSrc[1];
pV = pSrc[2];
lastCol = roi->width & 0x01;
lastRow = roi->height & 0x01;
nWidth = (roi->width + 1) & (uint32_t)~0x0001;
nHeight = (roi->height + 1) & (uint32_t)~0x0001;
halfWidth = nWidth / 2;
halfHeight = nHeight / 2;
srcPad[0] = (srcStep[0] - nWidth);
srcPad[1] = (srcStep[1] - halfWidth);
srcPad[2] = (srcStep[2] - halfWidth);
dstPad = (dstStep - (nWidth * 4));
UINT32 lastCol = roi->width & 0x01;
UINT32 lastRow = roi->height & 0x01;
const UINT32 nWidth = (roi->width + 1) & (uint32_t)~0x0001;
const UINT32 nHeight = (roi->height + 1) & (uint32_t)~0x0001;
const UINT32 halfWidth = nWidth / 2;
const UINT32 halfHeight = nHeight / 2;
for (UINT32 y = 0; y < halfHeight;)
for (UINT32 y = 0; y < halfHeight; y++)
{
if (++y == halfHeight)
const BYTE* pY = &pSrc[0][2ULL * srcStep[0] * y];
if (y + 1 == halfHeight)
lastRow <<= 1;
for (UINT32 x = 0; x < halfWidth;)
{
BYTE r = 0;
BYTE g = 0;
BYTE b = 0;
if (++x == halfWidth)
lastCol <<= 1;
U = *pU++;
V = *pV++;
/* 1st pixel */
Y = *pY++;
r = YUV2R(Y, U, V);
g = YUV2G(Y, U, V);
b = YUV2B(Y, U, V);
pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
/* 2nd pixel */
if (!(lastCol & 0x02))
const BYTE* pU = &pSrc[1][1ULL * srcStep[1] * y];
const BYTE* pV = &pSrc[2][1ULL * srcStep[2] * y];
BYTE* pRGBeven = &pDst[2ULL * y * dstStep];
for (UINT32 x = 0; x < halfWidth;)
{
Y = *pY++;
r = YUV2R(Y, U, V);
g = YUV2G(Y, U, V);
b = YUV2B(Y, U, V);
pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
}
else
{
pY++;
pRGB += formatSize;
lastCol >>= 1;
if (++x == halfWidth)
lastCol <<= 1;
const BYTE U = *pU++;
const BYTE V = *pV++;
/* 1st pixel */
{
const BYTE Y = *pY++;
pRGBeven = writeYUVPixel(pRGBeven, DstFormat, Y, U, V, writePixel);
}
/* 2nd pixel */
if (!(lastCol & 0x02))
{
const BYTE Y1 = *pY++;
pRGBeven = writeYUVPixel(pRGBeven, DstFormat, Y1, U, V, writePixel);
}
else
{
pY++;
pRGBeven += formatSize;
lastCol >>= 1;
}
}
}
pY += srcPad[0];
pU -= halfWidth;
pV -= halfWidth;
pRGB += dstPad;
if (lastRow & 0x02)
break;
for (UINT32 x = 0; x < halfWidth;)
{
BYTE r = 0;
BYTE g = 0;
BYTE b = 0;
const BYTE* pU = &pSrc[1][1ULL * srcStep[1] * y];
const BYTE* pV = &pSrc[2][1ULL * srcStep[2] * y];
BYTE* pRGBodd = &pDst[(2ULL * y + 1ULL) * dstStep];
if (++x == halfWidth)
lastCol <<= 1;
U = *pU++;
V = *pV++;
/* 3rd pixel */
Y = *pY++;
r = YUV2R(Y, U, V);
g = YUV2G(Y, U, V);
b = YUV2B(Y, U, V);
pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
/* 4th pixel */
if (!(lastCol & 0x02))
for (UINT32 x = 0; x < halfWidth;)
{
Y = *pY++;
r = YUV2R(Y, U, V);
g = YUV2G(Y, U, V);
b = YUV2B(Y, U, V);
pRGB = writePixel(pRGB, formatSize, DstFormat, r, g, b, 0);
}
else
{
pY++;
pRGB += formatSize;
lastCol >>= 1;
if (++x == halfWidth)
lastCol <<= 1;
const BYTE U = *pU++;
const BYTE V = *pV++;
/* 3rd pixel */
{
const BYTE Y = *pY++;
pRGBodd = writeYUVPixel(pRGBodd, DstFormat, Y, U, V, writePixel);
}
/* 4th pixel */
if (!(lastCol & 0x02))
{
const BYTE Y1 = *pY++;
pRGBodd = writeYUVPixel(pRGBodd, DstFormat, Y1, U, V, writePixel);
}
else
{
pY++;
pRGBodd += formatSize;
lastCol >>= 1;
}
}
}
pY += srcPad[0];
pU += srcPad[1];
pV += srcPad[2];
pRGB += dstPad;
}
return PRIMITIVES_SUCCESS;
@ -2351,6 +2308,7 @@ void primitives_init_YUV(primitives_t* WINPR_RESTRICT prims)
void primitives_init_YUV_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_YUV(prims);
primitives_init_YUV_sse41(prims);
primitives_init_YUV_neon(prims);
}

View File

@ -32,8 +32,6 @@
FREERDP_LOCAL void primitives_init_YUV_sse41_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_YUV_sse41(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_YUV(prims);
if (!IsProcessorFeaturePresentEx(PF_EX_SSE41) ||
!IsProcessorFeaturePresent(PF_SSE4_1_INSTRUCTIONS_AVAILABLE))
return;
@ -44,8 +42,6 @@ static inline void primitives_init_YUV_sse41(primitives_t* WINPR_RESTRICT prims)
FREERDP_LOCAL void primitives_init_YUV_neon_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_YUV_neon(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_YUV(prims);
if (!IsProcessorFeaturePresentEx(PF_EX_SSE41))
return;

View File

@ -78,5 +78,6 @@ void primitives_init_add(primitives_t* WINPR_RESTRICT prims)
void primitives_init_add_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_add(prims);
primitives_init_add_sse3(prims);
}

View File

@ -32,8 +32,6 @@
FREERDP_LOCAL void primitives_init_add_sse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_add_sse3(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_add(prims);
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
return;

View File

@ -93,5 +93,6 @@ void primitives_init_alphaComp(primitives_t* WINPR_RESTRICT prims)
void primitives_init_alphaComp_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_alphaComp(prims);
primitives_init_alphaComp_sse3(prims);
}

View File

@ -32,8 +32,6 @@
FREERDP_LOCAL void primitives_init_alphaComp_sse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_alphaComp_sse3(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_alphaComp(prims);
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) /* for LDDQU */
return;

View File

@ -61,5 +61,6 @@ void primitives_init_andor(primitives_t* WINPR_RESTRICT prims)
void primitives_init_andor_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_andor(prims);
primitives_init_andor_sse3(prims);
}

View File

@ -32,8 +32,6 @@
FREERDP_LOCAL void primitives_init_andor_sse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_andor_sse3(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_andor(prims);
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;

View File

@ -570,6 +570,7 @@ void primitives_init_colors(primitives_t* WINPR_RESTRICT prims)
/* ------------------------------------------------------------------------- */
void primitives_init_colors_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_colors(prims);
primitives_init_colors_sse2(prims);
primitives_init_colors_neon(prims);
}

View File

@ -32,8 +32,6 @@
FREERDP_LOCAL void primitives_init_colors_sse2_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_colors_sse2(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_colors(prims);
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;
@ -44,7 +42,6 @@ static inline void primitives_init_colors_sse2(primitives_t* WINPR_RESTRICT prim
FREERDP_LOCAL void primitives_init_colors_neon_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_colors_neon(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_colors(prims);
if (!IsProcessorFeaturePresent(PF_ARM_NEON_INSTRUCTIONS_AVAILABLE))
return;

View File

@ -424,6 +424,7 @@ void primitives_init_copy(primitives_t* WINPR_RESTRICT prims)
void primitives_init_copy_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_copy(prims);
primitives_init_copy_sse41(prims);
#if defined(WITH_AVX2)
primitives_init_copy_avx2(prims);

View File

@ -286,6 +286,17 @@ static INLINE BYTE RGB2V(INT32 R, INT32 G, INT32 B)
return WINPR_ASSERTING_INT_CAST(BYTE, val);
}
static inline BYTE* writeYUVPixel(BYTE* dst, UINT32 DstFormat, INT32 y, INT32 u, INT32 v,
fkt_writePixel fkt)
{
WINPR_ASSERT(fkt);
const BYTE r = YUV2R(y, u, v);
const BYTE g = YUV2G(y, u, v);
const BYTE b = YUV2B(y, u, v);
const DWORD formatSize = FreeRDPGetBytesPerPixel(DstFormat);
return fkt(dst, formatSize, DstFormat, r, g, b, 0);
}
FREERDP_LOCAL void general_RGBToAVC444YUV_BGRX_DOUBLE_ROW(
size_t offset, const BYTE* WINPR_RESTRICT srcEven, const BYTE* WINPR_RESTRICT srcOdd,
BYTE* WINPR_RESTRICT b1Even, BYTE* WINPR_RESTRICT b1Odd, BYTE* WINPR_RESTRICT b2,

View File

@ -128,5 +128,6 @@ void primitives_init_set(primitives_t* WINPR_RESTRICT prims)
void primitives_init_set_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_set(prims);
primitives_init_set_sse2(prims);
}

View File

@ -32,7 +32,6 @@
FREERDP_LOCAL void primitives_init_set_sse2_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_set_sse2(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_set(prims);
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;

View File

@ -145,5 +145,6 @@ void primitives_init_shift(primitives_t* WINPR_RESTRICT prims)
void primitives_init_shift_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_shift(prims);
primitives_init_shift_sse3(prims);
}

View File

@ -31,8 +31,6 @@
FREERDP_LOCAL void primitives_init_shift_sse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_shift_sse3(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_shift(prims);
if (!IsProcessorFeaturePresent(PF_SSE2_INSTRUCTIONS_AVAILABLE) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;

View File

@ -45,5 +45,6 @@ void primitives_init_sign(primitives_t* WINPR_RESTRICT prims)
void primitives_init_sign_opt(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_sign(prims);
primitives_init_sign_ssse3(prims);
}

View File

@ -32,7 +32,6 @@
FREERDP_LOCAL void primitives_init_sign_ssse3_int(primitives_t* WINPR_RESTRICT prims);
static inline void primitives_init_sign_ssse3(primitives_t* WINPR_RESTRICT prims)
{
primitives_init_sign(prims);
if (!IsProcessorFeaturePresentEx(PF_EX_SSSE3) ||
!IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE))
return;

View File

@ -160,10 +160,7 @@ static inline pstatus_t sse41_YUV420ToRGB_BGRX(const BYTE* WINPR_RESTRICT pSrc[]
const BYTE Y = *YData++;
const BYTE U = *UData;
const BYTE V = *VData;
const BYTE r = YUV2R(Y, U, V);
const BYTE g = YUV2G(Y, U, V);
const BYTE b = YUV2B(Y, U, V);
dst = (__m128i*)writePixelBGRX((BYTE*)dst, 4, PIXEL_FORMAT_BGRX32, r, g, b, 0);
dst = (__m128i*)writeYUVPixel((BYTE*)dst, PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX);
if (x % 2)
{
@ -221,10 +218,7 @@ static inline void BGRX_fillRGB(size_t offset, BYTE* WINPR_RESTRICT pRGB[2],
V = CONDITIONAL_CLIP(avgV, pV[0][offset]);
}
const BYTE r = YUV2R(Y, U, V);
const BYTE g = YUV2G(Y, U, V);
const BYTE b = YUV2B(Y, U, V);
writePixelBGRX(&pRGB[i][(j + offset) * bpp], bpp, DstFormat, r, g, b, 0);
writeYUVPixel(&pRGB[i][(j + offset) * bpp], DstFormat, Y, U, V, writePixelBGRX);
}
}
}
@ -455,7 +449,6 @@ static inline void BGRX_fillRGB_single(size_t offset, BYTE* WINPR_RESTRICT pRGB,
WINPR_ASSERT(pU);
WINPR_ASSERT(pV);
const UINT32 DstFormat = PIXEL_FORMAT_BGRX32;
const UINT32 bpp = 4;
for (size_t j = 0; j < 2; j++)
@ -464,10 +457,7 @@ static inline void BGRX_fillRGB_single(size_t offset, BYTE* WINPR_RESTRICT pRGB,
BYTE U = pU[offset + j];
BYTE V = pV[offset + j];
const BYTE r = YUV2R(Y, U, V);
const BYTE g = YUV2G(Y, U, V);
const BYTE b = YUV2B(Y, U, V);
writePixelBGRX(&pRGB[(j + offset) * bpp], bpp, DstFormat, r, g, b, 0);
writeYUVPixel(&pRGB[(j + offset) * bpp], PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX);
}
}

View File

@ -97,110 +97,6 @@ static inline void mm_prefetch_buffer(const void* WINPR_RESTRICT buffer, size_t
}
}
/*---------------------------------------------------------------------------*/
static pstatus_t
sse2_yCbCrToRGB_16s16s_P3P3(const INT16* WINPR_RESTRICT pSrc[3], int srcStep,
INT16* WINPR_RESTRICT pDst[3], int dstStep,
const prim_size_t* WINPR_RESTRICT roi) /* region of interest */
{
if (((ULONG_PTR)(pSrc[0]) & 0x0f) || ((ULONG_PTR)(pSrc[1]) & 0x0f) ||
((ULONG_PTR)(pSrc[2]) & 0x0f) || ((ULONG_PTR)(pDst[0]) & 0x0f) ||
((ULONG_PTR)(pDst[1]) & 0x0f) || ((ULONG_PTR)(pDst[2]) & 0x0f) || (roi->width & 0x07) ||
(srcStep & 127) || (dstStep & 127))
{
/* We can't maintain 16-byte alignment. */
return generic->yCbCrToRGB_16s16s_P3P3(pSrc, srcStep, pDst, dstStep, roi);
}
const __m128i zero = _mm_setzero_si128();
const __m128i max = _mm_set1_epi16(255);
const __m128i* y_buf = (const __m128i*)(pSrc[0]);
const __m128i* cb_buf = (const __m128i*)(pSrc[1]);
const __m128i* cr_buf = (const __m128i*)(pSrc[2]);
__m128i* r_buf = (__m128i*)(pDst[0]);
__m128i* g_buf = (__m128i*)(pDst[1]);
__m128i* b_buf = (__m128i*)(pDst[2]);
__m128i r_cr =
_mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][0])); /* 1.403 << 14 */
__m128i g_cb =
_mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][1])); /* -0.344 << 14 */
__m128i g_cr =
_mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][2])); /* -0.714 << 14 */
__m128i b_cb =
_mm_set1_epi16(WINPR_ASSERTING_INT_CAST(int16_t, ycbcr_table[14][3])); /* 1.770 << 14 */
__m128i c4096 = _mm_set1_epi16(4096);
const size_t srcbump = WINPR_ASSERTING_INT_CAST(size_t, srcStep) / sizeof(__m128i);
const size_t dstbump = WINPR_ASSERTING_INT_CAST(size_t, dstStep) / sizeof(__m128i);
mm_prefetch_buffer(y_buf, roi->width, (size_t)srcStep, roi->height);
mm_prefetch_buffer(cb_buf, roi->width, (size_t)srcStep, roi->height);
mm_prefetch_buffer(cr_buf, roi->width, (size_t)srcStep, roi->height);
const size_t imax = roi->width * sizeof(INT16) / sizeof(__m128i);
for (UINT32 yp = 0; yp < roi->height; ++yp)
{
for (size_t i = 0; i < imax; i++)
{
/* In order to use SSE2 signed 16-bit integer multiplication
* we need to convert the floating point factors to signed int
* without losing information.
* The result of this multiplication is 32 bit and we have two
* SSE instructions that return either the hi or lo word.
* Thus we will multiply the factors by the highest possible 2^n,
* take the upper 16 bits of the signed 32-bit result
* (_mm_mulhi_epi16) and correct this result by multiplying
* it by 2^(16-n).
*
* For the given factors in the conversion matrix the best
* possible n is 14.
*
* Example for calculating r:
* r = (y>>5) + 128 + (cr*1.403)>>5 // our base formula
* r = (y>>5) + 128 + (HIWORD(cr*(1.403<<14)<<2))>>5 // see above
* r = (y+4096)>>5 + (HIWORD(cr*22986)<<2)>>5 // simplification
* r = ((y+4096)>>2 + HIWORD(cr*22986)) >> 3
*/
/* y = (y_r_buf[i] + 4096) >> 2 */
__m128i y = LOAD_SI128(y_buf + i);
y = _mm_add_epi16(y, c4096);
y = _mm_srai_epi16(y, 2);
/* cb = cb_g_buf[i]; */
__m128i cb = LOAD_SI128(cb_buf + i);
/* cr = cr_b_buf[i]; */
__m128i cr = LOAD_SI128(cr_buf + i);
/* (y + HIWORD(cr*22986)) >> 3 */
__m128i r = _mm_add_epi16(y, _mm_mulhi_epi16(cr, r_cr));
r = _mm_srai_epi16(r, 3);
/* r_buf[i] = CLIP(r); */
mm_between_epi16(r, zero, max);
STORE_SI128(r_buf + i, r);
/* (y + HIWORD(cb*-5636) + HIWORD(cr*-11698)) >> 3 */
__m128i g = _mm_add_epi16(y, _mm_mulhi_epi16(cb, g_cb));
g = _mm_add_epi16(g, _mm_mulhi_epi16(cr, g_cr));
g = _mm_srai_epi16(g, 3);
/* g_buf[i] = CLIP(g); */
mm_between_epi16(g, zero, max);
STORE_SI128(g_buf + i, g);
/* (y + HIWORD(cb*28999)) >> 3 */
__m128i b = _mm_add_epi16(y, _mm_mulhi_epi16(cb, b_cb));
b = _mm_srai_epi16(b, 3);
/* b_buf[i] = CLIP(b); */
mm_between_epi16(b, zero, max);
STORE_SI128(b_buf + i, b);
}
y_buf += srcbump;
cb_buf += srcbump;
cr_buf += srcbump;
r_buf += dstbump;
g_buf += dstbump;
b_buf += dstbump;
}
return PRIMITIVES_SUCCESS;
}
/*---------------------------------------------------------------------------*/
static pstatus_t
sse2_yCbCrToRGB_16s8u_P3AC4R_BGRX(const INT16* WINPR_RESTRICT pSrc[3],
@ -1150,7 +1046,6 @@ void primitives_init_colors_sse2_int(primitives_t* WINPR_RESTRICT prims)
WLog_VRB(PRIM_TAG, "SSE2 optimizations");
prims->RGBToRGB_16s8u_P3AC4R = sse2_RGBToRGB_16s8u_P3AC4R;
prims->yCbCrToRGB_16s16s_P3P3 = sse2_yCbCrToRGB_16s16s_P3P3;
prims->yCbCrToRGB_16s8u_P3AC4R = sse2_yCbCrToRGB_16s8u_P3AC4R;
prims->RGBToYCbCr_16s16s_P3P3 = sse2_RGBToYCbCr_16s16s_P3P3;

View File

@ -1001,10 +1001,8 @@ static BOOL yuv444_to_rgb(BYTE* rgb, size_t stride, const BYTE* yuv[3], const UI
const BYTE Y = yline[0][x];
const BYTE U = yline[1][x];
const BYTE V = yline[2][x];
const BYTE r = YUV2R(Y, U, V);
const BYTE g = YUV2G(Y, U, V);
const BYTE b = YUV2B(Y, U, V);
writePixelBGRX(&line[x * 4], 4, PIXEL_FORMAT_BGRX32, r, g, b, 0xFF);
writeYUVPixel(&line[x * 4], PIXEL_FORMAT_BGRX32, Y, U, V, writePixelBGRX);
}
}
}