cogl: Use SSE2 when possible for premultiplying
This adds a fast path for premultiplying an RGBA image using SSE2 instructions. SSE registers are 128-bit and we need at least 16-bits per component for the intermediate result of the multiplication so we can do two pixels in parallel with one register. The function interleaves 2 SSE registers to multiply 4 pixels in one function call with the hope that this will pipeline better. http://bugzilla.openedhand.com/show_bug.cgi?id=1939 Signed-off-by: Emmanuele Bassi <ebassi@linux.intel.com>
This commit is contained in:
parent
bbb058df40
commit
1b2ff7eff7
@ -215,6 +215,91 @@ _cogl_premult_alpha_first (guchar *dst)
|
|||||||
|
|
||||||
#undef MULT
|
#undef MULT
|
||||||
|
|
||||||
|
/* Use the SSE optimized version to premult four pixels at once when
|
||||||
|
it is available. The same assembler code works for x86 and x86-64
|
||||||
|
because it doesn't refer to any non-SSE registers directly */
|
||||||
|
#if defined(__SSE2__) && defined(__GNUC__) \
|
||||||
|
&& (defined(__x86_64) || defined(__i386))
|
||||||
|
#define COGL_USE_PREMULT_SSE2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef COGL_USE_PREMULT_SSE2
|
||||||
|
|
||||||
|
inline static void
|
||||||
|
_cogl_premult_alpha_last_four_pixels_sse2 (const guint8 *p)
|
||||||
|
{
|
||||||
|
/* 8 copies of 128 used below */
|
||||||
|
static const gint16 eight_halves[8] __attribute__ ((aligned (16))) =
|
||||||
|
{ 128, 128, 128, 128, 128, 128, 128, 128 };
|
||||||
|
/* Mask of the rgb components of the four pixels */
|
||||||
|
static const gint8 just_rgb[16] __attribute__ ((aligned (16))) =
|
||||||
|
{ 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00,
|
||||||
|
0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00 };
|
||||||
|
/* Each SSE register only holds two pixels because we need to work
|
||||||
|
with 16-bit intermediate values. We still do four pixels by
|
||||||
|
interleaving two registers in the hope that it will pipeline
|
||||||
|
better */
|
||||||
|
asm (/* Load eight_halves into xmm5 for later */
|
||||||
|
"movdqa (%1), %%xmm5\n"
|
||||||
|
/* Clear xmm3 */
|
||||||
|
"pxor %%xmm3, %%xmm3\n"
|
||||||
|
/* Load two pixels from p into the low half of xmm0 */
|
||||||
|
"movlps (%0), %%xmm0\n"
|
||||||
|
/* Load the next set of two pixels from p into the low half of xmm1 */
|
||||||
|
"movlps 8(%0), %%xmm1\n"
|
||||||
|
/* Unpack 8 bytes from the low quad-words in each register to 8
|
||||||
|
16-bit values */
|
||||||
|
"punpcklbw %%xmm3, %%xmm0\n"
|
||||||
|
"punpcklbw %%xmm3, %%xmm1\n"
|
||||||
|
/* Copy alpha values of the first pixel in xmm0 to all
|
||||||
|
components of the first pixel in xmm2 */
|
||||||
|
"pshuflw $255, %%xmm0, %%xmm2\n"
|
||||||
|
/* same for xmm1 and xmm3 */
|
||||||
|
"pshuflw $255, %%xmm1, %%xmm3\n"
|
||||||
|
/* The above also copies the second pixel directly so we now
|
||||||
|
want to replace the RGB components with copies of the alpha
|
||||||
|
components */
|
||||||
|
"pshufhw $255, %%xmm2, %%xmm2\n"
|
||||||
|
"pshufhw $255, %%xmm3, %%xmm3\n"
|
||||||
|
/* Multiply the rgb components by the alpha */
|
||||||
|
"pmullw %%xmm2, %%xmm0\n"
|
||||||
|
"pmullw %%xmm3, %%xmm1\n"
|
||||||
|
/* Add 128 to each component */
|
||||||
|
"paddw %%xmm5, %%xmm0\n"
|
||||||
|
"paddw %%xmm5, %%xmm1\n"
|
||||||
|
/* Copy the results to temporary registers xmm4 and xmm5 */
|
||||||
|
"movdqa %%xmm0, %%xmm4\n"
|
||||||
|
"movdqa %%xmm1, %%xmm5\n"
|
||||||
|
/* Divide the results by 256 */
|
||||||
|
"psrlw $8, %%xmm0\n"
|
||||||
|
"psrlw $8, %%xmm1\n"
|
||||||
|
/* Add the temporaries back in */
|
||||||
|
"paddw %%xmm4, %%xmm0\n"
|
||||||
|
"paddw %%xmm5, %%xmm1\n"
|
||||||
|
/* Divide again */
|
||||||
|
"psrlw $8, %%xmm0\n"
|
||||||
|
"psrlw $8, %%xmm1\n"
|
||||||
|
/* Pack the results back as bytes */
|
||||||
|
"packuswb %%xmm1, %%xmm0\n"
|
||||||
|
/* Load just_rgb into xmm3 for later */
|
||||||
|
"movdqa (%2), %%xmm3\n"
|
||||||
|
/* Reload all four pixels into xmm2 */
|
||||||
|
"movups (%0), %%xmm2\n"
|
||||||
|
/* Mask out the alpha from the results */
|
||||||
|
"andps %%xmm3, %%xmm0\n"
|
||||||
|
/* Mask out the RGB from the original four pixels */
|
||||||
|
"andnps %%xmm2, %%xmm3\n"
|
||||||
|
/* Combine the two to get the right alpha values */
|
||||||
|
"orps %%xmm3, %%xmm0\n"
|
||||||
|
/* Write to memory */
|
||||||
|
"movdqu %%xmm0, (%0)\n"
|
||||||
|
: /* no outputs */
|
||||||
|
: "r" (p), "r" (eight_halves), "r" (just_rgb)
|
||||||
|
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif /* COGL_USE_PREMULT_SSE2 */
|
||||||
|
|
||||||
gboolean
|
gboolean
|
||||||
_cogl_bitmap_fallback_can_convert (CoglPixelFormat src, CoglPixelFormat dst)
|
_cogl_bitmap_fallback_can_convert (CoglPixelFormat src, CoglPixelFormat dst)
|
||||||
{
|
{
|
||||||
@ -408,7 +493,24 @@ _cogl_bitmap_fallback_premult (CoglBitmap *bmp)
|
|||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
for (x = 0; x < bmp->width; x++)
|
x = bmp->width;
|
||||||
|
|
||||||
|
#ifdef COGL_USE_PREMULT_SSE2
|
||||||
|
|
||||||
|
/* Process 4 pixels at a time */
|
||||||
|
while (x >= 4)
|
||||||
|
{
|
||||||
|
_cogl_premult_alpha_last_four_pixels_sse2 (p);
|
||||||
|
p += 4 * 4;
|
||||||
|
x -= 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If there are any pixels left we will fall through and
|
||||||
|
handle them below */
|
||||||
|
|
||||||
|
#endif /* COGL_USE_PREMULT_SSE2 */
|
||||||
|
|
||||||
|
while (x-- > 0)
|
||||||
{
|
{
|
||||||
_cogl_premult_alpha_last (p);
|
_cogl_premult_alpha_last (p);
|
||||||
p += 4;
|
p += 4;
|
||||||
|
Loading…
Reference in New Issue
Block a user