cogl: Use SSE2 when possible for premultiplying

This adds a fast path for premultiplying an RGBA image using SSE2 instructions. SSE registers are 128-bit and we need at least 16-bits per component for the intermediate result of the multiplication so we can do two pixels in parallel with one register. The function interleaves 2 SSE registers to multiply 4 pixels in one function call with the hope that this will pipeline better. http://bugzilla.openedhand.com/show_bug.cgi?id=1939 Signed-off-by: Emmanuele Bassi <ebassi@linux.intel.com>
2009-12-18 21:17:21 +00:00
parent bbb058df40
commit 1b2ff7eff7
1 changed files with 103 additions and 1 deletions
--- a/clutter/cogl/cogl/cogl-bitmap-fallback.c
+++ b/clutter/cogl/cogl/cogl-bitmap-fallback.c
@@ -215,6 +215,91 @@ _cogl_premult_alpha_first (guchar *dst)
 #undef MULT
 /* Use the SSE optimized version to premult four pixels at once when
   it is available. The same assembler code works for x86 and x86-64
   because it doesn't refer to any non-SSE registers directly */
 #if defined(__SSE2__) && defined(__GNUC__) \
  && (defined(__x86_64) || defined(__i386))
 #define COGL_USE_PREMULT_SSE2
 #endif
 #ifdef COGL_USE_PREMULT_SSE2
 inline static void
 _cogl_premult_alpha_last_four_pixels_sse2 (const guint8 *p)
 {
  /* 8 copies of 128 used below */
  static const gint16 eight_halves[8] __attribute__ ((aligned (16))) =
    { 128, 128, 128, 128, 128, 128, 128, 128 };
  /* Mask of the rgb components of the four pixels */
  static const gint8 just_rgb[16] __attribute__ ((aligned (16))) =
    { 0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00,
      0xff, 0xff, 0xff, 0x00, 0xff, 0xff, 0xff, 0x00 };
  /* Each SSE register only holds two pixels because we need to work
     with 16-bit intermediate values. We still do four pixels by
     interleaving two registers in the hope that it will pipeline
     better */
  asm (/* Load eight_halves into xmm5 for later */
       "movdqa (%1), %%xmm5\n"
       /* Clear xmm3 */
       "pxor %%xmm3, %%xmm3\n"
       /* Load two pixels from p into the low half of xmm0 */
       "movlps (%0), %%xmm0\n"
       /* Load the next set of two pixels from p into the low half of xmm1 */
       "movlps 8(%0), %%xmm1\n"
       /* Unpack 8 bytes from the low quad-words in each register to 8
          16-bit values */
       "punpcklbw %%xmm3, %%xmm0\n"
       "punpcklbw %%xmm3, %%xmm1\n"
       /* Copy alpha values of the first pixel in xmm0 to all
          components of the first pixel in xmm2 */
       "pshuflw $255, %%xmm0, %%xmm2\n"
       /* same for xmm1 and xmm3 */
       "pshuflw $255, %%xmm1, %%xmm3\n"
       /* The above also copies the second pixel directly so we now
          want to replace the RGB components with copies of the alpha
          components */
       "pshufhw $255, %%xmm2, %%xmm2\n"
       "pshufhw $255, %%xmm3, %%xmm3\n"
       /* Multiply the rgb components by the alpha */
       "pmullw %%xmm2, %%xmm0\n"
       "pmullw %%xmm3, %%xmm1\n"
       /* Add 128 to each component */
       "paddw %%xmm5, %%xmm0\n"
       "paddw %%xmm5, %%xmm1\n"
       /* Copy the results to temporary registers xmm4 and xmm5 */
       "movdqa %%xmm0, %%xmm4\n"
       "movdqa %%xmm1, %%xmm5\n"
       /* Divide the results by 256 */
       "psrlw $8, %%xmm0\n"
       "psrlw $8, %%xmm1\n"
       /* Add the temporaries back in */
       "paddw %%xmm4, %%xmm0\n"
       "paddw %%xmm5, %%xmm1\n"
       /* Divide again */
       "psrlw $8, %%xmm0\n"
       "psrlw $8, %%xmm1\n"
       /* Pack the results back as bytes */
       "packuswb %%xmm1, %%xmm0\n"
       /* Load just_rgb into xmm3 for later */
       "movdqa (%2), %%xmm3\n"
       /* Reload all four pixels into xmm2 */
       "movups (%0), %%xmm2\n"
       /* Mask out the alpha from the results */
       "andps %%xmm3, %%xmm0\n"
       /* Mask out the RGB from the original four pixels */
       "andnps %%xmm2, %%xmm3\n"
       /* Combine the two to get the right alpha values */
       "orps %%xmm3, %%xmm0\n"
       /* Write to memory */
       "movdqu %%xmm0, (%0)\n"
       : /* no outputs */
       : "r" (p), "r" (eight_halves), "r" (just_rgb)
       : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif /* COGL_USE_PREMULT_SSE2 */
 gboolean
 _cogl_bitmap_fallback_can_convert (CoglPixelFormat src, CoglPixelFormat dst)
 {
@@ -408,7 +493,24 @@ _cogl_bitmap_fallback_premult (CoglBitmap *bmp)
        }
      else
        {
-          for (x = 0; x < bmp->width; x++)
+          x = bmp->width;
 #ifdef COGL_USE_PREMULT_SSE2
          /* Process 4 pixels at a time */
          while (x >= 4)
            {
              _cogl_premult_alpha_last_four_pixels_sse2 (p);
              p += 4 * 4;
              x -= 4;
            }
          /* If there are any pixels left we will fall through and
             handle them below */
 #endif /* COGL_USE_PREMULT_SSE2 */
          while (x-- > 0)
            {
              _cogl_premult_alpha_last (p);
              p += 4;