From 62c893feee386223aebbab781171028db547d7f9 Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Tue, 9 Nov 2010 19:18:37 +0000
Subject: [PATCH] cogl-journal: Attempt to clip manually to avoid breaking up
 batches

Before flushing the journal there is now a separate iteration that
will try to determine if the matrix of the clip stack and the matrix
of the rectangle in each entry are on the same plane. If they are it
can completely avoid the clip stack and instead manually modify the
vertex and texture coordinates to implement the clip. The has the
advantage that it won't break up batching if a single clipped
rectangle is used in a scene.

The software clip is only used if there is no user program and no
texture matrices. There is a threshold to the size of the batch where
it is assumed that it is worth the cost to break up a batch and
program the GPU to do the clipping. Currently this is set to 8
although this figure is plucked out of thin air.

To check whether the two matrices are on the same plane it tries to
determine if one of the matrices is just a simple translation of the
other. In the process of this it also works out what the translation
would be. These values can be used to translate the clip rectangle
into the coordinate space of the rectangle to be logged. Then we can
do the clip directly in the rectangle's coordinate space.
---
 clutter/cogl/cogl/cogl-context.c         |   3 +
 clutter/cogl/cogl/cogl-context.h         |   1 +
 clutter/cogl/cogl/cogl-journal-private.h |   2 +
 clutter/cogl/cogl/cogl-journal.c         | 363 ++++++++++++++++++++++-
 4 files changed, 363 insertions(+), 6 deletions(-)

diff --git a/clutter/cogl/cogl/cogl-context.c b/clutter/cogl/cogl/cogl-context.c
index fa8a4b1bd..4b5b15e36 100644
--- a/clutter/cogl/cogl/cogl-context.c
+++ b/clutter/cogl/cogl/cogl-context.c
@@ -167,6 +167,7 @@ cogl_create_context (void)
   _context->logged_vertices = g_array_new (FALSE, FALSE, sizeof (GLfloat));
   _context->journal_flush_attributes_array =
     g_array_new (TRUE, FALSE, sizeof (CoglVertexAttribute *));
+  _context->journal_clip_bounds = NULL;
 
   _context->polygon_vertices = g_array_new (FALSE, FALSE, sizeof (float));
 
@@ -314,6 +315,8 @@ _cogl_destroy_context (void)
     g_array_free (_context->logged_vertices, TRUE);
   if (_context->journal_flush_attributes_array)
     g_array_free (_context->journal_flush_attributes_array, TRUE);
+  if (_context->journal_clip_bounds)
+    g_array_free (_context->journal_clip_bounds, TRUE);
 
   if (_context->polygon_vertices)
     g_array_free (_context->polygon_vertices, TRUE);
diff --git a/clutter/cogl/cogl/cogl-context.h b/clutter/cogl/cogl/cogl-context.h
index a3263558e..5511ed835 100644
--- a/clutter/cogl/cogl/cogl-context.h
+++ b/clutter/cogl/cogl/cogl-context.h
@@ -103,6 +103,7 @@ typedef struct
   GArray           *logged_vertices;
   GArray           *journal_flush_attributes_array;
   size_t            journal_needed_vbo_len;
+  GArray           *journal_clip_bounds;
 
   GArray           *polygon_vertices;
 
diff --git a/clutter/cogl/cogl/cogl-journal-private.h b/clutter/cogl/cogl/cogl-journal-private.h
index 0da86ff9e..a34a5263e 100644
--- a/clutter/cogl/cogl/cogl-journal-private.h
+++ b/clutter/cogl/cogl/cogl-journal-private.h
@@ -36,6 +36,8 @@ typedef struct _CoglJournalEntry
   int                      n_layers;
   CoglMatrix               model_view;
   CoglClipStack           *clip_stack;
+  /* Offset into ctx->logged_vertices */
+  size_t                   array_offset;
   /* XXX: These entries are pretty big now considering the padding in
    * CoglPipelineFlushOptions and CoglMatrix, so we might need to optimize this
    * later. */
diff --git a/clutter/cogl/cogl/cogl-journal.c b/clutter/cogl/cogl/cogl-journal.c
index ad12a4219..210043b25 100644
--- a/clutter/cogl/cogl/cogl-journal.c
+++ b/clutter/cogl/cogl/cogl-journal.c
@@ -85,6 +85,11 @@
   (POS_STRIDE + COLOR_STRIDE + \
    TEX_STRIDE * (N_LAYERS < MIN_LAYER_PADING ? MIN_LAYER_PADING : N_LAYERS))
 
+/* If a batch is longer than this threshold then we'll assume it's not
+   worth doing software clipping and it's cheaper to program the GPU
+   to do the clip */
+#define COGL_JOURNAL_HARDWARE_CLIP_THRESHOLD 8
+
 typedef struct _CoglJournalFlushState
 {
   CoglVertexArray     *vertex_array;
@@ -685,6 +690,337 @@ _cogl_journal_flush_clip_stacks_and_entries (CoglJournalEntry *batch_start,
                    time_flush_clip_stack_pipeline_entries);
 }
 
+static gboolean
+calculate_translation (const CoglMatrix *a,
+                       const CoglMatrix *b,
+                       float *tx_p,
+                       float *ty_p)
+{
+  float tx, ty;
+  int x, y;
+
+  /* Assuming we had the original matrix in this form:
+   *
+   *      [ a₁₁, a₁₂, a₁₃, a₁₄ ]
+   *      [ a₂₁, a₂₂, a₂₃, a₂₄ ]
+   *  a = [ a₃₁, a₃₂, a₃₃, a₃₄ ]
+   *      [ a₄₁, a₄₂, a₄₃, a₄₄ ]
+   *
+   * then a translation of that matrix would be a multiplication by a
+   * matrix of this form:
+   *
+   *      [ 1, 0, 0, x ]
+   *      [ 0, 1, 0, y ]
+   *  t = [ 0, 0, 1, 0 ]
+   *      [ 0, 0, 0, 1 ]
+   *
+   * That would give us a matrix of this form.
+   *
+   *              [ a₁₁, a₁₂, a₁₃, a₁₁ x + a₁₂ y + a₁₄ ]
+   *              [ a₂₁, a₂₂, a₂₃, a₂₁ x + a₂₂ y + a₂₄ ]
+   *  b = a ⋅ t = [ a₃₁, a₃₂, a₃₃, a₃₁ x + a₃₂ y + a₃₄ ]
+   *              [ a₄₁, a₄₂, a₄₃, a₄₁ x + a₄₂ y + a₄₄ ]
+   *
+   * We can use the two equations from the top left of the matrix to
+   * work out the x and y translation given the two matrices:
+   *
+   *  b₁₄ = a₁₁x + a₁₂y + a₁₄
+   *  b₂₄ = a₂₁x + a₂₂y + a₂₄
+   *
+   * Rearranging gives us:
+   *
+   *        a₁₂ b₂₄ - a₂₄ a₁₂
+   *        -----------------  +  a₁₄ - b₁₄
+   *              a₂₂
+   *  x =  ---------------------------------
+   *                a₁₂ a₂₁
+   *                -------  -  a₁₁
+   *                  a₂₂
+   *
+   *      b₂₄ - a₂₁x - a₂₄
+   *  y = ----------------
+   *            a₂₂
+   *
+   * Once we've worked out what x and y would be if this was a valid
+   * translation then we can simply verify that the rest of the matrix
+   * matches up.
+   */
+
+  /* The leftmost 3x4 part of the matrix shouldn't change by a
+     translation so we can just compare it directly */
+  for (y = 0; y < 4; y++)
+    for (x = 0; x < 3; x++)
+      if ((&a->xx)[x * 4 + y] != (&b->xx)[x * 4 + y])
+        return FALSE;
+
+  tx = (((a->xy * b->yw - a->yw * a->xy) / a->yy + a->xw - b->xw) /
+        ((a->xy * a->yx) / a->yy - a->xx));
+  ty = (b->yw - a->yx * tx - a->yw) / a->yy;
+
+#define APPROX_EQUAL(a, b) (fabsf ((a) - (b)) < 1e-6f)
+
+  /* Check whether the 4th column of the matrices match up to the
+     calculation */
+  if (!APPROX_EQUAL (b->xw, a->xx * tx + a->xy * ty + a->xw) ||
+      !APPROX_EQUAL (b->yw, a->yx * tx + a->yy * ty + a->yw) ||
+      !APPROX_EQUAL (b->zw, a->zx * tx + a->zy * ty + a->zw) ||
+      !APPROX_EQUAL (b->ww, a->wx * tx + a->wy * ty + a->ww))
+    return FALSE;
+
+#undef APPROX_EQUAL
+
+  *tx_p = tx;
+  *ty_p = ty;
+
+  return TRUE;
+}
+
+typedef struct
+{
+  float x_1, y_1;
+  float x_2, y_2;
+} ClipBounds;
+
+static void
+check_software_clip_for_batch (CoglJournalEntry      *batch_start,
+                               int                    batch_len,
+                               CoglJournalFlushState *state)
+{
+  CoglClipStack *clip_stack, *clip_entry;
+  int entry_num;
+
+  _COGL_GET_CONTEXT (ctx, NO_RETVAL);
+
+  /* This tries to find cases where the entry is logged with a clip
+     but it would be faster to modify the vertex and texture
+     coordinates rather than flush the clip so that it can batch
+     better */
+
+  /* If the batch is reasonably long then it's worthwhile programming
+     the GPU to do the clip */
+  if (batch_len >= COGL_JOURNAL_HARDWARE_CLIP_THRESHOLD)
+    return;
+
+  clip_stack = batch_start->clip_stack;
+
+  if (clip_stack == NULL)
+    return;
+
+  /* Verify that all of the clip stack entries are a simple rectangle
+     clip */
+  for (clip_entry = clip_stack; clip_entry; clip_entry = clip_entry->parent)
+    if (clip_entry->type != COGL_CLIP_STACK_RECT)
+      return;
+
+  /* This scratch buffer is used to store the translation for each
+     entry in the journal. We store it in a separate buffer because
+     it's expensive to calculate but at this point we still don't know
+     whether we can clip all of the entries so we don't want to do the
+     rest of the dependant calculations until we're sure we can. */
+  if (ctx->journal_clip_bounds == NULL)
+    ctx->journal_clip_bounds = g_array_new (FALSE, FALSE, sizeof (ClipBounds));
+  g_array_set_size (ctx->journal_clip_bounds, batch_len);
+
+  for (entry_num = 0; entry_num < batch_len; entry_num++)
+    {
+      CoglJournalEntry *journal_entry = batch_start + entry_num;
+      CoglPipeline *pipeline = journal_entry->pipeline;
+      ClipBounds *clip_bounds = &g_array_index (ctx->journal_clip_bounds,
+                                                ClipBounds, entry_num);
+      int layer_num;
+
+      clip_bounds->x_1 = -G_MAXFLOAT;
+      clip_bounds->y_1 = -G_MAXFLOAT;
+      clip_bounds->x_2 = G_MAXFLOAT;
+      clip_bounds->y_2 = G_MAXFLOAT;
+
+      /* Check the pipeline is usable. We can short-cut here for
+         entries using the same pipeline as the previous entry */
+      if (entry_num == 0 || pipeline != batch_start[entry_num - 1].pipeline)
+        {
+          /* If the pipeline has a user program then we can't reliably modify
+             the texture coordinates */
+          if (cogl_pipeline_get_user_program (pipeline))
+            return;
+
+          /* If any of the pipeline layers have a texture matrix then we can't
+             reliably modify the texture coordinates */
+          for (layer_num = cogl_pipeline_get_n_layers (pipeline) - 1;
+               layer_num >= 0;
+               layer_num--)
+            if (_cogl_pipeline_layer_has_user_matrix (pipeline, layer_num))
+              return;
+        }
+
+      /* Now we need to verify that each clip entry's matrix is just a
+         translation of the journal entry's modelview matrix. We can
+         also work out the bounds of the clip in modelview space using
+         this translation */
+      for (clip_entry = clip_stack; clip_entry; clip_entry = clip_entry->parent)
+        {
+          float rect_x1, rect_y1, rect_x2, rect_y2;
+          CoglClipStackRect *clip_rect;
+          float tx, ty;
+
+          clip_rect = (CoglClipStackRect *) clip_entry;
+
+          if (!calculate_translation (&clip_rect->matrix,
+                                      &journal_entry->model_view,
+                                      &tx, &ty))
+            return;
+
+          if (clip_rect->x0 < clip_rect->x1)
+            {
+              rect_x1 = clip_rect->x0;
+              rect_x2 = clip_rect->x1;
+            }
+          else
+            {
+              rect_x1 = clip_rect->x1;
+              rect_x2 = clip_rect->x0;
+            }
+          if (clip_rect->y0 < clip_rect->y1)
+            {
+              rect_y1 = clip_rect->y0;
+              rect_y2 = clip_rect->y1;
+            }
+          else
+            {
+              rect_y1 = clip_rect->y1;
+              rect_y2 = clip_rect->y0;
+            }
+
+          clip_bounds->x_1 = MAX (clip_bounds->x_1, rect_x1 - tx);
+          clip_bounds->y_1 = MAX (clip_bounds->y_1, rect_y1 - ty);
+          clip_bounds->x_2 = MIN (clip_bounds->x_2, rect_x2 - tx);
+          clip_bounds->y_2 = MIN (clip_bounds->y_2, rect_y2 - ty);
+        }
+    }
+
+  /* If we make it here then we know we can software clip the entire batch */
+
+  for (entry_num = 0; entry_num < batch_len; entry_num++)
+    {
+      CoglJournalEntry *journal_entry = batch_start + entry_num;
+      float *verts = &g_array_index (ctx->logged_vertices, float,
+                                     journal_entry->array_offset + 1);
+      ClipBounds *clip_bounds = &g_array_index (ctx->journal_clip_bounds,
+                                                ClipBounds, entry_num);
+
+      size_t stride =
+        GET_JOURNAL_ARRAY_STRIDE_FOR_N_LAYERS (journal_entry->n_layers);
+      float rx1, ry1, rx2, ry2;
+      float vx1, vy1, vx2, vy2;
+      int layer_num;
+
+      /* Remove the clip on the entry */
+      _cogl_clip_stack_unref (journal_entry->clip_stack);
+      journal_entry->clip_stack = NULL;
+
+      vx1 = verts[0];
+      vy1 = verts[1];
+      vx2 = verts[stride];
+      vy2 = verts[stride + 1];
+
+      if (vx1 < vx2)
+        {
+          rx1 = vx1;
+          rx2 = vx2;
+        }
+      else
+        {
+          rx1 = vx2;
+          rx2 = vx1;
+        }
+      if (vy1 < vy2)
+        {
+          ry1 = vy1;
+          ry2 = vy2;
+        }
+      else
+        {
+          ry1 = vy2;
+          ry2 = vy1;
+        }
+
+      rx1 = CLAMP (rx1, clip_bounds->x_1, clip_bounds->x_2);
+      ry1 = CLAMP (ry1, clip_bounds->y_1, clip_bounds->y_2);
+      rx2 = CLAMP (rx2, clip_bounds->x_1, clip_bounds->x_2);
+      ry2 = CLAMP (ry2, clip_bounds->y_1, clip_bounds->y_2);
+
+      /* Check if the rectangle intersects the clip at all */
+      if (rx1 == rx2 || ry1 == ry2)
+        /* Will set all of the vertex data to 0 in the hope that this
+           will create a degenerate rectangle and the GL driver will
+           be able to clip it quickly */
+        memset (verts, 0, sizeof (float) * stride * 2);
+      else
+        {
+          if (vx1 > vx2)
+            {
+              float t = rx1;
+              rx1 = rx2;
+              rx2 = t;
+            }
+          if (vy1 > vy2)
+            {
+              float t = ry1;
+              ry1 = ry2;
+              ry2 = t;
+            }
+
+          verts[0] = rx1;
+          verts[1] = ry1;
+          verts[stride] = rx2;
+          verts[stride + 1] = ry2;
+
+          /* Convert the rectangle coordinates to a fraction of the original
+             rectangle */
+          rx1 = (rx1 - vx1) / (vx2 - vx1);
+          ry1 = (ry1 - vy1) / (vy2 - vy1);
+          rx2 = (rx2 - vx1) / (vx2 - vx1);
+          ry2 = (ry2 - vy1) / (vy2 - vy1);
+
+          for (layer_num = 0; layer_num < journal_entry->n_layers; layer_num++)
+            {
+              float *t = verts + 2 + 2 * layer_num;
+              float tx1 = t[0], ty1 = t[1];
+              float tx2 = t[stride], ty2 = t[stride + 1];
+              t[0] = rx1 * (tx2 - tx1) + tx1;
+              t[1] = ry1 * (ty2 - ty1) + ty1;
+              t[stride] = rx2 * (tx2 - tx1) + tx1;
+              t[stride + 1] = ry2 * (ty2 - ty1) + ty1;
+            }
+        }
+    }
+
+  return;
+}
+
+static void
+_cogl_journal_check_software_clip (CoglJournalEntry *batch_start,
+                                   int               batch_len,
+                                   void             *data)
+{
+  CoglJournalFlushState *state = data;
+
+  COGL_STATIC_TIMER (time_check_software_clip,
+                     "Journal Flush", /* parent */
+                     "flush: check software clip",
+                     "Time spent checking for software clip",
+                     0 /* no application private data */);
+
+  _COGL_GET_CONTEXT (ctx, NO_RETVAL);
+
+  COGL_TIMER_START (_cogl_uprof_context,
+                    time_check_software_clip);
+
+  check_software_clip_for_batch (batch_start, batch_len, state);
+
+  COGL_TIMER_STOP (_cogl_uprof_context,
+                   time_check_software_clip);
+}
+
 static gboolean
 compare_entry_clip_stacks (CoglJournalEntry *entry0, CoglJournalEntry *entry1)
 {
@@ -812,19 +1148,33 @@ _cogl_journal_flush (void)
   if (G_UNLIKELY (cogl_debug_flags & COGL_DEBUG_BATCHING))
     g_print ("BATCHING: journal len = %d\n", ctx->journal->len);
 
-  state.vertex_array = upload_vertices (&g_array_index (ctx->journal,
-                                                        CoglJournalEntry, 0),
-                                        ctx->journal->len,
-                                        ctx->journal_needed_vbo_len,
-                                        ctx->logged_vertices);
   state.attributes = ctx->journal_flush_attributes_array;
-  state.array_offset = 0;
 
   framebuffer = _cogl_get_framebuffer ();
   modelview_stack = _cogl_framebuffer_get_modelview_stack (framebuffer);
   state.modelview_stack = modelview_stack;
   state.projection_stack = _cogl_framebuffer_get_projection_stack (framebuffer);
 
+  /* We do an initial walk of the journal to analyse the clip stack
+     batches to see if we can do software clipping. We do this as a
+     separate walk of the journal because we can modify entries and
+     this may end up joining together clip stack batches in the next
+     iteration. */
+  batch_and_call ((CoglJournalEntry *)ctx->journal->data, /* first entry */
+                  ctx->journal->len, /* max number of entries to consider */
+                  compare_entry_clip_stacks,
+                  _cogl_journal_check_software_clip, /* callback */
+                  &state); /* data */
+
+  /* We upload the vertices after the clip stack pass in case it
+     modifies the entries */
+  state.vertex_array = upload_vertices (&g_array_index (ctx->journal,
+                                                        CoglJournalEntry, 0),
+                                        ctx->journal->len,
+                                        ctx->journal_needed_vbo_len,
+                                        ctx->logged_vertices);
+  state.array_offset = 0;
+
   /* batch_and_call() batches a list of journal entries according to some
    * given criteria and calls a callback once for each determined batch.
    *
@@ -975,6 +1325,7 @@ _cogl_journal_log_quad (const float  *position,
   entry = &g_array_index (ctx->journal, CoglJournalEntry, next_entry);
 
   entry->n_layers = n_layers;
+  entry->array_offset = next_vert;
 
   source = pipeline;