cogl-journal: Attempt to clip manually to avoid breaking up batches

Before flushing the journal there is now a separate iteration that will try to determine if the matrix of the clip stack and the matrix of the rectangle in each entry are on the same plane. If they are it can completely avoid the clip stack and instead manually modify the vertex and texture coordinates to implement the clip. The has the advantage that it won't break up batching if a single clipped rectangle is used in a scene. The software clip is only used if there is no user program and no texture matrices. There is a threshold to the size of the batch where it is assumed that it is worth the cost to break up a batch and program the GPU to do the clipping. Currently this is set to 8 although this figure is plucked out of thin air. To check whether the two matrices are on the same plane it tries to determine if one of the matrices is just a simple translation of the other. In the process of this it also works out what the translation would be. These values can be used to translate the clip rectangle into the coordinate space of the rectangle to be logged. Then we can do the clip directly in the rectangle's coordinate space.
2024-11-22 16:10:41 -05:00 · 2010-11-09 19:18:37 +00:00 · 2010-11-09 19:18:37 +00:00 · 1d88e6c8ac
commit 1d88e6c8ac
parent 7294e5552c
4 changed files with 363 additions and 6 deletions
--- a/cogl/cogl-context.c
+++ b/cogl/cogl-context.c
@ -167,6 +167,7 @@ cogl_create_context (void)
  _context->logged_vertices = g_array_new (FALSE, FALSE, sizeof (GLfloat));
  _context->journal_flush_attributes_array =
    g_array_new (TRUE, FALSE, sizeof (CoglVertexAttribute *));
  _context->journal_clip_bounds = NULL;
  _context->polygon_vertices = g_array_new (FALSE, FALSE, sizeof (float));
@ -314,6 +315,8 @@ _cogl_destroy_context (void)
    g_array_free (_context->logged_vertices, TRUE);
  if (_context->journal_flush_attributes_array)
    g_array_free (_context->journal_flush_attributes_array, TRUE);
  if (_context->journal_clip_bounds)
    g_array_free (_context->journal_clip_bounds, TRUE);
  if (_context->polygon_vertices)
    g_array_free (_context->polygon_vertices, TRUE);
--- a/cogl/cogl-context.h
+++ b/cogl/cogl-context.h
@ -103,6 +103,7 @@ typedef struct
  GArray           *logged_vertices;
  GArray           *journal_flush_attributes_array;
  size_t            journal_needed_vbo_len;
  GArray           *journal_clip_bounds;
  GArray           *polygon_vertices;
--- a/cogl/cogl-journal-private.h
+++ b/cogl/cogl-journal-private.h
@ -36,6 +36,8 @@ typedef struct _CoglJournalEntry
  int                      n_layers;
  CoglMatrix               model_view;
  CoglClipStack           *clip_stack;
  /* Offset into ctx->logged_vertices */
  size_t                   array_offset;
  /* XXX: These entries are pretty big now considering the padding in
   * CoglPipelineFlushOptions and CoglMatrix, so we might need to optimize this
   * later. */
--- a/cogl/cogl-journal.c
+++ b/cogl/cogl-journal.c
@ -85,6 +85,11 @@
  (POS_STRIDE + COLOR_STRIDE + \
   TEX_STRIDE * (N_LAYERS < MIN_LAYER_PADING ? MIN_LAYER_PADING : N_LAYERS))
 /* If a batch is longer than this threshold then we'll assume it's not
   worth doing software clipping and it's cheaper to program the GPU
   to do the clip */
 #define COGL_JOURNAL_HARDWARE_CLIP_THRESHOLD 8
 typedef struct _CoglJournalFlushState
 {
  CoglVertexArray     *vertex_array;
@ -685,6 +690,337 @@ _cogl_journal_flush_clip_stacks_and_entries (CoglJournalEntry *batch_start,
                   time_flush_clip_stack_pipeline_entries);
 }
 static gboolean
 calculate_translation (const CoglMatrix *a,
                       const CoglMatrix *b,
                       float *tx_p,
                       float *ty_p)
 {
  float tx, ty;
  int x, y;
  /* Assuming we had the original matrix in this form:
   *
   *      [ a₁₁, a₁₂, a₁₃, a₁₄ ]
   *      [ a₂₁, a₂₂, a₂₃, a₂₄ ]
   *  a = [ a₃₁, a₃₂, a₃₃, a₃₄ ]
   *      [ a₄₁, a₄₂, a₄₃, a₄₄ ]
   *
   * then a translation of that matrix would be a multiplication by a
   * matrix of this form:
   *
   *      [ 1, 0, 0, x ]
   *      [ 0, 1, 0, y ]
   *  t = [ 0, 0, 1, 0 ]
   *      [ 0, 0, 0, 1 ]
   *
   * That would give us a matrix of this form.
   *
   *              [ a₁₁, a₁₂, a₁₃, a₁₁ x + a₁₂ y + a₁₄ ]
   *              [ a₂₁, a₂₂, a₂₃, a₂₁ x + a₂₂ y + a₂₄ ]
   *  b = a ⋅ t = [ a₃₁, a₃₂, a₃₃, a₃₁ x + a₃₂ y + a₃₄ ]
   *              [ a₄₁, a₄₂, a₄₃, a₄₁ x + a₄₂ y + a₄₄ ]
   *
   * We can use the two equations from the top left of the matrix to
   * work out the x and y translation given the two matrices:
   *
   *  b₁₄ = a₁₁x + a₁₂y + a₁₄
   *  b₂₄ = a₂₁x + a₂₂y + a₂₄
   *
   * Rearranging gives us:
   *
   *        a₁₂ b₂₄ - a₂₄ a₁₂
   *        -----------------  +  a₁₄ - b₁₄
   *              a₂₂
   *  x =  ---------------------------------
   *                a₁₂ a₂₁
   *                -------  -  a₁₁
   *                  a₂₂
   *
   *      b₂₄ - a₂₁x - a₂₄
   *  y = ----------------
   *            a₂₂
   *
   * Once we've worked out what x and y would be if this was a valid
   * translation then we can simply verify that the rest of the matrix
   * matches up.
   */
  /* The leftmost 3x4 part of the matrix shouldn't change by a
     translation so we can just compare it directly */
  for (y = 0; y < 4; y++)
    for (x = 0; x < 3; x++)
      if ((&a->xx)[x * 4 + y] != (&b->xx)[x * 4 + y])
        return FALSE;
  tx = (((a->xy * b->yw - a->yw * a->xy) / a->yy + a->xw - b->xw) /
        ((a->xy * a->yx) / a->yy - a->xx));
  ty = (b->yw - a->yx * tx - a->yw) / a->yy;
 #define APPROX_EQUAL(a, b) (fabsf ((a) - (b)) < 1e-6f)
  /* Check whether the 4th column of the matrices match up to the
     calculation */
  if (!APPROX_EQUAL (b->xw, a->xx * tx + a->xy * ty + a->xw) ||
      !APPROX_EQUAL (b->yw, a->yx * tx + a->yy * ty + a->yw) ||
      !APPROX_EQUAL (b->zw, a->zx * tx + a->zy * ty + a->zw) ||
      !APPROX_EQUAL (b->ww, a->wx * tx + a->wy * ty + a->ww))
    return FALSE;
 #undef APPROX_EQUAL
  *tx_p = tx;
  *ty_p = ty;
  return TRUE;
 }
 typedef struct
 {
  float x_1, y_1;
  float x_2, y_2;
 } ClipBounds;
 static void
 check_software_clip_for_batch (CoglJournalEntry      *batch_start,
                               int                    batch_len,
                               CoglJournalFlushState *state)
 {
  CoglClipStack *clip_stack, *clip_entry;
  int entry_num;
  _COGL_GET_CONTEXT (ctx, NO_RETVAL);
  /* This tries to find cases where the entry is logged with a clip
     but it would be faster to modify the vertex and texture
     coordinates rather than flush the clip so that it can batch
     better */
  /* If the batch is reasonably long then it's worthwhile programming
     the GPU to do the clip */
  if (batch_len >= COGL_JOURNAL_HARDWARE_CLIP_THRESHOLD)
    return;
  clip_stack = batch_start->clip_stack;
  if (clip_stack == NULL)
    return;
  /* Verify that all of the clip stack entries are a simple rectangle
     clip */
  for (clip_entry = clip_stack; clip_entry; clip_entry = clip_entry->parent)
    if (clip_entry->type != COGL_CLIP_STACK_RECT)
      return;
  /* This scratch buffer is used to store the translation for each
     entry in the journal. We store it in a separate buffer because
     it's expensive to calculate but at this point we still don't know
     whether we can clip all of the entries so we don't want to do the
     rest of the dependant calculations until we're sure we can. */
  if (ctx->journal_clip_bounds == NULL)
    ctx->journal_clip_bounds = g_array_new (FALSE, FALSE, sizeof (ClipBounds));
  g_array_set_size (ctx->journal_clip_bounds, batch_len);
  for (entry_num = 0; entry_num < batch_len; entry_num++)
    {
      CoglJournalEntry *journal_entry = batch_start + entry_num;
      CoglPipeline *pipeline = journal_entry->pipeline;
      ClipBounds *clip_bounds = &g_array_index (ctx->journal_clip_bounds,
                                                ClipBounds, entry_num);
      int layer_num;
      clip_bounds->x_1 = -G_MAXFLOAT;
      clip_bounds->y_1 = -G_MAXFLOAT;
      clip_bounds->x_2 = G_MAXFLOAT;
      clip_bounds->y_2 = G_MAXFLOAT;
      /* Check the pipeline is usable. We can short-cut here for
         entries using the same pipeline as the previous entry */
      if (entry_num == 0 || pipeline != batch_start[entry_num - 1].pipeline)
        {
          /* If the pipeline has a user program then we can't reliably modify
             the texture coordinates */
          if (cogl_pipeline_get_user_program (pipeline))
            return;
          /* If any of the pipeline layers have a texture matrix then we can't
             reliably modify the texture coordinates */
          for (layer_num = cogl_pipeline_get_n_layers (pipeline) - 1;
               layer_num >= 0;
               layer_num--)
            if (_cogl_pipeline_layer_has_user_matrix (pipeline, layer_num))
              return;
        }
      /* Now we need to verify that each clip entry's matrix is just a
         translation of the journal entry's modelview matrix. We can
         also work out the bounds of the clip in modelview space using
         this translation */
      for (clip_entry = clip_stack; clip_entry; clip_entry = clip_entry->parent)
        {
          float rect_x1, rect_y1, rect_x2, rect_y2;
          CoglClipStackRect *clip_rect;
          float tx, ty;
          clip_rect = (CoglClipStackRect *) clip_entry;
          if (!calculate_translation (&clip_rect->matrix,
                                      &journal_entry->model_view,
                                      &tx, &ty))
            return;
          if (clip_rect->x0 < clip_rect->x1)
            {
              rect_x1 = clip_rect->x0;
              rect_x2 = clip_rect->x1;
            }
          else
            {
              rect_x1 = clip_rect->x1;
              rect_x2 = clip_rect->x0;
            }
          if (clip_rect->y0 < clip_rect->y1)
            {
              rect_y1 = clip_rect->y0;
              rect_y2 = clip_rect->y1;
            }
          else
            {
              rect_y1 = clip_rect->y1;
              rect_y2 = clip_rect->y0;
            }
          clip_bounds->x_1 = MAX (clip_bounds->x_1, rect_x1 - tx);
          clip_bounds->y_1 = MAX (clip_bounds->y_1, rect_y1 - ty);
          clip_bounds->x_2 = MIN (clip_bounds->x_2, rect_x2 - tx);
          clip_bounds->y_2 = MIN (clip_bounds->y_2, rect_y2 - ty);
        }
    }
  /* If we make it here then we know we can software clip the entire batch */
  for (entry_num = 0; entry_num < batch_len; entry_num++)
    {
      CoglJournalEntry *journal_entry = batch_start + entry_num;
      float *verts = &g_array_index (ctx->logged_vertices, float,
                                     journal_entry->array_offset + 1);
      ClipBounds *clip_bounds = &g_array_index (ctx->journal_clip_bounds,
                                                ClipBounds, entry_num);
      size_t stride =
        GET_JOURNAL_ARRAY_STRIDE_FOR_N_LAYERS (journal_entry->n_layers);
      float rx1, ry1, rx2, ry2;
      float vx1, vy1, vx2, vy2;
      int layer_num;
      /* Remove the clip on the entry */
      _cogl_clip_stack_unref (journal_entry->clip_stack);
      journal_entry->clip_stack = NULL;
      vx1 = verts[0];
      vy1 = verts[1];
      vx2 = verts[stride];
      vy2 = verts[stride + 1];
      if (vx1 < vx2)
        {
          rx1 = vx1;
          rx2 = vx2;
        }
      else
        {
          rx1 = vx2;
          rx2 = vx1;
        }
      if (vy1 < vy2)
        {
          ry1 = vy1;
          ry2 = vy2;
        }
      else
        {
          ry1 = vy2;
          ry2 = vy1;
        }
      rx1 = CLAMP (rx1, clip_bounds->x_1, clip_bounds->x_2);
      ry1 = CLAMP (ry1, clip_bounds->y_1, clip_bounds->y_2);
      rx2 = CLAMP (rx2, clip_bounds->x_1, clip_bounds->x_2);
      ry2 = CLAMP (ry2, clip_bounds->y_1, clip_bounds->y_2);
      /* Check if the rectangle intersects the clip at all */
      if (rx1 == rx2 || ry1 == ry2)
        /* Will set all of the vertex data to 0 in the hope that this
           will create a degenerate rectangle and the GL driver will
           be able to clip it quickly */
        memset (verts, 0, sizeof (float) * stride * 2);
      else
        {
          if (vx1 > vx2)
            {
              float t = rx1;
              rx1 = rx2;
              rx2 = t;
            }
          if (vy1 > vy2)
            {
              float t = ry1;
              ry1 = ry2;
              ry2 = t;
            }
          verts[0] = rx1;
          verts[1] = ry1;
          verts[stride] = rx2;
          verts[stride + 1] = ry2;
          /* Convert the rectangle coordinates to a fraction of the original
             rectangle */
          rx1 = (rx1 - vx1) / (vx2 - vx1);
          ry1 = (ry1 - vy1) / (vy2 - vy1);
          rx2 = (rx2 - vx1) / (vx2 - vx1);
          ry2 = (ry2 - vy1) / (vy2 - vy1);
          for (layer_num = 0; layer_num < journal_entry->n_layers; layer_num++)
            {
              float *t = verts + 2 + 2 * layer_num;
              float tx1 = t[0], ty1 = t[1];
              float tx2 = t[stride], ty2 = t[stride + 1];
              t[0] = rx1 * (tx2 - tx1) + tx1;
              t[1] = ry1 * (ty2 - ty1) + ty1;
              t[stride] = rx2 * (tx2 - tx1) + tx1;
              t[stride + 1] = ry2 * (ty2 - ty1) + ty1;
            }
        }
    }
  return;
 }
 static void
 _cogl_journal_check_software_clip (CoglJournalEntry *batch_start,
                                   int               batch_len,
                                   void             *data)
 {
  CoglJournalFlushState *state = data;
  COGL_STATIC_TIMER (time_check_software_clip,
                     "Journal Flush", /* parent */
                     "flush: check software clip",
                     "Time spent checking for software clip",
                     0 /* no application private data */);
  _COGL_GET_CONTEXT (ctx, NO_RETVAL);
  COGL_TIMER_START (_cogl_uprof_context,
                    time_check_software_clip);
  check_software_clip_for_batch (batch_start, batch_len, state);
  COGL_TIMER_STOP (_cogl_uprof_context,
                   time_check_software_clip);
 }
 static gboolean
 compare_entry_clip_stacks (CoglJournalEntry *entry0, CoglJournalEntry *entry1)
 {
@ -812,19 +1148,33 @@ _cogl_journal_flush (void)
  if (G_UNLIKELY (cogl_debug_flags & COGL_DEBUG_BATCHING))
    g_print ("BATCHING: journal len = %d\n", ctx->journal->len);
  state.vertex_array = upload_vertices (&g_array_index (ctx->journal,
                                                        CoglJournalEntry, 0),
                                        ctx->journal->len,
                                        ctx->journal_needed_vbo_len,
                                        ctx->logged_vertices);
  state.attributes = ctx->journal_flush_attributes_array;
  state.array_offset = 0;
  framebuffer = _cogl_get_framebuffer ();
  modelview_stack = _cogl_framebuffer_get_modelview_stack (framebuffer);
  state.modelview_stack = modelview_stack;
  state.projection_stack = _cogl_framebuffer_get_projection_stack (framebuffer);
  /* We do an initial walk of the journal to analyse the clip stack
     batches to see if we can do software clipping. We do this as a
     separate walk of the journal because we can modify entries and
     this may end up joining together clip stack batches in the next
     iteration. */
  batch_and_call ((CoglJournalEntry *)ctx->journal->data, /* first entry */
                  ctx->journal->len, /* max number of entries to consider */
                  compare_entry_clip_stacks,
                  _cogl_journal_check_software_clip, /* callback */
                  &state); /* data */
  /* We upload the vertices after the clip stack pass in case it
     modifies the entries */
  state.vertex_array = upload_vertices (&g_array_index (ctx->journal,
                                                        CoglJournalEntry, 0),
                                        ctx->journal->len,
                                        ctx->journal_needed_vbo_len,
                                        ctx->logged_vertices);
  state.array_offset = 0;
  /* batch_and_call() batches a list of journal entries according to some
   * given criteria and calls a callback once for each determined batch.
   *
@ -975,6 +1325,7 @@ _cogl_journal_log_quad (const float  *position,
  entry = &g_array_index (ctx->journal, CoglJournalEntry, next_entry);
  entry->n_layers = n_layers;
  entry->array_offset = next_vert;
  source = pipeline;