diff --git a/cogl/cogl-context.c b/cogl/cogl-context.c
index fa8a4b1bd..4b5b15e36 100644
--- a/cogl/cogl-context.c
+++ b/cogl/cogl-context.c
@@ -167,6 +167,7 @@ cogl_create_context (void)
   _context->logged_vertices = g_array_new (FALSE, FALSE, sizeof (GLfloat));
   _context->journal_flush_attributes_array =
     g_array_new (TRUE, FALSE, sizeof (CoglVertexAttribute *));
+  _context->journal_clip_bounds = NULL;
 
   _context->polygon_vertices = g_array_new (FALSE, FALSE, sizeof (float));
 
@@ -314,6 +315,8 @@ _cogl_destroy_context (void)
     g_array_free (_context->logged_vertices, TRUE);
   if (_context->journal_flush_attributes_array)
     g_array_free (_context->journal_flush_attributes_array, TRUE);
+  if (_context->journal_clip_bounds)
+    g_array_free (_context->journal_clip_bounds, TRUE);
 
   if (_context->polygon_vertices)
     g_array_free (_context->polygon_vertices, TRUE);
diff --git a/cogl/cogl-context.h b/cogl/cogl-context.h
index a3263558e..5511ed835 100644
--- a/cogl/cogl-context.h
+++ b/cogl/cogl-context.h
@@ -103,6 +103,7 @@ typedef struct
   GArray           *logged_vertices;
   GArray           *journal_flush_attributes_array;
   size_t            journal_needed_vbo_len;
+  GArray           *journal_clip_bounds;
 
   GArray           *polygon_vertices;
 
diff --git a/cogl/cogl-journal-private.h b/cogl/cogl-journal-private.h
index 0da86ff9e..a34a5263e 100644
--- a/cogl/cogl-journal-private.h
+++ b/cogl/cogl-journal-private.h
@@ -36,6 +36,8 @@ typedef struct _CoglJournalEntry
   int                      n_layers;
   CoglMatrix               model_view;
   CoglClipStack           *clip_stack;
+  /* Offset into ctx->logged_vertices */
+  size_t                   array_offset;
   /* XXX: These entries are pretty big now considering the padding in
    * CoglPipelineFlushOptions and CoglMatrix, so we might need to optimize this
    * later. */
diff --git a/cogl/cogl-journal.c b/cogl/cogl-journal.c
index ad12a4219..210043b25 100644
--- a/cogl/cogl-journal.c
+++ b/cogl/cogl-journal.c
@@ -85,6 +85,11 @@
   (POS_STRIDE + COLOR_STRIDE + \
    TEX_STRIDE * (N_LAYERS < MIN_LAYER_PADING ? MIN_LAYER_PADING : N_LAYERS))
 
+/* If a batch is longer than this threshold then we'll assume it's not
+   worth doing software clipping and it's cheaper to program the GPU
+   to do the clip */
+#define COGL_JOURNAL_HARDWARE_CLIP_THRESHOLD 8
+
 typedef struct _CoglJournalFlushState
 {
   CoglVertexArray     *vertex_array;
@@ -685,6 +690,337 @@ _cogl_journal_flush_clip_stacks_and_entries (CoglJournalEntry *batch_start,
                    time_flush_clip_stack_pipeline_entries);
 }
 
+static gboolean
+calculate_translation (const CoglMatrix *a,
+                       const CoglMatrix *b,
+                       float *tx_p,
+                       float *ty_p)
+{
+  float tx, ty;
+  int x, y;
+
+  /* Assuming we had the original matrix in this form:
+   *
+   *      [ a₁₁, a₁₂, a₁₃, a₁₄ ]
+   *      [ a₂₁, a₂₂, a₂₃, a₂₄ ]
+   *  a = [ a₃₁, a₃₂, a₃₃, a₃₄ ]
+   *      [ a₄₁, a₄₂, a₄₃, a₄₄ ]
+   *
+   * then a translation of that matrix would be a multiplication by a
+   * matrix of this form:
+   *
+   *      [ 1, 0, 0, x ]
+   *      [ 0, 1, 0, y ]
+   *  t = [ 0, 0, 1, 0 ]
+   *      [ 0, 0, 0, 1 ]
+   *
+   * That would give us a matrix of this form.
+   *
+   *              [ a₁₁, a₁₂, a₁₃, a₁₁ x + a₁₂ y + a₁₄ ]
+   *              [ a₂₁, a₂₂, a₂₃, a₂₁ x + a₂₂ y + a₂₄ ]
+   *  b = a ⋅ t = [ a₃₁, a₃₂, a₃₃, a₃₁ x + a₃₂ y + a₃₄ ]
+   *              [ a₄₁, a₄₂, a₄₃, a₄₁ x + a₄₂ y + a₄₄ ]
+   *
+   * We can use the two equations from the top left of the matrix to
+   * work out the x and y translation given the two matrices:
+   *
+   *  b₁₄ = a₁₁x + a₁₂y + a₁₄
+   *  b₂₄ = a₂₁x + a₂₂y + a₂₄
+   *
+   * Rearranging gives us:
+   *
+   *        a₁₂ b₂₄ - a₂₄ a₁₂
+   *        -----------------  +  a₁₄ - b₁₄
+   *              a₂₂
+   *  x =  ---------------------------------
+   *                a₁₂ a₂₁
+   *                -------  -  a₁₁
+   *                  a₂₂
+   *
+   *      b₂₄ - a₂₁x - a₂₄
+   *  y = ----------------
+   *            a₂₂
+   *
+   * Once we've worked out what x and y would be if this was a valid
+   * translation then we can simply verify that the rest of the matrix
+   * matches up.
+   */
+
+  /* The leftmost 3x4 part of the matrix shouldn't change by a
+     translation so we can just compare it directly */
+  for (y = 0; y < 4; y++)
+    for (x = 0; x < 3; x++)
+      if ((&a->xx)[x * 4 + y] != (&b->xx)[x * 4 + y])
+        return FALSE;
+
+  tx = (((a->xy * b->yw - a->yw * a->xy) / a->yy + a->xw - b->xw) /
+        ((a->xy * a->yx) / a->yy - a->xx));
+  ty = (b->yw - a->yx * tx - a->yw) / a->yy;
+
+#define APPROX_EQUAL(a, b) (fabsf ((a) - (b)) < 1e-6f)
+
+  /* Check whether the 4th column of the matrices match up to the
+     calculation */
+  if (!APPROX_EQUAL (b->xw, a->xx * tx + a->xy * ty + a->xw) ||
+      !APPROX_EQUAL (b->yw, a->yx * tx + a->yy * ty + a->yw) ||
+      !APPROX_EQUAL (b->zw, a->zx * tx + a->zy * ty + a->zw) ||
+      !APPROX_EQUAL (b->ww, a->wx * tx + a->wy * ty + a->ww))
+    return FALSE;
+
+#undef APPROX_EQUAL
+
+  *tx_p = tx;
+  *ty_p = ty;
+
+  return TRUE;
+}
+
+typedef struct
+{
+  float x_1, y_1;
+  float x_2, y_2;
+} ClipBounds;
+
+static void
+check_software_clip_for_batch (CoglJournalEntry      *batch_start,
+                               int                    batch_len,
+                               CoglJournalFlushState *state)
+{
+  CoglClipStack *clip_stack, *clip_entry;
+  int entry_num;
+
+  _COGL_GET_CONTEXT (ctx, NO_RETVAL);
+
+  /* This tries to find cases where the entry is logged with a clip
+     but it would be faster to modify the vertex and texture
+     coordinates rather than flush the clip so that it can batch
+     better */
+
+  /* If the batch is reasonably long then it's worthwhile programming
+     the GPU to do the clip */
+  if (batch_len >= COGL_JOURNAL_HARDWARE_CLIP_THRESHOLD)
+    return;
+
+  clip_stack = batch_start->clip_stack;
+
+  if (clip_stack == NULL)
+    return;
+
+  /* Verify that all of the clip stack entries are a simple rectangle
+     clip */
+  for (clip_entry = clip_stack; clip_entry; clip_entry = clip_entry->parent)
+    if (clip_entry->type != COGL_CLIP_STACK_RECT)
+      return;
+
+  /* This scratch buffer is used to store the translation for each
+     entry in the journal. We store it in a separate buffer because
+     it's expensive to calculate but at this point we still don't know
+     whether we can clip all of the entries so we don't want to do the
+     rest of the dependant calculations until we're sure we can. */
+  if (ctx->journal_clip_bounds == NULL)
+    ctx->journal_clip_bounds = g_array_new (FALSE, FALSE, sizeof (ClipBounds));
+  g_array_set_size (ctx->journal_clip_bounds, batch_len);
+
+  for (entry_num = 0; entry_num < batch_len; entry_num++)
+    {
+      CoglJournalEntry *journal_entry = batch_start + entry_num;
+      CoglPipeline *pipeline = journal_entry->pipeline;
+      ClipBounds *clip_bounds = &g_array_index (ctx->journal_clip_bounds,
+                                                ClipBounds, entry_num);
+      int layer_num;
+
+      clip_bounds->x_1 = -G_MAXFLOAT;
+      clip_bounds->y_1 = -G_MAXFLOAT;
+      clip_bounds->x_2 = G_MAXFLOAT;
+      clip_bounds->y_2 = G_MAXFLOAT;
+
+      /* Check the pipeline is usable. We can short-cut here for
+         entries using the same pipeline as the previous entry */
+      if (entry_num == 0 || pipeline != batch_start[entry_num - 1].pipeline)
+        {
+          /* If the pipeline has a user program then we can't reliably modify
+             the texture coordinates */
+          if (cogl_pipeline_get_user_program (pipeline))
+            return;
+
+          /* If any of the pipeline layers have a texture matrix then we can't
+             reliably modify the texture coordinates */
+          for (layer_num = cogl_pipeline_get_n_layers (pipeline) - 1;
+               layer_num >= 0;
+               layer_num--)
+            if (_cogl_pipeline_layer_has_user_matrix (pipeline, layer_num))
+              return;
+        }
+
+      /* Now we need to verify that each clip entry's matrix is just a
+         translation of the journal entry's modelview matrix. We can
+         also work out the bounds of the clip in modelview space using
+         this translation */
+      for (clip_entry = clip_stack; clip_entry; clip_entry = clip_entry->parent)
+        {
+          float rect_x1, rect_y1, rect_x2, rect_y2;
+          CoglClipStackRect *clip_rect;
+          float tx, ty;
+
+          clip_rect = (CoglClipStackRect *) clip_entry;
+
+          if (!calculate_translation (&clip_rect->matrix,
+                                      &journal_entry->model_view,
+                                      &tx, &ty))
+            return;
+
+          if (clip_rect->x0 < clip_rect->x1)
+            {
+              rect_x1 = clip_rect->x0;
+              rect_x2 = clip_rect->x1;
+            }
+          else
+            {
+              rect_x1 = clip_rect->x1;
+              rect_x2 = clip_rect->x0;
+            }
+          if (clip_rect->y0 < clip_rect->y1)
+            {
+              rect_y1 = clip_rect->y0;
+              rect_y2 = clip_rect->y1;
+            }
+          else
+            {
+              rect_y1 = clip_rect->y1;
+              rect_y2 = clip_rect->y0;
+            }
+
+          clip_bounds->x_1 = MAX (clip_bounds->x_1, rect_x1 - tx);
+          clip_bounds->y_1 = MAX (clip_bounds->y_1, rect_y1 - ty);
+          clip_bounds->x_2 = MIN (clip_bounds->x_2, rect_x2 - tx);
+          clip_bounds->y_2 = MIN (clip_bounds->y_2, rect_y2 - ty);
+        }
+    }
+
+  /* If we make it here then we know we can software clip the entire batch */
+
+  for (entry_num = 0; entry_num < batch_len; entry_num++)
+    {
+      CoglJournalEntry *journal_entry = batch_start + entry_num;
+      float *verts = &g_array_index (ctx->logged_vertices, float,
+                                     journal_entry->array_offset + 1);
+      ClipBounds *clip_bounds = &g_array_index (ctx->journal_clip_bounds,
+                                                ClipBounds, entry_num);
+
+      size_t stride =
+        GET_JOURNAL_ARRAY_STRIDE_FOR_N_LAYERS (journal_entry->n_layers);
+      float rx1, ry1, rx2, ry2;
+      float vx1, vy1, vx2, vy2;
+      int layer_num;
+
+      /* Remove the clip on the entry */
+      _cogl_clip_stack_unref (journal_entry->clip_stack);
+      journal_entry->clip_stack = NULL;
+
+      vx1 = verts[0];
+      vy1 = verts[1];
+      vx2 = verts[stride];
+      vy2 = verts[stride + 1];
+
+      if (vx1 < vx2)
+        {
+          rx1 = vx1;
+          rx2 = vx2;
+        }
+      else
+        {
+          rx1 = vx2;
+          rx2 = vx1;
+        }
+      if (vy1 < vy2)
+        {
+          ry1 = vy1;
+          ry2 = vy2;
+        }
+      else
+        {
+          ry1 = vy2;
+          ry2 = vy1;
+        }
+
+      rx1 = CLAMP (rx1, clip_bounds->x_1, clip_bounds->x_2);
+      ry1 = CLAMP (ry1, clip_bounds->y_1, clip_bounds->y_2);
+      rx2 = CLAMP (rx2, clip_bounds->x_1, clip_bounds->x_2);
+      ry2 = CLAMP (ry2, clip_bounds->y_1, clip_bounds->y_2);
+
+      /* Check if the rectangle intersects the clip at all */
+      if (rx1 == rx2 || ry1 == ry2)
+        /* Will set all of the vertex data to 0 in the hope that this
+           will create a degenerate rectangle and the GL driver will
+           be able to clip it quickly */
+        memset (verts, 0, sizeof (float) * stride * 2);
+      else
+        {
+          if (vx1 > vx2)
+            {
+              float t = rx1;
+              rx1 = rx2;
+              rx2 = t;
+            }
+          if (vy1 > vy2)
+            {
+              float t = ry1;
+              ry1 = ry2;
+              ry2 = t;
+            }
+
+          verts[0] = rx1;
+          verts[1] = ry1;
+          verts[stride] = rx2;
+          verts[stride + 1] = ry2;
+
+          /* Convert the rectangle coordinates to a fraction of the original
+             rectangle */
+          rx1 = (rx1 - vx1) / (vx2 - vx1);
+          ry1 = (ry1 - vy1) / (vy2 - vy1);
+          rx2 = (rx2 - vx1) / (vx2 - vx1);
+          ry2 = (ry2 - vy1) / (vy2 - vy1);
+
+          for (layer_num = 0; layer_num < journal_entry->n_layers; layer_num++)
+            {
+              float *t = verts + 2 + 2 * layer_num;
+              float tx1 = t[0], ty1 = t[1];
+              float tx2 = t[stride], ty2 = t[stride + 1];
+              t[0] = rx1 * (tx2 - tx1) + tx1;
+              t[1] = ry1 * (ty2 - ty1) + ty1;
+              t[stride] = rx2 * (tx2 - tx1) + tx1;
+              t[stride + 1] = ry2 * (ty2 - ty1) + ty1;
+            }
+        }
+    }
+
+  return;
+}
+
+static void
+_cogl_journal_check_software_clip (CoglJournalEntry *batch_start,
+                                   int               batch_len,
+                                   void             *data)
+{
+  CoglJournalFlushState *state = data;
+
+  COGL_STATIC_TIMER (time_check_software_clip,
+                     "Journal Flush", /* parent */
+                     "flush: check software clip",
+                     "Time spent checking for software clip",
+                     0 /* no application private data */);
+
+  _COGL_GET_CONTEXT (ctx, NO_RETVAL);
+
+  COGL_TIMER_START (_cogl_uprof_context,
+                    time_check_software_clip);
+
+  check_software_clip_for_batch (batch_start, batch_len, state);
+
+  COGL_TIMER_STOP (_cogl_uprof_context,
+                   time_check_software_clip);
+}
+
 static gboolean
 compare_entry_clip_stacks (CoglJournalEntry *entry0, CoglJournalEntry *entry1)
 {
@@ -812,19 +1148,33 @@ _cogl_journal_flush (void)
   if (G_UNLIKELY (cogl_debug_flags & COGL_DEBUG_BATCHING))
     g_print ("BATCHING: journal len = %d\n", ctx->journal->len);
 
-  state.vertex_array = upload_vertices (&g_array_index (ctx->journal,
-                                                        CoglJournalEntry, 0),
-                                        ctx->journal->len,
-                                        ctx->journal_needed_vbo_len,
-                                        ctx->logged_vertices);
   state.attributes = ctx->journal_flush_attributes_array;
-  state.array_offset = 0;
 
   framebuffer = _cogl_get_framebuffer ();
   modelview_stack = _cogl_framebuffer_get_modelview_stack (framebuffer);
   state.modelview_stack = modelview_stack;
   state.projection_stack = _cogl_framebuffer_get_projection_stack (framebuffer);
 
+  /* We do an initial walk of the journal to analyse the clip stack
+     batches to see if we can do software clipping. We do this as a
+     separate walk of the journal because we can modify entries and
+     this may end up joining together clip stack batches in the next
+     iteration. */
+  batch_and_call ((CoglJournalEntry *)ctx->journal->data, /* first entry */
+                  ctx->journal->len, /* max number of entries to consider */
+                  compare_entry_clip_stacks,
+                  _cogl_journal_check_software_clip, /* callback */
+                  &state); /* data */
+
+  /* We upload the vertices after the clip stack pass in case it
+     modifies the entries */
+  state.vertex_array = upload_vertices (&g_array_index (ctx->journal,
+                                                        CoglJournalEntry, 0),
+                                        ctx->journal->len,
+                                        ctx->journal_needed_vbo_len,
+                                        ctx->logged_vertices);
+  state.array_offset = 0;
+
   /* batch_and_call() batches a list of journal entries according to some
    * given criteria and calls a callback once for each determined batch.
    *
@@ -975,6 +1325,7 @@ _cogl_journal_log_quad (const float  *position,
   entry = &g_array_index (ctx->journal, CoglJournalEntry, next_entry);
 
   entry->n_layers = n_layers;
+  entry->array_offset = next_vert;
 
   source = pipeline;