diff --git a/cogl/cogl-framebuffer-private.h b/cogl/cogl-framebuffer-private.h
index 3d9cba364..6f69d0699 100644
--- a/cogl/cogl-framebuffer-private.h
+++ b/cogl/cogl-framebuffer-private.h
@@ -67,6 +67,22 @@ struct _CoglFramebuffer
   /* The scene of a given framebuffer may depend on images in other
    * framebuffers... */
   GList              *deps;
+
+  /* As part of an optimization for reading-back single pixels from a
+   * framebuffer in some simple cases where the geometry is still
+   * available in the journal we need to track the bounds of the last
+   * region cleared, its color and we need to track when something
+   * does in fact draw to that region so it is no longer clear.
+   */
+  float               clear_color_red;
+  float               clear_color_green;
+  float               clear_color_blue;
+  float               clear_color_alpha;
+  int                 clear_clip_x0;
+  int                 clear_clip_y0;
+  int                 clear_clip_x1;
+  int                 clear_clip_y1;
+  gboolean            clear_clip_dirty;
 };
 
 #define COGL_FRAMEBUFFER(X) ((CoglFramebuffer *)(X))
@@ -117,6 +133,9 @@ _cogl_framebuffer_clear4f (CoglFramebuffer *framebuffer,
                            float blue,
                            float alpha);
 
+void
+_cogl_framebuffer_dirty (CoglFramebuffer *framebuffer);
+
 int
 _cogl_framebuffer_get_width (CoglFramebuffer *framebuffer);
 
@@ -196,6 +215,14 @@ _cogl_framebuffer_flush_dependency_journals (CoglFramebuffer *framebuffer);
 void
 _cogl_framebuffer_swap_notify (CoglFramebuffer *framebuffer);
 
+gboolean
+_cogl_framebuffer_try_fast_read_pixel (CoglFramebuffer *framebuffer,
+                                       int x,
+                                       int y,
+                                       CoglReadPixelsFlags source,
+                                       CoglPixelFormat format,
+                                       guint8 *pixel);
+
 typedef enum _CoglFramebufferFlushFlags
 {
   /* XXX: When using this, that imples you are going to manually load the
diff --git a/cogl/cogl-framebuffer.c b/cogl/cogl-framebuffer.c
index a0c5b944d..1519d58f3 100644
--- a/cogl/cogl-framebuffer.c
+++ b/cogl/cogl-framebuffer.c
@@ -160,6 +160,13 @@ _cogl_framebuffer_init (CoglFramebuffer *framebuffer,
 
   framebuffer->journal = _cogl_journal_new ();
 
+  /* Ensure we know the framebuffer->clear_color* members can't be
+   * referenced for our fast-path read-pixel optimization (see
+   * _cogl_journal_try_read_pixel()) until some region of the
+   * framebuffer is initialized.
+   */
+  framebuffer->clear_clip_dirty = TRUE;
+
   /* XXX: We have to maintain a central list of all framebuffers
    * because at times we need to be able to flush all known journals.
    *
@@ -248,6 +255,12 @@ _cogl_clear4f (unsigned long buffers,
   GE (glClear (gl_buffers));
 }
 
+void
+_cogl_framebuffer_dirty (CoglFramebuffer *framebuffer)
+{
+  framebuffer->clear_clip_dirty = TRUE;
+}
+
 void
 _cogl_framebuffer_clear4f (CoglFramebuffer *framebuffer,
                            unsigned long buffers,
@@ -256,11 +269,96 @@ _cogl_framebuffer_clear4f (CoglFramebuffer *framebuffer,
                            float blue,
                            float alpha)
 {
+  CoglClipStack *clip_stack = _cogl_framebuffer_get_clip_stack (framebuffer);
+  int scissor_x0;
+  int scissor_y0;
+  int scissor_x1;
+  int scissor_y1;
+
+  _cogl_clip_stack_get_bounds (clip_stack,
+                               &scissor_x0, &scissor_y0,
+                               &scissor_x1, &scissor_y1);
+
+  /* NB: the previous clear could have had an arbitrary clip.
+   * NB: everything for the last frame might still be in the journal
+   *     but we can't assume anything about how each entry was
+   *     clipped.
+   * NB: Clutter will scissor its pick renders which would mean all
+   *     journal entries have a common ClipStack entry, but without
+   *     a layering violation Cogl has to explicitly walk the journal
+   *     entries to determine if this is the case.
+   * NB: We have a software only read-pixel optimization in the
+   *     journal that determines the color at a given framebuffer
+   *     coordinate for simple scenes without rendering with the GPU.
+   *     When Clutter is hitting this fast-path we can expect to
+   *     receive calls to clear the framebuffer with an un-flushed
+   *     journal.
+   * NB: To fully support software based picking for Clutter we
+   *     need to be able to reliably detect when the contents of a
+   *     journal can be discarded and when we can skip the call to
+   *     glClear because it matches the previous clear request.
+   */
+
+  /* Note: we don't check for the stencil buffer being cleared here
+   * since there isn't any public cogl api to manipulate the stencil
+   * buffer.
+   *
+   * Note: we check for an exact clip match here because
+   * 1) a smaller clip could mean existing journal entries may
+   *    need to contribute to regions outside the new clear-clip
+   * 2) a larger clip would mean we need to issue a real
+   *    glClear and we only care about cases avoiding a
+   *    glClear.
+   *
+   * Note: Comparing without an epsilon is considered
+   * appropriate here.
+   */
+  if (buffers & COGL_BUFFER_BIT_COLOR &&
+      buffers & COGL_BUFFER_BIT_DEPTH &&
+      !framebuffer->clear_clip_dirty &&
+      framebuffer->clear_color_red == red &&
+      framebuffer->clear_color_green == green &&
+      framebuffer->clear_color_blue == blue &&
+      framebuffer->clear_color_alpha == alpha &&
+      scissor_x0 == framebuffer->clear_clip_x0 &&
+      scissor_y0 == framebuffer->clear_clip_y0 &&
+      scissor_x1 == framebuffer->clear_clip_x1 &&
+      scissor_y1 == framebuffer->clear_clip_y1)
+    {
+      /* NB: We only have to consider the clip state of journal
+       * entries if the current clear is clipped since otherwise we
+       * know every pixel of the framebuffer is affected by the clear
+       * and so all journal entries become redundant and can simply be
+       * discarded.
+       */
+      if (clip_stack)
+        {
+          /*
+           * Note: the function for checking the journal entries is
+           * quite strict. It avoids detailed checking of all entry
+           * clip_stacks by only checking the details of the first
+           * entry and then it only verifies that the remaining
+           * entries share the same clip_stack ancestry. This means
+           * it's possible for some false negatives here but that will
+           * just result in us falling back to a real clear.
+           */
+          if (_cogl_journal_all_entries_within_bounds (framebuffer->journal,
+                                                       scissor_x0, scissor_y0,
+                                                       scissor_x1, scissor_y1))
+            {
+              _cogl_journal_discard (framebuffer->journal);
+              goto cleared;
+            }
+        }
+      else
+        {
+          _cogl_journal_discard (framebuffer->journal);
+          goto cleared;
+        }
+    }
+
   COGL_NOTE (DRAW, "Clear begin");
 
-  /* XXX: in the case where it's the color buffer being cleared and
-   * the current clip-stack is empty we could instead discard the
-   * journal here instead of flushing it. */
   _cogl_framebuffer_flush_journal (framebuffer);
 
   /* NB: _cogl_framebuffer_flush_state may disrupt various state (such
@@ -282,6 +380,38 @@ _cogl_framebuffer_clear4f (CoglFramebuffer *framebuffer,
     }
 
   COGL_NOTE (DRAW, "Clear end");
+
+cleared:
+
+  if (buffers & COGL_BUFFER_BIT_COLOR && buffers & COGL_BUFFER_BIT_DEPTH)
+    {
+      /* For our fast-path for reading back a single pixel of simple
+       * scenes where the whole frame is in the journal we need to
+       * track the cleared color of the framebuffer in case the point
+       * read doesn't intersect any of the journal rectangles. */
+      framebuffer->clear_clip_dirty = FALSE;
+      framebuffer->clear_color_red = red;
+      framebuffer->clear_color_green = green;
+      framebuffer->clear_color_blue = blue;
+      framebuffer->clear_color_alpha = alpha;
+
+      /* NB: A clear may be scissored so we need to track the extents
+       * that the clear is applicable too... */
+      if (clip_stack)
+        {
+          _cogl_clip_stack_get_bounds (clip_stack,
+                                       &framebuffer->clear_clip_x0,
+                                       &framebuffer->clear_clip_y0,
+                                       &framebuffer->clear_clip_x1,
+                                       &framebuffer->clear_clip_y1);
+        }
+      else
+        {
+          /* FIXME: set degenerate clip */
+        }
+    }
+  else
+    _cogl_framebuffer_dirty (framebuffer);
 }
 
 /* XXX: We'll need to consider if this API is a good approach for the
@@ -1097,3 +1227,63 @@ _cogl_framebuffer_get_alpha_bits (CoglFramebuffer *framebuffer)
   return framebuffer->alpha_bits;
 }
 
+gboolean
+_cogl_framebuffer_try_fast_read_pixel (CoglFramebuffer *framebuffer,
+                                       int x,
+                                       int y,
+                                       CoglReadPixelsFlags source,
+                                       CoglPixelFormat format,
+                                       guint8 *pixel)
+{
+  gboolean found_intersection;
+
+  if (source != COGL_READ_PIXELS_COLOR_BUFFER)
+    return FALSE;
+
+  if (format != COGL_PIXEL_FORMAT_RGBA_8888_PRE &&
+      format != COGL_PIXEL_FORMAT_RGBA_8888)
+    return FALSE;
+
+  if (!_cogl_journal_try_read_pixel (framebuffer->journal,
+                                     x, y, format, pixel,
+                                     &found_intersection))
+    return FALSE;
+
+  /* If we can't determine the color from the primitives in the
+   * journal then see if we can use the last recorded clear color
+   */
+
+  /* If _cogl_journal_try_read_pixel() failed even though there was an
+   * intersection of the given point with a primitive in the journal
+   * then we can't fallback to the framebuffer's last clear color...
+   * */
+  if (found_intersection)
+    return TRUE;
+
+  /* If the framebuffer has been rendered too since it was last
+   * cleared then we can't return the last known clear color. */
+  if (framebuffer->clear_clip_dirty)
+    return FALSE;
+
+  if (x >= framebuffer->clear_clip_x0 &&
+      x < framebuffer->clear_clip_x1 &&
+      y >= framebuffer->clear_clip_y0 &&
+      y < framebuffer->clear_clip_y1)
+    {
+
+      /* we currently only care about cases where the premultiplied or
+       * unpremultipled colors are equivalent... */
+      if (framebuffer->clear_color_alpha != 1.0)
+        return FALSE;
+
+      pixel[0] = framebuffer->clear_color_red * 255.0;
+      pixel[1] = framebuffer->clear_color_green * 255.0;
+      pixel[2] = framebuffer->clear_color_blue * 255.0;
+      pixel[3] = framebuffer->clear_color_alpha * 255.0;
+
+      return TRUE;
+    }
+
+  return FALSE;
+}
+
diff --git a/cogl/cogl-journal-private.h b/cogl/cogl-journal-private.h
index 754a2ecd7..44a4af855 100644
--- a/cogl/cogl-journal-private.h
+++ b/cogl/cogl-journal-private.h
@@ -35,6 +35,8 @@ typedef struct _CoglJournal
   GArray *vertices;
   size_t needed_vbo_len;
 
+  int fast_read_pixel_count;
+
 } CoglJournal;
 
 /* To improve batching of geometry when submitting vertices to OpenGL we
@@ -69,4 +71,22 @@ void
 _cogl_journal_flush (CoglJournal *journal,
                      CoglFramebuffer *framebuffer);
 
+void
+_cogl_journal_discard (CoglJournal *journal);
+
+gboolean
+_cogl_journal_all_entries_within_bounds (CoglJournal *journal,
+                                         float clip_x0,
+                                         float clip_y0,
+                                         float clip_x1,
+                                         float clip_y1);
+
+gboolean
+_cogl_journal_try_read_pixel (CoglJournal *journal,
+                              int x,
+                              int y,
+                              CoglPixelFormat format,
+                              guint8 *pixel,
+                              gboolean *found_intersection);
+
 #endif /* __COGL_JOURNAL_PRIVATE_H */
diff --git a/cogl/cogl-journal.c b/cogl/cogl-journal.c
index dc81e588a..f0e8f3896 100644
--- a/cogl/cogl-journal.c
+++ b/cogl/cogl-journal.c
@@ -37,6 +37,7 @@
 #include "cogl-framebuffer-private.h"
 #include "cogl-profile.h"
 #include "cogl-vertex-attribute-private.h"
+#include "cogl-point-in-poly-private.h"
 
 #include <string.h>
 #include <gmodule.h>
@@ -820,10 +821,182 @@ typedef struct
   float x_2, y_2;
 } ClipBounds;
 
+static gboolean
+can_software_clip_entry (CoglJournalEntry *journal_entry,
+                         CoglJournalEntry *prev_journal_entry,
+                         CoglClipStack *clip_stack,
+                         ClipBounds *clip_bounds_out)
+{
+  CoglPipeline *pipeline = journal_entry->pipeline;
+  CoglClipStack *clip_entry;
+  int layer_num;
+
+  clip_bounds_out->x_1 = -G_MAXFLOAT;
+  clip_bounds_out->y_1 = -G_MAXFLOAT;
+  clip_bounds_out->x_2 = G_MAXFLOAT;
+  clip_bounds_out->y_2 = G_MAXFLOAT;
+
+  /* Check the pipeline is usable. We can short-cut here for
+     entries using the same pipeline as the previous entry */
+  if (prev_journal_entry == NULL || pipeline != prev_journal_entry->pipeline)
+    {
+      /* If the pipeline has a user program then we can't reliably modify
+         the texture coordinates */
+      if (cogl_pipeline_get_user_program (pipeline))
+        return FALSE;
+
+      /* If any of the pipeline layers have a texture matrix then we can't
+         reliably modify the texture coordinates */
+      for (layer_num = cogl_pipeline_get_n_layers (pipeline) - 1;
+           layer_num >= 0;
+           layer_num--)
+        if (_cogl_pipeline_layer_has_user_matrix (pipeline, layer_num))
+          return FALSE;
+    }
+
+  /* Now we need to verify that each clip entry's matrix is just a
+     translation of the journal entry's modelview matrix. We can
+     also work out the bounds of the clip in modelview space using
+     this translation */
+  for (clip_entry = clip_stack; clip_entry; clip_entry = clip_entry->parent)
+    {
+      float rect_x1, rect_y1, rect_x2, rect_y2;
+      CoglClipStackRect *clip_rect;
+      float tx, ty;
+
+      clip_rect = (CoglClipStackRect *) clip_entry;
+
+      if (!calculate_translation (&clip_rect->matrix,
+                                  &journal_entry->model_view,
+                                  &tx, &ty))
+        return FALSE;
+
+      if (clip_rect->x0 < clip_rect->x1)
+        {
+          rect_x1 = clip_rect->x0;
+          rect_x2 = clip_rect->x1;
+        }
+      else
+        {
+          rect_x1 = clip_rect->x1;
+          rect_x2 = clip_rect->x0;
+        }
+      if (clip_rect->y0 < clip_rect->y1)
+        {
+          rect_y1 = clip_rect->y0;
+          rect_y2 = clip_rect->y1;
+        }
+      else
+        {
+          rect_y1 = clip_rect->y1;
+          rect_y2 = clip_rect->y0;
+        }
+
+      clip_bounds_out->x_1 = MAX (clip_bounds_out->x_1, rect_x1 - tx);
+      clip_bounds_out->y_1 = MAX (clip_bounds_out->y_1, rect_y1 - ty);
+      clip_bounds_out->x_2 = MIN (clip_bounds_out->x_2, rect_x2 - tx);
+      clip_bounds_out->y_2 = MIN (clip_bounds_out->y_2, rect_y2 - ty);
+    }
+
+  return TRUE;
+}
+
 static void
-check_software_clip_for_batch (CoglJournalEntry      *batch_start,
-                               int                    batch_len,
-                               CoglJournalFlushState *state)
+software_clip_entry (CoglJournalEntry *journal_entry,
+                     float *verts,
+                     ClipBounds *clip_bounds)
+{
+  size_t stride =
+    GET_JOURNAL_ARRAY_STRIDE_FOR_N_LAYERS (journal_entry->n_layers);
+  float rx1, ry1, rx2, ry2;
+  float vx1, vy1, vx2, vy2;
+  int layer_num;
+
+  /* Remove the clip on the entry */
+  _cogl_clip_stack_unref (journal_entry->clip_stack);
+  journal_entry->clip_stack = NULL;
+
+  vx1 = verts[0];
+  vy1 = verts[1];
+  vx2 = verts[stride];
+  vy2 = verts[stride + 1];
+
+  if (vx1 < vx2)
+    {
+      rx1 = vx1;
+      rx2 = vx2;
+    }
+  else
+    {
+      rx1 = vx2;
+      rx2 = vx1;
+    }
+  if (vy1 < vy2)
+    {
+      ry1 = vy1;
+      ry2 = vy2;
+    }
+  else
+    {
+      ry1 = vy2;
+      ry2 = vy1;
+    }
+
+  rx1 = CLAMP (rx1, clip_bounds->x_1, clip_bounds->x_2);
+  ry1 = CLAMP (ry1, clip_bounds->y_1, clip_bounds->y_2);
+  rx2 = CLAMP (rx2, clip_bounds->x_1, clip_bounds->x_2);
+  ry2 = CLAMP (ry2, clip_bounds->y_1, clip_bounds->y_2);
+
+  /* Check if the rectangle intersects the clip at all */
+  if (rx1 == rx2 || ry1 == ry2)
+    /* Will set all of the vertex data to 0 in the hope that this
+       will create a degenerate rectangle and the GL driver will
+       be able to clip it quickly */
+    memset (verts, 0, sizeof (float) * stride * 2);
+  else
+    {
+      if (vx1 > vx2)
+        {
+          float t = rx1;
+          rx1 = rx2;
+          rx2 = t;
+        }
+      if (vy1 > vy2)
+        {
+          float t = ry1;
+          ry1 = ry2;
+          ry2 = t;
+        }
+
+      verts[0] = rx1;
+      verts[1] = ry1;
+      verts[stride] = rx2;
+      verts[stride + 1] = ry2;
+
+      /* Convert the rectangle coordinates to a fraction of the original
+         rectangle */
+      rx1 = (rx1 - vx1) / (vx2 - vx1);
+      ry1 = (ry1 - vy1) / (vy2 - vy1);
+      rx2 = (rx2 - vx1) / (vx2 - vx1);
+      ry2 = (ry2 - vy1) / (vy2 - vy1);
+
+      for (layer_num = 0; layer_num < journal_entry->n_layers; layer_num++)
+        {
+          float *t = verts + 2 + 2 * layer_num;
+          float tx1 = t[0], ty1 = t[1];
+          float tx2 = t[stride], ty2 = t[stride + 1];
+          t[0] = rx1 * (tx2 - tx1) + tx1;
+          t[1] = ry1 * (ty2 - ty1) + ty1;
+          t[stride] = rx2 * (tx2 - tx1) + tx1;
+          t[stride + 1] = ry2 * (ty2 - ty1) + ty1;
+        }
+    }
+}
+
+static void
+maybe_software_clip_entries (CoglJournalEntry      *batch_start,
+                             int                    batch_len,
+                             CoglJournalFlushState *state)
 {
   CoglJournal *journal = state->journal;
   CoglClipStack *clip_stack, *clip_entry;
@@ -864,77 +1037,15 @@ check_software_clip_for_batch (CoglJournalEntry      *batch_start,
   for (entry_num = 0; entry_num < batch_len; entry_num++)
     {
       CoglJournalEntry *journal_entry = batch_start + entry_num;
-      CoglPipeline *pipeline = journal_entry->pipeline;
+      CoglJournalEntry *prev_journal_entry =
+        entry_num ? batch_start + (entry_num - 1) : NULL;
       ClipBounds *clip_bounds = &g_array_index (ctx->journal_clip_bounds,
                                                 ClipBounds, entry_num);
-      int layer_num;
 
-      clip_bounds->x_1 = -G_MAXFLOAT;
-      clip_bounds->y_1 = -G_MAXFLOAT;
-      clip_bounds->x_2 = G_MAXFLOAT;
-      clip_bounds->y_2 = G_MAXFLOAT;
-
-      /* Check the pipeline is usable. We can short-cut here for
-         entries using the same pipeline as the previous entry */
-      if (entry_num == 0 || pipeline != batch_start[entry_num - 1].pipeline)
-        {
-          /* If the pipeline has a user program then we can't reliably modify
-             the texture coordinates */
-          if (cogl_pipeline_get_user_program (pipeline))
-            return;
-
-          /* If any of the pipeline layers have a texture matrix then we can't
-             reliably modify the texture coordinates */
-          for (layer_num = cogl_pipeline_get_n_layers (pipeline) - 1;
-               layer_num >= 0;
-               layer_num--)
-            if (_cogl_pipeline_layer_has_user_matrix (pipeline, layer_num))
-              return;
-        }
-
-      /* Now we need to verify that each clip entry's matrix is just a
-         translation of the journal entry's modelview matrix. We can
-         also work out the bounds of the clip in modelview space using
-         this translation */
-      for (clip_entry = clip_stack; clip_entry; clip_entry = clip_entry->parent)
-        {
-          float rect_x1, rect_y1, rect_x2, rect_y2;
-          CoglClipStackRect *clip_rect;
-          float tx, ty;
-
-          clip_rect = (CoglClipStackRect *) clip_entry;
-
-          if (!calculate_translation (&clip_rect->matrix,
-                                      &journal_entry->model_view,
-                                      &tx, &ty))
-            return;
-
-          if (clip_rect->x0 < clip_rect->x1)
-            {
-              rect_x1 = clip_rect->x0;
-              rect_x2 = clip_rect->x1;
-            }
-          else
-            {
-              rect_x1 = clip_rect->x1;
-              rect_x2 = clip_rect->x0;
-            }
-          if (clip_rect->y0 < clip_rect->y1)
-            {
-              rect_y1 = clip_rect->y0;
-              rect_y2 = clip_rect->y1;
-            }
-          else
-            {
-              rect_y1 = clip_rect->y1;
-              rect_y2 = clip_rect->y0;
-            }
-
-          clip_bounds->x_1 = MAX (clip_bounds->x_1, rect_x1 - tx);
-          clip_bounds->y_1 = MAX (clip_bounds->y_1, rect_y1 - ty);
-          clip_bounds->x_2 = MIN (clip_bounds->x_2, rect_x2 - tx);
-          clip_bounds->y_2 = MIN (clip_bounds->y_2, rect_y2 - ty);
-        }
+      if (!can_software_clip_entry (journal_entry, prev_journal_entry,
+                                    clip_stack,
+                                    clip_bounds))
+        return;
     }
 
   /* If we make it here then we know we can software clip the entire batch */
@@ -947,107 +1058,23 @@ check_software_clip_for_batch (CoglJournalEntry      *batch_start,
       ClipBounds *clip_bounds = &g_array_index (ctx->journal_clip_bounds,
                                                 ClipBounds, entry_num);
 
-      size_t stride =
-        GET_JOURNAL_ARRAY_STRIDE_FOR_N_LAYERS (journal_entry->n_layers);
-      float rx1, ry1, rx2, ry2;
-      float vx1, vy1, vx2, vy2;
-      int layer_num;
-
-      /* Remove the clip on the entry */
-      _cogl_clip_stack_unref (journal_entry->clip_stack);
-      journal_entry->clip_stack = NULL;
-
-      vx1 = verts[0];
-      vy1 = verts[1];
-      vx2 = verts[stride];
-      vy2 = verts[stride + 1];
-
-      if (vx1 < vx2)
-        {
-          rx1 = vx1;
-          rx2 = vx2;
-        }
-      else
-        {
-          rx1 = vx2;
-          rx2 = vx1;
-        }
-      if (vy1 < vy2)
-        {
-          ry1 = vy1;
-          ry2 = vy2;
-        }
-      else
-        {
-          ry1 = vy2;
-          ry2 = vy1;
-        }
-
-      rx1 = CLAMP (rx1, clip_bounds->x_1, clip_bounds->x_2);
-      ry1 = CLAMP (ry1, clip_bounds->y_1, clip_bounds->y_2);
-      rx2 = CLAMP (rx2, clip_bounds->x_1, clip_bounds->x_2);
-      ry2 = CLAMP (ry2, clip_bounds->y_1, clip_bounds->y_2);
-
-      /* Check if the rectangle intersects the clip at all */
-      if (rx1 == rx2 || ry1 == ry2)
-        /* Will set all of the vertex data to 0 in the hope that this
-           will create a degenerate rectangle and the GL driver will
-           be able to clip it quickly */
-        memset (verts, 0, sizeof (float) * stride * 2);
-      else
-        {
-          if (vx1 > vx2)
-            {
-              float t = rx1;
-              rx1 = rx2;
-              rx2 = t;
-            }
-          if (vy1 > vy2)
-            {
-              float t = ry1;
-              ry1 = ry2;
-              ry2 = t;
-            }
-
-          verts[0] = rx1;
-          verts[1] = ry1;
-          verts[stride] = rx2;
-          verts[stride + 1] = ry2;
-
-          /* Convert the rectangle coordinates to a fraction of the original
-             rectangle */
-          rx1 = (rx1 - vx1) / (vx2 - vx1);
-          ry1 = (ry1 - vy1) / (vy2 - vy1);
-          rx2 = (rx2 - vx1) / (vx2 - vx1);
-          ry2 = (ry2 - vy1) / (vy2 - vy1);
-
-          for (layer_num = 0; layer_num < journal_entry->n_layers; layer_num++)
-            {
-              float *t = verts + 2 + 2 * layer_num;
-              float tx1 = t[0], ty1 = t[1];
-              float tx2 = t[stride], ty2 = t[stride + 1];
-              t[0] = rx1 * (tx2 - tx1) + tx1;
-              t[1] = ry1 * (ty2 - ty1) + ty1;
-              t[stride] = rx2 * (tx2 - tx1) + tx1;
-              t[stride + 1] = ry2 * (ty2 - ty1) + ty1;
-            }
-        }
+      software_clip_entry (journal_entry, verts, clip_bounds);
     }
 
   return;
 }
 
 static void
-_cogl_journal_check_software_clip (CoglJournalEntry *batch_start,
-                                   int               batch_len,
-                                   void             *data)
+_cogl_journal_maybe_software_clip_entries (CoglJournalEntry *batch_start,
+                                           int               batch_len,
+                                           void             *data)
 {
   CoglJournalFlushState *state = data;
 
   COGL_STATIC_TIMER (time_check_software_clip,
                      "Journal Flush", /* parent */
-                     "flush: check software clip",
-                     "Time spent checking for software clip",
+                     "flush: software clipping",
+                     "Time spent software clipping",
                      0 /* no application private data */);
 
   _COGL_GET_CONTEXT (ctx, NO_RETVAL);
@@ -1055,7 +1082,7 @@ _cogl_journal_check_software_clip (CoglJournalEntry *batch_start,
   COGL_TIMER_START (_cogl_uprof_context,
                     time_check_software_clip);
 
-  check_software_clip_for_batch (batch_start, batch_len, state);
+  maybe_software_clip_entries (batch_start, batch_len, state);
 
   COGL_TIMER_STOP (_cogl_uprof_context,
                    time_check_software_clip);
@@ -1160,6 +1187,93 @@ upload_vertices (const CoglJournalEntry *entries,
   return array;
 }
 
+void
+_cogl_journal_discard (CoglJournal *journal)
+{
+  int i;
+
+  for (i = 0; i < journal->entries->len; i++)
+    {
+      CoglJournalEntry *entry =
+        &g_array_index (journal->entries, CoglJournalEntry, i);
+      _cogl_pipeline_journal_unref (entry->pipeline);
+      _cogl_clip_stack_unref (entry->clip_stack);
+    }
+
+  g_array_set_size (journal->entries, 0);
+  g_array_set_size (journal->vertices, 0);
+  journal->needed_vbo_len = 0;
+  journal->fast_read_pixel_count = 0;
+}
+
+/* Note: A return value of FALSE doesn't mean 'no' it means
+ * 'unknown' */
+gboolean
+_cogl_journal_all_entries_within_bounds (CoglJournal *journal,
+                                         float clip_x0,
+                                         float clip_y0,
+                                         float clip_x1,
+                                         float clip_y1)
+{
+  CoglJournalEntry *entry = (CoglJournalEntry *)journal->entries->data;
+  CoglClipStack *clip_entry;
+  CoglClipStack *reference = NULL;
+  int bounds_x0;
+  int bounds_y0;
+  int bounds_x1;
+  int bounds_y1;
+  int i;
+
+  if (journal->entries->len == 0)
+    return TRUE;
+
+  /* Find the shortest clip_stack ancestry that leaves us in the
+   * required bounds */
+  for (clip_entry = entry->clip_stack;
+       clip_entry;
+       clip_entry = clip_entry->parent)
+    {
+      _cogl_clip_stack_get_bounds (clip_entry,
+                                   &bounds_x0, &bounds_y0,
+                                   &bounds_x1, &bounds_y1);
+
+      if (bounds_x0 >= clip_x0 && bounds_y0 >= clip_y0 &&
+          bounds_x1 <= clip_x1 && bounds_y1 <= clip_y1)
+        reference = clip_entry;
+      else
+        break;
+    }
+
+  if (!reference)
+    return FALSE;
+
+  /* For the remaining journal entries we will only verify they share
+   * 'reference' as an ancestor in their clip stack since that's
+   * enough to know that they would be within the required bounds.
+   */
+  for (i = 1; i < journal->entries->len; i++)
+    {
+      gboolean found_reference = FALSE;
+      entry = &g_array_index (journal->entries, CoglJournalEntry, i);
+
+      for (clip_entry = entry->clip_stack;
+           clip_entry;
+           clip_entry = clip_entry->parent)
+        {
+          if (clip_entry == reference)
+            {
+              found_reference = TRUE;
+              break;
+            }
+        }
+
+      if (!found_reference)
+        return FALSE;
+    }
+
+  return TRUE;
+}
+
 /* XXX NB: When _cogl_journal_flush() returns all state relating
  * to pipelines, all glEnable flags and current matrix state
  * is undefined.
@@ -1219,7 +1333,7 @@ _cogl_journal_flush (CoglJournal *journal,
       batch_and_call ((CoglJournalEntry *)journal->entries->data, /* first entry */
                       journal->entries->len, /* max number of entries to consider */
                       compare_entry_clip_stacks,
-                      _cogl_journal_check_software_clip, /* callback */
+                      _cogl_journal_maybe_software_clip_entries, /* callback */
                       &state); /* data */
     }
 
@@ -1268,17 +1382,7 @@ _cogl_journal_flush (CoglJournal *journal,
 
   cogl_object_unref (state.vertex_array);
 
-  for (i = 0; i < journal->entries->len; i++)
-    {
-      CoglJournalEntry *entry =
-        &g_array_index (journal->entries, CoglJournalEntry, i);
-      _cogl_pipeline_journal_unref (entry->pipeline);
-      _cogl_clip_stack_unref (entry->clip_stack);
-    }
-
-  g_array_set_size (journal->entries, 0);
-  g_array_set_size (journal->vertices, 0);
-  journal->needed_vbo_len = 0;
+  _cogl_journal_discard (journal);
 
   cogl_pop_framebuffer ();
 
@@ -1438,3 +1542,261 @@ _cogl_journal_log_quad (CoglJournal  *journal,
   COGL_TIMER_STOP (_cogl_uprof_context, log_timer);
 }
 
+static void
+entry_to_screen_polygon (const CoglJournalEntry *entry,
+                         float *vertices,
+                         float *poly)
+{
+  size_t array_stride =
+    GET_JOURNAL_ARRAY_STRIDE_FOR_N_LAYERS (entry->n_layers);
+  CoglMatrixStack *projection_stack;
+  CoglMatrix projection;
+  int i;
+  int viewport[4];
+
+  poly[0] = vertices[0];
+  poly[1] = vertices[1];
+  poly[2] = 0;
+  poly[3] = 1;
+
+  poly[4] = vertices[0];
+  poly[5] = vertices[array_stride + 1];
+  poly[6] = 0;
+  poly[7] = 1;
+
+  poly[8] = vertices[array_stride];
+  poly[9] = vertices[array_stride + 1];
+  poly[10] = 0;
+  poly[11] = 1;
+
+  poly[12] = vertices[array_stride];
+  poly[13] = vertices[1];
+  poly[14] = 0;
+  poly[15] = 1;
+
+  /* TODO: perhaps split the following out into a more generalized
+   * _cogl_transform_points utility...
+   */
+
+  cogl_matrix_transform_points (&entry->model_view,
+                                2, /* n_components */
+                                sizeof (float) * 4, /* stride_in */
+                                poly, /* points_in */
+                                /* strideout */
+                                sizeof (float) * 4,
+                                poly, /* points_out */
+                                4 /* n_points */);
+
+  projection_stack =
+    _cogl_framebuffer_get_projection_stack (_cogl_get_framebuffer ());
+  _cogl_matrix_stack_get (projection_stack, &projection);
+
+  cogl_matrix_project_points (&projection,
+                              3, /* n_components */
+                              sizeof (float) * 4, /* stride_in */
+                              poly, /* points_in */
+                              /* strideout */
+                              sizeof (float) * 4,
+                              poly, /* points_out */
+                              4 /* n_points */);
+
+  _cogl_framebuffer_get_viewport4fv (_cogl_get_framebuffer (),
+                                     viewport);
+
+/* Scale from OpenGL normalized device coordinates (ranging from -1 to 1)
+ * to Cogl window/framebuffer coordinates (ranging from 0 to buffer-size) with
+ * (0,0) being top left. */
+#define VIEWPORT_TRANSFORM_X(x, vp_origin_x, vp_width) \
+    (  ( ((x) + 1.0) * ((vp_width) / 2.0) ) + (vp_origin_x)  )
+/* Note: for Y we first flip all coordinates around the X axis while in
+ * normalized device coodinates */
+#define VIEWPORT_TRANSFORM_Y(y, vp_origin_y, vp_height) \
+    (  ( ((-(y)) + 1.0) * ((vp_height) / 2.0) ) + (vp_origin_y)  )
+
+  /* Scale from normalized device coordinates (in range [-1,1]) to
+   * window coordinates ranging [0,window-size] ... */
+  for (i = 0; i < 4; i++)
+    {
+      float w = poly[4 * i + 3];
+
+      /* Perform perspective division */
+      poly[4 * i] /= w;
+      poly[4 * i + 1] /= w;
+
+      /* Apply viewport transform */
+      poly[4 * i] = VIEWPORT_TRANSFORM_X (poly[4 * i],
+                                          viewport[0], viewport[2]);
+      poly[4 * i + 1] = VIEWPORT_TRANSFORM_Y (poly[4 * i + 1],
+                                              viewport[1], viewport[3]);
+    }
+
+#undef VIEWPORT_TRANSFORM_X
+#undef VIEWPORT_TRANSFORM_Y
+}
+
+static gboolean
+try_checking_point_hits_entry_after_clipping (CoglJournalEntry *entry,
+                                              float *vertices,
+                                              float x,
+                                              float y,
+                                              gboolean *hit)
+{
+  gboolean can_software_clip = TRUE;
+  gboolean needs_software_clip = FALSE;
+  CoglClipStack *clip_entry;
+
+  *hit = TRUE;
+
+  /* Verify that all of the clip stack entries are simple rectangle
+   * clips */
+  for (clip_entry = entry->clip_stack;
+       clip_entry;
+       clip_entry = clip_entry->parent)
+    {
+      if (x < clip_entry->bounds_x0 ||
+          x >= clip_entry->bounds_x1 ||
+          y < clip_entry->bounds_y0 ||
+          y >= clip_entry->bounds_y1)
+        {
+          *hit = FALSE;
+          return TRUE;
+        }
+
+      if (clip_entry->type == COGL_CLIP_STACK_WINDOW_RECT)
+        {
+          /* XXX: technically we could still run the software clip in
+           * this case because for our purposes we know this clip
+           * can be ignored now, but [can_]sofware_clip_entry() doesn't
+           * know this and will bail out. */
+          can_software_clip = FALSE;
+        }
+      else if (clip_entry->type == COGL_CLIP_STACK_RECT)
+        {
+          CoglClipStackRect *rect_entry = (CoglClipStackRect *)entry;
+
+          if (rect_entry->can_be_scissor == FALSE)
+            needs_software_clip = TRUE;
+          /* If can_be_scissor is TRUE then we know it's screen
+           * aligned and the hit test we did above has determined
+           * that we are inside this clip. */
+        }
+      else
+        return FALSE;
+    }
+
+  if (needs_software_clip)
+    {
+      ClipBounds clip_bounds;
+      float poly[16];
+
+      if (!can_software_clip_entry (entry, NULL,
+                                    entry->clip_stack, &clip_bounds))
+        return FALSE;
+
+      software_clip_entry (entry, vertices, &clip_bounds);
+      entry_to_screen_polygon (entry, vertices, poly);
+
+      *hit = _cogl_util_point_in_poly (x, y, poly, sizeof (float) * 4, 4);
+      return TRUE;
+    }
+
+  return TRUE;
+}
+
+gboolean
+_cogl_journal_try_read_pixel (CoglJournal *journal,
+                              int x,
+                              int y,
+                              CoglPixelFormat format,
+                              guint8 *pixel,
+                              gboolean *found_intersection)
+{
+  int i;
+
+  _COGL_GET_CONTEXT (ctx, FALSE);
+
+  /* XXX: this number has been plucked out of thin air, but the idea
+   * is that if so many pixels are being read from the same un-changed
+   * journal than we expect that it will be more efficient to fail
+   * here so we end up flushing and rendering the journal so that
+   * further reads can directly read from the framebuffer. There will
+   * be a bit more lag to flush the render but if there are going to
+   * continue being lots of arbitrary single pixel reads they will end
+   * up faster in the end. */
+  if (journal->fast_read_pixel_count > 50)
+    return FALSE;
+
+  if (format != COGL_PIXEL_FORMAT_RGBA_8888_PRE &&
+      format != COGL_PIXEL_FORMAT_RGBA_8888)
+    return FALSE;
+
+  *found_intersection = FALSE;
+
+  /* NB: The most recently added journal entry is the last entry, and
+   * assuming this is a simple scene only comprised of opaque coloured
+   * rectangles with no special pipelines involved (e.g. enabling
+   * depth testing) then we can assume painter's algorithm for the
+   * entries and so our fast read-pixel just needs to walk backwards
+   * through the journal entries trying to intersect each entry with
+   * the given point of interest. */
+  for (i = journal->entries->len - 1; i >= 0; i--)
+    {
+      CoglJournalEntry *entry =
+        &g_array_index (journal->entries, CoglJournalEntry, i);
+      guint8 *color = (guint8 *)&g_array_index (journal->vertices, float,
+                                                entry->array_offset);
+      float *vertices = (float *)color + 1;
+      float poly[16];
+
+      entry_to_screen_polygon (entry, vertices, poly);
+
+      if (!_cogl_util_point_in_poly (x, y, poly, sizeof (float) * 4, 4))
+        continue;
+
+      /* FIXME: the journal should have a back pointer to the
+       * associated framebuffer, because it should be possible to read
+       * a pixel from arbitrary framebuffers without needing to
+       * internally call _cogl_push/pop_framebuffer.
+       */
+      if (entry->clip_stack)
+        {
+          gboolean hit;
+
+          if (!try_checking_point_hits_entry_after_clipping (entry, vertices,
+                                                             x, y, &hit))
+            return FALSE; /* hit couldn't be determined */
+
+          if (!hit)
+            continue;
+        }
+
+      *found_intersection = TRUE;
+
+      /* If we find that the rectangle the point of interest
+       * intersects has any state more complex than a constant opaque
+       * color then we bail out. */
+      if (!_cogl_pipeline_equal (ctx->opaque_color_pipeline, entry->pipeline,
+                                 (COGL_PIPELINE_STATE_ALL &
+                                  ~COGL_PIPELINE_STATE_COLOR),
+                                 COGL_PIPELINE_LAYER_STATE_ALL,
+                                 0))
+        return FALSE;
+
+
+      /* we currently only care about cases where the premultiplied or
+       * unpremultipled colors are equivalent... */
+      if (color[3] != 0xff)
+        return FALSE;
+
+      pixel[0] = color[0];
+      pixel[1] = color[1];
+      pixel[2] = color[2];
+      pixel[3] = color[3];
+
+      goto success;
+    }
+
+success:
+  journal->fast_read_pixel_count++;
+  return TRUE;
+}
diff --git a/cogl/cogl-vertex-attribute.c b/cogl/cogl-vertex-attribute.c
index d7821a13d..37ff0a1f9 100644
--- a/cogl/cogl-vertex-attribute.c
+++ b/cogl/cogl-vertex-attribute.c
@@ -480,6 +480,7 @@ enable_gl_state (CoglDrawFlags flags,
                  CoglVertexAttribute **attributes,
                  ValidateLayerState *state)
 {
+  CoglFramebuffer *framebuffer = _cogl_get_framebuffer ();
   int i;
 #ifdef MAY_HAVE_PROGRAMABLE_GL
   GLuint generic_index = 0;
@@ -492,6 +493,12 @@ enable_gl_state (CoglDrawFlags flags,
 
   _COGL_GET_CONTEXT (ctx, COGL_INVALID_HANDLE);
 
+  /* In cogl_read_pixels we have a fast-path when reading a single
+   * pixel and the scene is just comprised of simple rectangles still
+   * in the journal. For this optimization to work we need to track
+   * when the framebuffer really does get drawn to. */
+  _cogl_framebuffer_dirty (framebuffer);
+
   source = cogl_get_source ();
 
   /* Iterate the attributes to work out whether blending needs to be
diff --git a/cogl/cogl.c b/cogl/cogl.c
index b6c3fa535..d8b0827c9 100644
--- a/cogl/cogl.c
+++ b/cogl/cogl.c
@@ -497,7 +497,7 @@ _cogl_read_pixels_with_rowstride (int x,
                                   guint8 *pixels,
                                   int rowstride)
 {
-  CoglFramebuffer *framebuffer;
+  CoglFramebuffer *framebuffer = _cogl_get_framebuffer ();
   int              framebuffer_height;
   int              bpp;
   CoglBitmap      *bmp;
@@ -510,6 +510,22 @@ _cogl_read_pixels_with_rowstride (int x,
 
   g_return_if_fail (source == COGL_READ_PIXELS_COLOR_BUFFER);
 
+  if (width == 1 && height == 1 && !framebuffer->clear_clip_dirty)
+    {
+      /* If everything drawn so far for this frame is still in the
+       * Journal then if all of the rectangles only have a flat
+       * opaque color we have a fast-path for reading a single pixel
+       * that avoids the relatively high cost of flushing primitives
+       * to be drawn on the GPU (considering how simple the geometry
+       * is in this case) and then blocking on the long GPU pipelines
+       * for the result.
+       */
+      if (_cogl_framebuffer_try_fast_read_pixel (framebuffer,
+                                                 x, y, source, format,
+                                                 pixels))
+        return;
+    }
+
   /* make sure any batched primitives get emitted to the GL driver
    * before issuing our read pixels...
    *
@@ -521,8 +537,6 @@ _cogl_read_pixels_with_rowstride (int x,
    */
   cogl_flush ();
 
-  framebuffer = _cogl_get_framebuffer ();
-
   _cogl_framebuffer_flush_state (framebuffer, 0);
 
   framebuffer_height = _cogl_framebuffer_get_height (framebuffer);