From efbf483d8cdd448b20ddcfedc518509f907b2ac6 Mon Sep 17 00:00:00 2001 From: Robert Bragg Date: Wed, 17 Jun 2009 01:31:36 +0100 Subject: [PATCH] [cogl journal] Perform software modelview transform on logged quads. Since most Clutter actors aren't much more than textured quads; flushing the journal typically involves lots of 'change modelview; draw quad' sequences. The amount of overhead involved in uploading a new modelview and queuing that primitive is huge in comparison to simply transforming 4 vertices by the current modelview when logging quads. (Note if your GPU supports HW vertex transform, then it still does the projective and viewport transforms) At the same time a --cogl-debug=disable-software-transform option has been added for comparison and debugging. This change allows typical pick scenes to be batched into a single draw call and I'm seeing test-pick run over 200% faster with this. (i965 + Mesa 7.6-devel) --- clutter/cogl/cogl-debug.h | 3 +- clutter/cogl/common/cogl-debug.c | 3 +- clutter/cogl/common/cogl-primitives.c | 158 +++++++++++++++++++------- 3 files changed, 120 insertions(+), 44 deletions(-) diff --git a/clutter/cogl/cogl-debug.h b/clutter/cogl/cogl-debug.h index c8c1f566e..0c602909a 100644 --- a/clutter/cogl/cogl-debug.h +++ b/clutter/cogl/cogl-debug.h @@ -43,7 +43,8 @@ typedef enum { COGL_DEBUG_FORCE_CLIENT_SIDE_MATRICES = 1 << 11, COGL_DEBUG_DISABLE_VBOS = 1 << 12, COGL_DEBUG_JOURNAL = 1 << 13, - COGL_DEBUG_BATCHING = 1 << 14 + COGL_DEBUG_BATCHING = 1 << 14, + COGL_DEBUG_DISABLE_SOFTWARE_TRANSFORM = 1 << 15 } CoglDebugFlags; #ifdef COGL_ENABLE_DEBUG diff --git a/clutter/cogl/common/cogl-debug.c b/clutter/cogl/common/cogl-debug.c index e093a344c..28c475295 100644 --- a/clutter/cogl/common/cogl-debug.c +++ b/clutter/cogl/common/cogl-debug.c @@ -45,7 +45,8 @@ static const GDebugKey cogl_debug_keys[] = { { "client-side-matrices", COGL_DEBUG_FORCE_CLIENT_SIDE_MATRICES }, { "disable-vbos", COGL_DEBUG_DISABLE_VBOS }, { "journal", COGL_DEBUG_JOURNAL }, - { "batching", COGL_DEBUG_BATCHING } + { "batching", COGL_DEBUG_BATCHING }, + { "disable-software-transform", COGL_DEBUG_DISABLE_SOFTWARE_TRANSFORM } }; static const gint n_cogl_debug_keys = G_N_ELEMENTS (cogl_debug_keys); diff --git a/clutter/cogl/common/cogl-primitives.c b/clutter/cogl/common/cogl-primitives.c index 9dec92132..2bf358778 100644 --- a/clutter/cogl/common/cogl-primitives.c +++ b/clutter/cogl/common/cogl-primitives.c @@ -57,7 +57,7 @@ /* XXX NB: * Our journal's vertex data is arranged as follows: * 4 vertices per quad: - * 2 or GLfloats per position + * 2 or 3 GLfloats per position (3 when doing software transforms) * 4 RGBA GLubytes, * 2 GLfloats per tex coord * n_layers * @@ -66,11 +66,21 @@ * To avoid frequent changes in the stride of our vertex data we always pad * n_layers to be >= 2 * + * When we are transforming quads in software we need to also track the z + * coordinate of transformed vertices. + * * So for a given number of layers this gets the stride in 32bit words: */ -#define MIN_LAYER_PADING 2 +#define SW_TRANSFORM (!(cogl_debug_flags & \ + COGL_DEBUG_DISABLE_SOFTWARE_TRANSFORM)) +#define POS_STRIDE (SW_TRANSFORM ? 3 : 2) /* number of 32bit words */ +#define N_POS_COMPONENTS POS_STRIDE +#define COLOR_STRIDE 1 /* number of 32bit words */ +#define TEX_STRIDE 2 /* number of 32bit words */ +#define MIN_LAYER_PADING 2 #define GET_JOURNAL_VB_STRIDE_FOR_N_LAYERS(N_LAYERS) \ - (2 + 1 + 2 * (N_LAYERS < MIN_LAYER_PADING ? MIN_LAYER_PADING : N_LAYERS)) + (POS_STRIDE + COLOR_STRIDE + \ + TEX_STRIDE * (N_LAYERS < MIN_LAYER_PADING ? MIN_LAYER_PADING : N_LAYERS)) typedef void (*CoglJournalBatchCallback) (CoglJournalEntry *start, @@ -85,8 +95,8 @@ typedef struct _CoglJournalFlushState /* Note: this is a pointer to handle fallbacks. It normally holds a VBO * offset, but when the driver doesn't support VBOs then this points into * our GArray of logged vertices. */ - char * vbo_offset; - GLuint vertex_offset; + char * vbo_offset; + GLuint vertex_offset; #ifndef HAVE_COGL_GL CoglJournalIndices *indices; size_t indices_type_size; @@ -108,19 +118,26 @@ _cogl_journal_dump_quad_vertices (guint8 *data, int n_layers) _COGL_GET_CONTEXT (ctx, NO_RETVAL); - g_print ("stride = %d (%d bytes)\n", (int)stride, (int)stride * 4); + g_print ("n_layers = %d; stride = %d; pos stride = %d; color stride = %d; " + "tex stride = %d; stride in bytes = %d\n", + n_layers, (int)stride, POS_STRIDE, COLOR_STRIDE, + TEX_STRIDE, (int)stride * 4); for (i = 0; i < 4; i++) { float *v = (float *)data + (i * stride); - guint8 *c = data + 8 + (i * stride * 4); + guint8 *c = data + (POS_STRIDE * 4) + (i * stride * 4); int j; - g_print ("v%d: x = %f, y = %f, rgba=0x%02X%02X%02X%02X", - i, v[0], v[1], c[0], c[1], c[2], c[3]); + if (cogl_debug_flags & COGL_DEBUG_DISABLE_SOFTWARE_TRANSFORM) + g_print ("v%d: x = %f, y = %f, rgba=0x%02X%02X%02X%02X", + i, v[0], v[1], c[0], c[1], c[2], c[3]); + else + g_print ("v%d: x = %f, y = %f, z = %f, rgba=0x%02X%02X%02X%02X", + i, v[0], v[1], v[2], c[0], c[1], c[2], c[3]); for (j = 0; j < n_layers; j++) { - float *t = v + 3 + 2 * j; + float *t = v + POS_STRIDE + COLOR_STRIDE + TEX_STRIDE * j; g_print (", tx%d = %f, ty%d = %f", j, t[0], j, t[1]); } g_print ("\n"); @@ -181,7 +198,8 @@ _cogl_journal_flush_modelview_and_entries (CoglJournalEntry *batch_start, if (G_UNLIKELY (cogl_debug_flags & COGL_DEBUG_BATCHING)) g_print ("BATCHING: modelview batch len = %d\n", batch_len); - GE (glLoadMatrixf ((GLfloat *)&batch_start->model_view)); + if (G_UNLIKELY (cogl_debug_flags & COGL_DEBUG_DISABLE_SOFTWARE_TRANSFORM)) + GE (glLoadMatrixf ((GLfloat *)&batch_start->model_view)); #ifdef HAVE_COGL_GL @@ -248,6 +266,7 @@ compare_entry_modelviews (CoglJournalEntry *entry0, * if the age has increased, and if so record the change as a * boolean in the journal. */ + if (memcmp (&entry0->model_view, &entry1->model_view, sizeof (GLfloat) * 16) == 0) return TRUE; @@ -283,12 +302,18 @@ _cogl_journal_flush_material_and_entries (CoglJournalEntry *batch_start, enable_flags |= COGL_ENABLE_COLOR_ARRAY; cogl_enable (enable_flags); - batch_and_call (batch_start, - batch_len, - compare_entry_modelviews, - _cogl_journal_flush_modelview_and_entries, - data); - + /* If we haven't transformed the quads in software then we need to also break + * up batches according to changes in the modelview matrix... */ + if (cogl_debug_flags & COGL_DEBUG_DISABLE_SOFTWARE_TRANSFORM) + { + batch_and_call (batch_start, + batch_len, + compare_entry_modelviews, + _cogl_journal_flush_modelview_and_entries, + data); + } + else + _cogl_journal_flush_modelview_and_entries (batch_start, batch_len, data); } static gboolean @@ -332,14 +357,16 @@ _cogl_journal_flush_texcoord_vbo_offsets_and_entries ( /* XXX NB: * Our journal's vertex data is arranged as follows: * 4 vertices per quad: - * 2 GLfloats per position + * 2 or 3 GLfloats per position (3 when doing software transforms) * 4 RGBA GLubytes, * 2 GLfloats per tex coord * n_layers * (though n_layers may be padded; see definition of * GET_JOURNAL_VB_STRIDE_FOR_N_LAYERS for details) */ GE (glTexCoordPointer (2, GL_FLOAT, state->stride, - (void *)(state->vbo_offset + 12 + 8 * i))); + (void *)(state->vbo_offset + + (POS_STRIDE + COLOR_STRIDE) * 4 + + TEX_STRIDE * 4 * i))); } prev_n_texcoord_arrays_enabled = ctx->n_texcoord_arrays_enabled; @@ -389,7 +416,7 @@ _cogl_journal_flush_vbo_offsets_and_entries (CoglJournalEntry *batch_start, /* XXX NB: * Our journal's vertex data is arranged as follows: * 4 vertices per quad: - * 2 GLfloats per position + * 2 or 3 GLfloats per position (3 when doing software transforms) * 4 RGBA GLubytes, * 2 GLfloats per tex coord * n_layers * (though n_layers may be padded; see definition of @@ -399,9 +426,10 @@ _cogl_journal_flush_vbo_offsets_and_entries (CoglJournalEntry *batch_start, stride *= sizeof (GLfloat); state->stride = stride; - GE (glVertexPointer (2, GL_FLOAT, stride, (void *)state->vbo_offset)); + GE (glVertexPointer (N_POS_COMPONENTS, GL_FLOAT, stride, + (void *)state->vbo_offset)); GE (glColorPointer (4, GL_UNSIGNED_BYTE, stride, - (void *)(state->vbo_offset + 8))); + (void *)(state->vbo_offset + (POS_STRIDE * 4)))); #ifndef HAVE_COGL_GL indices_handle = cogl_vertex_buffer_indices_get_for_quads (needed_indices); @@ -540,19 +568,36 @@ _cogl_journal_flush (void) * we need to dirty our client side matrix stack cache... */ _cogl_current_matrix_state_dirty (); + /* If we have transformed all our quads at log time then the whole journal + * then we ensure no further model transform is applied by loading the + * identity matrix here...*/ + if (!(cogl_debug_flags & COGL_DEBUG_DISABLE_SOFTWARE_TRANSFORM)) + { + GE (glMatrixMode (GL_MODELVIEW)); + glLoadIdentity (); + } + /* batch_and_call() batches a list of journal entries according to some * given criteria and calls a callback once for each determined batch. * - * The process of flushing the journal is done by splitting the entries - * by three broad criteria: - * 1) We split the entries according the number of material layers. - * Each time the number of material layers changes, then the stride - * changes, so we need to call gl{Vertex,Color,Texture}Pointer to - * inform GL of new VO offsets. - * 2) We then split according to compatible Cogl materials. + * The process of flushing the journal is staggered to reduce the amount + * of driver/GPU state changes necessary: + * 1) We split the entries according to the stride of the vertices: + * Each time the stride of our vertex data changes we need to call + * gl{Vertex,Color}Pointer to inform GL of new VBO offsets. + * Currently the only thing that affects the stride of our vertex data + * is the number of material layers. + * 2) We split the entries explicitly by the number of material layers: + * We pad our vertex data when the number of layers is < 2 so that we + * can minimize changes in stride. Each time the number of layers + * changes we need to call glTexCoordPointer to inform GL of new VBO + * offsets. + * 3) We then split according to compatible Cogl materials: * This is where we flush material state - * 3) Finally we split according to modelview matrix changes. + * 4) Finally we split according to modelview matrix changes: * This is when we finally tell GL to draw something. + * Note: Splitting by modelview changes is skipped when are doing the + * vertex transformation in software at log time. */ batch_and_call ((CoglJournalEntry *)ctx->journal->data, /* first entry */ ctx->journal->len, /* max number of entries to consider */ @@ -612,7 +657,7 @@ _cogl_journal_log_quad (float x_1, next_vert = ctx->logged_vertices->len; g_array_set_size (ctx->logged_vertices, next_vert + 4 * stride); v = &g_array_index (ctx->logged_vertices, GLfloat, next_vert); - c = (GLubyte *)(v + 2); + c = (GLubyte *)(v + POS_STRIDE); /* XXX: All the jumping around to fill in this strided buffer doesn't * seem ideal. */ @@ -630,19 +675,47 @@ _cogl_journal_log_quad (float x_1, memcpy (c, src_c, 4); } - v[0] = x_1; v[1] = y_1; - v += stride; - v[0] = x_1; v[1] = y_2; - v += stride; - v[0] = x_2; v[1] = y_2; - v += stride; - v[0] = x_2; v[1] = y_1; + if (G_UNLIKELY (cogl_debug_flags & COGL_DEBUG_DISABLE_SOFTWARE_TRANSFORM)) + { + v[0] = x_1; v[1] = y_1; + v += stride; + v[0] = x_1; v[1] = y_2; + v += stride; + v[0] = x_2; v[1] = y_2; + v += stride; + v[0] = x_2; v[1] = y_1; + } + else + { + CoglMatrix mv; + float x, y, z, w; + + cogl_get_modelview_matrix (&mv); + + x = x_1, y = y_1, z = 0; w = 1; + cogl_matrix_transform_point (&mv, &x, &y, &z, &w); + v[0] = x; v[1] = y; v[2] = z; + v += stride; + x = x_1, y = y_2, z = 0; w = 1; + cogl_matrix_transform_point (&mv, &x, &y, &z, &w); + v[0] = x; v[1] = y; v[2] = z; + v += stride; + x = x_2, y = y_2, z = 0; w = 1; + cogl_matrix_transform_point (&mv, &x, &y, &z, &w); + v[0] = x; v[1] = y; v[2] = z; + v += stride; + x = x_2, y = y_1, z = 0; w = 1; + cogl_matrix_transform_point (&mv, &x, &y, &z, &w); + v[0] = x; v[1] = y; v[2] = z; + } for (i = 0; i < n_layers; i++) { - /* NB: See note at top about vertex buffer layout: */ - GLfloat *t = &g_array_index (ctx->logged_vertices, - GLfloat, next_vert + 3 + 2 * i); + /* XXX: See definition of GET_JOURNAL_VB_STRIDE_FOR_N_LAYERS for details + * about how we pack our vertex data */ + GLfloat *t = &g_array_index (ctx->logged_vertices, GLfloat, + next_vert + POS_STRIDE + + COLOR_STRIDE + TEX_STRIDE * i); t[0] = tex_coords[0]; t[1] = tex_coords[1]; t += stride; @@ -677,7 +750,8 @@ _cogl_journal_log_quad (float x_1, entry->flush_options.fallback_layers = fallback_layers; entry->flush_options.disable_layers = disable_layers; entry->flush_options.layer0_override_texture = layer0_override_texture; - cogl_get_modelview_matrix (&entry->model_view); + if (G_UNLIKELY (cogl_debug_flags & COGL_DEBUG_DISABLE_SOFTWARE_TRANSFORM)) + cogl_get_modelview_matrix (&entry->model_view); if (G_UNLIKELY (cogl_debug_flags & COGL_DEBUG_DISABLE_BATCHING || cogl_debug_flags & COGL_DEBUG_RECTANGLES))