Unrolled quad strip drawing loop to up to three quads at a time, sharing transformed vertices between them.

Not actually a performance improvement and I'm not sure why. Sidelining this for now.
3 years ago · 2344e8ba42
1 changed files with 127 additions and 40 deletions
--- a/draw.h
+++ b/draw.h
@ -209,66 +209,153 @@ static INLINE void draw_triangle_textured(STVECTOR *verts, u_short tpage, u_long

 static INLINE void draw_quadstrip_textured(const ps1bsp_vertex_t *vertices, const ps1bsp_polyvertex_t *polyVerts, u_char numVerts, u_short tpage, u_long *ot)
 {
-    const ps1bsp_polyvertex_t *v0, *v1, *v2, *v3;
-    u_char i0, i1, i2, i3;
+    POLY_GT4 *p0, *p1, *p2;
+    const ps1bsp_polyvertex_t *v0, *v1, *v2;
    u_char head = 0;
    u_char tail = numVerts;

-    // Initialize the first two vertices
-    i2 = --tail;
-    i3 = head++;
-   
+    // Initialize the first vertices
+    v0 = &polyVerts[head++];
+    v1 = &polyVerts[--tail];
+    v2 = &polyVerts[head++];
+
+    // Transform the first three vertices
+    gte_ldv3(&vertices[v0->index], &vertices[v1->index], &vertices[v2->index]);
+    gte_rtpt();
+
    // Normally a quad strip would have (N-2)/2 quads, but we might end up with a sole triangle at the end which will be drawn as a collapsed quad
    // NOTE: testing has shown that the PS1 is faster just rendering quads and accepting the odd collapsed quad, rather than being clever with pointer comparisons and drawing a single triangle at the end.
    u_char numQuads = (numVerts - 1) >> 1;
-    for (u_char quadIdx = 0; quadIdx < numQuads; ++quadIdx)
+    u_char quadIdx = 0;
+    for (;;)
    {
-        i0 = i2;
-        i1 = i3;
-        i2 = --tail;
-        i3 = head++;
-
-        v0 = &polyVerts[i0];
-        v1 = &polyVerts[i1];
-        v2 = &polyVerts[i2];
-        v3 = &polyVerts[i3];
-
-        // Transform the first three vertices
-        gte_ldv3(&vertices[v0->index], &vertices[v1->index], &vertices[v2->index]);
-        gte_rtpt();     // Rotation, translation, perspective projection
+        if (quadIdx++ == numQuads)
+            return;

        // Draw a gouraud shaded textured quad
-        POLY_GT4 *poly = (POLY_GT4*)mem_prim(sizeof(POLY_GT4));
+        p0 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4));

        // Fill out the quad's data fields in struct order, to optimize data access
        // First vertex and texture CLUT
-        setColorFast(&poly->r0, &v0->r);
-        setUVFast(&poly->u0, &v0->u);
-        gte_stsxy0(&poly->x0);
-        poly->clut = quake_clut;
+        setColorFast(&p0->r0, &v0->r);
+        setUVFast(&p0->u0, &v0->u);
+        gte_stsxy0(&p0->x0);
+        p0->clut = quake_clut;

        // Second vertex and texture page
-        setColorFast(&poly->r1, &v1->r);
-        gte_stsxy1(&poly->x1);
-        setUVFast(&poly->u1, &v1->u);
-        poly->tpage = tpage;
+        setColorFast(&p0->r1, &v1->r);
+        gte_stsxy1(&p0->x1);
+        setUVFast(&p0->u1, &v1->u);
+        p0->tpage = tpage;

        // Third vertex
-        setColorFast(&poly->r2, &v2->r);
-        gte_stsxy2(&poly->x2);
-        setUVFast(&poly->u2, &v2->u);
+        setColorFast(&p0->r2, &v2->r);
+        gte_stsxy2(&p0->x2);
+        setUVFast(&p0->u2, &v2->u);

-        // Transform the fourth vertex to complete the quad
-        gte_ldv0(&vertices[v3->index]);
-        gte_rtps();
+        if (quadIdx++ == numQuads)
+        {
+            v0 = &polyVerts[--tail];
+
+            // Transform the fourth vertex to complete the quad
+            gte_ldv0(&vertices[v0->index]);
+            gte_rtps();
+
+            // Fourth vertex
+            setColorFast(&p0->r3, &v0->r);
+            setUVFast(&p0->u3, &v0->u);
+            gte_stsxy(&p0->x3);
+
+            // Finalize the quad
+            setPolyGT4(p0);
+            addPrim(ot, p0);
+            ++polyCount;
+            return;
+        }
+
+        // Start the second quad, sharing two vertices with the first quad
+        p1 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4));
+        setColorFast(&p1->r0, &v2->r);
+        gte_stsxy2(&p1->x0);
+        setUVFast(&p1->u0, &v2->u);
+        p1->clut = quake_clut;
+
+        // Transform the next three vertices
+        v0 = &polyVerts[--tail];
+        v1 = &polyVerts[head++];
+        v2 = &polyVerts[--tail];
+
+        gte_ldv3(&vertices[v0->index], &vertices[v1->index], &vertices[v2->index]);
+        gte_rtpt();
+
+        // Fourth vertex of the first quad
+        setColorFast(&p0->r3, &v0->r);
+        setUVFast(&p0->u3, &v0->u);
+        gte_stsxy0(&p0->x3);
+
+        // Finalize the first quad
+        setPolyGT4(p0);
+        addPrim(ot, p0);
+        ++polyCount;
+
+        // Second vertex of the second quad
+        setColorFast(&p1->r1, &v0->r);
+        setUVFast(&p1->u1, &v0->u);
+        gte_stsxy0(&p1->x1);
+        p1->tpage = tpage;
+
+        // Third vertex
+        setColorFast(&p1->r2, &v1->r);
+        gte_stsxy1(&p1->x2);
+        setUVFast(&p1->u2, &v1->u);

        // Fourth vertex
-        setColorFast(&poly->r3, &v3->r);
-        setUVFast(&poly->u3, &v3->u);
-        gte_stsxy(&poly->x3);
+        setColorFast(&p1->r3, &v2->r);
+        gte_stsxy2(&p1->x3);
+        setUVFast(&p1->u3, &v2->u);

-        setPolyGT4(poly);
-        addPrim(ot, poly);
+        // Finalize the second quad
+        setPolyGT4(p1);
+        addPrim(ot, p1);
+        ++polyCount;
+
+        if (quadIdx++ == numQuads)
+            return;
+
+        // Start the third quad, sharing two vertices with the second quad and two with the next quad
+        p2 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4));
+        setColorFast(&p2->r0, &v1->r);
+        gte_stsxy1(&p2->x0);
+        setUVFast(&p2->u0, &v1->u);
+        p2->clut = quake_clut;
+
+        // Second vertex and texture page
+        setColorFast(&p2->r1, &v2->r);
+        setUVFast(&p2->u1, &v2->u);
+        gte_stsxy2(&p2->x1);
+        p2->tpage = tpage;
+
+        // Transform the next three vertices
+        v0 = &polyVerts[head++];
+        v1 = &polyVerts[--tail];
+        v2 = &polyVerts[head++];
+
+        gte_ldv3(&vertices[v0->index], &vertices[v1->index], &vertices[v2->index]);
+        gte_rtpt();
+
+        // Third vertex
+        setColorFast(&p2->r2, &v0->r);
+        gte_stsxy0(&p2->x2);
+        setUVFast(&p2->u2, &v0->u);
+
+        // Fourth vertex
+        setColorFast(&p2->r3, &v1->r);
+        gte_stsxy1(&p2->x3);
+        setUVFast(&p2->u3, &v1->u);
+
+        // Finalize the second quad
+        setPolyGT4(p2);
+        addPrim(ot, p2);
        ++polyCount;
    }
 }