Intersperse GTE instructions with vertex lerping and quad building code to mask load/store delays.

Makes a noticeable difference in drawing performance.
3 years ago · 4f886bd40c
1 changed files with 14 additions and 15 deletions
--- a/draw.h
+++ b/draw.h
@ -404,34 +404,24 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p
        copyVertFast(&tmp[6], pv2, v2);
        copyVertFast(&tmp[8], pv3, v3);
        // TODO Optimization: start loading vertices into GTE as soon as we're done with them
        // Interpolate vertices and start loading them into GTE as soon as we're done with them
        // This way we avoid waiting for the GTE load/store delays
        lerpVert(&tmp[1], &tmp[0], &tmp[2]);
        lerpVert(&tmp[5], &tmp[2], &tmp[8]);    // After this, 0 1 2 are ready for GTE
        lerpVert(&tmp[3], &tmp[0], &tmp[6]);
        lerpVert(&tmp[4], &tmp[3], &tmp[5]);    // After this, 3 4 5 are ready for GTE
        lerpVert(&tmp[7], &tmp[6], &tmp[8]);
        // Transform the vertices in groups of three
        gte_ldv3(&tmp[0], &tmp[1], &tmp[2]);
        gte_rtpt();
        lerpVert(&tmp[3], &tmp[0], &tmp[6]);
        gte_stsxy0(&tmp[0].vx);
        gte_stsxy1(&tmp[1].vx);
        gte_stsxy2(&tmp[2].vx);
        lerpVert(&tmp[4], &tmp[3], &tmp[5]);    // After this, 3 4 5 are ready for GTE
        gte_ldv3(&tmp[3], &tmp[4], &tmp[5]);
        gte_rtpt();
        lerpVert(&tmp[7], &tmp[6], &tmp[8]);
        gte_stsxy0(&tmp[3].vx);
        gte_stsxy1(&tmp[4].vx);
        gte_stsxy2(&tmp[5].vx);
        gte_ldv3(&tmp[6], &tmp[7], &tmp[8]);
        gte_rtpt();
        gte_stsxy0(&tmp[6].vx);
        gte_stsxy1(&tmp[7].vx);
        gte_stsxy2(&tmp[8].vx);
        // TODO Optimization: interperse quad building instructions while waiting on GTE store/load delays
        // Draw the first quad
        p0 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4));
        blitVert(p0, 0, tmp[0]);
@ -442,6 +432,10 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p
        blitVert(p0, 3, tmp[4]);
        setPolyGT4(p0);
        addPrim(ot, p0);
        // Transform the final three vertices, needed for the final two quads
        gte_ldv3(&tmp[6], &tmp[7], &tmp[8]);
        gte_rtpt();
        // Second quad
        p1 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4));
@ -454,6 +448,11 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p
        setPolyGT4(p1);
        addPrim(ot, p1);
        // Make sure the transformed vertices are loaded from GTE before we need them
        gte_stsxy0(&tmp[6].vx);
        gte_stsxy1(&tmp[7].vx);
        gte_stsxy2(&tmp[8].vx);
        // Third quad
        p2 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4));
        blitVert(p2, 0, tmp[3]);