diff --git a/draw.h b/draw.h index adb2c2a..d224b86 100644 --- a/draw.h +++ b/draw.h @@ -404,34 +404,24 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p copyVertFast(&tmp[6], pv2, v2); copyVertFast(&tmp[8], pv3, v3); - // TODO Optimization: start loading vertices into GTE as soon as we're done with them + // Interpolate vertices and start loading them into GTE as soon as we're done with them + // This way we avoid waiting for the GTE load/store delays lerpVert(&tmp[1], &tmp[0], &tmp[2]); lerpVert(&tmp[5], &tmp[2], &tmp[8]); // After this, 0 1 2 are ready for GTE - lerpVert(&tmp[3], &tmp[0], &tmp[6]); - lerpVert(&tmp[4], &tmp[3], &tmp[5]); // After this, 3 4 5 are ready for GTE - lerpVert(&tmp[7], &tmp[6], &tmp[8]); - - // Transform the vertices in groups of three gte_ldv3(&tmp[0], &tmp[1], &tmp[2]); gte_rtpt(); + lerpVert(&tmp[3], &tmp[0], &tmp[6]); gte_stsxy0(&tmp[0].vx); gte_stsxy1(&tmp[1].vx); gte_stsxy2(&tmp[2].vx); - + lerpVert(&tmp[4], &tmp[3], &tmp[5]); // After this, 3 4 5 are ready for GTE gte_ldv3(&tmp[3], &tmp[4], &tmp[5]); gte_rtpt(); + lerpVert(&tmp[7], &tmp[6], &tmp[8]); gte_stsxy0(&tmp[3].vx); gte_stsxy1(&tmp[4].vx); gte_stsxy2(&tmp[5].vx); - gte_ldv3(&tmp[6], &tmp[7], &tmp[8]); - gte_rtpt(); - gte_stsxy0(&tmp[6].vx); - gte_stsxy1(&tmp[7].vx); - gte_stsxy2(&tmp[8].vx); - - // TODO Optimization: interperse quad building instructions while waiting on GTE store/load delays - // Draw the first quad p0 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4)); blitVert(p0, 0, tmp[0]); @@ -442,6 +432,10 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p blitVert(p0, 3, tmp[4]); setPolyGT4(p0); addPrim(ot, p0); + + // Transform the final three vertices, needed for the final two quads + gte_ldv3(&tmp[6], &tmp[7], &tmp[8]); + gte_rtpt(); // Second quad p1 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4)); @@ -454,6 +448,11 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p setPolyGT4(p1); addPrim(ot, p1); + // Make sure the transformed vertices are loaded from GTE before we need them + gte_stsxy0(&tmp[6].vx); + gte_stsxy1(&tmp[7].vx); + gte_stsxy2(&tmp[8].vx); + // Third quad p2 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4)); blitVert(p2, 0, tmp[3]);