diff --git a/draw.h b/draw.h index 27744ef..35c8674 100644 --- a/draw.h +++ b/draw.h @@ -27,6 +27,19 @@ : "r"( r0 ) \ : "$12", "$13", "$14" ) +#define gte_stcuv( r0 ) __asm__ volatile ( \ + "mfc2 $12, $9;" \ + "mfc2 $13, $10;" \ + "mfc2 $14, $11;" \ + "sb $12, 0( %0 );" \ + "sb $13, 1( %0 );" \ + "sb $14, 2( %0 );" \ + "sb $14, 3( %0 );" \ + "sb $14, 4( %0 );" \ + : \ + : "r"( r0 ) \ + : "$12", "$13", "$14", "memory" ) + // Macros for quickly blitting RGB and UV values with a single copy // This is faster than copying each value individually #define setColorFast(pr, r) *((u_int*)(pr)) = *((u_int*)(r)) @@ -360,16 +373,16 @@ static INLINE void draw_quadstrip_textured(const ps1bsp_vertex_t *vertices, cons typedef struct _TMPVERT { int16_t vx, vy, vz, vpad; - uint8_t r, g, b, cpad; uint8_t u, v; + uint8_t r, g, b, cpad; uint16_t pad; } TMPVERT; #define copyVertFast(dst, pv, v) \ - blit32(&(dst)->r, &(pv)->r); \ - blit32(&(dst)->vx, &(v)->vx); \ - blit32(&(dst)->vz, &(v)->vz); \ - blit16(&(dst)->u, &(pv)->u); + blit32(&(dst).r, &(pv)->r); \ + blit32(&(dst).vx, &(v)->vx); \ + blit32(&(dst).vz, &(v)->vz); \ + blit16(&(dst).u, &(pv)->u); #define lerpVert(dst, src0, src1) \ gte_ld_intpol_sv1(&(src0).vx); \ @@ -404,8 +417,10 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p i3 = head++; TMPVERT *tmp = (TMPVERT*)(scratchpad); - const short half = ONE >> 1; + // Interpolation factor will always be 0.5 so we need to set this register only once + gte_lddp(ONE >> 1); + // Normally a quad strip would have (N-2)/2 quads, but we might end up with a sole triangle at the end which will be drawn as a collapsed quad // NOTE: testing has shown that the PS1 is faster just rendering quads and accepting the odd collapsed quad, rather than being clever with pointer comparisons and drawing a single triangle at the end. u_char numQuads = (numVerts - 1) >> 1; @@ -426,13 +441,10 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p v2 = &vertices[pv2->index]; v3 = &vertices[pv3->index]; - copyVertFast(&tmp[0], pv0, v0); - copyVertFast(&tmp[2], pv1, v1); - copyVertFast(&tmp[6], pv2, v2); - copyVertFast(&tmp[8], pv3, v3); - - // Interpolation factor will always be 0.5 so we need to set this register only once - gte_lddp(half); + copyVertFast(tmp[0], pv0, v0); + copyVertFast(tmp[2], pv1, v1); + copyVertFast(tmp[6], pv2, v2); + copyVertFast(tmp[8], pv3, v3); // TODO Optimization: start loading vertices into GTE as soon as we're done with them lerpVert(tmp[1], tmp[0], tmp[2]);