Some rearrangement that will hopefully allow single light values to be lerped right alongside UVs, saving on one lerp per vertex.

3 years ago · 909b32f4c0
1 changed files with 25 additions and 13 deletions
--- a/draw.h
+++ b/draw.h
@ -27,6 +27,19 @@
 	: "r"( r0 )				\
 	: "$12", "$13", "$14" )

+#define gte_stcuv( r0 ) __asm__ volatile ( \
+	"mfc2	$12, $9;"		\
+	"mfc2	$13, $10;"		\
+	"mfc2	$14, $11;"		\
+	"sb	$12, 0( %0 );"		\
+	"sb	$13, 1( %0 );"		\
+	"sb	$14, 2( %0 );"		\
+    "sb	$14, 3( %0 );"		\
+    "sb	$14, 4( %0 );"		\
+	:						\
+	: "r"( r0 )				\
+	: "$12", "$13", "$14", "memory" )
+
 // Macros for quickly blitting RGB and UV values with a single copy
 // This is faster than copying each value individually
 #define setColorFast(pr, r)         *((u_int*)(pr)) = *((u_int*)(r))
@ -360,16 +373,16 @@ static INLINE void draw_quadstrip_textured(const ps1bsp_vertex_t *vertices, cons
 typedef struct _TMPVERT
 {
    int16_t vx, vy, vz, vpad;
-    uint8_t r, g, b, cpad;
    uint8_t u, v;
+    uint8_t r, g, b, cpad;
    uint16_t pad;
 } TMPVERT;

 #define copyVertFast(dst, pv, v)  \
-    blit32(&(dst)->r, &(pv)->r); \
-    blit32(&(dst)->vx, &(v)->vx); \
-    blit32(&(dst)->vz, &(v)->vz); \
-    blit16(&(dst)->u, &(pv)->u);
+    blit32(&(dst).r, &(pv)->r); \
+    blit32(&(dst).vx, &(v)->vx); \
+    blit32(&(dst).vz, &(v)->vz); \
+    blit16(&(dst).u, &(pv)->u);

 #define lerpVert(dst, src0, src1) \
    gte_ld_intpol_sv1(&(src0).vx); \
@ -404,8 +417,10 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p
    i3 = head++;

    TMPVERT *tmp = (TMPVERT*)(scratchpad);
-    const short half = ONE >> 1;
   
+    // Interpolation factor will always be 0.5 so we need to set this register only once
+    gte_lddp(ONE >> 1);
+
    // Normally a quad strip would have (N-2)/2 quads, but we might end up with a sole triangle at the end which will be drawn as a collapsed quad
    // NOTE: testing has shown that the PS1 is faster just rendering quads and accepting the odd collapsed quad, rather than being clever with pointer comparisons and drawing a single triangle at the end.
    u_char numQuads = (numVerts - 1) >> 1;
@ -426,13 +441,10 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p
        v2 = &vertices[pv2->index];
        v3 = &vertices[pv3->index];

-        copyVertFast(&tmp[0], pv0, v0);
-        copyVertFast(&tmp[2], pv1, v1);
-        copyVertFast(&tmp[6], pv2, v2);
-        copyVertFast(&tmp[8], pv3, v3);
-
-        // Interpolation factor will always be 0.5 so we need to set this register only once
-        gte_lddp(half);
+        copyVertFast(tmp[0], pv0, v0);
+        copyVertFast(tmp[2], pv1, v1);
+        copyVertFast(tmp[6], pv2, v2);
+        copyVertFast(tmp[8], pv3, v3);

        // TODO Optimization: start loading vertices into GTE as soon as we're done with them
        lerpVert(tmp[1], tmp[0], tmp[2]);