|
|
@ -384,6 +384,8 @@ typedef struct _TMPVERT |
|
|
blit32(&(dst).vz, &(v)->vz); \ |
|
|
blit32(&(dst).vz, &(v)->vz); \ |
|
|
blit16(&(dst).u, &(pv)->u); |
|
|
blit16(&(dst).u, &(pv)->u); |
|
|
|
|
|
|
|
|
|
|
|
// Vertex interpolation using GTE macros |
|
|
|
|
|
// Interpolates XYZ, RGB and UV separately |
|
|
#define lerpVert(dst, src0, src1) \ |
|
|
#define lerpVert(dst, src0, src1) \ |
|
|
gte_ld_intpol_sv1(&(src0).vx); \ |
|
|
gte_ld_intpol_sv1(&(src0).vx); \ |
|
|
gte_ld_intpol_sv0(&(src1).vx); \ |
|
|
gte_ld_intpol_sv0(&(src1).vx); \ |
|
|
@ -398,6 +400,9 @@ typedef struct _TMPVERT |
|
|
gte_intpl(); \ |
|
|
gte_intpl(); \ |
|
|
gte_stbv(&(dst).u); |
|
|
gte_stbv(&(dst).u); |
|
|
|
|
|
|
|
|
|
|
|
// Vertex interpolation using pure assembly |
|
|
|
|
|
// Combines UV and R into a single vector, thus requiring only two interpolates |
|
|
|
|
|
// Outputs the interpolated light value directly to the R, G and B fields of dst |
|
|
#define lerpVert2(dst, src0, src1) __asm__ volatile ( \ |
|
|
#define lerpVert2(dst, src0, src1) __asm__ volatile ( \ |
|
|
"lh $12, 0( %1 );" \ |
|
|
"lh $12, 0( %1 );" \ |
|
|
"lh $13, 2( %1 );" \ |
|
|
"lh $13, 2( %1 );" \ |
|
|
@ -467,7 +472,6 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p |
|
|
gte_lddp(ONE >> 1); |
|
|
gte_lddp(ONE >> 1); |
|
|
|
|
|
|
|
|
// Normally a quad strip would have (N-2)/2 quads, but we might end up with a sole triangle at the end which will be drawn as a collapsed quad |
|
|
// Normally a quad strip would have (N-2)/2 quads, but we might end up with a sole triangle at the end which will be drawn as a collapsed quad |
|
|
// NOTE: testing has shown that the PS1 is faster just rendering quads and accepting the odd collapsed quad, rather than being clever with pointer comparisons and drawing a single triangle at the end. |
|
|
|
|
|
u_char numQuads = (numVerts - 1) >> 1; |
|
|
u_char numQuads = (numVerts - 1) >> 1; |
|
|
for (u_char quadIdx = 0; quadIdx < numQuads; ++quadIdx) |
|
|
for (u_char quadIdx = 0; quadIdx < numQuads; ++quadIdx) |
|
|
{ |
|
|
{ |
|
|
@ -492,6 +496,8 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p |
|
|
copyVertFast(tmp[8], pv3, v3); |
|
|
copyVertFast(tmp[8], pv3, v3); |
|
|
|
|
|
|
|
|
// TODO Optimization: start loading vertices into GTE as soon as we're done with them |
|
|
// TODO Optimization: start loading vertices into GTE as soon as we're done with them |
|
|
|
|
|
// TODO Even better: make lerpVert also push the resulting XYZ straight to the C2_VXYZn data registers |
|
|
|
|
|
// NOTE: could separate XYZ lerping from UVL lerping, as XYZ needs to be transformed whereas UVL doesn't. UVL lerping might also be faster on CPU. |
|
|
lerpVert2(&tmp[1], &tmp[0], &tmp[2]); |
|
|
lerpVert2(&tmp[1], &tmp[0], &tmp[2]); |
|
|
lerpVert2(&tmp[5], &tmp[2], &tmp[8]); // After this, 0 1 2 are ready for GTE |
|
|
lerpVert2(&tmp[5], &tmp[2], &tmp[8]); // After this, 0 1 2 are ready for GTE |
|
|
lerpVert2(&tmp[3], &tmp[0], &tmp[6]); |
|
|
lerpVert2(&tmp[3], &tmp[0], &tmp[6]); |
|
|
@ -517,7 +523,7 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p |
|
|
gte_stsxy1(&tmp[7].vx); |
|
|
gte_stsxy1(&tmp[7].vx); |
|
|
gte_stsxy2(&tmp[8].vx); |
|
|
gte_stsxy2(&tmp[8].vx); |
|
|
|
|
|
|
|
|
// TODO Optimization: interperse quad building instructions while waiting on GTE store/load delays |
|
|
|
|
|
|
|
|
// TODO Optimization: intersperse quad building instructions while waiting on GTE store/load delays |
|
|
|
|
|
|
|
|
// Draw the first quad |
|
|
// Draw the first quad |
|
|
p0 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4)); |
|
|
p0 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4)); |
|
|
|