diff --git a/draw.h b/draw.h index 6e4bbf3..fc96e23 100644 --- a/draw.h +++ b/draw.h @@ -384,6 +384,8 @@ typedef struct _TMPVERT blit32(&(dst).vz, &(v)->vz); \ blit16(&(dst).u, &(pv)->u); +// Vertex interpolation using GTE macros +// Interpolates XYZ, RGB and UV separately #define lerpVert(dst, src0, src1) \ gte_ld_intpol_sv1(&(src0).vx); \ gte_ld_intpol_sv0(&(src1).vx); \ @@ -398,6 +400,9 @@ typedef struct _TMPVERT gte_intpl(); \ gte_stbv(&(dst).u); +// Vertex interpolation using pure assembly +// Combines UV and R into a single vector, thus requiring only two interpolates +// Outputs the interpolated light value directly to the R, G and B fields of dst #define lerpVert2(dst, src0, src1) __asm__ volatile ( \ "lh $12, 0( %1 );" \ "lh $13, 2( %1 );" \ @@ -467,7 +472,6 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p gte_lddp(ONE >> 1); // Normally a quad strip would have (N-2)/2 quads, but we might end up with a sole triangle at the end which will be drawn as a collapsed quad - // NOTE: testing has shown that the PS1 is faster just rendering quads and accepting the odd collapsed quad, rather than being clever with pointer comparisons and drawing a single triangle at the end. u_char numQuads = (numVerts - 1) >> 1; for (u_char quadIdx = 0; quadIdx < numQuads; ++quadIdx) { @@ -492,6 +496,8 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p copyVertFast(tmp[8], pv3, v3); // TODO Optimization: start loading vertices into GTE as soon as we're done with them + // TODO Even better: make lerpVert also push the resulting XYZ straight to the C2_VXYZn data registers + // NOTE: could separate XYZ lerping from UVL lerping, as XYZ needs to be transformed whereas UVL doesn't. UVL lerping might also be faster on CPU. lerpVert2(&tmp[1], &tmp[0], &tmp[2]); lerpVert2(&tmp[5], &tmp[2], &tmp[8]); // After this, 0 1 2 are ready for GTE lerpVert2(&tmp[3], &tmp[0], &tmp[6]); @@ -517,7 +523,7 @@ static INLINE void draw_quadstrip_tess2(const ps1bsp_vertex_t *vertices, const p gte_stsxy1(&tmp[7].vx); gte_stsxy2(&tmp[8].vx); - // TODO Optimization: interperse quad building instructions while waiting on GTE store/load delays + // TODO Optimization: intersperse quad building instructions while waiting on GTE store/load delays // Draw the first quad p0 = (POLY_GT4*)mem_prim(sizeof(POLY_GT4)); diff --git a/world.c b/world.c index 0b75cf9..3bc1da8 100644 --- a/world.c +++ b/world.c @@ -82,6 +82,7 @@ static INLINE short world_cull_backface(const world_t *world, const ps1bsp_face_ // Check if the face is behind the camera // NOTE: disabling the behind-the-camera check does *not* actually solve the problem of polygons disappearing when too close to the camera! // This means that the GPU probably already clips polygons that stretch too far outside the drawing area. Tessellation should solve this. + // UPDATE: tessellation does solve this, to an extent. However we must also make sure not to discard faces partially inside the frustum, and force tessellation on those. int camDot = m_dot12(&cam_vec, &cam_dir); if (camDot < 0) return 0; @@ -322,6 +323,12 @@ static void world_sortModels(const world_t *world) ps1bsp_model_t* model = (ps1bsp_model_t*)&world->models[modelIdx]; ps1bsp_leaf_t* leaf = (ps1bsp_leaf_t*)model->currentLeaf; + // TODO: skip clip brush-only models (clip and trigger textures) + // TODO: frustum culling (may be more expensive than it's worth) + // TODO: instead of leaf, refer to a node, i.e. the first one that splits the model's bounding box/sphere (R_FindTopnode in Q2 code) + // Q2 distinguishes between models that fall entirely in one leaf (i.e. small ones) and ones split by a node; the latter is clipped by R_DrawSolidClippedSubmodelPolygons + // Could probably traverse BSP tree from top node and divide up model faces per leaf, insert them as the leaf faces are sorted + // Update the model's current leaf. This only needs to be done when the model moves. if (leaf == NULL) {