mirror of
https://github.com/stenzek/duckstation.git
synced 2025-06-07 12:05:52 +00:00
GPU/HW: Vectorize flipped sprite handling
This commit is contained in:
parent
1a211e0a21
commit
b7832e609f
@ -2101,30 +2101,33 @@ ALWAYS_INLINE_RELEASE void GPU_HW::DrawBatchVertices(BatchRenderMode render_mode
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE_RELEASE void GPU_HW::ComputeUVPartialDerivatives(const BatchVertex* vertices, float* dudx, float* dudy,
|
||||||
|
float* dvdx, float* dvdy, float* xy_area, s32* uv_area)
|
||||||
|
{
|
||||||
|
const float v01x = vertices[1].x - vertices[0].x;
|
||||||
|
const float v01y = vertices[1].y - vertices[0].y;
|
||||||
|
const float v12x = vertices[2].x - vertices[1].x;
|
||||||
|
const float v12y = vertices[2].y - vertices[1].y;
|
||||||
|
const float v23x = vertices[0].x - vertices[2].x;
|
||||||
|
const float v23y = vertices[0].y - vertices[2].y;
|
||||||
|
const float v0u = static_cast<float>(vertices[0].u);
|
||||||
|
const float v0v = static_cast<float>(vertices[0].v);
|
||||||
|
const float v1u = static_cast<float>(vertices[1].u);
|
||||||
|
const float v1v = static_cast<float>(vertices[1].v);
|
||||||
|
const float v2u = static_cast<float>(vertices[2].u);
|
||||||
|
const float v2v = static_cast<float>(vertices[2].v);
|
||||||
|
*dudx = -v01y * v2u - v12y * v0u - v23y * v1u;
|
||||||
|
*dvdx = -v01y * v2v - v12y * v0v - v23y * v1v;
|
||||||
|
*dudy = v01x * v2u + v12x * v0u + v23x * v1u;
|
||||||
|
*dvdy = v01x * v2v + v12x * v0v + v23x * v1v;
|
||||||
|
*xy_area = v12x * v23y - v12y * v23x;
|
||||||
|
*uv_area = (vertices[1].u - vertices[0].u) * (vertices[2].v - vertices[0].v) -
|
||||||
|
(vertices[2].u - vertices[0].u) * (vertices[1].v - vertices[0].v);
|
||||||
|
}
|
||||||
|
|
||||||
ALWAYS_INLINE_RELEASE void GPU_HW::HandleFlippedQuadTextureCoordinates(const GPUBackendDrawCommand* cmd,
|
ALWAYS_INLINE_RELEASE void GPU_HW::HandleFlippedQuadTextureCoordinates(const GPUBackendDrawCommand* cmd,
|
||||||
BatchVertex* vertices)
|
BatchVertex* vertices)
|
||||||
{
|
{
|
||||||
// Taken from beetle-psx gpu_polygon.cpp
|
|
||||||
// For X/Y flipped 2D sprites, PSX games rely on a very specific rasterization behavior. If U or V is decreasing in X
|
|
||||||
// or Y, and we use the provided U/V as is, we will sample the wrong texel as interpolation covers an entire pixel,
|
|
||||||
// while PSX samples its interpolation essentially in the top-left corner and splats that interpolant across the
|
|
||||||
// entire pixel. While we could emulate this reasonably well in native resolution by shifting our vertex coords by
|
|
||||||
// 0.5, this breaks in upscaling scenarios, because we have several samples per native sample and we need NN rules to
|
|
||||||
// hit the same UV every time. One approach here is to use interpolate at offset or similar tricks to generalize the
|
|
||||||
// PSX interpolation patterns, but the problem is that vertices sharing an edge will no longer see the same UV (due to
|
|
||||||
// different plane derivatives), we end up sampling outside the intended boundary and artifacts are inevitable, so the
|
|
||||||
// only case where we can apply this fixup is for "sprites" or similar which should not share edges, which leads to
|
|
||||||
// this unfortunate code below.
|
|
||||||
|
|
||||||
// It might be faster to do more direct checking here, but the code below handles primitives in any order and
|
|
||||||
// orientation, and is far more SIMD-friendly if needed.
|
|
||||||
const float abx = vertices[1].x - vertices[0].x;
|
|
||||||
const float aby = vertices[1].y - vertices[0].y;
|
|
||||||
const float bcx = vertices[2].x - vertices[1].x;
|
|
||||||
const float bcy = vertices[2].y - vertices[1].y;
|
|
||||||
const float cax = vertices[0].x - vertices[2].x;
|
|
||||||
const float cay = vertices[0].y - vertices[2].y;
|
|
||||||
|
|
||||||
// Hack for Wild Arms 2: The player sprite is drawn one line at a time with a quad, but the bottom V coordinates
|
// Hack for Wild Arms 2: The player sprite is drawn one line at a time with a quad, but the bottom V coordinates
|
||||||
// are set to a large distance from the top V coordinate. When upscaling, this means that the coordinate is
|
// are set to a large distance from the top V coordinate. When upscaling, this means that the coordinate is
|
||||||
// interpolated between these two values, result in out-of-bounds sampling. At native, it's fine, because at the
|
// interpolated between these two values, result in out-of-bounds sampling. At native, it's fine, because at the
|
||||||
@ -2143,63 +2146,47 @@ ALWAYS_INLINE_RELEASE void GPU_HW::HandleFlippedQuadTextureCoordinates(const GPU
|
|||||||
vertices[3].v = vertices[0].v;
|
vertices[3].v = vertices[0].v;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute static derivatives, just assume W is uniform across the primitive and that the plane equation remains the
|
// Handle interpolation differences between PC GPUs and the PSX GPU. The first pixel on each span/scanline is given
|
||||||
// same across the quad. (which it is, there is no Z.. yet).
|
// the initial U/V coordinate without any further interpolation on the PSX GPU, in contrast to PC GPUs. This results
|
||||||
const float dudx = -aby * static_cast<float>(vertices[2].u) - bcy * static_cast<float>(vertices[0].u) -
|
// in oversampling on the right edge, so compensate by offsetting the left (right in texture space) UV.
|
||||||
cay * static_cast<float>(vertices[1].u);
|
alignas(VECTOR_ALIGNMENT) float pd[4];
|
||||||
const float dvdx = -aby * static_cast<float>(vertices[2].v) - bcy * static_cast<float>(vertices[0].v) -
|
float xy_area;
|
||||||
cay * static_cast<float>(vertices[1].v);
|
s32 uv_area;
|
||||||
const float dudy = +abx * static_cast<float>(vertices[2].u) + bcx * static_cast<float>(vertices[0].u) +
|
ComputeUVPartialDerivatives(vertices, &pd[0], &pd[1], &pd[2], &pd[3], &xy_area, &uv_area);
|
||||||
cax * static_cast<float>(vertices[1].u);
|
if (xy_area == 0.0f || uv_area == 0)
|
||||||
const float dvdy = +abx * static_cast<float>(vertices[2].v) + bcx * static_cast<float>(vertices[0].v) +
|
|
||||||
cax * static_cast<float>(vertices[1].v);
|
|
||||||
const float area = bcx * cay - bcy * cax;
|
|
||||||
|
|
||||||
// Detect and reject any triangles with 0 size texture area
|
|
||||||
const s32 texArea = (vertices[1].u - vertices[0].u) * (vertices[2].v - vertices[0].v) -
|
|
||||||
(vertices[2].u - vertices[0].u) * (vertices[1].v - vertices[0].v);
|
|
||||||
|
|
||||||
// Shouldn't matter as degenerate primitives will be culled anyways.
|
|
||||||
if (area == 0.0f || texArea == 0)
|
|
||||||
return;
|
return;
|
||||||
|
|
||||||
// Use floats here as it'll be faster than integer divides.
|
const GSVector4 pd_area = GSVector4::load<true>(pd) / GSVector4(xy_area);
|
||||||
const float rcp_area = 1.0f / area;
|
const GSVector4 neg_pd = (pd_area < GSVector4::zero());
|
||||||
const float dudx_area = dudx * rcp_area;
|
const GSVector4 zero_pd = (pd_area == GSVector4::zero());
|
||||||
const float dudy_area = dudy * rcp_area;
|
const int mask = (neg_pd.mask() | (zero_pd.mask() << 4));
|
||||||
const float dvdx_area = dvdx * rcp_area;
|
|
||||||
const float dvdy_area = dvdy * rcp_area;
|
|
||||||
const bool neg_dudx = dudx_area < 0.0f;
|
|
||||||
const bool neg_dudy = dudy_area < 0.0f;
|
|
||||||
const bool neg_dvdx = dvdx_area < 0.0f;
|
|
||||||
const bool neg_dvdy = dvdy_area < 0.0f;
|
|
||||||
const bool zero_dudx = dudx_area == 0.0f;
|
|
||||||
const bool zero_dudy = dudy_area == 0.0f;
|
|
||||||
const bool zero_dvdx = dvdx_area == 0.0f;
|
|
||||||
const bool zero_dvdy = dvdy_area == 0.0f;
|
|
||||||
|
|
||||||
// If we have negative dU or dV in any direction, increment the U or V to work properly with nearest-neighbor in
|
// Addressing the 8-bit status code above.
|
||||||
// this impl. If we don't have 1:1 pixel correspondence, this creates a slight "shift" in the sprite, but we
|
static constexpr int NEG_DUDX = 0x1;
|
||||||
// guarantee that we don't sample garbage at least. Overall, this is kinda hacky because there can be legitimate,
|
static constexpr int NEG_DUDY = 0x2;
|
||||||
// rare cases where 3D meshes hit this scenario, and a single texel offset can pop in, but this is way better than
|
static constexpr int NEG_DVDX = 0x4;
|
||||||
// having borked 2D overall.
|
static constexpr int NEG_DVDY = 0x8;
|
||||||
//
|
static constexpr int ZERO_DUDX = 0x10;
|
||||||
// TODO: If perf becomes an issue, we can probably SIMD the 8 comparisons above,
|
static constexpr int ZERO_DUDY = 0x20;
|
||||||
// create an 8-bit code, and use a LUT to get the offsets.
|
static constexpr int ZERO_DVDX = 0x40;
|
||||||
// Case 1: U is decreasing in X, but no change in Y.
|
static constexpr int ZERO_DVDY = 0x80;
|
||||||
// Case 2: U is decreasing in Y, but no change in X.
|
|
||||||
// Case 3: V is decreasing in X, but no change in Y.
|
// Flipped horizontal sprites: negative dudx+zero dudy or negative dudy+zero dudx.
|
||||||
// Case 4: V is decreasing in Y, but no change in X.
|
if ((mask & (NEG_DUDX | ZERO_DUDY)) == (NEG_DUDX | ZERO_DUDY) ||
|
||||||
if ((neg_dudx && zero_dudy) || (neg_dudy && zero_dudx))
|
(mask & (NEG_DUDY | ZERO_DUDX)) == (NEG_DUDY | ZERO_DUDX))
|
||||||
{
|
{
|
||||||
|
GL_INS_FMT("Horizontal flipped sprite detected at {},{}", vertices[0].x, vertices[0].y);
|
||||||
vertices[0].u++;
|
vertices[0].u++;
|
||||||
vertices[1].u++;
|
vertices[1].u++;
|
||||||
vertices[2].u++;
|
vertices[2].u++;
|
||||||
vertices[3].u++;
|
vertices[3].u++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((neg_dvdx && zero_dvdy) || (neg_dvdy && zero_dvdx))
|
// Flipped vertical sprites: negative dvdx+zero dvdy or negative dvdy+zero dvdx.
|
||||||
|
if ((mask & (NEG_DVDX | ZERO_DVDY)) == (NEG_DVDX | ZERO_DVDY) ||
|
||||||
|
(mask & (NEG_DVDY | ZERO_DVDX)) == (NEG_DVDY | ZERO_DVDX))
|
||||||
{
|
{
|
||||||
|
GL_INS_FMT("Vertical flipped sprite detected at {},{}", vertices[0].x, vertices[0].y);
|
||||||
vertices[0].v++;
|
vertices[0].v++;
|
||||||
vertices[1].v++;
|
vertices[1].v++;
|
||||||
vertices[2].v++;
|
vertices[2].v++;
|
||||||
@ -2208,32 +2195,24 @@ ALWAYS_INLINE_RELEASE void GPU_HW::HandleFlippedQuadTextureCoordinates(const GPU
|
|||||||
|
|
||||||
// 2D polygons should have zero change in V on the X axis, and vice versa.
|
// 2D polygons should have zero change in V on the X axis, and vice versa.
|
||||||
if (m_allow_sprite_mode)
|
if (m_allow_sprite_mode)
|
||||||
SetBatchSpriteMode(cmd, zero_dudy && zero_dvdx);
|
{
|
||||||
|
const bool is_sprite = (mask & (ZERO_DVDX | ZERO_DUDY)) == (ZERO_DVDX | ZERO_DUDY);
|
||||||
|
SetBatchSpriteMode(cmd, is_sprite);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool GPU_HW::IsPossibleSpritePolygon(const BatchVertex* vertices) const
|
bool GPU_HW::IsPossibleSpritePolygon(const BatchVertex* vertices) const
|
||||||
{
|
{
|
||||||
const float abx = vertices[1].x - vertices[0].x;
|
float dudx, dudy, dvdx, dvdy, xy_area;
|
||||||
const float aby = vertices[1].y - vertices[0].y;
|
s32 uv_area;
|
||||||
const float bcx = vertices[2].x - vertices[1].x;
|
ComputeUVPartialDerivatives(vertices, &dudx, &dudy, &dvdx, &dvdy, &xy_area, &uv_area);
|
||||||
const float bcy = vertices[2].y - vertices[1].y;
|
if (xy_area == 0.0f || uv_area == 0)
|
||||||
const float cax = vertices[0].x - vertices[2].x;
|
|
||||||
const float cay = vertices[0].y - vertices[2].y;
|
|
||||||
const float dvdx = -aby * static_cast<float>(vertices[2].v) - bcy * static_cast<float>(vertices[0].v) -
|
|
||||||
cay * static_cast<float>(vertices[1].v);
|
|
||||||
const float dudy = +abx * static_cast<float>(vertices[2].u) + bcx * static_cast<float>(vertices[0].u) +
|
|
||||||
cax * static_cast<float>(vertices[1].u);
|
|
||||||
const float area = bcx * cay - bcy * cax;
|
|
||||||
const s32 texArea = (vertices[1].u - vertices[0].u) * (vertices[2].v - vertices[0].v) -
|
|
||||||
(vertices[2].u - vertices[0].u) * (vertices[1].v - vertices[0].v);
|
|
||||||
|
|
||||||
// Doesn't matter.
|
|
||||||
if (area == 0.0f || texArea == 0)
|
|
||||||
return m_batch.sprite_mode;
|
return m_batch.sprite_mode;
|
||||||
|
|
||||||
const float rcp_area = 1.0f / area;
|
// Could vectorize this, but it's not really worth it as we're only checking two partial derivatives.
|
||||||
const bool zero_dudy = ((dudy * rcp_area) == 0.0f);
|
const float rcp_xy_area = 1.0f / xy_area;
|
||||||
const bool zero_dvdx = ((dvdx * rcp_area) == 0.0f);
|
const bool zero_dudy = ((dudy * rcp_xy_area) == 0.0f);
|
||||||
|
const bool zero_dvdx = ((dvdx * rcp_xy_area) == 0.0f);
|
||||||
return (zero_dudy && zero_dvdx);
|
return (zero_dudy && zero_dvdx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -250,6 +250,10 @@ private:
|
|||||||
/// Expands a line into two triangles.
|
/// Expands a line into two triangles.
|
||||||
void DrawLine(const GSVector4 bounds, u32 col0, u32 col1, float depth);
|
void DrawLine(const GSVector4 bounds, u32 col0, u32 col1, float depth);
|
||||||
|
|
||||||
|
/// Computes partial derivatives and area for the given triangle. Needed for sprite/line detection.
|
||||||
|
static void ComputeUVPartialDerivatives(const BatchVertex* vertices, float* dudx, float* dudy, float* dvdx,
|
||||||
|
float* dvdy, float* xy_area, s32* uv_area);
|
||||||
|
|
||||||
/// Handles quads with flipped texture coordinate directions.
|
/// Handles quads with flipped texture coordinate directions.
|
||||||
void HandleFlippedQuadTextureCoordinates(const GPUBackendDrawCommand* cmd, BatchVertex* vertices);
|
void HandleFlippedQuadTextureCoordinates(const GPUBackendDrawCommand* cmd, BatchVertex* vertices);
|
||||||
bool IsPossibleSpritePolygon(const BatchVertex* vertices) const;
|
bool IsPossibleSpritePolygon(const BatchVertex* vertices) const;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user