duckstation/src/core/gpu_sw_rasterizer.inl
2024-11-18 14:19:47 +10:00

1756 lines
64 KiB
C++

// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
#ifdef __INTELLISENSE__
#include "common/gsvector.h"
#include "gpu.h"
#include <algorithm>
#define USE_VECTOR 1
#define GSVECTOR_HAS_SRLV 1
#define GSVECTOR_HAS_256 1
extern GPU_SW_Rasterizer::DitherLUT g_dither_lut;
namespace GPU_SW_Rasterizer {
#endif
// TODO: UpdateVRAM, FillVRAM, etc.
#ifdef USE_VECTOR
// #define CHECK_VECTOR
#ifdef CHECK_VECTOR
static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT];
static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT];
static u32 s_bad_counter = 0;
#define BACKUP_VRAM() \
do \
{ \
std::memcpy(s_vram_backup, g_vram, sizeof(g_vram)); \
s_bad_counter++; \
} while (0)
#define CHECK_VRAM(drawer) \
do \
{ \
std::memcpy(s_new_vram, g_vram, sizeof(g_vram)); \
std::memcpy(g_vram, s_vram_backup, sizeof(g_vram)); \
\
drawer; \
for (u32 vidx = 0; vidx < (VRAM_WIDTH * VRAM_HEIGHT); vidx++) \
{ \
if (s_new_vram[vidx] != g_vram[vidx]) \
{ \
fprintf(stderr, "[%u] Mismatch at %d,%d, expected %04x got %04x\n", s_bad_counter, (vidx % VRAM_WIDTH), \
(vidx / VRAM_WIDTH), g_vram[vidx], s_new_vram[vidx]); \
AssertMsg(false, "Mismatch"); \
} \
} \
/*Assert(std::memcmp(g_vram, s_new_vram, sizeof(g_vram)) == 0)*/ \
} while (0)
#endif
#endif
[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
{
return g_vram[VRAM_WIDTH * y + x];
}
[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
{
return &g_vram[VRAM_WIDTH * y + x];
}
[[maybe_unused]] ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
{
g_vram[VRAM_WIDTH * y + x] = value;
}
[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
{
return std::make_tuple(static_cast<u8>(texcoord), static_cast<u8>(texcoord >> 8));
}
[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
{
return std::make_tuple(static_cast<u8>(rgb24), static_cast<u8>(rgb24 >> 8), static_cast<u8>(rgb24 >> 16));
}
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
[[maybe_unused]] ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y,
u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
u8 texcoord_y)
{
u16 color;
if constexpr (texture_enable)
{
// Apply texture window
texcoord_x = (texcoord_x & cmd->window.and_x) | cmd->window.or_x;
texcoord_y = (texcoord_y & cmd->window.and_y) | cmd->window.or_y;
u16 texture_color;
switch (cmd->draw_mode.texture_mode)
{
case GPUTextureMode::Palette4Bit:
{
const u16 palette_value =
GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 4)) % VRAM_WIDTH,
(cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
const size_t palette_index = (palette_value >> ((texcoord_x % 4) * 4)) & 0x0Fu;
texture_color = g_gpu_clut[palette_index];
}
break;
case GPUTextureMode::Palette8Bit:
{
const u16 palette_value =
GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 2)) % VRAM_WIDTH,
(cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
const size_t palette_index = (palette_value >> ((texcoord_x % 2) * 8)) & 0xFFu;
texture_color = g_gpu_clut[palette_index];
}
break;
default:
{
texture_color = GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x)) % VRAM_WIDTH,
(cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
}
break;
}
if (texture_color == 0)
return;
if constexpr (raw_texture_enable)
{
color = texture_color;
}
else
{
const bool dithering_enable = cmd->draw_mode.dither_enable;
const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
color =
(ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16(texture_color & 0x1Fu) * u16(color_r)) >> 4]) << 0) |
(ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16((texture_color >> 5) & 0x1Fu) * u16(color_g)) >> 4]) << 5) |
(ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16((texture_color >> 10) & 0x1Fu) * u16(color_b)) >> 4])
<< 10) |
(texture_color & 0x8000u);
}
}
else
{
const bool dithering_enable = cmd->draw_mode.dither_enable;
const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;
// Non-textured transparent polygons don't set bit 15, but are treated as transparent.
color = (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_r]) << 0) |
(ZeroExtend16(g_dither_lut[dither_y][dither_x][color_g]) << 5) |
(ZeroExtend16(g_dither_lut[dither_y][dither_x][color_b]) << 10) | (transparency_enable ? 0x8000u : 0);
}
const u16 bg_color = GetPixel(static_cast<u32>(x), static_cast<u32>(y));
if constexpr (transparency_enable)
{
if (color & 0x8000u || !texture_enable)
{
// Based on blargg's efficient 15bpp pixel math.
u32 bg_bits = ZeroExtend32(bg_color);
u32 fg_bits = ZeroExtend32(color);
switch (cmd->draw_mode.transparency_mode)
{
case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
{
bg_bits |= 0x8000u;
color = Truncate16(((fg_bits + bg_bits) - ((fg_bits ^ bg_bits) & 0x0421u)) >> 1);
}
break;
case GPUTransparencyMode::BackgroundPlusForeground:
{
bg_bits &= ~0x8000u;
const u32 sum = fg_bits + bg_bits;
const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;
color = Truncate16((sum - carry) | (carry - (carry >> 5)));
}
break;
case GPUTransparencyMode::BackgroundMinusForeground:
{
bg_bits |= 0x8000u;
fg_bits &= ~0x8000u;
const u32 diff = bg_bits - fg_bits + 0x108420u;
const u32 borrow = (diff - ((bg_bits ^ fg_bits) & 0x108420u)) & 0x108420u;
color = Truncate16((diff - borrow) & (borrow - (borrow >> 5)));
}
break;
case GPUTransparencyMode::BackgroundPlusQuarterForeground:
{
bg_bits &= ~0x8000u;
fg_bits = ((fg_bits >> 2) & 0x1CE7u) | 0x8000u;
const u32 sum = fg_bits + bg_bits;
const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;
color = Truncate16((sum - carry) | (carry - (carry >> 5)));
}
break;
default:
break;
}
// See above.
if constexpr (!texture_enable)
color &= ~0x8000u;
}
}
const u16 mask_and = cmd->params.GetMaskAND();
if ((bg_color & mask_and) != 0)
return;
DebugAssert(static_cast<u32>(x) < VRAM_WIDTH && static_cast<u32>(y) < VRAM_HEIGHT);
SetPixel(static_cast<u32>(x), static_cast<u32>(y), color | cmd->params.GetMaskOR());
}
#ifndef USE_VECTOR
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
{
const s32 origin_x = cmd->x;
const s32 origin_y = cmd->y;
const auto [r, g, b] = UnpackColorRGB24(cmd->color);
const auto [origin_texcoord_x, origin_texcoord_y] = UnpackTexcoord(cmd->texcoord);
for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
{
const s32 y = origin_y + static_cast<s32>(offset_y);
if (y < static_cast<s32>(g_drawing_area.top) || y > static_cast<s32>(g_drawing_area.bottom) ||
(cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u)))
{
continue;
}
const u32 draw_y = static_cast<u32>(y) & VRAM_HEIGHT_MASK;
const u8 texcoord_y = Truncate8(ZeroExtend32(origin_texcoord_y) + offset_y);
for (u32 offset_x = 0; offset_x < cmd->width; offset_x++)
{
const s32 x = origin_x + static_cast<s32>(offset_x);
if (x < static_cast<s32>(g_drawing_area.left) || x > static_cast<s32>(g_drawing_area.right))
continue;
const u8 texcoord_x = Truncate8(ZeroExtend32(origin_texcoord_x) + offset_x);
ShadePixel<texture_enable, raw_texture_enable, transparency_enable>(cmd, static_cast<u32>(x), draw_y, r, g, b,
texcoord_x, texcoord_y);
}
}
}
#else // USE_VECTOR
#ifdef GSVECTOR_HAS_256
using GSVectorNi = GSVector8i;
static constexpr GSVector8i SPAN_OFFSET_VEC = GSVector8i::cxpr(0, 1, 2, 3, 4, 5, 6, 7);
static constexpr GSVector8i SPAN_WIDTH_VEC = GSVector8i::cxpr(1, 2, 3, 4, 5, 6, 7, 8);
static constexpr GSVector8i PIXELS_PER_VEC_VEC = GSVector8i::cxpr(8);
static constexpr u32 PIXELS_PER_VEC = 8;
#else
using GSVectorNi = GSVector4i;
static constexpr GSVector4i SPAN_OFFSET_VEC = GSVector4i::cxpr(0, 1, 2, 3);
static constexpr GSVector4i SPAN_WIDTH_VEC = GSVector4i::cxpr(1, 2, 3, 4);
static constexpr GSVector4i PIXELS_PER_VEC_VEC = GSVector4i::cxpr(4);
static constexpr u32 PIXELS_PER_VEC = 4;
#endif
#ifdef GSVECTOR_HAS_256
ALWAYS_INLINE_RELEASE static GSVector8i GatherVector(GSVector8i coord_x, GSVector8i coord_y)
{
const GSVector8i offsets = coord_y.sll32<10>().add32(coord_x); // y * 1024 + x
GSVector8i pixels = GSVector8i::zext32(g_vram[static_cast<u32>(offsets.extract32<0>())]);
pixels = pixels.insert16<2>(g_vram[static_cast<u32>(offsets.extract32<1>())]);
pixels = pixels.insert16<4>(g_vram[static_cast<u32>(offsets.extract32<2>())]);
pixels = pixels.insert16<6>(g_vram[static_cast<u32>(offsets.extract32<3>())]);
pixels = pixels.insert16<8>(g_vram[static_cast<u32>(offsets.extract32<4>())]);
pixels = pixels.insert16<10>(g_vram[static_cast<u32>(offsets.extract32<5>())]);
pixels = pixels.insert16<12>(g_vram[static_cast<u32>(offsets.extract32<6>())]);
pixels = pixels.insert16<14>(g_vram[static_cast<u32>(offsets.extract32<7>())]);
return pixels;
}
template<u32 mask>
ALWAYS_INLINE_RELEASE static GSVector8i GatherCLUTVector(GSVector8i indices, GSVector8i shifts)
{
const GSVector8i offsets = indices.srlv32(shifts) & GSVector8i::cxpr(mask);
GSVector8i pixels = GSVector8i::zext32(g_gpu_clut[static_cast<u32>(offsets.extract32<0>())]);
pixels = pixels.insert16<2>(g_gpu_clut[static_cast<u32>(offsets.extract32<1>())]);
pixels = pixels.insert16<4>(g_gpu_clut[static_cast<u32>(offsets.extract32<2>())]);
pixels = pixels.insert16<6>(g_gpu_clut[static_cast<u32>(offsets.extract32<3>())]);
pixels = pixels.insert16<8>(g_gpu_clut[static_cast<u32>(offsets.extract32<4>())]);
pixels = pixels.insert16<10>(g_gpu_clut[static_cast<u32>(offsets.extract32<5>())]);
pixels = pixels.insert16<12>(g_gpu_clut[static_cast<u32>(offsets.extract32<6>())]);
pixels = pixels.insert16<14>(g_gpu_clut[static_cast<u32>(offsets.extract32<7>())]);
return pixels;
}
ALWAYS_INLINE_RELEASE static GSVector8i LoadVector(u32 x, u32 y)
{
// TODO: Split into high/low
if (x <= (VRAM_WIDTH - 8))
{
return GSVector8i::u16to32(GSVector4i::load<false>(&g_vram[y * VRAM_WIDTH + x]));
}
else
{
// TODO: Avoid loads for masked pixels if a contiguous region is masked
const u16* line = &g_vram[y * VRAM_WIDTH];
GSVector8i pixels = GSVector8i::zero();
pixels = pixels.insert16<0>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<6>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<8>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<10>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<12>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<14>(line[(x++) & VRAM_WIDTH_MASK]);
return pixels;
}
}
ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector8i color)
{
// TODO: Split into high/low
const GSVector4i packed = color.low128().pu32(color.high128());
if (x <= (VRAM_WIDTH - 8))
{
GSVector4i::store<false>(&g_vram[y * VRAM_WIDTH + x], packed);
}
else
{
// TODO: Avoid stores for masked pixels if a contiguous region is masked
u16* line = &g_vram[y * VRAM_WIDTH];
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<0>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<1>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<2>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<3>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<4>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<5>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<6>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<7>());
}
}
#else
ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y)
{
const GSVector4i offsets = coord_y.sll32<10>().add32(coord_x); // y * 1024 + x
// Clang seems to optimize this directly into pextrd+pinsrw, good.
GSVector4i pixels = GSVector4i::zext32(g_vram[static_cast<u32>(offsets.extract32<0>())]);
pixels = pixels.insert16<2>(g_vram[static_cast<u32>(offsets.extract32<1>())]);
pixels = pixels.insert16<4>(g_vram[static_cast<u32>(offsets.extract32<2>())]);
pixels = pixels.insert16<6>(g_vram[static_cast<u32>(offsets.extract32<3>())]);
return pixels;
}
template<u32 mask>
ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices, GSVector4i shifts)
{
#ifdef GSVECTOR_HAS_SRLV
// On everywhere except RISC-V, we can do the shl 1 (* 2) as part of the load instruction.
const GSVector4i offsets = indices.srlv32(shifts) & GSVector4i::cxpr(mask);
GSVector4i pixels = GSVector4i::zext32(g_gpu_clut[static_cast<u32>(offsets.extract32<0>())]);
pixels = pixels.insert16<2>(g_gpu_clut[static_cast<u32>(offsets.extract32<1>())]);
pixels = pixels.insert16<4>(g_gpu_clut[static_cast<u32>(offsets.extract32<2>())]);
pixels = pixels.insert16<6>(g_gpu_clut[static_cast<u32>(offsets.extract32<3>())]);
return pixels;
#else
// Without variable shifts, it's probably quicker to do it without vectors.
// Because otherwise we have to do 4 separate vector shifts, as well as broadcasting the shifts...
// Clang seems to turn this into a bunch of extracts, and skips memory. Nice.
alignas(VECTOR_ALIGNMENT) s32 indices_array[4], shifts_array[4];
GSVector4i::store<true>(indices_array, indices);
GSVector4i::store<true>(shifts_array, shifts);
GSVector4i pixels = GSVector4i::zext32(g_gpu_clut[((indices_array[0] >> shifts_array[0]) & mask)]);
pixels = pixels.insert16<2>(g_gpu_clut[((indices_array[1] >> shifts_array[1]) & mask)]);
pixels = pixels.insert16<4>(g_gpu_clut[((indices_array[2] >> shifts_array[2]) & mask)]);
pixels = pixels.insert16<6>(g_gpu_clut[((indices_array[3] >> shifts_array[3]) & mask)]);
return pixels;
#endif
}
ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
{
if (x <= (VRAM_WIDTH - 4))
{
return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
}
else
{
const u16* line = &g_vram[y * VRAM_WIDTH];
GSVector4i pixels = GSVector4i(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
pixels = pixels.insert16<6>(line[x & VRAM_WIDTH_MASK]);
return pixels;
}
}
ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
{
const GSVector4i packed_color = color.pu32();
if (x <= (VRAM_WIDTH - 4))
{
GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color);
}
else
{
u16* line = &g_vram[y * VRAM_WIDTH];
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<0>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<1>());
line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<2>());
line[x & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<3>());
}
}
#endif
ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVectorNi rgb5a1, GSVectorNi& rg, GSVectorNi& ba)
{
rg = rgb5a1 & GSVectorNi::cxpr(0x1F); // R | R | R | R
rg = rg | (rgb5a1 & GSVectorNi::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0
ba = rgb5a1.srl32<10>() & GSVectorNi::cxpr(0x1F); // B | B | B | B
ba = ba | (rgb5a1 & GSVectorNi::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0
}
ALWAYS_INLINE_RELEASE static GSVectorNi RG_BAToRGB5A1(GSVectorNi rg, GSVectorNi ba)
{
GSVectorNi res;
res = rg & GSVectorNi::cxpr(0x1F); // R | R | R | R
res = res | (rg.srl32<11>() & GSVectorNi::cxpr(0x3E0)); // RG | RG | RG | RG
res = res | ((ba & GSVectorNi::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB
res = res | ba.srl32<16>().sll32<15>(); // RGBA | RGBA | RGBA | RGBA
return res;
}
// Color repeated twice for RG packing, then duplicated to we can load based on the X offset.
alignas(VECTOR_ALIGNMENT) static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
#define P(m, n) static_cast<s16>(DITHER_MATRIX[m][n]), static_cast<s16>(DITHER_MATRIX[m][n])
#define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3)
{R(0)}, {R(1)}, {R(2)}, {R(3)}
#undef R
#undef P
};
namespace {
template<bool texture_enable>
struct PixelVectors
{
struct UnusedField
{
};
GSVectorNi clip_left;
GSVectorNi clip_right;
GSVectorNi mask_and;
GSVectorNi mask_or;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_and_x;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_or_x;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_and_y;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_or_y;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_base_x;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_base_y;
PixelVectors(const GPUBackendDrawCommand* cmd)
{
clip_left = GSVectorNi(g_drawing_area.left);
clip_right = GSVectorNi(g_drawing_area.right);
mask_and = GSVectorNi(cmd->params.GetMaskAND());
mask_or = GSVectorNi(cmd->params.GetMaskOR());
if constexpr (texture_enable)
{
texture_window_and_x = GSVectorNi(cmd->window.and_x);
texture_window_or_x = GSVectorNi(cmd->window.or_x);
texture_window_and_y = GSVectorNi(cmd->window.and_y);
texture_window_or_y = GSVectorNi(cmd->window.or_y);
texture_base_x = GSVectorNi(cmd->draw_mode.GetTexturePageBaseX());
texture_base_y = GSVectorNi(cmd->draw_mode.GetTexturePageBaseY());
}
}
};
} // namespace
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
ALWAYS_INLINE_RELEASE static void
ShadePixel(const PixelVectors<texture_enable>& pv, GPUTextureMode texture_mode, GPUTransparencyMode transparency_mode,
u32 start_x, u32 y, GSVectorNi vertex_color_rg, GSVectorNi vertex_color_ba, GSVectorNi texcoord_x,
GSVectorNi texcoord_y, GSVectorNi preserve_mask, GSVectorNi dither)
{
static constexpr GSVectorNi coord_mask_x = GSVectorNi::cxpr(VRAM_WIDTH_MASK);
static constexpr GSVectorNi coord_mask_y = GSVectorNi::cxpr(VRAM_HEIGHT_MASK);
GSVectorNi color;
if constexpr (texture_enable)
{
// Apply texture window
texcoord_x = (texcoord_x & pv.texture_window_and_x) | pv.texture_window_or_x;
texcoord_y = (texcoord_y & pv.texture_window_and_y) | pv.texture_window_or_y;
texcoord_y = pv.texture_base_y.add32(texcoord_y) & coord_mask_y;
GSVectorNi texture_color;
switch (texture_mode)
{
case GPUTextureMode::Palette4Bit:
{
GSVectorNi load_texcoord_x = texcoord_x.srl32<2>();
load_texcoord_x = pv.texture_base_x.add32(load_texcoord_x);
load_texcoord_x = load_texcoord_x & coord_mask_x;
const GSVectorNi palette_shift = (texcoord_x & GSVectorNi::cxpr(3)).sll32<2>();
const GSVectorNi palette_indices = GatherVector(load_texcoord_x, texcoord_y);
texture_color = GatherCLUTVector<0x0F>(palette_indices, palette_shift);
}
break;
case GPUTextureMode::Palette8Bit:
{
GSVectorNi load_texcoord_x = texcoord_x.srl32<1>();
load_texcoord_x = pv.texture_base_x.add32(load_texcoord_x);
load_texcoord_x = load_texcoord_x & coord_mask_x;
const GSVectorNi palette_shift = (texcoord_x & GSVectorNi::cxpr(1)).sll32<3>();
const GSVectorNi palette_indices = GatherVector(load_texcoord_x, texcoord_y);
texture_color = GatherCLUTVector<0xFF>(palette_indices, palette_shift);
}
break;
default:
{
texcoord_x = pv.texture_base_x.add32(texcoord_x);
texcoord_x = texcoord_x & coord_mask_x;
texture_color = GatherVector(texcoord_x, texcoord_y);
}
break;
}
// check for zero texture colour across the 4 pixels, early out if so
const GSVectorNi texture_transparent_mask = texture_color.eq32(GSVectorNi::zero());
if (texture_transparent_mask.alltrue())
return;
preserve_mask = preserve_mask | texture_transparent_mask;
if constexpr (raw_texture_enable)
{
color = texture_color;
}
else
{
GSVectorNi trg, tba;
RGB5A1ToRG_BA(texture_color, trg, tba);
// now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply
GSVectorNi rg = trg.mul16l(vertex_color_rg);
GSVectorNi ba = tba.mul16l(vertex_color_ba);
// Convert to 5bit.
rg = rg.sra16<4>().add16(dither).max_s16(GSVectorNi::zero()).sra16<3>();
ba = ba.sra16<4>().add16(dither).max_s16(GSVectorNi::zero()).sra16<3>();
// Bit15 gets passed through as-is.
ba = ba.blend16<0xaa>(tba);
// Clamp to 5bit.
static constexpr GSVectorNi colclamp = GSVectorNi::cxpr16(0x1F);
rg = rg.min_u16(colclamp);
ba = ba.min_u16(colclamp);
// And interleave back to 16bpp.
color = RG_BAToRGB5A1(rg, ba);
}
}
else
{
// Non-textured transparent polygons don't set bit 15, but are treated as transparent.
GSVectorNi rg = vertex_color_rg.add16(dither).max_s16(GSVectorNi::zero()).sra16<3>();
GSVectorNi ba = vertex_color_ba.add16(dither).max_s16(GSVectorNi::zero()).sra16<3>();
// Clamp to 5bit. We use 32bit for BA to set a to zero.
rg = rg.min_u16(GSVectorNi::cxpr16(0x1F));
ba = ba.min_u16(GSVectorNi::cxpr(0x1F));
// And interleave back to 16bpp.
color = RG_BAToRGB5A1(rg, ba);
}
GSVectorNi bg_color = LoadVector(start_x, y);
if constexpr (transparency_enable)
{
[[maybe_unused]] GSVectorNi transparent_mask;
if constexpr (texture_enable)
{
// Compute transparent_mask, ffff per lane if transparent otherwise 0000
transparent_mask = color.sra16<15>();
}
// TODO: We don't need to OR color here with 0x8000 for textures.
// 0x8000 is added to match serial path.
GSVectorNi blended_color;
switch (transparency_mode)
{
case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
{
const GSVectorNi fg_bits = color | GSVectorNi::cxpr(0x8000u);
const GSVectorNi bg_bits = bg_color | GSVectorNi::cxpr(0x8000u);
const GSVectorNi res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x0421u)).srl32<1>();
blended_color = res & GSVectorNi::cxpr(0xffff);
}
break;
case GPUTransparencyMode::BackgroundPlusForeground:
{
const GSVectorNi fg_bits = color | GSVectorNi::cxpr(0x8000u);
const GSVectorNi bg_bits = bg_color & GSVectorNi::cxpr(0x7FFFu);
const GSVectorNi sum = fg_bits.add32(bg_bits);
const GSVectorNi carry =
(sum.sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x8421u))) & GSVectorNi::cxpr(0x8420u);
const GSVectorNi res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
blended_color = res & GSVectorNi::cxpr(0xffff);
}
break;
case GPUTransparencyMode::BackgroundMinusForeground:
{
const GSVectorNi bg_bits = bg_color | GSVectorNi::cxpr(0x8000u);
const GSVectorNi fg_bits = color & GSVectorNi::cxpr(0x7FFFu);
const GSVectorNi diff = bg_bits.sub32(fg_bits).add32(GSVectorNi::cxpr(0x108420u));
const GSVectorNi borrow =
diff.sub32((bg_bits ^ fg_bits) & GSVectorNi::cxpr(0x108420u)) & GSVectorNi::cxpr(0x108420u);
const GSVectorNi res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>());
blended_color = res & GSVectorNi::cxpr(0xffff);
}
break;
case GPUTransparencyMode::BackgroundPlusQuarterForeground:
default:
{
const GSVectorNi bg_bits = bg_color & GSVectorNi::cxpr(0x7FFFu);
const GSVectorNi fg_bits =
((color | GSVectorNi::cxpr(0x8000)).srl32<2>() & GSVectorNi::cxpr(0x1CE7u)) | GSVectorNi::cxpr(0x8000u);
const GSVectorNi sum = fg_bits.add32(bg_bits);
const GSVectorNi carry = sum.sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x8421u)) & GSVectorNi::cxpr(0x8420u);
const GSVectorNi res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
blended_color = res & GSVectorNi::cxpr(0xffff);
}
break;
}
// select blended pixels for transparent pixels, otherwise consider opaque
if constexpr (texture_enable)
color = color.blend8(blended_color, transparent_mask);
else
color = blended_color & GSVectorNi::cxpr(0x7fff);
}
GSVectorNi mask_bits_set = bg_color & pv.mask_and; // 8000 if masked else 0000
mask_bits_set = mask_bits_set.sra16<15>(); // ffff if masked else 0000
preserve_mask = preserve_mask | mask_bits_set; // ffff if preserved else 0000
bg_color = bg_color & preserve_mask;
color = (color | pv.mask_or).andnot(preserve_mask);
color = color | bg_color;
StoreVector(start_x, y, color);
}
template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
{
const s32 origin_x = cmd->x;
const s32 origin_y = cmd->y;
const GSVector4i rgba = GSVector4i(cmd->color); // RGBA | RGBA | RGBA | RGBA
const GSVector4i rgp = rgba.xxxxl(); // RGRG | RGRG | RGRG | RGRG
const GSVector4i bap = rgba.yyyyl(); // BABA | BABA | BABA | BABA
const GSVectorNi rg = GSVectorNi::broadcast128(rgp.u8to16()); // R0G0 | R0G0 | R0G0 | R0G0
const GSVectorNi ba = GSVectorNi::broadcast128(bap.u8to16()); // B0A0 | B0A0 | B0A0 | B0A0
const GSVectorNi texcoord_x = GSVectorNi(cmd->texcoord & 0xFF).add32(SPAN_OFFSET_VEC);
GSVectorNi texcoord_y = GSVectorNi(cmd->texcoord >> 8);
const PixelVectors<texture_enable> pv(cmd);
const u32 width = cmd->width;
#ifdef CHECK_VECTOR
BACKUP_VRAM();
#endif
for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
{
const s32 y = origin_y + static_cast<s32>(offset_y);
if (y >= static_cast<s32>(g_drawing_area.top) && y <= static_cast<s32>(g_drawing_area.bottom) &&
(!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast<u32>(y)) & 1u)))
{
const s32 draw_y = (y & VRAM_HEIGHT_MASK);
GSVectorNi row_texcoord_x = texcoord_x;
GSVectorNi xvec = GSVectorNi(origin_x).add32(SPAN_OFFSET_VEC);
GSVectorNi wvec = GSVectorNi(width).sub32(SPAN_WIDTH_VEC);
for (u32 offset_x = 0; offset_x < width; offset_x += PIXELS_PER_VEC)
{
const s32 x = origin_x + static_cast<s32>(offset_x);
// width test
GSVectorNi preserve_mask = wvec.lt32(GSVectorNi::zero());
// clip test, if all pixels are outside, skip
preserve_mask = preserve_mask | xvec.lt32(pv.clip_left);
preserve_mask = preserve_mask | xvec.gt32(pv.clip_right);
if (!preserve_mask.alltrue())
{
ShadePixel<texture_enable, raw_texture_enable, transparency_enable>(
pv, cmd->draw_mode.texture_mode, cmd->draw_mode.transparency_mode, x, draw_y, rg, ba, row_texcoord_x,
texcoord_y, preserve_mask, GSVectorNi::zero());
}
xvec = xvec.add32(PIXELS_PER_VEC_VEC);
wvec = wvec.sub32(PIXELS_PER_VEC_VEC);
if constexpr (texture_enable)
row_texcoord_x = row_texcoord_x.add32(PIXELS_PER_VEC_VEC) & GSVectorNi::cxpr(0xFF);
}
}
if constexpr (texture_enable)
texcoord_y = texcoord_y.add32(GSVectorNi::cxpr(1)) & GSVectorNi::cxpr(0xFF);
}
#ifdef CHECK_VECTOR
CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd));
#endif
}
#endif // USE_VECTOR
// TODO: Vectorize line draw.
template<bool shading_enable, bool transparency_enable>
static void DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0,
const GPUBackendDrawLineCommand::Vertex* p1)
{
static constexpr u32 XY_SHIFT = 32;
static constexpr u32 RGB_SHIFT = 12;
static constexpr auto makefp_xy = [](s32 x) { return (static_cast<s64>(x) << XY_SHIFT) | (1LL << (XY_SHIFT - 1)); };
static constexpr auto unfp_xy = [](s64 x) { return static_cast<s32>(x >> XY_SHIFT) & 2047; };
static constexpr auto div_xy = [](s64 delta, s32 dk) {
return ((delta << XY_SHIFT) - ((delta < 0) ? (dk - 1) : 0) + ((delta > 0) ? (dk - 1) : 0)) / dk;
};
static constexpr auto makefp_rgb = [](u32 c) { return (static_cast<s32>(c) << RGB_SHIFT) | (1 << (RGB_SHIFT - 1)); };
static constexpr auto unfp_rgb = [](s32 c) { return static_cast<u8>(c >> RGB_SHIFT); };
static constexpr auto div_rgb = [](u32 c1, u32 c0, s32 dk) {
return ((static_cast<s32>(c1) - static_cast<s32>(c0)) << RGB_SHIFT) / dk;
};
const s32 i_dx = std::abs(p1->x - p0->x);
const s32 i_dy = std::abs(p1->y - p0->y);
const s32 k = (i_dx > i_dy) ? i_dx : i_dy;
if (i_dx >= MAX_PRIMITIVE_WIDTH || i_dy >= MAX_PRIMITIVE_HEIGHT) [[unlikely]]
return;
if (p0->x >= p1->x && k > 0)
std::swap(p0, p1);
s64 dxdk = 0, dydk = 0;
[[maybe_unused]] s32 drdk = 0, dgdk = 0, dbdk = 0;
if (k != 0) [[likely]]
{
dxdk = div_xy(p1->x - p0->x, k);
dydk = div_xy(p1->y - p0->y, k);
if constexpr (shading_enable)
{
drdk = div_rgb(p1->r, p0->r, k);
dgdk = div_rgb(p1->g, p0->g, k);
dbdk = div_rgb(p1->b, p0->b, k);
}
}
s64 curx = makefp_xy(p0->x) - 1024;
s64 cury = makefp_xy(p0->y) - ((dydk < 0) ? 1024 : 0);
[[maybe_unused]] s32 curr, curg, curb;
if constexpr (shading_enable)
{
curr = makefp_rgb(p0->r);
curg = makefp_rgb(p0->g);
curb = makefp_rgb(p0->b);
}
for (s32 i = 0; i <= k; i++)
{
const s32 x = unfp_xy(curx);
const s32 y = unfp_xy(cury);
if ((!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast<u32>(y)) & 1u)) &&
x >= static_cast<s32>(g_drawing_area.left) && x <= static_cast<s32>(g_drawing_area.right) &&
y >= static_cast<s32>(g_drawing_area.top) && y <= static_cast<s32>(g_drawing_area.bottom))
{
const u8 r = shading_enable ? unfp_rgb(curr) : p0->r;
const u8 g = shading_enable ? unfp_rgb(curg) : p0->g;
const u8 b = shading_enable ? unfp_rgb(curb) : p0->b;
ShadePixel<false, false, transparency_enable>(cmd, static_cast<u32>(x), static_cast<u32>(y) & VRAM_HEIGHT_MASK, r,
g, b, 0, 0);
}
curx += dxdk;
cury += dydk;
if constexpr (shading_enable)
{
curr += drdk;
curg += dgdk;
curb += dbdk;
}
}
}
// DDA triangle rasterization algorithm originally from Mednafen, rewritten and vectorized for DuckStation.
namespace {
static constexpr u32 ATTRIB_SHIFT = 12;
static constexpr u32 ATTRIB_POST_SHIFT = 12;
struct UVSteps
{
u32 dudx;
u32 dvdx;
u32 dudy;
u32 dvdy;
};
struct UVStepper
{
u32 u;
u32 v;
ALWAYS_INLINE u8 GetU() const { return Truncate8(u >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
ALWAYS_INLINE u8 GetV() const { return Truncate8(v >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
ALWAYS_INLINE void Init(u32 ustart, u32 vstart)
{
u = (((ustart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
v = (((vstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
}
ALWAYS_INLINE void StepX(const UVSteps& steps)
{
u = u + steps.dudx;
v = v + steps.dvdx;
}
ALWAYS_INLINE void StepX(const UVSteps& steps, s32 count)
{
u = u + static_cast<u32>(static_cast<s32>(steps.dudx) * count);
v = v + static_cast<u32>(static_cast<s32>(steps.dvdx) * count);
}
template<bool upside_down>
ALWAYS_INLINE void StepY(const UVSteps& steps)
{
u = upside_down ? (u - steps.dudy) : (u + steps.dudy);
v = upside_down ? (v - steps.dvdy) : (v + steps.dvdy);
}
ALWAYS_INLINE void StepY(const UVSteps& steps, s32 count)
{
u = u + static_cast<u32>(static_cast<s32>(steps.dudy) * count);
v = v + static_cast<u32>(static_cast<s32>(steps.dvdy) * count);
}
};
struct RGBSteps
{
u32 drdx;
u32 dgdx;
u32 dbdx;
u32 drdy;
u32 dgdy;
u32 dbdy;
};
struct RGBStepper
{
u32 r;
u32 g;
u32 b;
ALWAYS_INLINE u8 GetR() const { return Truncate8(r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
ALWAYS_INLINE u8 GetG() const { return Truncate8(g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
ALWAYS_INLINE u8 GetB() const { return Truncate8(b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
ALWAYS_INLINE void Init(u32 rstart, u32 gstart, u32 bstart)
{
r = (((rstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
g = (((gstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
b = (((bstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
}
ALWAYS_INLINE void StepX(const RGBSteps& steps)
{
r = r + steps.drdx;
g = g + steps.dgdx;
b = b + steps.dbdx;
}
ALWAYS_INLINE void StepX(const RGBSteps& steps, s32 count)
{
r = r + static_cast<u32>(static_cast<s32>(steps.drdx) * count);
g = g + static_cast<u32>(static_cast<s32>(steps.dgdx) * count);
b = b + static_cast<u32>(static_cast<s32>(steps.dbdx) * count);
}
template<bool upside_down>
ALWAYS_INLINE void StepY(const RGBSteps& steps)
{
r = upside_down ? (r - steps.drdy) : (r + steps.drdy);
g = upside_down ? (g - steps.dgdy) : (g + steps.dgdy);
b = upside_down ? (b - steps.dbdy) : (b + steps.dbdy);
}
ALWAYS_INLINE void StepY(const RGBSteps& steps, s32 count)
{
r = r + static_cast<u32>(static_cast<s32>(steps.drdy) * count);
g = g + static_cast<u32>(static_cast<s32>(steps.dgdy) * count);
b = b + static_cast<u32>(static_cast<s32>(steps.dbdy) * count);
}
};
struct TrianglePart
{
// left/right edges
u64 start_x[2];
u64 step_x[2];
s32 start_y;
s32 end_y;
bool fill_upside_down;
};
} // namespace
#ifndef USE_VECTOR
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep)
{
s32 width = x_bound - x_start;
s32 current_x = TruncateGPUVertexPosition(x_start);
// Skip pixels outside of the scissor rectangle.
if (current_x < static_cast<s32>(g_drawing_area.left))
{
const s32 delta = static_cast<s32>(g_drawing_area.left) - current_x;
x_start += delta;
current_x += delta;
width -= delta;
}
if ((current_x + width) > (static_cast<s32>(g_drawing_area.right) + 1))
width = static_cast<s32>(g_drawing_area.right) + 1 - current_x;
if (width <= 0)
return;
if constexpr (texture_enable)
uv.StepX(uvstep, x_start);
if constexpr (shading_enable)
rgb.StepX(rgbstep, x_start);
do
{
ShadePixel<texture_enable, raw_texture_enable, transparency_enable>(
cmd, static_cast<u32>(current_x), static_cast<u32>(y), rgb.GetR(), rgb.GetG(), rgb.GetB(), uv.GetU(), uv.GetV());
current_x++;
if constexpr (texture_enable)
uv.StepX(uvstep);
if constexpr (shading_enable)
rgb.StepX(rgbstep);
} while (--width > 0);
}
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp,
const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb,
const RGBSteps& rgbstep)
{
static constexpr auto unfp_xy = [](s64 xfp) -> s32 { return static_cast<s32>(static_cast<u64>(xfp) >> 32); };
const u64 left_x_step = tp.step_x[0];
const u64 right_x_step = tp.step_x[1];
const s32 end_y = tp.end_y;
u64 left_x = tp.start_x[0];
u64 right_x = tp.start_x[1];
s32 current_y = tp.start_y;
if (tp.fill_upside_down)
{
if (current_y <= end_y)
return;
UVStepper luv = uv;
if constexpr (texture_enable)
luv.StepY(uvstep, current_y);
RGBStepper lrgb = rgb;
if constexpr (shading_enable)
lrgb.StepY(rgbstep, current_y);
do
{
current_y--;
left_x -= left_x_step;
right_x -= right_x_step;
const s32 y = TruncateGPUVertexPosition(current_y);
if (y < static_cast<s32>(g_drawing_area.top))
break;
// Opposite direction means we need to subtract when stepping instead of adding.
if constexpr (texture_enable)
luv.StepY<true>(uvstep);
if constexpr (shading_enable)
lrgb.StepY<true>(rgbstep);
if (y > static_cast<s32>(g_drawing_area.bottom) ||
(cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (static_cast<u32>(current_y) & 1u)))
{
continue;
}
DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(
cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep);
} while (current_y > end_y);
}
else
{
if (current_y >= end_y)
return;
UVStepper luv = uv;
if constexpr (texture_enable)
luv.StepY(uvstep, current_y);
RGBStepper lrgb = rgb;
if constexpr (shading_enable)
lrgb.StepY(rgbstep, current_y);
do
{
const s32 y = TruncateGPUVertexPosition(current_y);
if (y > static_cast<s32>(g_drawing_area.bottom))
{
break;
}
if (y >= static_cast<s32>(g_drawing_area.top) &&
(!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (static_cast<u32>(current_y) & 1u)))
{
DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(
cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep);
}
current_y++;
left_x += left_x_step;
right_x += right_x_step;
if constexpr (texture_enable)
luv.StepY<false>(uvstep);
if constexpr (shading_enable)
lrgb.StepY<false>(rgbstep);
} while (current_y < end_y);
}
}
#else // USE_VECTOR
namespace {
template<bool shading_enable, bool texture_enable>
struct TriangleVectors : PixelVectors<texture_enable>
{
using UnusedField = PixelVectors<texture_enable>::UnusedField;
typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> drdx;
typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dgdx;
typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dbdx;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dudx;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dvdx;
typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> drdx_0123;
typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dgdx_0123;
typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dbdx_0123;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dudx_0123;
typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dvdx_0123;
TriangleVectors(const GPUBackendDrawCommand* cmd, const UVSteps& uvstep, const RGBSteps& rgbstep)
: PixelVectors<texture_enable>(cmd)
{
if constexpr (shading_enable)
{
drdx = GSVectorNi(rgbstep.drdx * PIXELS_PER_VEC);
dgdx = GSVectorNi(rgbstep.dgdx * PIXELS_PER_VEC);
dbdx = GSVectorNi(rgbstep.dbdx * PIXELS_PER_VEC);
drdx_0123 = GSVectorNi(rgbstep.drdx).mul32l(SPAN_OFFSET_VEC);
dgdx_0123 = GSVectorNi(rgbstep.dgdx).mul32l(SPAN_OFFSET_VEC);
dbdx_0123 = GSVectorNi(rgbstep.dbdx).mul32l(SPAN_OFFSET_VEC);
}
if constexpr (texture_enable)
{
dudx = GSVectorNi(uvstep.dudx * PIXELS_PER_VEC);
dvdx = GSVectorNi(uvstep.dvdx * PIXELS_PER_VEC);
dudx_0123 = GSVectorNi(uvstep.dudx).mul32l(SPAN_OFFSET_VEC);
dvdx_0123 = GSVectorNi(uvstep.dvdx).mul32l(SPAN_OFFSET_VEC);
}
}
};
} // namespace
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound,
UVStepper uv, const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep,
const TriangleVectors<shading_enable, texture_enable>& tv)
{
s32 width = x_bound - x_start;
s32 current_x = TruncateGPUVertexPosition(x_start);
// Skip pixels outside of the scissor rectangle.
if (current_x < static_cast<s32>(g_drawing_area.left))
{
const s32 delta = static_cast<s32>(g_drawing_area.left) - current_x;
x_start += delta;
current_x += delta;
width -= delta;
}
if ((current_x + width) > (static_cast<s32>(g_drawing_area.right) + 1))
width = static_cast<s32>(g_drawing_area.right) + 1 - current_x;
if (width <= 0)
return;
GSVectorNi dr, dg, db;
if constexpr (shading_enable)
{
dr = GSVectorNi(rgb.r + rgbstep.drdx * x_start).add32(tv.drdx_0123);
dg = GSVectorNi(rgb.g + rgbstep.dgdx * x_start).add32(tv.dgdx_0123);
db = GSVectorNi(rgb.b + rgbstep.dbdx * x_start).add32(tv.dbdx_0123);
}
else
{
// precompute for flat shading
dr = GSVectorNi(rgb.r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
dg = GSVectorNi((rgb.g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)) << 16);
db = GSVectorNi(rgb.b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
}
GSVectorNi du, dv;
if constexpr (texture_enable)
{
du = GSVectorNi(uv.u + uvstep.dudx * x_start).add32(tv.dudx_0123);
dv = GSVectorNi(uv.v + uvstep.dvdx * x_start).add32(tv.dvdx_0123);
}
else
{
// Hopefully optimized out...
du = GSVectorNi::zero();
dv = GSVectorNi::zero();
}
const GSVectorNi dither = cmd->draw_mode.dither_enable ?
GSVectorNi::broadcast128<false>(
&VECTOR_DITHER_MATRIX[static_cast<u32>(y) & 3][(static_cast<u32>(current_x) & 3) * 2]) :
GSVectorNi::zero();
GSVectorNi xvec = GSVectorNi(current_x).add32(SPAN_OFFSET_VEC);
GSVectorNi wvec = GSVectorNi(width).sub32(SPAN_WIDTH_VEC);
for (s32 count = (width + (PIXELS_PER_VEC - 1)) / PIXELS_PER_VEC; count > 0; --count)
{
// R000 | R000 | R000 | R000
// R0G0 | R0G0 | R0G0 | R0G0
const GSVectorNi r = shading_enable ? dr.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>() : dr;
const GSVectorNi g =
shading_enable ? dg.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>().sll32<16>() : dg; // get G into the correct position
const GSVectorNi b = shading_enable ? db.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>() : db;
const GSVectorNi u = du.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>();
const GSVectorNi v = dv.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>();
const GSVectorNi rg = r.blend16<0xAA>(g);
// mask based on what's outside the span
GSVectorNi preserve_mask = wvec.lt32(GSVectorNi::zero());
// clip test, if all pixels are outside, skip
preserve_mask = preserve_mask | xvec.lt32(tv.clip_left);
preserve_mask = preserve_mask | xvec.gt32(tv.clip_right);
if (!preserve_mask.alltrue())
{
ShadePixel<texture_enable, raw_texture_enable, transparency_enable>(
tv, cmd->draw_mode.texture_mode, cmd->draw_mode.transparency_mode, static_cast<u32>(current_x),
static_cast<u32>(y), rg, b, u, v, preserve_mask, dither);
}
current_x += PIXELS_PER_VEC;
xvec = xvec.add32(PIXELS_PER_VEC_VEC);
wvec = wvec.sub32(PIXELS_PER_VEC_VEC);
if constexpr (shading_enable)
{
dr = dr.add32(tv.drdx);
dg = dg.add32(tv.dgdx);
db = db.add32(tv.dbdx);
}
if constexpr (texture_enable)
{
du = du.add32(tv.dudx);
dv = dv.add32(tv.dvdx);
}
}
}
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp,
const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb,
const RGBSteps& rgbstep)
{
static constexpr auto unfp_xy = [](s64 xfp) -> s32 { return static_cast<s32>(static_cast<u64>(xfp) >> 32); };
const u64 left_x_step = tp.step_x[0];
const u64 right_x_step = tp.step_x[1];
const s32 end_y = tp.end_y;
u64 left_x = tp.start_x[0];
u64 right_x = tp.start_x[1];
s32 current_y = tp.start_y;
if (tp.fill_upside_down)
{
if (current_y <= end_y)
return;
UVStepper luv = uv;
if constexpr (texture_enable)
luv.StepY(uvstep, current_y);
RGBStepper lrgb = rgb;
if constexpr (shading_enable)
lrgb.StepY(rgbstep, current_y);
const TriangleVectors<shading_enable, texture_enable> tv(cmd, uvstep, rgbstep);
do
{
current_y--;
left_x -= left_x_step;
right_x -= right_x_step;
const s32 y = TruncateGPUVertexPosition(current_y);
if (y < static_cast<s32>(g_drawing_area.top))
break;
// Opposite direction means we need to subtract when stepping instead of adding.
if constexpr (texture_enable)
luv.StepY<true>(uvstep);
if constexpr (shading_enable)
lrgb.StepY<true>(rgbstep);
if (y > static_cast<s32>(g_drawing_area.bottom) ||
(cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (static_cast<u32>(current_y) & 1u)))
{
continue;
}
DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(
cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep, tv);
} while (current_y > end_y);
}
else
{
if (current_y >= end_y)
return;
UVStepper luv = uv;
if constexpr (texture_enable)
luv.StepY(uvstep, current_y);
RGBStepper lrgb = rgb;
if constexpr (shading_enable)
lrgb.StepY(rgbstep, current_y);
const TriangleVectors<shading_enable, texture_enable> tv(cmd, uvstep, rgbstep);
do
{
const s32 y = TruncateGPUVertexPosition(current_y);
if (y > static_cast<s32>(g_drawing_area.bottom))
{
break;
}
if (y >= static_cast<s32>(g_drawing_area.top) &&
(!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (static_cast<u32>(current_y) & 1u)))
{
DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(
cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep, tv);
}
current_y++;
left_x += left_x_step;
right_x += right_x_step;
if constexpr (texture_enable)
luv.StepY<false>(uvstep);
if constexpr (shading_enable)
lrgb.StepY<false>(rgbstep);
} while (current_y < end_y);
}
}
#endif // USE_VECTOR
template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2)
{
#ifdef CHECK_VECTOR
const GPUBackendDrawPolygonCommand::Vertex* orig_v0 = v0;
const GPUBackendDrawPolygonCommand::Vertex* orig_v1 = v1;
const GPUBackendDrawPolygonCommand::Vertex* orig_v2 = v2;
#endif
// Sort vertices so that v0 is the top vertex, v1 is the bottom vertex, and v2 is the side vertex.
u32 tl = 0;
if (v1->x <= v0->x)
tl = (v2->x <= v1->x) ? 4 : 2;
else if (v2->x < v0->x)
tl = 4;
else
tl = 1;
if (v2->y < v1->y)
{
std::swap(v2, v1);
tl = ((tl >> 1) & 0x2) | ((tl << 1) & 0x4) | (tl & 0x1);
}
if (v1->y < v0->y)
{
std::swap(v1, v0);
tl = ((tl >> 1) & 0x1) | ((tl << 1) & 0x2) | (tl & 0x4);
}
if (v2->y < v1->y)
{
std::swap(v2, v1);
tl = ((tl >> 1) & 0x2) | ((tl << 1) & 0x4) | (tl & 0x1);
}
const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
tl = tl >> 1;
// Invalid size early culling.
if (static_cast<u32>(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
static_cast<u32>(std::abs(v2->x - v1->x)) >= MAX_PRIMITIVE_WIDTH ||
static_cast<u32>(std::abs(v1->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
static_cast<u32>(v2->y - v0->y) >= MAX_PRIMITIVE_HEIGHT || v0->y == v2->y)
{
return;
}
// Same as line rasterization, use higher precision for position.
static constexpr auto makefp_xy = [](s32 x) { return (static_cast<s64>(x) << 32) + ((1LL << 32) - (1 << 11)); };
static constexpr auto makestep_xy = [](s32 dx, s32 dy) -> s64 {
return (((static_cast<s64>(dx) << 32) + ((dx < 0) ? -(dy - 1) : ((dx > 0) ? (dy - 1) : 0))) / dy);
};
const s64 base_coord = makefp_xy(v0->x);
const s64 base_step = makestep_xy(v2->x - v0->x, v2->y - v0->y);
const s64 bound_coord_us = (v1->y == v0->y) ? 0 : makestep_xy(v1->x - v0->x, v1->y - v0->y);
const s64 bound_coord_ls = (v2->y == v1->y) ? 0 : makestep_xy(v2->x - v1->x, v2->y - v1->y);
const u32 vo = (tl != 0) ? 1 : 0;
const u32 vp = (tl == 2) ? 3 : 0;
const bool right_facing = (v1->y == v0->y) ? (v1->x > v0->x) : (bound_coord_us > base_step);
const u32 rfi = BoolToUInt32(right_facing);
const u32 ofi = BoolToUInt32(!right_facing);
TrianglePart triparts[2];
TrianglePart& tpo = triparts[vo];
TrianglePart& tpp = triparts[vo ^ 1];
tpo.start_y = vertices[0 ^ vo]->y;
tpo.end_y = vertices[1 ^ vo]->y;
tpp.start_y = vertices[1 ^ vp]->y;
tpp.end_y = vertices[2 ^ vp]->y;
tpo.start_x[rfi] = makefp_xy(vertices[0 ^ vo]->x);
tpo.step_x[rfi] = bound_coord_us;
tpo.start_x[ofi] = base_coord + ((vertices[vo]->y - vertices[0]->y) * base_step);
tpo.step_x[ofi] = base_step;
tpo.fill_upside_down = ConvertToBoolUnchecked(vo);
tpp.start_x[rfi] = makefp_xy(vertices[1 ^ vp]->x);
tpp.step_x[rfi] = bound_coord_ls;
tpp.start_x[ofi] = base_coord + ((vertices[1 ^ vp]->y - vertices[0]->y) * base_step);
tpp.step_x[ofi] = base_step;
tpp.fill_upside_down = (vp != 0);
#define ATTRIB_DETERMINANT(x, y) (((v1->x - v0->x) * (v2->y - v1->y)) - ((v2->x - v1->x) * (v1->y - v0->y)))
#define ATTRIB_STEP(x, y) (static_cast<u32>(ATTRIB_DETERMINANT(x, y) * (1 << ATTRIB_SHIFT) / det) << ATTRIB_POST_SHIFT)
// Check edges.
const s32 det = ATTRIB_DETERMINANT(x, y);
if (det == 0) [[unlikely]]
return;
// Compute step values.
UVSteps uvstep;
RGBSteps rgbstep;
if constexpr (texture_enable)
{
uvstep.dudx = ATTRIB_STEP(u, y);
uvstep.dvdx = ATTRIB_STEP(v, y);
uvstep.dudy = ATTRIB_STEP(x, u);
uvstep.dvdy = ATTRIB_STEP(x, v);
}
if constexpr (shading_enable)
{
rgbstep.drdx = ATTRIB_STEP(r, y);
rgbstep.dgdx = ATTRIB_STEP(g, y);
rgbstep.dbdx = ATTRIB_STEP(b, y);
rgbstep.drdy = ATTRIB_STEP(x, r);
rgbstep.dgdy = ATTRIB_STEP(x, g);
rgbstep.dbdy = ATTRIB_STEP(x, b);
}
#undef ATTRIB_STEP
#undef ATTRIB_DETERMINANT
// Undo the start of the vertex, so that when we add the offset for each line, it starts at the beginning value.
UVStepper uv;
RGBStepper rgb;
const GPUBackendDrawPolygonCommand::Vertex* top_left_vertex = vertices[tl];
if constexpr (texture_enable)
{
uv.Init(top_left_vertex->u, top_left_vertex->v);
uv.StepX(uvstep, -top_left_vertex->x);
uv.StepY(uvstep, -top_left_vertex->y);
}
else
{
uv = {};
}
if constexpr (shading_enable)
{
rgb.Init(top_left_vertex->r, top_left_vertex->g, top_left_vertex->b);
rgb.StepX(rgbstep, -top_left_vertex->x);
rgb.StepY(rgbstep, -top_left_vertex->y);
}
else
{
rgb.Init(top_left_vertex->r, top_left_vertex->g, top_left_vertex->b);
}
#ifdef CHECK_VECTOR
BACKUP_VRAM();
#endif
for (u32 i = 0; i < 2; i++)
{
DrawTrianglePart<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(cmd, triparts[i], uv,
uvstep, rgb, rgbstep);
}
#ifdef CHECK_VECTOR
CHECK_VRAM(
GPU_SW_Rasterizer::DrawTriangleFunctions[shading_enable][texture_enable][raw_texture_enable][transparency_enable](
cmd, orig_v0, orig_v1, orig_v2));
#endif
}
constinit const DrawRectangleFunctionTable DrawRectangleFunctions = {
{{&DrawRectangle<false, false, false>, &DrawRectangle<false, false, true>},
{&DrawRectangle<false, false, false>, &DrawRectangle<false, false, true>}},
{{&DrawRectangle<true, false, false>, &DrawRectangle<true, false, true>},
{&DrawRectangle<true, true, false>, &DrawRectangle<true, true, true>}}};
constinit const DrawLineFunctionTable DrawLineFunctions = {{&DrawLine<false, false>, &DrawLine<false, true>},
{&DrawLine<true, false>, &DrawLine<true, true>}};
constinit const DrawTriangleFunctionTable DrawTriangleFunctions = {
{{{&DrawTriangle<false, false, false, false>, &DrawTriangle<false, false, false, true>},
{&DrawTriangle<false, false, false, false>, &DrawTriangle<false, false, false, true>}},
{{&DrawTriangle<false, true, false, false>, &DrawTriangle<false, true, false, true>},
{&DrawTriangle<false, true, true, false>, &DrawTriangle<false, true, true, true>}}},
{{{&DrawTriangle<true, false, false, false>, &DrawTriangle<true, false, false, true>},
{&DrawTriangle<true, false, false, false>, &DrawTriangle<true, false, false, true>}},
{{&DrawTriangle<true, true, false, false>, &DrawTriangle<true, true, false, true>},
{&DrawTriangle<true, true, true, false>, &DrawTriangle<true, true, true, true>}}}};
static void FillVRAMImpl(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced, u8 active_line_lsb)
{
#ifdef USE_VECTOR
const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
constexpr u32 vector_width = 8;
const u32 aligned_width = Common::AlignDownPow2(width, vector_width);
if ((x + width) <= VRAM_WIDTH && !interlaced)
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
u32 xoffs = 0;
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
GSVector4i::store<false>(row_ptr, fill);
for (; xoffs < width; xoffs++)
*(row_ptr++) = color16;
}
}
else if (interlaced)
{
// Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
const u32 active_field = active_line_lsb;
if ((x + width) <= VRAM_WIDTH)
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
if ((row & u32(1)) == active_field)
continue;
u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
u32 xoffs = 0;
for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
GSVector4i::store<false>(row_ptr, fill);
for (; xoffs < width; xoffs++)
*(row_ptr++) = color16;
}
}
else
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
if ((row & u32(1)) == active_field)
continue;
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
for (u32 xoffs = 0; xoffs < width; xoffs++)
{
const u32 col = (x + xoffs) % VRAM_WIDTH;
row_ptr[col] = color16;
}
}
}
}
else
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
for (u32 xoffs = 0; xoffs < width; xoffs++)
{
const u32 col = (x + xoffs) % VRAM_WIDTH;
row_ptr[col] = color16;
}
}
}
#else
const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
if ((x + width) <= VRAM_WIDTH && !interlaced)
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
std::fill_n(&g_vram[row * VRAM_WIDTH + x], width, color16);
}
}
else if (interlaced)
{
// Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
const u32 active_field = active_line_lsb;
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
if ((row & u32(1)) == active_field)
continue;
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
for (u32 xoffs = 0; xoffs < width; xoffs++)
{
const u32 col = (x + xoffs) % VRAM_WIDTH;
row_ptr[col] = color16;
}
}
}
else
{
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
const u32 row = (y + yoffs) % VRAM_HEIGHT;
u16* row_ptr = &g_vram[row * VRAM_WIDTH];
for (u32 xoffs = 0; xoffs < width; xoffs++)
{
const u32 col = (x + xoffs) % VRAM_WIDTH;
row_ptr[col] = color16;
}
}
}
#endif
}
static void WriteVRAMImpl(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
{
// TODO: Vector implementation
// Fast path when the copy is not oversized.
if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !set_mask && !check_mask)
{
const u16* src_ptr = static_cast<const u16*>(data);
u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x];
for (u32 yoffs = 0; yoffs < height; yoffs++)
{
std::copy_n(src_ptr, width, dst_ptr);
src_ptr += width;
dst_ptr += VRAM_WIDTH;
}
}
else
{
// Slow path when we need to handle wrap-around.
// During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
const u16* src_ptr = static_cast<const u16*>(data);
const u16 mask_and = check_mask ? 0x8000u : 0x0000u;
const u16 mask_or = set_mask ? 0x8000u : 0x0000u;
for (u32 row = 0; row < height;)
{
u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
for (u32 col = 0; col < width;)
{
// TODO: Handle unaligned reads...
u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
if (((*pixel_ptr) & mask_and) == 0)
*pixel_ptr = *(src_ptr++) | mask_or;
}
}
}
}
static void CopyVRAMImpl(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask,
bool check_mask)
{
// TODO: Vector implementation.
// Break up oversized copies. This behavior has not been verified on console.
if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
{
u32 remaining_rows = height;
u32 current_src_y = src_y;
u32 current_dst_y = dst_y;
while (remaining_rows > 0)
{
const u32 rows_to_copy =
std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));
u32 remaining_columns = width;
u32 current_src_x = src_x;
u32 current_dst_x = dst_x;
while (remaining_columns > 0)
{
const u32 columns_to_copy =
std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, set_mask,
check_mask);
current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
remaining_columns -= columns_to_copy;
}
current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
remaining_rows -= rows_to_copy;
}
return;
}
// This doesn't have a fast path, but do we really need one? It's not common.
const u16 mask_and = check_mask ? 0x8000u : 0x0000u;
const u16 mask_or = set_mask ? 0x8000u : 0x0000u;
// Copy in reverse when src_x < dst_x, this is verified on console.
if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
{
for (u32 row = 0; row < height; row++)
{
const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
{
const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
*dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr;
}
}
}
else
{
for (u32 row = 0; row < height; row++)
{
const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
for (u32 col = 0; col < width; col++)
{
const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
*dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr;
}
}
}
}
#ifdef __INTELLISENSE__
}
#endif