duckstation/src/core/gpu_sw_rasterizer.inl

// SPDX-FileCopyrightText: 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: CC-BY-NC-ND-4.0

#ifdef __INTELLISENSE__

#include "common/gsvector.h"
#include "gpu.h"
#include <algorithm>

#define USE_VECTOR 1
#define GSVECTOR_HAS_SRLV 1
#define GSVECTOR_HAS_256 1

extern GPU_SW_Rasterizer::DitherLUT g_dither_lut;

namespace GPU_SW_Rasterizer {

#endif

// TODO: UpdateVRAM, FillVRAM, etc.

#ifdef USE_VECTOR
// #define CHECK_VECTOR
#ifdef CHECK_VECTOR
static u16 s_vram_backup[VRAM_WIDTH * VRAM_HEIGHT];
static u16 s_new_vram[VRAM_WIDTH * VRAM_HEIGHT];
static u32 s_bad_counter = 0;
#define BACKUP_VRAM()                                                                                                  \
  do                                                                                                                   \
  {                                                                                                                    \
    std::memcpy(s_vram_backup, g_vram, sizeof(g_vram));                                                                \
    s_bad_counter++;                                                                                                   \
  } while (0)
#define CHECK_VRAM(drawer)                                                                                             \
  do                                                                                                                   \
  {                                                                                                                    \
    std::memcpy(s_new_vram, g_vram, sizeof(g_vram));                                                                   \
    std::memcpy(g_vram, s_vram_backup, sizeof(g_vram));                                                                \
                                                                                                                       \
    drawer;                                                                                                            \
    for (u32 vidx = 0; vidx < (VRAM_WIDTH * VRAM_HEIGHT); vidx++)                                                      \
    {                                                                                                                  \
      if (s_new_vram[vidx] != g_vram[vidx])                                                                            \
      {                                                                                                                \
        fprintf(stderr, "[%u] Mismatch at %d,%d, expected %04x got %04x\n", s_bad_counter, (vidx % VRAM_WIDTH),        \
                (vidx / VRAM_WIDTH), g_vram[vidx], s_new_vram[vidx]);                                                  \
        AssertMsg(false, "Mismatch");                                                                                  \
      }                                                                                                                \
    }                                                                                                                  \
    /*Assert(std::memcmp(g_vram, s_new_vram, sizeof(g_vram)) == 0)*/                                                   \
  } while (0)
#endif
#endif

[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16 GetPixel(const u32 x, const u32 y)
{
  return g_vram[VRAM_WIDTH * y + x];
}
[[maybe_unused]] ALWAYS_INLINE_RELEASE static u16* GetPixelPtr(const u32 x, const u32 y)
{
  return &g_vram[VRAM_WIDTH * y + x];
}
[[maybe_unused]] ALWAYS_INLINE_RELEASE static void SetPixel(const u32 x, const u32 y, const u16 value)
{
  g_vram[VRAM_WIDTH * y + x] = value;
}

[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8> UnpackTexcoord(u16 texcoord)
{
  return std::make_tuple(static_cast<u8>(texcoord), static_cast<u8>(texcoord >> 8));
}

[[maybe_unused]] ALWAYS_INLINE_RELEASE static constexpr std::tuple<u8, u8, u8> UnpackColorRGB24(u32 rgb24)
{
  return std::make_tuple(static_cast<u8>(rgb24), static_cast<u8>(rgb24 >> 8), static_cast<u8>(rgb24 >> 16));
}

template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
[[maybe_unused]] ALWAYS_INLINE_RELEASE static void ShadePixel(const GPUBackendDrawCommand* cmd, u32 x, u32 y,
                                                              u8 color_r, u8 color_g, u8 color_b, u8 texcoord_x,
                                                              u8 texcoord_y)
{
  u16 color;
  if constexpr (texture_enable)
  {
    // Apply texture window
    texcoord_x = (texcoord_x & cmd->window.and_x) | cmd->window.or_x;
    texcoord_y = (texcoord_y & cmd->window.and_y) | cmd->window.or_y;

    u16 texture_color;
    switch (cmd->draw_mode.texture_mode)
    {
      case GPUTextureMode::Palette4Bit:
      {
        const u16 palette_value =
          GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 4)) % VRAM_WIDTH,
                   (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
        const size_t palette_index = (palette_value >> ((texcoord_x % 4) * 4)) & 0x0Fu;
        texture_color = g_gpu_clut[palette_index];
      }
      break;

      case GPUTextureMode::Palette8Bit:
      {
        const u16 palette_value =
          GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x / 2)) % VRAM_WIDTH,
                   (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
        const size_t palette_index = (palette_value >> ((texcoord_x % 2) * 8)) & 0xFFu;
        texture_color = g_gpu_clut[palette_index];
      }
      break;

      default:
      {
        texture_color = GetPixel((cmd->draw_mode.GetTexturePageBaseX() + ZeroExtend32(texcoord_x)) % VRAM_WIDTH,
                                 (cmd->draw_mode.GetTexturePageBaseY() + ZeroExtend32(texcoord_y)) % VRAM_HEIGHT);
      }
      break;
    }

    if (texture_color == 0)
      return;

    if constexpr (raw_texture_enable)
    {
      color = texture_color;
    }
    else
    {
      const bool dithering_enable = cmd->draw_mode.dither_enable;
      const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
      const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;

      color =
        (ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16(texture_color & 0x1Fu) * u16(color_r)) >> 4]) << 0) |
        (ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16((texture_color >> 5) & 0x1Fu) * u16(color_g)) >> 4]) << 5) |
        (ZeroExtend16(g_dither_lut[dither_y][dither_x][(u16((texture_color >> 10) & 0x1Fu) * u16(color_b)) >> 4])
         << 10) |
        (texture_color & 0x8000u);
    }
  }
  else
  {
    const bool dithering_enable = cmd->draw_mode.dither_enable;
    const u32 dither_y = (dithering_enable) ? (y & 3u) : 2u;
    const u32 dither_x = (dithering_enable) ? (x & 3u) : 3u;

    // Non-textured transparent polygons don't set bit 15, but are treated as transparent.
    color = (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_r]) << 0) |
            (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_g]) << 5) |
            (ZeroExtend16(g_dither_lut[dither_y][dither_x][color_b]) << 10) | (transparency_enable ? 0x8000u : 0);
  }

  const u16 bg_color = GetPixel(static_cast<u32>(x), static_cast<u32>(y));
  if constexpr (transparency_enable)
  {
    if (color & 0x8000u || !texture_enable)
    {
      // Based on blargg's efficient 15bpp pixel math.
      u32 bg_bits = ZeroExtend32(bg_color);
      u32 fg_bits = ZeroExtend32(color);
      switch (cmd->draw_mode.transparency_mode)
      {
        case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
        {
          bg_bits |= 0x8000u;
          color = Truncate16(((fg_bits + bg_bits) - ((fg_bits ^ bg_bits) & 0x0421u)) >> 1);
        }
        break;

        case GPUTransparencyMode::BackgroundPlusForeground:
        {
          bg_bits &= ~0x8000u;

          const u32 sum = fg_bits + bg_bits;
          const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;

          color = Truncate16((sum - carry) | (carry - (carry >> 5)));
        }
        break;

        case GPUTransparencyMode::BackgroundMinusForeground:
        {
          bg_bits |= 0x8000u;
          fg_bits &= ~0x8000u;

          const u32 diff = bg_bits - fg_bits + 0x108420u;
          const u32 borrow = (diff - ((bg_bits ^ fg_bits) & 0x108420u)) & 0x108420u;

          color = Truncate16((diff - borrow) & (borrow - (borrow >> 5)));
        }
        break;

        case GPUTransparencyMode::BackgroundPlusQuarterForeground:
        {
          bg_bits &= ~0x8000u;
          fg_bits = ((fg_bits >> 2) & 0x1CE7u) | 0x8000u;

          const u32 sum = fg_bits + bg_bits;
          const u32 carry = (sum - ((fg_bits ^ bg_bits) & 0x8421u)) & 0x8420u;

          color = Truncate16((sum - carry) | (carry - (carry >> 5)));
        }
        break;

        default:
          break;
      }

      // See above.
      if constexpr (!texture_enable)
        color &= ~0x8000u;
    }
  }

  const u16 mask_and = cmd->params.GetMaskAND();
  if ((bg_color & mask_and) != 0)
    return;

  DebugAssert(static_cast<u32>(x) < VRAM_WIDTH && static_cast<u32>(y) < VRAM_HEIGHT);
  SetPixel(static_cast<u32>(x), static_cast<u32>(y), color | cmd->params.GetMaskOR());
}

#ifndef USE_VECTOR

template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
{
  const s32 origin_x = cmd->x;
  const s32 origin_y = cmd->y;
  const auto [r, g, b] = UnpackColorRGB24(cmd->color);
  const auto [origin_texcoord_x, origin_texcoord_y] = UnpackTexcoord(cmd->texcoord);

  for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
  {
    const s32 y = origin_y + static_cast<s32>(offset_y);
    if (y < static_cast<s32>(g_drawing_area.top) || y > static_cast<s32>(g_drawing_area.bottom) ||
        (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (Truncate8(static_cast<u32>(y)) & 1u)))
    {
      continue;
    }

    const u32 draw_y = static_cast<u32>(y) & VRAM_HEIGHT_MASK;
    const u8 texcoord_y = Truncate8(ZeroExtend32(origin_texcoord_y) + offset_y);

    for (u32 offset_x = 0; offset_x < cmd->width; offset_x++)
    {
      const s32 x = origin_x + static_cast<s32>(offset_x);
      if (x < static_cast<s32>(g_drawing_area.left) || x > static_cast<s32>(g_drawing_area.right))
        continue;

      const u8 texcoord_x = Truncate8(ZeroExtend32(origin_texcoord_x) + offset_x);

      ShadePixel<texture_enable, raw_texture_enable, transparency_enable>(cmd, static_cast<u32>(x), draw_y, r, g, b,
                                                                          texcoord_x, texcoord_y);
    }
  }
}

#else // USE_VECTOR

#ifdef GSVECTOR_HAS_256
using GSVectorNi = GSVector8i;
static constexpr GSVector8i SPAN_OFFSET_VEC = GSVector8i::cxpr(0, 1, 2, 3, 4, 5, 6, 7);
static constexpr GSVector8i SPAN_WIDTH_VEC = GSVector8i::cxpr(1, 2, 3, 4, 5, 6, 7, 8);
static constexpr GSVector8i PIXELS_PER_VEC_VEC = GSVector8i::cxpr(8);
static constexpr u32 PIXELS_PER_VEC = 8;
#else
using GSVectorNi = GSVector4i;
static constexpr GSVector4i SPAN_OFFSET_VEC = GSVector4i::cxpr(0, 1, 2, 3);
static constexpr GSVector4i SPAN_WIDTH_VEC = GSVector4i::cxpr(1, 2, 3, 4);
static constexpr GSVector4i PIXELS_PER_VEC_VEC = GSVector4i::cxpr(4);
static constexpr u32 PIXELS_PER_VEC = 4;
#endif

#ifdef GSVECTOR_HAS_256

ALWAYS_INLINE_RELEASE static GSVector8i GatherVector(GSVector8i coord_x, GSVector8i coord_y)
{
  const GSVector8i offsets = coord_y.sll32<10>().add32(coord_x); // y * 1024 + x
  GSVector8i pixels = GSVector8i::zext32(g_vram[static_cast<u32>(offsets.extract32<0>())]);
  pixels = pixels.insert16<2>(g_vram[static_cast<u32>(offsets.extract32<1>())]);
  pixels = pixels.insert16<4>(g_vram[static_cast<u32>(offsets.extract32<2>())]);
  pixels = pixels.insert16<6>(g_vram[static_cast<u32>(offsets.extract32<3>())]);
  pixels = pixels.insert16<8>(g_vram[static_cast<u32>(offsets.extract32<4>())]);
  pixels = pixels.insert16<10>(g_vram[static_cast<u32>(offsets.extract32<5>())]);
  pixels = pixels.insert16<12>(g_vram[static_cast<u32>(offsets.extract32<6>())]);
  pixels = pixels.insert16<14>(g_vram[static_cast<u32>(offsets.extract32<7>())]);
  return pixels;
}

template<u32 mask>
ALWAYS_INLINE_RELEASE static GSVector8i GatherCLUTVector(GSVector8i indices, GSVector8i shifts)
{
  const GSVector8i offsets = indices.srlv32(shifts) & GSVector8i::cxpr(mask);
  GSVector8i pixels = GSVector8i::zext32(g_gpu_clut[static_cast<u32>(offsets.extract32<0>())]);
  pixels = pixels.insert16<2>(g_gpu_clut[static_cast<u32>(offsets.extract32<1>())]);
  pixels = pixels.insert16<4>(g_gpu_clut[static_cast<u32>(offsets.extract32<2>())]);
  pixels = pixels.insert16<6>(g_gpu_clut[static_cast<u32>(offsets.extract32<3>())]);
  pixels = pixels.insert16<8>(g_gpu_clut[static_cast<u32>(offsets.extract32<4>())]);
  pixels = pixels.insert16<10>(g_gpu_clut[static_cast<u32>(offsets.extract32<5>())]);
  pixels = pixels.insert16<12>(g_gpu_clut[static_cast<u32>(offsets.extract32<6>())]);
  pixels = pixels.insert16<14>(g_gpu_clut[static_cast<u32>(offsets.extract32<7>())]);
  return pixels;
}

ALWAYS_INLINE_RELEASE static GSVector8i LoadVector(u32 x, u32 y)
{
  // TODO: Split into high/low
  if (x <= (VRAM_WIDTH - 8))
  {
    return GSVector8i::u16to32(GSVector4i::load<false>(&g_vram[y * VRAM_WIDTH + x]));
  }
  else
  {
    // TODO: Avoid loads for masked pixels if a contiguous region is masked
    const u16* line = &g_vram[y * VRAM_WIDTH];
    GSVector8i pixels = GSVector8i::zero();
    pixels = pixels.insert16<0>(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<6>(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<8>(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<10>(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<12>(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<14>(line[(x++) & VRAM_WIDTH_MASK]);
    return pixels;
  }
}

ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector8i color)
{
  // TODO: Split into high/low
  const GSVector4i packed = color.low128().pu32(color.high128());
  if (x <= (VRAM_WIDTH - 8))
  {
    GSVector4i::store<false>(&g_vram[y * VRAM_WIDTH + x], packed);
  }
  else
  {
    // TODO: Avoid stores for masked pixels if a contiguous region is masked
    u16* line = &g_vram[y * VRAM_WIDTH];
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<0>());
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<1>());
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<2>());
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<3>());
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<4>());
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<5>());
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<6>());
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed.extract16<7>());
  }
}

#else

ALWAYS_INLINE_RELEASE static GSVector4i GatherVector(GSVector4i coord_x, GSVector4i coord_y)
{
  const GSVector4i offsets = coord_y.sll32<10>().add32(coord_x); // y * 1024 + x

  // Clang seems to optimize this directly into pextrd+pinsrw, good.
  GSVector4i pixels = GSVector4i::zext32(g_vram[static_cast<u32>(offsets.extract32<0>())]);
  pixels = pixels.insert16<2>(g_vram[static_cast<u32>(offsets.extract32<1>())]);
  pixels = pixels.insert16<4>(g_vram[static_cast<u32>(offsets.extract32<2>())]);
  pixels = pixels.insert16<6>(g_vram[static_cast<u32>(offsets.extract32<3>())]);

  return pixels;
}

template<u32 mask>
ALWAYS_INLINE_RELEASE static GSVector4i GatherCLUTVector(GSVector4i indices, GSVector4i shifts)
{
#ifdef GSVECTOR_HAS_SRLV
  // On everywhere except RISC-V, we can do the shl 1 (* 2) as part of the load instruction.
  const GSVector4i offsets = indices.srlv32(shifts) & GSVector4i::cxpr(mask);
  GSVector4i pixels = GSVector4i::zext32(g_gpu_clut[static_cast<u32>(offsets.extract32<0>())]);
  pixels = pixels.insert16<2>(g_gpu_clut[static_cast<u32>(offsets.extract32<1>())]);
  pixels = pixels.insert16<4>(g_gpu_clut[static_cast<u32>(offsets.extract32<2>())]);
  pixels = pixels.insert16<6>(g_gpu_clut[static_cast<u32>(offsets.extract32<3>())]);
  return pixels;
#else
  // Without variable shifts, it's probably quicker to do it without vectors.
  // Because otherwise we have to do 4 separate vector shifts, as well as broadcasting the shifts...
  // Clang seems to turn this into a bunch of extracts, and skips memory. Nice.
  alignas(VECTOR_ALIGNMENT) s32 indices_array[4], shifts_array[4];
  GSVector4i::store<true>(indices_array, indices);
  GSVector4i::store<true>(shifts_array, shifts);

  GSVector4i pixels = GSVector4i::zext32(g_gpu_clut[((indices_array[0] >> shifts_array[0]) & mask)]);
  pixels = pixels.insert16<2>(g_gpu_clut[((indices_array[1] >> shifts_array[1]) & mask)]);
  pixels = pixels.insert16<4>(g_gpu_clut[((indices_array[2] >> shifts_array[2]) & mask)]);
  pixels = pixels.insert16<6>(g_gpu_clut[((indices_array[3] >> shifts_array[3]) & mask)]);
  return pixels;
#endif
}

ALWAYS_INLINE_RELEASE static GSVector4i LoadVector(u32 x, u32 y)
{
  if (x <= (VRAM_WIDTH - 4))
  {
    return GSVector4i::loadl(&g_vram[y * VRAM_WIDTH + x]).u16to32();
  }
  else
  {
    const u16* line = &g_vram[y * VRAM_WIDTH];
    GSVector4i pixels = GSVector4i(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<2>(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<4>(line[(x++) & VRAM_WIDTH_MASK]);
    pixels = pixels.insert16<6>(line[x & VRAM_WIDTH_MASK]);
    return pixels;
  }
}

ALWAYS_INLINE_RELEASE static void StoreVector(u32 x, u32 y, GSVector4i color)
{
  const GSVector4i packed_color = color.pu32();
  if (x <= (VRAM_WIDTH - 4))
  {
    GSVector4i::storel(&g_vram[y * VRAM_WIDTH + x], packed_color);
  }
  else
  {
    u16* line = &g_vram[y * VRAM_WIDTH];
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<0>());
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<1>());
    line[(x++) & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<2>());
    line[x & VRAM_WIDTH_MASK] = Truncate16(packed_color.extract16<3>());
  }
}

#endif

ALWAYS_INLINE_RELEASE static void RGB5A1ToRG_BA(GSVectorNi rgb5a1, GSVectorNi& rg, GSVectorNi& ba)
{
  rg = rgb5a1 & GSVectorNi::cxpr(0x1F);                     // R | R | R | R
  rg = rg | (rgb5a1 & GSVectorNi::cxpr(0x3E0)).sll32<11>(); // R0G0 | R0G0 | R0G0 | R0G0
  ba = rgb5a1.srl32<10>() & GSVectorNi::cxpr(0x1F);         // B | B | B | B
  ba = ba | (rgb5a1 & GSVectorNi::cxpr(0x8000)).sll32<1>(); // B0A0 | B0A0 | B0A0 | B0A0
}

ALWAYS_INLINE_RELEASE static GSVectorNi RG_BAToRGB5A1(GSVectorNi rg, GSVectorNi ba)
{
  GSVectorNi res;

  res = rg & GSVectorNi::cxpr(0x1F);                       // R | R | R | R
  res = res | (rg.srl32<11>() & GSVectorNi::cxpr(0x3E0));  // RG | RG | RG | RG
  res = res | ((ba & GSVectorNi::cxpr(0x1F)).sll32<10>()); // RGB | RGB | RGB | RGB
  res = res | ba.srl32<16>().sll32<15>();                  // RGBA | RGBA | RGBA | RGBA

  return res;
}

// Color repeated twice for RG packing, then duplicated to we can load based on the X offset.
alignas(VECTOR_ALIGNMENT) static constexpr s16 VECTOR_DITHER_MATRIX[4][16] = {
#define P(m, n) static_cast<s16>(DITHER_MATRIX[m][n]), static_cast<s16>(DITHER_MATRIX[m][n])
#define R(m) P(m, 0), P(m, 1), P(m, 2), P(m, 3), P(m, 0), P(m, 1), P(m, 2), P(m, 3)

  {R(0)}, {R(1)}, {R(2)}, {R(3)}

#undef R
#undef P
};

namespace {
template<bool texture_enable>
struct PixelVectors
{
  struct UnusedField
  {
  };

  GSVectorNi clip_left;
  GSVectorNi clip_right;

  GSVectorNi mask_and;
  GSVectorNi mask_or;

  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_and_x;
  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_or_x;
  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_and_y;
  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_window_or_y;
  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_base_x;
  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> texture_base_y;

  PixelVectors(const GPUBackendDrawCommand* cmd)
  {
    clip_left = GSVectorNi(g_drawing_area.left);
    clip_right = GSVectorNi(g_drawing_area.right);

    mask_and = GSVectorNi(cmd->params.GetMaskAND());
    mask_or = GSVectorNi(cmd->params.GetMaskOR());

    if constexpr (texture_enable)
    {
      texture_window_and_x = GSVectorNi(cmd->window.and_x);
      texture_window_or_x = GSVectorNi(cmd->window.or_x);
      texture_window_and_y = GSVectorNi(cmd->window.and_y);
      texture_window_or_y = GSVectorNi(cmd->window.or_y);
      texture_base_x = GSVectorNi(cmd->draw_mode.GetTexturePageBaseX());
      texture_base_y = GSVectorNi(cmd->draw_mode.GetTexturePageBaseY());
    }
  }
};
} // namespace

template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
ALWAYS_INLINE_RELEASE static void
ShadePixel(const PixelVectors<texture_enable>& pv, GPUTextureMode texture_mode, GPUTransparencyMode transparency_mode,
           u32 start_x, u32 y, GSVectorNi vertex_color_rg, GSVectorNi vertex_color_ba, GSVectorNi texcoord_x,
           GSVectorNi texcoord_y, GSVectorNi preserve_mask, GSVectorNi dither)
{
  static constexpr GSVectorNi coord_mask_x = GSVectorNi::cxpr(VRAM_WIDTH_MASK);
  static constexpr GSVectorNi coord_mask_y = GSVectorNi::cxpr(VRAM_HEIGHT_MASK);

  GSVectorNi color;

  if constexpr (texture_enable)
  {
    // Apply texture window
    texcoord_x = (texcoord_x & pv.texture_window_and_x) | pv.texture_window_or_x;
    texcoord_y = (texcoord_y & pv.texture_window_and_y) | pv.texture_window_or_y;

    texcoord_y = pv.texture_base_y.add32(texcoord_y) & coord_mask_y;

    GSVectorNi texture_color;
    switch (texture_mode)
    {
      case GPUTextureMode::Palette4Bit:
      {
        GSVectorNi load_texcoord_x = texcoord_x.srl32<2>();
        load_texcoord_x = pv.texture_base_x.add32(load_texcoord_x);
        load_texcoord_x = load_texcoord_x & coord_mask_x;

        const GSVectorNi palette_shift = (texcoord_x & GSVectorNi::cxpr(3)).sll32<2>();
        const GSVectorNi palette_indices = GatherVector(load_texcoord_x, texcoord_y);
        texture_color = GatherCLUTVector<0x0F>(palette_indices, palette_shift);
      }
      break;

      case GPUTextureMode::Palette8Bit:
      {
        GSVectorNi load_texcoord_x = texcoord_x.srl32<1>();
        load_texcoord_x = pv.texture_base_x.add32(load_texcoord_x);
        load_texcoord_x = load_texcoord_x & coord_mask_x;

        const GSVectorNi palette_shift = (texcoord_x & GSVectorNi::cxpr(1)).sll32<3>();
        const GSVectorNi palette_indices = GatherVector(load_texcoord_x, texcoord_y);
        texture_color = GatherCLUTVector<0xFF>(palette_indices, palette_shift);
      }
      break;

      default:
      {
        texcoord_x = pv.texture_base_x.add32(texcoord_x);
        texcoord_x = texcoord_x & coord_mask_x;
        texture_color = GatherVector(texcoord_x, texcoord_y);
      }
      break;
    }

    // check for zero texture colour across the 4 pixels, early out if so
    const GSVectorNi texture_transparent_mask = texture_color.eq32(GSVectorNi::zero());
    if (texture_transparent_mask.alltrue())
      return;

    preserve_mask = preserve_mask | texture_transparent_mask;

    if constexpr (raw_texture_enable)
    {
      color = texture_color;
    }
    else
    {
      GSVectorNi trg, tba;
      RGB5A1ToRG_BA(texture_color, trg, tba);

      // now we have both the texture and vertex color in RG/GA pairs, for 4 pixels, which we can multiply
      GSVectorNi rg = trg.mul16l(vertex_color_rg);
      GSVectorNi ba = tba.mul16l(vertex_color_ba);

      // Convert to 5bit.
      rg = rg.sra16<4>().add16(dither).max_s16(GSVectorNi::zero()).sra16<3>();
      ba = ba.sra16<4>().add16(dither).max_s16(GSVectorNi::zero()).sra16<3>();

      // Bit15 gets passed through as-is.
      ba = ba.blend16<0xaa>(tba);

      // Clamp to 5bit.
      static constexpr GSVectorNi colclamp = GSVectorNi::cxpr16(0x1F);
      rg = rg.min_u16(colclamp);
      ba = ba.min_u16(colclamp);

      // And interleave back to 16bpp.
      color = RG_BAToRGB5A1(rg, ba);
    }
  }
  else
  {
    // Non-textured transparent polygons don't set bit 15, but are treated as transparent.
    GSVectorNi rg = vertex_color_rg.add16(dither).max_s16(GSVectorNi::zero()).sra16<3>();
    GSVectorNi ba = vertex_color_ba.add16(dither).max_s16(GSVectorNi::zero()).sra16<3>();

    // Clamp to 5bit. We use 32bit for BA to set a to zero.
    rg = rg.min_u16(GSVectorNi::cxpr16(0x1F));
    ba = ba.min_u16(GSVectorNi::cxpr(0x1F));

    // And interleave back to 16bpp.
    color = RG_BAToRGB5A1(rg, ba);
  }

  GSVectorNi bg_color = LoadVector(start_x, y);

  if constexpr (transparency_enable)
  {
    [[maybe_unused]] GSVectorNi transparent_mask;
    if constexpr (texture_enable)
    {
      // Compute transparent_mask, ffff per lane if transparent otherwise 0000
      transparent_mask = color.sra16<15>();
    }

    // TODO: We don't need to OR color here with 0x8000 for textures.
    // 0x8000 is added to match serial path.

    GSVectorNi blended_color;
    switch (transparency_mode)
    {
      case GPUTransparencyMode::HalfBackgroundPlusHalfForeground:
      {
        const GSVectorNi fg_bits = color | GSVectorNi::cxpr(0x8000u);
        const GSVectorNi bg_bits = bg_color | GSVectorNi::cxpr(0x8000u);
        const GSVectorNi res = fg_bits.add32(bg_bits).sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x0421u)).srl32<1>();
        blended_color = res & GSVectorNi::cxpr(0xffff);
      }
      break;

      case GPUTransparencyMode::BackgroundPlusForeground:
      {
        const GSVectorNi fg_bits = color | GSVectorNi::cxpr(0x8000u);
        const GSVectorNi bg_bits = bg_color & GSVectorNi::cxpr(0x7FFFu);
        const GSVectorNi sum = fg_bits.add32(bg_bits);
        const GSVectorNi carry =
          (sum.sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x8421u))) & GSVectorNi::cxpr(0x8420u);
        const GSVectorNi res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
        blended_color = res & GSVectorNi::cxpr(0xffff);
      }
      break;

      case GPUTransparencyMode::BackgroundMinusForeground:
      {
        const GSVectorNi bg_bits = bg_color | GSVectorNi::cxpr(0x8000u);
        const GSVectorNi fg_bits = color & GSVectorNi::cxpr(0x7FFFu);
        const GSVectorNi diff = bg_bits.sub32(fg_bits).add32(GSVectorNi::cxpr(0x108420u));
        const GSVectorNi borrow =
          diff.sub32((bg_bits ^ fg_bits) & GSVectorNi::cxpr(0x108420u)) & GSVectorNi::cxpr(0x108420u);
        const GSVectorNi res = diff.sub32(borrow) & borrow.sub32(borrow.srl32<5>());
        blended_color = res & GSVectorNi::cxpr(0xffff);
      }
      break;

      case GPUTransparencyMode::BackgroundPlusQuarterForeground:
      default:
      {
        const GSVectorNi bg_bits = bg_color & GSVectorNi::cxpr(0x7FFFu);
        const GSVectorNi fg_bits =
          ((color | GSVectorNi::cxpr(0x8000)).srl32<2>() & GSVectorNi::cxpr(0x1CE7u)) | GSVectorNi::cxpr(0x8000u);
        const GSVectorNi sum = fg_bits.add32(bg_bits);
        const GSVectorNi carry = sum.sub32((fg_bits ^ bg_bits) & GSVectorNi::cxpr(0x8421u)) & GSVectorNi::cxpr(0x8420u);
        const GSVectorNi res = sum.sub32(carry) | carry.sub32(carry.srl32<5>());
        blended_color = res & GSVectorNi::cxpr(0xffff);
      }
      break;
    }

    // select blended pixels for transparent pixels, otherwise consider opaque
    if constexpr (texture_enable)
      color = color.blend8(blended_color, transparent_mask);
    else
      color = blended_color & GSVectorNi::cxpr(0x7fff);
  }

  GSVectorNi mask_bits_set = bg_color & pv.mask_and; // 8000 if masked else 0000
  mask_bits_set = mask_bits_set.sra16<15>();         // ffff if masked else 0000
  preserve_mask = preserve_mask | mask_bits_set;     // ffff if preserved else 0000

  bg_color = bg_color & preserve_mask;
  color = (color | pv.mask_or).andnot(preserve_mask);
  color = color | bg_color;

  StoreVector(start_x, y, color);
}

template<bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawRectangle(const GPUBackendDrawRectangleCommand* cmd)
{
  const s32 origin_x = cmd->x;
  const s32 origin_y = cmd->y;

  const GSVector4i rgba = GSVector4i(cmd->color);               // RGBA | RGBA | RGBA | RGBA
  const GSVector4i rgp = rgba.xxxxl();                          // RGRG | RGRG | RGRG | RGRG
  const GSVector4i bap = rgba.yyyyl();                          // BABA | BABA | BABA | BABA
  const GSVectorNi rg = GSVectorNi::broadcast128(rgp.u8to16()); // R0G0 | R0G0 | R0G0 | R0G0
  const GSVectorNi ba = GSVectorNi::broadcast128(bap.u8to16()); // B0A0 | B0A0 | B0A0 | B0A0

  const GSVectorNi texcoord_x = GSVectorNi(cmd->texcoord & 0xFF).add32(SPAN_OFFSET_VEC);
  GSVectorNi texcoord_y = GSVectorNi(cmd->texcoord >> 8);

  const PixelVectors<texture_enable> pv(cmd);
  const u32 width = cmd->width;

#ifdef CHECK_VECTOR
  BACKUP_VRAM();
#endif

  for (u32 offset_y = 0; offset_y < cmd->height; offset_y++)
  {
    const s32 y = origin_y + static_cast<s32>(offset_y);
    if (y >= static_cast<s32>(g_drawing_area.top) && y <= static_cast<s32>(g_drawing_area.bottom) &&
        (!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast<u32>(y)) & 1u)))
    {
      const s32 draw_y = (y & VRAM_HEIGHT_MASK);

      GSVectorNi row_texcoord_x = texcoord_x;
      GSVectorNi xvec = GSVectorNi(origin_x).add32(SPAN_OFFSET_VEC);
      GSVectorNi wvec = GSVectorNi(width).sub32(SPAN_WIDTH_VEC);

      for (u32 offset_x = 0; offset_x < width; offset_x += PIXELS_PER_VEC)
      {
        const s32 x = origin_x + static_cast<s32>(offset_x);

        // width test
        GSVectorNi preserve_mask = wvec.lt32(GSVectorNi::zero());

        // clip test, if all pixels are outside, skip
        preserve_mask = preserve_mask | xvec.lt32(pv.clip_left);
        preserve_mask = preserve_mask | xvec.gt32(pv.clip_right);
        if (!preserve_mask.alltrue())
        {
          ShadePixel<texture_enable, raw_texture_enable, transparency_enable>(
            pv, cmd->draw_mode.texture_mode, cmd->draw_mode.transparency_mode, x, draw_y, rg, ba, row_texcoord_x,
            texcoord_y, preserve_mask, GSVectorNi::zero());
        }

        xvec = xvec.add32(PIXELS_PER_VEC_VEC);
        wvec = wvec.sub32(PIXELS_PER_VEC_VEC);

        if constexpr (texture_enable)
          row_texcoord_x = row_texcoord_x.add32(PIXELS_PER_VEC_VEC) & GSVectorNi::cxpr(0xFF);
      }
    }

    if constexpr (texture_enable)
      texcoord_y = texcoord_y.add32(GSVectorNi::cxpr(1)) & GSVectorNi::cxpr(0xFF);
  }

#ifdef CHECK_VECTOR
  CHECK_VRAM(GPU_SW_Rasterizer::DrawRectangleFunctions[texture_enable][raw_texture_enable][transparency_enable](cmd));
#endif
}

#endif // USE_VECTOR

// TODO: Vectorize line draw.
template<bool shading_enable, bool transparency_enable>
static void DrawLine(const GPUBackendDrawLineCommand* cmd, const GPUBackendDrawLineCommand::Vertex* p0,
                     const GPUBackendDrawLineCommand::Vertex* p1)
{
  static constexpr u32 XY_SHIFT = 32;
  static constexpr u32 RGB_SHIFT = 12;
  static constexpr auto makefp_xy = [](s32 x) { return (static_cast<s64>(x) << XY_SHIFT) | (1LL << (XY_SHIFT - 1)); };
  static constexpr auto unfp_xy = [](s64 x) { return static_cast<s32>(x >> XY_SHIFT) & 2047; };
  static constexpr auto div_xy = [](s64 delta, s32 dk) {
    return ((delta << XY_SHIFT) - ((delta < 0) ? (dk - 1) : 0) + ((delta > 0) ? (dk - 1) : 0)) / dk;
  };
  static constexpr auto makefp_rgb = [](u32 c) { return (static_cast<s32>(c) << RGB_SHIFT) | (1 << (RGB_SHIFT - 1)); };
  static constexpr auto unfp_rgb = [](s32 c) { return static_cast<u8>(c >> RGB_SHIFT); };
  static constexpr auto div_rgb = [](u32 c1, u32 c0, s32 dk) {
    return ((static_cast<s32>(c1) - static_cast<s32>(c0)) << RGB_SHIFT) / dk;
  };

  const s32 i_dx = std::abs(p1->x - p0->x);
  const s32 i_dy = std::abs(p1->y - p0->y);
  const s32 k = (i_dx > i_dy) ? i_dx : i_dy;
  if (i_dx >= MAX_PRIMITIVE_WIDTH || i_dy >= MAX_PRIMITIVE_HEIGHT) [[unlikely]]
    return;

  if (p0->x >= p1->x && k > 0)
    std::swap(p0, p1);

  s64 dxdk = 0, dydk = 0;
  [[maybe_unused]] s32 drdk = 0, dgdk = 0, dbdk = 0;
  if (k != 0) [[likely]]
  {
    dxdk = div_xy(p1->x - p0->x, k);
    dydk = div_xy(p1->y - p0->y, k);
    if constexpr (shading_enable)
    {
      drdk = div_rgb(p1->r, p0->r, k);
      dgdk = div_rgb(p1->g, p0->g, k);
      dbdk = div_rgb(p1->b, p0->b, k);
    }
  }

  s64 curx = makefp_xy(p0->x) - 1024;
  s64 cury = makefp_xy(p0->y) - ((dydk < 0) ? 1024 : 0);
  [[maybe_unused]] s32 curr, curg, curb;
  if constexpr (shading_enable)
  {
    curr = makefp_rgb(p0->r);
    curg = makefp_rgb(p0->g);
    curb = makefp_rgb(p0->b);
  }

  for (s32 i = 0; i <= k; i++)
  {
    const s32 x = unfp_xy(curx);
    const s32 y = unfp_xy(cury);

    if ((!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (Truncate8(static_cast<u32>(y)) & 1u)) &&
        x >= static_cast<s32>(g_drawing_area.left) && x <= static_cast<s32>(g_drawing_area.right) &&
        y >= static_cast<s32>(g_drawing_area.top) && y <= static_cast<s32>(g_drawing_area.bottom))
    {
      const u8 r = shading_enable ? unfp_rgb(curr) : p0->r;
      const u8 g = shading_enable ? unfp_rgb(curg) : p0->g;
      const u8 b = shading_enable ? unfp_rgb(curb) : p0->b;

      ShadePixel<false, false, transparency_enable>(cmd, static_cast<u32>(x), static_cast<u32>(y) & VRAM_HEIGHT_MASK, r,
                                                    g, b, 0, 0);
    }

    curx += dxdk;
    cury += dydk;

    if constexpr (shading_enable)
    {
      curr += drdk;
      curg += dgdk;
      curb += dbdk;
    }
  }
}

// DDA triangle rasterization algorithm originally from Mednafen, rewritten and vectorized for DuckStation.
namespace {
static constexpr u32 ATTRIB_SHIFT = 12;
static constexpr u32 ATTRIB_POST_SHIFT = 12;

struct UVSteps
{
  u32 dudx;
  u32 dvdx;
  u32 dudy;
  u32 dvdy;
};

struct UVStepper
{
  u32 u;
  u32 v;

  ALWAYS_INLINE u8 GetU() const { return Truncate8(u >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
  ALWAYS_INLINE u8 GetV() const { return Truncate8(v >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }

  ALWAYS_INLINE void Init(u32 ustart, u32 vstart)
  {
    u = (((ustart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
    v = (((vstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
  }

  ALWAYS_INLINE void StepX(const UVSteps& steps)
  {
    u = u + steps.dudx;
    v = v + steps.dvdx;
  }

  ALWAYS_INLINE void StepX(const UVSteps& steps, s32 count)
  {
    u = u + static_cast<u32>(static_cast<s32>(steps.dudx) * count);
    v = v + static_cast<u32>(static_cast<s32>(steps.dvdx) * count);
  }

  template<bool upside_down>
  ALWAYS_INLINE void StepY(const UVSteps& steps)
  {
    u = upside_down ? (u - steps.dudy) : (u + steps.dudy);
    v = upside_down ? (v - steps.dvdy) : (v + steps.dvdy);
  }

  ALWAYS_INLINE void StepY(const UVSteps& steps, s32 count)
  {
    u = u + static_cast<u32>(static_cast<s32>(steps.dudy) * count);
    v = v + static_cast<u32>(static_cast<s32>(steps.dvdy) * count);
  }
};

struct RGBSteps
{
  u32 drdx;
  u32 dgdx;
  u32 dbdx;

  u32 drdy;
  u32 dgdy;
  u32 dbdy;
};

struct RGBStepper
{
  u32 r;
  u32 g;
  u32 b;

  ALWAYS_INLINE u8 GetR() const { return Truncate8(r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
  ALWAYS_INLINE u8 GetG() const { return Truncate8(g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }
  ALWAYS_INLINE u8 GetB() const { return Truncate8(b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)); }

  ALWAYS_INLINE void Init(u32 rstart, u32 gstart, u32 bstart)
  {
    r = (((rstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
    g = (((gstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
    b = (((bstart << ATTRIB_SHIFT) + (1u << (ATTRIB_SHIFT - 1))) << ATTRIB_POST_SHIFT);
  }

  ALWAYS_INLINE void StepX(const RGBSteps& steps)
  {
    r = r + steps.drdx;
    g = g + steps.dgdx;
    b = b + steps.dbdx;
  }

  ALWAYS_INLINE void StepX(const RGBSteps& steps, s32 count)
  {
    r = r + static_cast<u32>(static_cast<s32>(steps.drdx) * count);
    g = g + static_cast<u32>(static_cast<s32>(steps.dgdx) * count);
    b = b + static_cast<u32>(static_cast<s32>(steps.dbdx) * count);
  }

  template<bool upside_down>
  ALWAYS_INLINE void StepY(const RGBSteps& steps)
  {
    r = upside_down ? (r - steps.drdy) : (r + steps.drdy);
    g = upside_down ? (g - steps.dgdy) : (g + steps.dgdy);
    b = upside_down ? (b - steps.dbdy) : (b + steps.dbdy);
  }

  ALWAYS_INLINE void StepY(const RGBSteps& steps, s32 count)
  {
    r = r + static_cast<u32>(static_cast<s32>(steps.drdy) * count);
    g = g + static_cast<u32>(static_cast<s32>(steps.dgdy) * count);
    b = b + static_cast<u32>(static_cast<s32>(steps.dbdy) * count);
  }
};

struct TrianglePart
{
  // left/right edges
  u64 start_x[2];
  u64 step_x[2];

  s32 start_y;
  s32 end_y;

  bool fill_upside_down;
};
} // namespace

#ifndef USE_VECTOR

template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound, UVStepper uv,
                     const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep)
{
  s32 width = x_bound - x_start;
  s32 current_x = TruncateGPUVertexPosition(x_start);

  // Skip pixels outside of the scissor rectangle.
  if (current_x < static_cast<s32>(g_drawing_area.left))
  {
    const s32 delta = static_cast<s32>(g_drawing_area.left) - current_x;
    x_start += delta;
    current_x += delta;
    width -= delta;
  }

  if ((current_x + width) > (static_cast<s32>(g_drawing_area.right) + 1))
    width = static_cast<s32>(g_drawing_area.right) + 1 - current_x;

  if (width <= 0)
    return;

  if constexpr (texture_enable)
    uv.StepX(uvstep, x_start);
  if constexpr (shading_enable)
    rgb.StepX(rgbstep, x_start);

  do
  {
    ShadePixel<texture_enable, raw_texture_enable, transparency_enable>(
      cmd, static_cast<u32>(current_x), static_cast<u32>(y), rgb.GetR(), rgb.GetG(), rgb.GetB(), uv.GetU(), uv.GetV());

    current_x++;
    if constexpr (texture_enable)
      uv.StepX(uvstep);
    if constexpr (shading_enable)
      rgb.StepX(rgbstep);
  } while (--width > 0);
}

template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp,
                                                   const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb,
                                                   const RGBSteps& rgbstep)
{
  static constexpr auto unfp_xy = [](s64 xfp) -> s32 { return static_cast<s32>(static_cast<u64>(xfp) >> 32); };

  const u64 left_x_step = tp.step_x[0];
  const u64 right_x_step = tp.step_x[1];
  const s32 end_y = tp.end_y;
  u64 left_x = tp.start_x[0];
  u64 right_x = tp.start_x[1];
  s32 current_y = tp.start_y;

  if (tp.fill_upside_down)
  {
    if (current_y <= end_y)
      return;

    UVStepper luv = uv;
    if constexpr (texture_enable)
      luv.StepY(uvstep, current_y);

    RGBStepper lrgb = rgb;
    if constexpr (shading_enable)
      lrgb.StepY(rgbstep, current_y);

    do
    {
      current_y--;
      left_x -= left_x_step;
      right_x -= right_x_step;

      const s32 y = TruncateGPUVertexPosition(current_y);
      if (y < static_cast<s32>(g_drawing_area.top))
        break;

      // Opposite direction means we need to subtract when stepping instead of adding.
      if constexpr (texture_enable)
        luv.StepY<true>(uvstep);
      if constexpr (shading_enable)
        lrgb.StepY<true>(rgbstep);

      if (y > static_cast<s32>(g_drawing_area.bottom) ||
          (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (static_cast<u32>(current_y) & 1u)))
      {
        continue;
      }

      DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(
        cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep);
    } while (current_y > end_y);
  }
  else
  {
    if (current_y >= end_y)
      return;

    UVStepper luv = uv;
    if constexpr (texture_enable)
      luv.StepY(uvstep, current_y);

    RGBStepper lrgb = rgb;
    if constexpr (shading_enable)
      lrgb.StepY(rgbstep, current_y);

    do
    {
      const s32 y = TruncateGPUVertexPosition(current_y);

      if (y > static_cast<s32>(g_drawing_area.bottom))
      {
        break;
      }
      if (y >= static_cast<s32>(g_drawing_area.top) &&
          (!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (static_cast<u32>(current_y) & 1u)))
      {
        DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(
          cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep);
      }

      current_y++;
      left_x += left_x_step;
      right_x += right_x_step;

      if constexpr (texture_enable)
        luv.StepY<false>(uvstep);
      if constexpr (shading_enable)
        lrgb.StepY<false>(rgbstep);
    } while (current_y < end_y);
  }
}

#else // USE_VECTOR

namespace {
template<bool shading_enable, bool texture_enable>
struct TriangleVectors : PixelVectors<texture_enable>
{
  using UnusedField = PixelVectors<texture_enable>::UnusedField;

  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> drdx;
  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dgdx;
  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dbdx;
  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dudx;
  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dvdx;
  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> drdx_0123;
  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dgdx_0123;
  typename std::conditional_t<shading_enable, GSVectorNi, UnusedField> dbdx_0123;
  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dudx_0123;
  typename std::conditional_t<texture_enable, GSVectorNi, UnusedField> dvdx_0123;

  TriangleVectors(const GPUBackendDrawCommand* cmd, const UVSteps& uvstep, const RGBSteps& rgbstep)
    : PixelVectors<texture_enable>(cmd)
  {
    if constexpr (shading_enable)
    {
      drdx = GSVectorNi(rgbstep.drdx * PIXELS_PER_VEC);
      dgdx = GSVectorNi(rgbstep.dgdx * PIXELS_PER_VEC);
      dbdx = GSVectorNi(rgbstep.dbdx * PIXELS_PER_VEC);

      drdx_0123 = GSVectorNi(rgbstep.drdx).mul32l(SPAN_OFFSET_VEC);
      dgdx_0123 = GSVectorNi(rgbstep.dgdx).mul32l(SPAN_OFFSET_VEC);
      dbdx_0123 = GSVectorNi(rgbstep.dbdx).mul32l(SPAN_OFFSET_VEC);
    }

    if constexpr (texture_enable)
    {
      dudx = GSVectorNi(uvstep.dudx * PIXELS_PER_VEC);
      dvdx = GSVectorNi(uvstep.dvdx * PIXELS_PER_VEC);
      dudx_0123 = GSVectorNi(uvstep.dudx).mul32l(SPAN_OFFSET_VEC);
      dvdx_0123 = GSVectorNi(uvstep.dvdx).mul32l(SPAN_OFFSET_VEC);
    }
  }
};
} // namespace

template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
ALWAYS_INLINE_RELEASE static void DrawSpan(const GPUBackendDrawPolygonCommand* cmd, s32 y, s32 x_start, s32 x_bound,
                                           UVStepper uv, const UVSteps& uvstep, RGBStepper rgb, const RGBSteps& rgbstep,
                                           const TriangleVectors<shading_enable, texture_enable>& tv)
{
  s32 width = x_bound - x_start;
  s32 current_x = TruncateGPUVertexPosition(x_start);

  // Skip pixels outside of the scissor rectangle.
  if (current_x < static_cast<s32>(g_drawing_area.left))
  {
    const s32 delta = static_cast<s32>(g_drawing_area.left) - current_x;
    x_start += delta;
    current_x += delta;
    width -= delta;
  }

  if ((current_x + width) > (static_cast<s32>(g_drawing_area.right) + 1))
    width = static_cast<s32>(g_drawing_area.right) + 1 - current_x;

  if (width <= 0)
    return;

  GSVectorNi dr, dg, db;
  if constexpr (shading_enable)
  {
    dr = GSVectorNi(rgb.r + rgbstep.drdx * x_start).add32(tv.drdx_0123);
    dg = GSVectorNi(rgb.g + rgbstep.dgdx * x_start).add32(tv.dgdx_0123);
    db = GSVectorNi(rgb.b + rgbstep.dbdx * x_start).add32(tv.dbdx_0123);
  }
  else
  {
    // precompute for flat shading
    dr = GSVectorNi(rgb.r >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
    dg = GSVectorNi((rgb.g >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT)) << 16);
    db = GSVectorNi(rgb.b >> (ATTRIB_SHIFT + ATTRIB_POST_SHIFT));
  }

  GSVectorNi du, dv;
  if constexpr (texture_enable)
  {
    du = GSVectorNi(uv.u + uvstep.dudx * x_start).add32(tv.dudx_0123);
    dv = GSVectorNi(uv.v + uvstep.dvdx * x_start).add32(tv.dvdx_0123);
  }
  else
  {
    // Hopefully optimized out...
    du = GSVectorNi::zero();
    dv = GSVectorNi::zero();
  }

  const GSVectorNi dither = cmd->draw_mode.dither_enable ?
                              GSVectorNi::broadcast128<false>(
                                &VECTOR_DITHER_MATRIX[static_cast<u32>(y) & 3][(static_cast<u32>(current_x) & 3) * 2]) :
                              GSVectorNi::zero();

  GSVectorNi xvec = GSVectorNi(current_x).add32(SPAN_OFFSET_VEC);
  GSVectorNi wvec = GSVectorNi(width).sub32(SPAN_WIDTH_VEC);

  for (s32 count = (width + (PIXELS_PER_VEC - 1)) / PIXELS_PER_VEC; count > 0; --count)
  {
    // R000 | R000 | R000 | R000
    // R0G0 | R0G0 | R0G0 | R0G0
    const GSVectorNi r = shading_enable ? dr.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>() : dr;
    const GSVectorNi g =
      shading_enable ? dg.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>().sll32<16>() : dg; // get G into the correct position
    const GSVectorNi b = shading_enable ? db.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>() : db;
    const GSVectorNi u = du.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>();
    const GSVectorNi v = dv.srl32<ATTRIB_SHIFT + ATTRIB_POST_SHIFT>();

    const GSVectorNi rg = r.blend16<0xAA>(g);

    // mask based on what's outside the span
    GSVectorNi preserve_mask = wvec.lt32(GSVectorNi::zero());

    // clip test, if all pixels are outside, skip
    preserve_mask = preserve_mask | xvec.lt32(tv.clip_left);
    preserve_mask = preserve_mask | xvec.gt32(tv.clip_right);
    if (!preserve_mask.alltrue())
    {
      ShadePixel<texture_enable, raw_texture_enable, transparency_enable>(
        tv, cmd->draw_mode.texture_mode, cmd->draw_mode.transparency_mode, static_cast<u32>(current_x),
        static_cast<u32>(y), rg, b, u, v, preserve_mask, dither);
    }

    current_x += PIXELS_PER_VEC;

    xvec = xvec.add32(PIXELS_PER_VEC_VEC);
    wvec = wvec.sub32(PIXELS_PER_VEC_VEC);

    if constexpr (shading_enable)
    {
      dr = dr.add32(tv.drdx);
      dg = dg.add32(tv.dgdx);
      db = db.add32(tv.dbdx);
    }

    if constexpr (texture_enable)
    {
      du = du.add32(tv.dudx);
      dv = dv.add32(tv.dvdx);
    }
  }
}

template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
ALWAYS_INLINE_RELEASE static void DrawTrianglePart(const GPUBackendDrawPolygonCommand* cmd, const TrianglePart& tp,
                                                   const UVStepper& uv, const UVSteps& uvstep, const RGBStepper& rgb,
                                                   const RGBSteps& rgbstep)
{
  static constexpr auto unfp_xy = [](s64 xfp) -> s32 { return static_cast<s32>(static_cast<u64>(xfp) >> 32); };

  const u64 left_x_step = tp.step_x[0];
  const u64 right_x_step = tp.step_x[1];
  const s32 end_y = tp.end_y;
  u64 left_x = tp.start_x[0];
  u64 right_x = tp.start_x[1];
  s32 current_y = tp.start_y;

  if (tp.fill_upside_down)
  {
    if (current_y <= end_y)
      return;

    UVStepper luv = uv;
    if constexpr (texture_enable)
      luv.StepY(uvstep, current_y);

    RGBStepper lrgb = rgb;
    if constexpr (shading_enable)
      lrgb.StepY(rgbstep, current_y);

    const TriangleVectors<shading_enable, texture_enable> tv(cmd, uvstep, rgbstep);

    do
    {
      current_y--;
      left_x -= left_x_step;
      right_x -= right_x_step;

      const s32 y = TruncateGPUVertexPosition(current_y);
      if (y < static_cast<s32>(g_drawing_area.top))
        break;

      // Opposite direction means we need to subtract when stepping instead of adding.
      if constexpr (texture_enable)
        luv.StepY<true>(uvstep);
      if constexpr (shading_enable)
        lrgb.StepY<true>(rgbstep);

      if (y > static_cast<s32>(g_drawing_area.bottom) ||
          (cmd->params.interlaced_rendering && cmd->params.active_line_lsb == (static_cast<u32>(current_y) & 1u)))
      {
        continue;
      }

      DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(
        cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep, tv);
    } while (current_y > end_y);
  }
  else
  {
    if (current_y >= end_y)
      return;

    UVStepper luv = uv;
    if constexpr (texture_enable)
      luv.StepY(uvstep, current_y);

    RGBStepper lrgb = rgb;
    if constexpr (shading_enable)
      lrgb.StepY(rgbstep, current_y);

    const TriangleVectors<shading_enable, texture_enable> tv(cmd, uvstep, rgbstep);

    do
    {
      const s32 y = TruncateGPUVertexPosition(current_y);

      if (y > static_cast<s32>(g_drawing_area.bottom))
      {
        break;
      }
      if (y >= static_cast<s32>(g_drawing_area.top) &&
          (!cmd->params.interlaced_rendering || cmd->params.active_line_lsb != (static_cast<u32>(current_y) & 1u)))
      {
        DrawSpan<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(
          cmd, y & VRAM_HEIGHT_MASK, unfp_xy(left_x), unfp_xy(right_x), luv, uvstep, lrgb, rgbstep, tv);
      }

      current_y++;
      left_x += left_x_step;
      right_x += right_x_step;

      if constexpr (texture_enable)
        luv.StepY<false>(uvstep);
      if constexpr (shading_enable)
        lrgb.StepY<false>(rgbstep);
    } while (current_y < end_y);
  }
}

#endif // USE_VECTOR

template<bool shading_enable, bool texture_enable, bool raw_texture_enable, bool transparency_enable>
static void DrawTriangle(const GPUBackendDrawPolygonCommand* cmd, const GPUBackendDrawPolygonCommand::Vertex* v0,
                         const GPUBackendDrawPolygonCommand::Vertex* v1, const GPUBackendDrawPolygonCommand::Vertex* v2)
{
#ifdef CHECK_VECTOR
  const GPUBackendDrawPolygonCommand::Vertex* orig_v0 = v0;
  const GPUBackendDrawPolygonCommand::Vertex* orig_v1 = v1;
  const GPUBackendDrawPolygonCommand::Vertex* orig_v2 = v2;
#endif

  // Sort vertices so that v0 is the top vertex, v1 is the bottom vertex, and v2 is the side vertex.
  u32 tl = 0;
  if (v1->x <= v0->x)
    tl = (v2->x <= v1->x) ? 4 : 2;
  else if (v2->x < v0->x)
    tl = 4;
  else
    tl = 1;
  if (v2->y < v1->y)
  {
    std::swap(v2, v1);
    tl = ((tl >> 1) & 0x2) | ((tl << 1) & 0x4) | (tl & 0x1);
  }
  if (v1->y < v0->y)
  {
    std::swap(v1, v0);
    tl = ((tl >> 1) & 0x1) | ((tl << 1) & 0x2) | (tl & 0x4);
  }
  if (v2->y < v1->y)
  {
    std::swap(v2, v1);
    tl = ((tl >> 1) & 0x2) | ((tl << 1) & 0x4) | (tl & 0x1);
  }

  const GPUBackendDrawPolygonCommand::Vertex* vertices[3] = {v0, v1, v2};
  tl = tl >> 1;

  // Invalid size early culling.
  if (static_cast<u32>(std::abs(v2->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
      static_cast<u32>(std::abs(v2->x - v1->x)) >= MAX_PRIMITIVE_WIDTH ||
      static_cast<u32>(std::abs(v1->x - v0->x)) >= MAX_PRIMITIVE_WIDTH ||
      static_cast<u32>(v2->y - v0->y) >= MAX_PRIMITIVE_HEIGHT || v0->y == v2->y)
  {
    return;
  }

  // Same as line rasterization, use higher precision for position.
  static constexpr auto makefp_xy = [](s32 x) { return (static_cast<s64>(x) << 32) + ((1LL << 32) - (1 << 11)); };
  static constexpr auto makestep_xy = [](s32 dx, s32 dy) -> s64 {
    return (((static_cast<s64>(dx) << 32) + ((dx < 0) ? -(dy - 1) : ((dx > 0) ? (dy - 1) : 0))) / dy);
  };
  const s64 base_coord = makefp_xy(v0->x);
  const s64 base_step = makestep_xy(v2->x - v0->x, v2->y - v0->y);
  const s64 bound_coord_us = (v1->y == v0->y) ? 0 : makestep_xy(v1->x - v0->x, v1->y - v0->y);
  const s64 bound_coord_ls = (v2->y == v1->y) ? 0 : makestep_xy(v2->x - v1->x, v2->y - v1->y);
  const u32 vo = (tl != 0) ? 1 : 0;
  const u32 vp = (tl == 2) ? 3 : 0;
  const bool right_facing = (v1->y == v0->y) ? (v1->x > v0->x) : (bound_coord_us > base_step);
  const u32 rfi = BoolToUInt32(right_facing);
  const u32 ofi = BoolToUInt32(!right_facing);

  TrianglePart triparts[2];
  TrianglePart& tpo = triparts[vo];
  TrianglePart& tpp = triparts[vo ^ 1];
  tpo.start_y = vertices[0 ^ vo]->y;
  tpo.end_y = vertices[1 ^ vo]->y;
  tpp.start_y = vertices[1 ^ vp]->y;
  tpp.end_y = vertices[2 ^ vp]->y;
  tpo.start_x[rfi] = makefp_xy(vertices[0 ^ vo]->x);
  tpo.step_x[rfi] = bound_coord_us;
  tpo.start_x[ofi] = base_coord + ((vertices[vo]->y - vertices[0]->y) * base_step);
  tpo.step_x[ofi] = base_step;
  tpo.fill_upside_down = ConvertToBoolUnchecked(vo);
  tpp.start_x[rfi] = makefp_xy(vertices[1 ^ vp]->x);
  tpp.step_x[rfi] = bound_coord_ls;
  tpp.start_x[ofi] = base_coord + ((vertices[1 ^ vp]->y - vertices[0]->y) * base_step);
  tpp.step_x[ofi] = base_step;
  tpp.fill_upside_down = (vp != 0);

#define ATTRIB_DETERMINANT(x, y) (((v1->x - v0->x) * (v2->y - v1->y)) - ((v2->x - v1->x) * (v1->y - v0->y)))
#define ATTRIB_STEP(x, y) (static_cast<u32>(ATTRIB_DETERMINANT(x, y) * (1 << ATTRIB_SHIFT) / det) << ATTRIB_POST_SHIFT)

  // Check edges.
  const s32 det = ATTRIB_DETERMINANT(x, y);
  if (det == 0) [[unlikely]]
    return;

  // Compute step values.
  UVSteps uvstep;
  RGBSteps rgbstep;
  if constexpr (texture_enable)
  {
    uvstep.dudx = ATTRIB_STEP(u, y);
    uvstep.dvdx = ATTRIB_STEP(v, y);
    uvstep.dudy = ATTRIB_STEP(x, u);
    uvstep.dvdy = ATTRIB_STEP(x, v);
  }

  if constexpr (shading_enable)
  {
    rgbstep.drdx = ATTRIB_STEP(r, y);
    rgbstep.dgdx = ATTRIB_STEP(g, y);
    rgbstep.dbdx = ATTRIB_STEP(b, y);
    rgbstep.drdy = ATTRIB_STEP(x, r);
    rgbstep.dgdy = ATTRIB_STEP(x, g);
    rgbstep.dbdy = ATTRIB_STEP(x, b);
  }

#undef ATTRIB_STEP
#undef ATTRIB_DETERMINANT

  // Undo the start of the vertex, so that when we add the offset for each line, it starts at the beginning value.
  UVStepper uv;
  RGBStepper rgb;
  const GPUBackendDrawPolygonCommand::Vertex* top_left_vertex = vertices[tl];
  if constexpr (texture_enable)
  {
    uv.Init(top_left_vertex->u, top_left_vertex->v);
    uv.StepX(uvstep, -top_left_vertex->x);
    uv.StepY(uvstep, -top_left_vertex->y);
  }
  else
  {
    uv = {};
  }

  if constexpr (shading_enable)
  {
    rgb.Init(top_left_vertex->r, top_left_vertex->g, top_left_vertex->b);
    rgb.StepX(rgbstep, -top_left_vertex->x);
    rgb.StepY(rgbstep, -top_left_vertex->y);
  }
  else
  {
    rgb.Init(top_left_vertex->r, top_left_vertex->g, top_left_vertex->b);
  }

#ifdef CHECK_VECTOR
  BACKUP_VRAM();
#endif

  for (u32 i = 0; i < 2; i++)
  {
    DrawTrianglePart<shading_enable, texture_enable, raw_texture_enable, transparency_enable>(cmd, triparts[i], uv,
                                                                                              uvstep, rgb, rgbstep);
  }

#ifdef CHECK_VECTOR
  CHECK_VRAM(
    GPU_SW_Rasterizer::DrawTriangleFunctions[shading_enable][texture_enable][raw_texture_enable][transparency_enable](
      cmd, orig_v0, orig_v1, orig_v2));
#endif
}

constinit const DrawRectangleFunctionTable DrawRectangleFunctions = {
  {{&DrawRectangle<false, false, false>, &DrawRectangle<false, false, true>},
   {&DrawRectangle<false, false, false>, &DrawRectangle<false, false, true>}},
  {{&DrawRectangle<true, false, false>, &DrawRectangle<true, false, true>},
   {&DrawRectangle<true, true, false>, &DrawRectangle<true, true, true>}}};

constinit const DrawLineFunctionTable DrawLineFunctions = {{&DrawLine<false, false>, &DrawLine<false, true>},
                                                           {&DrawLine<true, false>, &DrawLine<true, true>}};

constinit const DrawTriangleFunctionTable DrawTriangleFunctions = {
  {{{&DrawTriangle<false, false, false, false>, &DrawTriangle<false, false, false, true>},
    {&DrawTriangle<false, false, false, false>, &DrawTriangle<false, false, false, true>}},
   {{&DrawTriangle<false, true, false, false>, &DrawTriangle<false, true, false, true>},
    {&DrawTriangle<false, true, true, false>, &DrawTriangle<false, true, true, true>}}},
  {{{&DrawTriangle<true, false, false, false>, &DrawTriangle<true, false, false, true>},
    {&DrawTriangle<true, false, false, false>, &DrawTriangle<true, false, false, true>}},
   {{&DrawTriangle<true, true, false, false>, &DrawTriangle<true, true, false, true>},
    {&DrawTriangle<true, true, true, false>, &DrawTriangle<true, true, true, true>}}}};

static void FillVRAMImpl(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced, u8 active_line_lsb)
{
#ifdef USE_VECTOR
  const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
  const GSVector4i fill = GSVector4i(color16, color16, color16, color16, color16, color16, color16, color16);
  constexpr u32 vector_width = 8;
  const u32 aligned_width = Common::AlignDownPow2(width, vector_width);

  if ((x + width) <= VRAM_WIDTH && !interlaced)
  {
    for (u32 yoffs = 0; yoffs < height; yoffs++)
    {
      const u32 row = (y + yoffs) % VRAM_HEIGHT;

      u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
      u32 xoffs = 0;
      for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
        GSVector4i::store<false>(row_ptr, fill);
      for (; xoffs < width; xoffs++)
        *(row_ptr++) = color16;
    }
  }
  else if (interlaced)
  {
    // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
    const u32 active_field = active_line_lsb;

    if ((x + width) <= VRAM_WIDTH)
    {
      for (u32 yoffs = 0; yoffs < height; yoffs++)
      {
        const u32 row = (y + yoffs) % VRAM_HEIGHT;
        if ((row & u32(1)) == active_field)
          continue;

        u16* row_ptr = &g_vram[row * VRAM_WIDTH + x];
        u32 xoffs = 0;
        for (; xoffs < aligned_width; xoffs += vector_width, row_ptr += vector_width)
          GSVector4i::store<false>(row_ptr, fill);
        for (; xoffs < width; xoffs++)
          *(row_ptr++) = color16;
      }
    }
    else
    {
      for (u32 yoffs = 0; yoffs < height; yoffs++)
      {
        const u32 row = (y + yoffs) % VRAM_HEIGHT;
        if ((row & u32(1)) == active_field)
          continue;

        u16* row_ptr = &g_vram[row * VRAM_WIDTH];
        for (u32 xoffs = 0; xoffs < width; xoffs++)
        {
          const u32 col = (x + xoffs) % VRAM_WIDTH;
          row_ptr[col] = color16;
        }
      }
    }
  }
  else
  {
    for (u32 yoffs = 0; yoffs < height; yoffs++)
    {
      const u32 row = (y + yoffs) % VRAM_HEIGHT;
      u16* row_ptr = &g_vram[row * VRAM_WIDTH];
      for (u32 xoffs = 0; xoffs < width; xoffs++)
      {
        const u32 col = (x + xoffs) % VRAM_WIDTH;
        row_ptr[col] = color16;
      }
    }
  }
#else
  const u16 color16 = VRAMRGBA8888ToRGBA5551(color);
  if ((x + width) <= VRAM_WIDTH && !interlaced)
  {
    for (u32 yoffs = 0; yoffs < height; yoffs++)
    {
      const u32 row = (y + yoffs) % VRAM_HEIGHT;
      std::fill_n(&g_vram[row * VRAM_WIDTH + x], width, color16);
    }
  }
  else if (interlaced)
  {
    // Hardware tests show that fills seem to break on the first two lines when the offset matches the displayed field.
    const u32 active_field = active_line_lsb;

    for (u32 yoffs = 0; yoffs < height; yoffs++)
    {
      const u32 row = (y + yoffs) % VRAM_HEIGHT;
      if ((row & u32(1)) == active_field)
        continue;

      u16* row_ptr = &g_vram[row * VRAM_WIDTH];
      for (u32 xoffs = 0; xoffs < width; xoffs++)
      {
        const u32 col = (x + xoffs) % VRAM_WIDTH;
        row_ptr[col] = color16;
      }
    }
  }
  else
  {
    for (u32 yoffs = 0; yoffs < height; yoffs++)
    {
      const u32 row = (y + yoffs) % VRAM_HEIGHT;
      u16* row_ptr = &g_vram[row * VRAM_WIDTH];
      for (u32 xoffs = 0; xoffs < width; xoffs++)
      {
        const u32 col = (x + xoffs) % VRAM_WIDTH;
        row_ptr[col] = color16;
      }
    }
  }
#endif
}

static void WriteVRAMImpl(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
{
  // TODO: Vector implementation

  // Fast path when the copy is not oversized.
  if ((x + width) <= VRAM_WIDTH && (y + height) <= VRAM_HEIGHT && !set_mask && !check_mask)
  {
    const u16* src_ptr = static_cast<const u16*>(data);
    u16* dst_ptr = &g_vram[y * VRAM_WIDTH + x];
    for (u32 yoffs = 0; yoffs < height; yoffs++)
    {
      std::copy_n(src_ptr, width, dst_ptr);
      src_ptr += width;
      dst_ptr += VRAM_WIDTH;
    }
  }
  else
  {
    // Slow path when we need to handle wrap-around.
    // During transfer/render operations, if ((dst_pixel & mask_and) == 0) { pixel = src_pixel | mask_or }
    const u16* src_ptr = static_cast<const u16*>(data);
    const u16 mask_and = check_mask ? 0x8000u : 0x0000u;
    const u16 mask_or = set_mask ? 0x8000u : 0x0000u;

    for (u32 row = 0; row < height;)
    {
      u16* dst_row_ptr = &g_vram[((y + row++) % VRAM_HEIGHT) * VRAM_WIDTH];
      for (u32 col = 0; col < width;)
      {
        // TODO: Handle unaligned reads...
        u16* pixel_ptr = &dst_row_ptr[(x + col++) % VRAM_WIDTH];
        if (((*pixel_ptr) & mask_and) == 0)
          *pixel_ptr = *(src_ptr++) | mask_or;
      }
    }
  }
}

static void CopyVRAMImpl(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask,
                         bool check_mask)
{
  // TODO: Vector implementation.

  // Break up oversized copies. This behavior has not been verified on console.
  if ((src_x + width) > VRAM_WIDTH || (dst_x + width) > VRAM_WIDTH)
  {
    u32 remaining_rows = height;
    u32 current_src_y = src_y;
    u32 current_dst_y = dst_y;
    while (remaining_rows > 0)
    {
      const u32 rows_to_copy =
        std::min<u32>(remaining_rows, std::min<u32>(VRAM_HEIGHT - current_src_y, VRAM_HEIGHT - current_dst_y));

      u32 remaining_columns = width;
      u32 current_src_x = src_x;
      u32 current_dst_x = dst_x;
      while (remaining_columns > 0)
      {
        const u32 columns_to_copy =
          std::min<u32>(remaining_columns, std::min<u32>(VRAM_WIDTH - current_src_x, VRAM_WIDTH - current_dst_x));
        CopyVRAM(current_src_x, current_src_y, current_dst_x, current_dst_y, columns_to_copy, rows_to_copy, set_mask,
                 check_mask);
        current_src_x = (current_src_x + columns_to_copy) % VRAM_WIDTH;
        current_dst_x = (current_dst_x + columns_to_copy) % VRAM_WIDTH;
        remaining_columns -= columns_to_copy;
      }

      current_src_y = (current_src_y + rows_to_copy) % VRAM_HEIGHT;
      current_dst_y = (current_dst_y + rows_to_copy) % VRAM_HEIGHT;
      remaining_rows -= rows_to_copy;
    }

    return;
  }

  // This doesn't have a fast path, but do we really need one? It's not common.
  const u16 mask_and = check_mask ? 0x8000u : 0x0000u;
  const u16 mask_or = set_mask ? 0x8000u : 0x0000u;

  // Copy in reverse when src_x < dst_x, this is verified on console.
  if (src_x < dst_x || ((src_x + width - 1) % VRAM_WIDTH) < ((dst_x + width - 1) % VRAM_WIDTH))
  {
    for (u32 row = 0; row < height; row++)
    {
      const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
      u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];

      for (s32 col = static_cast<s32>(width - 1); col >= 0; col--)
      {
        const u16 src_pixel = src_row_ptr[(src_x + static_cast<u32>(col)) % VRAM_WIDTH];
        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + static_cast<u32>(col)) % VRAM_WIDTH];
        *dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr;
      }
    }
  }
  else
  {
    for (u32 row = 0; row < height; row++)
    {
      const u16* src_row_ptr = &g_vram[((src_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];
      u16* dst_row_ptr = &g_vram[((dst_y + row) % VRAM_HEIGHT) * VRAM_WIDTH];

      for (u32 col = 0; col < width; col++)
      {
        const u16 src_pixel = src_row_ptr[(src_x + col) % VRAM_WIDTH];
        u16* dst_pixel_ptr = &dst_row_ptr[(dst_x + col) % VRAM_WIDTH];
        *dst_pixel_ptr = ((*dst_pixel_ptr & mask_and) == 0) ? (src_pixel | mask_or) : *dst_pixel_ptr;
      }
    }
  }
}

#ifdef __INTELLISENSE__
}
#endif