duckstation/src/core/cpu_pgxp.cpp

// SPDX-FileCopyrightText: 2016 iCatButler, 2019-2024 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: CC-BY-NC-ND-4.0
//
// This file has been completely rewritten over the years compared to the original PCSXR-PGXP release.
// No original code remains. The original copyright notice is included above for historical purposes.
//

#include "cpu_pgxp.h"
#include "bus.h"
#include "cpu_core.h"
#include "cpu_disasm.h"
#include "gpu_types.h"
#include "settings.h"

#include "util/gpu_device.h"

#include "common/assert.h"
#include "common/log.h"

#include <climits>
#include <cmath>

LOG_CHANNEL(CPU);

// #define LOG_VALUES 1
// #define LOG_LOOKUPS 1

// TODO: Don't update flags on Validate(), instead return it.

namespace CPU::PGXP {

enum : u32
{
  VERTEX_CACHE_WIDTH = 2048,
  VERTEX_CACHE_HEIGHT = 2048,
  VERTEX_CACHE_SIZE = VERTEX_CACHE_WIDTH * VERTEX_CACHE_HEIGHT,
  PGXP_MEM_SIZE = (static_cast<u32>(Bus::RAM_8MB_SIZE) + static_cast<u32>(CPU::SCRATCHPAD_SIZE)) / 4,
  PGXP_MEM_SCRATCH_OFFSET = Bus::RAM_8MB_SIZE / 4,
};

enum : u32
{
  VALID_X = (1u << 0),
  VALID_Y = (1u << 1),
  VALID_Z = (1u << 2),
  VALID_LOWZ = (1u << 16),      // Valid Z from the low part of a 32-bit value.
  VALID_HIGHZ = (1u << 17),     // Valid Z from the high part of a 32-bit value.
  VALID_TAINTED_Z = (1u << 31), // X/Y has been changed, Z may not be accurate.

  VALID_XY = (VALID_X | VALID_Y),
  VALID_XYZ = (VALID_X | VALID_Y | VALID_Z),
  VALID_ALL = (VALID_X | VALID_Y | VALID_Z),
};

#define LOWORD_U16(val) (static_cast<u16>(val))
#define HIWORD_U16(val) (static_cast<u16>(static_cast<u32>(val) >> 16))
#define LOWORD_S16(val) (static_cast<s16>(static_cast<u16>(val)))
#define HIWORD_S16(val) (static_cast<s16>(static_cast<u16>(static_cast<u32>(val) >> 16)))
#define SET_LOWORD(val, loword) ((static_cast<u32>(val) & 0xFFFF0000u) | static_cast<u32>(static_cast<u16>(loword)))
#define SET_HIWORD(val, hiword) ((static_cast<u32>(val) & 0x0000FFFFu) | (static_cast<u32>(hiword) << 16))

static double f16Sign(double val);
static double f16Unsign(double val);
static double f16Overflow(double val);

static void CacheVertex(u32 value, const PGXPValue& vertex);
static PGXPValue* GetCachedVertex(u32 value);

static float TruncateVertexPosition(float p);
static bool IsWithinTolerance(float precise_x, float precise_y, int int_x, int int_y);

static PGXPValue& GetRdValue(Instruction instr);
static PGXPValue& GetRtValue(Instruction instr);
static PGXPValue& ValidateAndGetRtValue(Instruction instr, u32 rtVal);
static PGXPValue& ValidateAndGetRsValue(Instruction instr, u32 rsVal);
static void SetRtValue(Instruction instr, const PGXPValue& val);
static void SetRtValue(Instruction instr, const PGXPValue& val, u32 rtVal);
static PGXPValue& GetSXY0();
static PGXPValue& GetSXY1();
static PGXPValue& GetSXY2();
static PGXPValue& PushSXY();

static PGXPValue* GetPtr(u32 addr);
static const PGXPValue& ValidateAndLoadMem(u32 addr, u32 value);
static void ValidateAndLoadMem16(PGXPValue& dest, u32 addr, u32 value, bool sign);

static void CPU_MTC2(u32 reg, const PGXPValue& value, u32 val);
static void CPU_BITWISE(Instruction instr, u32 rdVal, u32 rsVal, u32 rtVal);
static void CPU_SLL(Instruction instr, u32 rtVal, u32 sh);
static void CPU_SRx(Instruction instr, u32 rtVal, u32 sh, bool sign, bool is_variable);

static void WriteMem(u32 addr, const PGXPValue& value);
static void WriteMem16(u32 addr, const PGXPValue& value);

static void CopyZIfMissing(PGXPValue& dst, const PGXPValue& src);
static void SelectZ(float& dst_z, u32& dst_flags, const PGXPValue& src1, const PGXPValue& src2);

#ifdef LOG_VALUES
static void LogInstruction(u32 pc, Instruction instr);
static void LogValue(const char* name, u32 rval, const PGXPValue* val);
static void LogValueStr(SmallStringBase& str, const char* name, u32 rval, const PGXPValue* val);

// clang-format off
#define LOG_VALUES_NV() do { LogInstruction(CPU::g_state.current_instruction_pc, instr); } while (0)
#define LOG_VALUES_1(name, rval, val) do { LogInstruction(CPU::g_state.current_instruction_pc, instr); LogValue(name, rval, val); } while (0)
#define LOG_VALUES_C1(rnum, rval) do { LogInstruction(CPU::g_state.current_instruction_pc,instr); LogValue(CPU::GetRegName(static_cast<CPU::Reg>(rnum)), rval, &g_state.pgxp_gpr[static_cast<u32>(rnum)]); } while(0)
#define LOG_VALUES_C2(r1num, r1val, r2num, r2val) do { LogInstruction(CPU::g_state.current_instruction_pc,instr); LogValue(CPU::GetRegName(static_cast<CPU::Reg>(r1num)), r1val, &g_state.pgxp_gpr[static_cast<u32>(r1num)]); LogValue(CPU::GetRegName(static_cast<CPU::Reg>(r2num)), r2val, &g_state.pgxp_gpr[static_cast<u32>(r2num)]); } while(0)
#define LOG_VALUES_LOAD(addr, val) do { LogInstruction(CPU::g_state.current_instruction_pc,instr); LogValue(TinyString::from_format("MEM[{:08X}]", addr).c_str(), val, GetPtr(addr)); } while(0)
#define LOG_VALUES_STORE(rnum, rval, addr) do { LOG_VALUES_C1(rnum, rval); std::fprintf(s_log, " addr=%08X", addr); } while(0)
#else
#define LOG_VALUES_NV() (void)0
#define LOG_VALUES_1(name, rval, val) (void)0
#define LOG_VALUES_C1(rnum, rval) (void)0
#define LOG_VALUES_C2(r1num, r1val, r2num, r2val) (void)0
#define LOG_VALUES_LOAD(addr, val) (void)0
#define LOG_VALUES_STORE(rnum, rval, addr) (void)0
#endif
// clang-format on

static constexpr const PGXPValue INVALID_VALUE = {};

static PGXPValue* s_mem = nullptr;
static PGXPValue* s_vertex_cache = nullptr;

#ifdef LOG_VALUES
static std::FILE* s_log;
#endif
} // namespace CPU::PGXP

void CPU::PGXP::Initialize()
{
  std::memset(g_state.pgxp_gpr, 0, sizeof(g_state.pgxp_gpr));
  std::memset(g_state.pgxp_cop0, 0, sizeof(g_state.pgxp_cop0));
  std::memset(g_state.pgxp_gte, 0, sizeof(g_state.pgxp_gte));

  if (!s_mem)
  {
    s_mem = static_cast<PGXPValue*>(std::calloc(PGXP_MEM_SIZE, sizeof(PGXPValue)));
    if (!s_mem)
      Panic("Failed to allocate PGXP memory");
  }

  if (g_settings.gpu_pgxp_vertex_cache && !s_vertex_cache)
  {
    s_vertex_cache = static_cast<PGXPValue*>(std::calloc(VERTEX_CACHE_SIZE, sizeof(PGXPValue)));
    if (!s_vertex_cache)
    {
      ERROR_LOG("Failed to allocate memory for vertex cache, disabling.");
      g_settings.gpu_pgxp_vertex_cache = false;
    }
  }

  if (s_vertex_cache)
    std::memset(s_vertex_cache, 0, sizeof(PGXPValue) * VERTEX_CACHE_SIZE);
}

void CPU::PGXP::Reset()
{
  std::memset(g_state.pgxp_gpr, 0, sizeof(g_state.pgxp_gpr));
  std::memset(g_state.pgxp_cop0, 0, sizeof(g_state.pgxp_cop0));
  std::memset(g_state.pgxp_gte, 0, sizeof(g_state.pgxp_gte));

  if (s_mem)
    std::memset(s_mem, 0, sizeof(PGXPValue) * PGXP_MEM_SIZE);

  if (g_settings.gpu_pgxp_vertex_cache && s_vertex_cache)
    std::memset(s_vertex_cache, 0, sizeof(PGXPValue) * VERTEX_CACHE_SIZE);
}

void CPU::PGXP::Shutdown()
{
  if (s_vertex_cache)
  {
    std::free(s_vertex_cache);
    s_vertex_cache = nullptr;
  }
  if (s_mem)
  {
    std::free(s_mem);
    s_mem = nullptr;
  }

  std::memset(g_state.pgxp_gte, 0, sizeof(g_state.pgxp_gte));
  std::memset(g_state.pgxp_gpr, 0, sizeof(g_state.pgxp_gpr));
  std::memset(g_state.pgxp_cop0, 0, sizeof(g_state.pgxp_cop0));
}

ALWAYS_INLINE_RELEASE double CPU::PGXP::f16Sign(double val)
{
  const s32 s = static_cast<s32>(static_cast<s64>(val * (USHRT_MAX + 1)));
  return static_cast<double>(s) / static_cast<double>(USHRT_MAX + 1);
}

ALWAYS_INLINE_RELEASE double CPU::PGXP::f16Unsign(double val)
{
  return (val >= 0) ? val : (val + (USHRT_MAX + 1));
}

ALWAYS_INLINE_RELEASE double CPU::PGXP::f16Overflow(double val)
{
  return static_cast<double>(static_cast<s64>(val) >> 16);
}

ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetRdValue(Instruction instr)
{
  return g_state.pgxp_gpr[static_cast<u8>(instr.r.rd.GetValue())];
}

ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetRtValue(Instruction instr)
{
  return g_state.pgxp_gpr[static_cast<u8>(instr.r.rt.GetValue())];
}

ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::ValidateAndGetRtValue(Instruction instr, u32 rtVal)
{
  PGXPValue& ret = g_state.pgxp_gpr[static_cast<u8>(instr.r.rt.GetValue())];
  ret.Validate(rtVal);
  return ret;
}

ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::ValidateAndGetRsValue(Instruction instr, u32 rsVal)
{
  PGXPValue& ret = g_state.pgxp_gpr[static_cast<u8>(instr.r.rs.GetValue())];
  ret.Validate(rsVal);
  return ret;
}

ALWAYS_INLINE void CPU::PGXP::SetRtValue(Instruction instr, const PGXPValue& val)
{
  g_state.pgxp_gpr[static_cast<u8>(instr.r.rt.GetValue())] = val;
}

ALWAYS_INLINE void CPU::PGXP::SetRtValue(Instruction instr, const PGXPValue& val, u32 rtVal)
{
  PGXPValue& prtVal = g_state.pgxp_gpr[static_cast<u8>(instr.r.rt.GetValue())];
  prtVal = val;
  prtVal.value = rtVal;
}

ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetSXY0()
{
  return g_state.pgxp_gte[12];
}

ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetSXY1()
{
  return g_state.pgxp_gte[13];
}

ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::GetSXY2()
{
  return g_state.pgxp_gte[14];
}

ALWAYS_INLINE CPU::PGXPValue& CPU::PGXP::PushSXY()
{
  g_state.pgxp_gte[12] = g_state.pgxp_gte[13];
  g_state.pgxp_gte[13] = g_state.pgxp_gte[14];
  return g_state.pgxp_gte[14];
}

ALWAYS_INLINE_RELEASE CPU::PGXPValue* CPU::PGXP::GetPtr(u32 addr)
{
#if 0
  if ((addr & CPU::PHYSICAL_MEMORY_ADDRESS_MASK) >= 0x0017A2B4 &&
      (addr & CPU::PHYSICAL_MEMORY_ADDRESS_MASK) <= 0x0017A2B4)
    __debugbreak();
#endif

  if ((addr & SCRATCHPAD_ADDR_MASK) == SCRATCHPAD_ADDR)
    return &s_mem[PGXP_MEM_SCRATCH_OFFSET + ((addr & SCRATCHPAD_OFFSET_MASK) >> 2)];

  const u32 paddr = (addr & PHYSICAL_MEMORY_ADDRESS_MASK);
  if (paddr < Bus::RAM_MIRROR_END)
    return &s_mem[(paddr & Bus::g_ram_mask) >> 2];
  else
    return nullptr;
}

ALWAYS_INLINE_RELEASE const CPU::PGXPValue& CPU::PGXP::ValidateAndLoadMem(u32 addr, u32 value)
{
  PGXPValue* pMem = GetPtr(addr);
  if (!pMem) [[unlikely]]
    return INVALID_VALUE;

  pMem->Validate(value);
  return *pMem;
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::ValidateAndLoadMem16(PGXPValue& dest, u32 addr, u32 value, bool sign)
{
  PGXPValue* pMem = GetPtr(addr);
  if (!pMem) [[unlikely]]
  {
    dest = INVALID_VALUE;
    return;
  }

  // determine if high or low word
  const bool hiword = ((addr & 2) != 0);

  // only validate the component we're interested in
  pMem->flags = hiword ?
                  ((Truncate16(pMem->value >> 16) == Truncate16(value)) ? pMem->flags : (pMem->flags & ~VALID_Y)) :
                  ((Truncate16(pMem->value) == Truncate16(value)) ? pMem->flags : (pMem->flags & ~VALID_X));

  // copy whole value
  dest = *pMem;

  // if high word then shift
  if (hiword)
  {
    dest.x = dest.y;
    dest.flags = (dest.flags & ~VALID_X) | ((dest.flags & VALID_Y) >> 1);
  }

  // only set y as valid if x is also valid.. don't want to make fake values
  if (dest.flags & VALID_X)
  {
    dest.y = (dest.x < 0) ? -1.0f * sign : 0.0f;
    dest.flags |= VALID_Y;
  }
  else
  {
    dest.y = 0.0f;
    dest.flags &= ~VALID_Y;
  }

  dest.value = value;
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem(u32 addr, const PGXPValue& value)
{
  PGXPValue* pMem = GetPtr(addr);
  if (!pMem) [[unlikely]]
    return;

  *pMem = value;
  pMem->flags |= VALID_LOWZ | VALID_HIGHZ;
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::WriteMem16(u32 addr, const PGXPValue& value)
{
  PGXPValue* dest = GetPtr(addr);
  if (!dest) [[unlikely]]
    return;

  // determine if high or low word
  const bool hiword = ((addr & 2) != 0);
  if (hiword)
  {
    dest->y = value.x;
    dest->flags = (dest->flags & ~VALID_Y) | ((value.flags & VALID_X) << 1);
    dest->value = (dest->value & UINT32_C(0x0000FFFF)) | (value.value << 16);
  }
  else
  {
    dest->x = value.x;
    dest->flags = (dest->flags & ~VALID_X) | (value.flags & VALID_X);
    dest->value = (dest->value & UINT32_C(0xFFFF0000)) | (value.value & UINT32_C(0x0000FFFF));
  }

  // overwrite z/w if valid
  // TODO: Check modified
  if (value.flags & VALID_Z)
  {
    dest->z = value.z;
    dest->flags |= VALID_Z | (hiword ? VALID_HIGHZ : VALID_LOWZ);
  }
  else
  {
    dest->flags &= hiword ? ~VALID_HIGHZ : ~VALID_LOWZ;
    if (dest->flags & VALID_Z && !(dest->flags & (VALID_HIGHZ | VALID_LOWZ)))
      dest->flags &= ~VALID_Z;
  }
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::CopyZIfMissing(PGXPValue& dst, const PGXPValue& src)
{
  dst.z = (dst.flags & VALID_Z) ? dst.z : src.z;
  dst.flags |= (src.flags & VALID_Z);
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::SelectZ(float& dst_z, u32& dst_flags, const PGXPValue& src1,
                                              const PGXPValue& src2)
{
  // Prefer src2 if src1 is missing Z, or is potentially an imprecise value, when src2 is precise.
  dst_z = (!(src1.flags & VALID_Z) ||
           (src1.flags & VALID_TAINTED_Z && (src2.flags & (VALID_Z | VALID_TAINTED_Z)) == VALID_Z)) ?
            src2.z :
            src1.z;
  dst_flags |= ((src1.flags | src2.flags) & VALID_Z);
}

#ifdef LOG_VALUES
void CPU::PGXP::LogInstruction(u32 pc, Instruction instr)
{
  if (!s_log) [[unlikely]]
  {
    s_log = std::fopen("pgxp.log", "wb");
  }
  else
  {
    std::fflush(s_log);
    std::fputc('\n', s_log);
  }

  SmallString str;
  DisassembleInstruction(&str, pc, instr.bits);
  std::fprintf(s_log, "%08X %08X %-20s", pc, instr.bits, str.c_str());
}

void CPU::PGXP::LogValue(const char* name, u32 rval, const PGXPValue* val)
{
  if (!s_log) [[unlikely]]
    return;

  SmallString str;
  LogValueStr(str, name, rval, val);
  std::fprintf(s_log, " %s", str.c_str());
}

void CPU::PGXP::LogValueStr(SmallStringBase& str, const char* name, u32 rval, const PGXPValue* val)
{
  str.append_format("{}=[{:08X}", name, rval);
  if (!val)
  {
    str.append(", NULL]");
  }
  else
  {
    if (val->value != rval)
      str.append_format(", PGXP{:08X}", val->value);

    str.append_format(", {{{},{},{}}}", val->x, val->y, val->z);

    if (val->flags & VALID_ALL)
    {
      str.append(", valid=");
      if (val->flags & VALID_X)
        str.append('X');
      if (val->flags & VALID_Y)
        str.append('Y');
      if (val->flags & VALID_Z)
        str.append('Z');
    }

    // if (val->flags & VALID_TAINTED_Z)
    // str.append(", tainted");

    str.append(']');
  }
}

#endif

void CPU::PGXP::GTE_RTPS(float x, float y, float z, u32 value)
{
  PGXPValue& pvalue = PushSXY();
  pvalue.x = x;
  pvalue.y = y;
  pvalue.z = z;
  pvalue.value = value;
  pvalue.flags = VALID_ALL;

  if (g_settings.gpu_pgxp_vertex_cache)
    CacheVertex(value, pvalue);
}

bool CPU::PGXP::GTE_HasPreciseVertices(u32 sxy0, u32 sxy1, u32 sxy2)
{
  PGXPValue& SXY0 = GetSXY0();
  SXY0.Validate(sxy0);
  PGXPValue& SXY1 = GetSXY1();
  SXY1.Validate(sxy1);
  PGXPValue& SXY2 = GetSXY2();
  SXY2.Validate(sxy2);

  // Don't use accurate clipping for game-constructed values, which don't have a valid Z.
  return (((SXY0.flags & SXY1.flags & SXY2.flags & VALID_XYZ) == VALID_XYZ));
}

float CPU::PGXP::GTE_NCLIP()
{
  const PGXPValue& SXY0 = GetSXY0();
  const PGXPValue& SXY1 = GetSXY1();
  const PGXPValue& SXY2 = GetSXY2();
  float nclip = ((SXY0.x * SXY1.y) + (SXY1.x * SXY2.y) + (SXY2.x * SXY0.y) - (SXY0.x * SXY2.y) - (SXY1.x * SXY0.y) -
                 (SXY2.x * SXY1.y));

  // ensure fractional values are not incorrectly rounded to 0
  const float nclip_abs = std::abs(nclip);
  if (0.1f < nclip_abs && nclip_abs < 1.0f)
    nclip += (nclip < 0.0f ? -1.0f : 1.0f);

  return nclip;
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::CPU_MTC2(u32 reg, const PGXPValue& value, u32 val)
{
  switch (reg)
  {
    case 15:
    {
      // push FIFO
      PGXPValue& SXY2 = PushSXY();
      SXY2 = value;
      return;
    }

    // read-only registers
    case 29:
    case 31:
    {
      return;
    }

    default:
    {
      PGXPValue& gteVal = g_state.pgxp_gte[reg];
      gteVal = value;
      gteVal.value = val;
      return;
    }
  }
}

void CPU::PGXP::CPU_MFC2(Instruction instr, u32 rdVal)
{
  // CPU[Rt] = GTE_D[Rd]
  const u32 idx = instr.cop.Cop2Index();
  LOG_VALUES_1(CPU::GetGTERegisterName(idx), rdVal, &g_state.pgxp_gte[idx]);

  PGXPValue& prdVal = g_state.pgxp_gte[idx];
  prdVal.Validate(rdVal);
  SetRtValue(instr, prdVal, rdVal);
}

void CPU::PGXP::CPU_MTC2(Instruction instr, u32 rtVal)
{
  // GTE_D[Rd] = CPU[Rt]
  const u32 idx = instr.cop.Cop2Index();
  LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);

  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
  CPU_MTC2(idx, prtVal, rtVal);
}

void CPU::PGXP::CPU_LWC2(Instruction instr, u32 addr, u32 rtVal)
{
  // GTE_D[Rt] = Mem[addr]
  LOG_VALUES_LOAD(addr, rtVal);

  const PGXPValue& pMem = ValidateAndLoadMem(addr, rtVal);
  CPU_MTC2(static_cast<u32>(instr.r.rt.GetValue()), pMem, rtVal);
}

void CPU::PGXP::CPU_SWC2(Instruction instr, u32 addr, u32 rtVal)
{
  //  Mem[addr] = GTE_D[Rt]
  const u32 idx = static_cast<u32>(instr.r.rt.GetValue());
  PGXPValue& prtVal = g_state.pgxp_gte[idx];
#ifdef LOG_VALUES
  LOG_VALUES_1(CPU::GetGTERegisterName(idx), rtVal, &prtVal);
  std::fprintf(s_log, " addr=%08X", addr);
#endif
  prtVal.Validate(rtVal);
  WriteMem(addr, prtVal);
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::CacheVertex(u32 value, const PGXPValue& vertex)
{
  const s16 sx = static_cast<s16>(value & 0xFFFFu);
  const s16 sy = static_cast<s16>(value >> 16);
  DebugAssert(sx >= -1024 && sx <= 1023 && sy >= -1024 && sy <= 1023);
  s_vertex_cache[(sy + 1024) * VERTEX_CACHE_WIDTH + (sx + 1024)] = vertex;
}

ALWAYS_INLINE_RELEASE CPU::PGXPValue* CPU::PGXP::GetCachedVertex(u32 value)
{
  const s16 sx = static_cast<s16>(value & 0xFFFFu);
  const s16 sy = static_cast<s16>(value >> 16);
  return (sx >= -1024 && sx <= 1023 && sy >= -1024 && sy <= 1013) ?
           &s_vertex_cache[(sy + 1024) * VERTEX_CACHE_WIDTH + (sx + 1024)] :
           nullptr;
}

ALWAYS_INLINE_RELEASE float CPU::PGXP::TruncateVertexPosition(float p)
{
  // Truncates positions to 11 bits before drawing.
  // Matches GPU command parsing, where the upper 5 bits are dropped.
  // Necessary for Jet Moto and Racingroovy VS.
  const s32 int_part = static_cast<s32>(p);
  const float int_part_f = static_cast<float>(int_part);
  return static_cast<float>(TruncateGPUVertexPosition(int_part)) + (p - int_part_f);
}

ALWAYS_INLINE_RELEASE bool CPU::PGXP::IsWithinTolerance(float precise_x, float precise_y, int int_x, int int_y)
{
  const float tolerance = g_settings.gpu_pgxp_tolerance;
  if (tolerance < 0.0f)
    return true;

  return (std::abs(precise_x - static_cast<float>(int_x)) <= tolerance &&
          std::abs(precise_y - static_cast<float>(int_y)) <= tolerance);
}

bool CPU::PGXP::GetPreciseVertex(u32 addr, u32 value, int x, int y, int xOffs, int yOffs, float* out_x, float* out_y,
                                 float* out_w)
{
  const PGXPValue* vert = GetPtr(addr);
  if (vert && (vert->flags & VALID_XY) == VALID_XY && vert->value == value)
  {
    *out_x = TruncateVertexPosition(vert->x) + static_cast<float>(xOffs);
    *out_y = TruncateVertexPosition(vert->y) + static_cast<float>(yOffs);
    *out_w = vert->z / 32768.0f;

#ifdef LOG_LOOKUPS
    GL_INS_FMT("0x{:08X} {},{} => {},{} ({},{},{}) ({},{})", addr, x, y, *out_x, *out_y,
               TruncateVertexPosition(vert->x), TruncateVertexPosition(vert->y), vert->z, std::abs(*out_x - x),
               std::abs(*out_y - y));
#endif

    if (IsWithinTolerance(*out_x, *out_y, x, y))
    {
      // check validity of z component
      return ((vert->flags & VALID_Z) == VALID_Z);
    }
  }

  if (g_settings.gpu_pgxp_vertex_cache)
  {
    vert = GetCachedVertex(value);
    if (vert && (vert->flags & VALID_XY) == VALID_XY)
    {
      *out_x = TruncateVertexPosition(vert->x) + static_cast<float>(xOffs);
      *out_y = TruncateVertexPosition(vert->y) + static_cast<float>(yOffs);
      *out_w = vert->z / 32768.0f;

      if (IsWithinTolerance(*out_x, *out_y, x, y))
        return false;
    }
  }

  // no valid value can be found anywhere, use the native PSX data
  *out_x = static_cast<float>(x);
  *out_y = static_cast<float>(y);
  *out_w = 1.0f;
  return false;
}

void CPU::PGXP::CPU_LW(Instruction instr, u32 addr, u32 rtVal)
{
  // Rt = Mem[Rs + Im]
  LOG_VALUES_LOAD(addr, rtVal);
  SetRtValue(instr, ValidateAndLoadMem(addr, rtVal));
}

void CPU::PGXP::CPU_LBx(Instruction instr, u32 addr, u32 rtVal)
{
  LOG_VALUES_LOAD(addr, rtVal);
  SetRtValue(instr, INVALID_VALUE);
}

void CPU::PGXP::CPU_LH(Instruction instr, u32 addr, u32 rtVal)
{
  // Rt = Mem[Rs + Im] (sign extended)
  LOG_VALUES_LOAD(addr, rtVal);
  ValidateAndLoadMem16(GetRtValue(instr), addr, rtVal, true);
}

void CPU::PGXP::CPU_LHU(Instruction instr, u32 addr, u32 rtVal)
{
  // Rt = Mem[Rs + Im] (zero extended)
  LOG_VALUES_LOAD(addr, rtVal);
  ValidateAndLoadMem16(GetRtValue(instr), addr, rtVal, false);
}

void CPU::PGXP::CPU_SB(Instruction instr, u32 addr, u32 rtVal)
{
  LOG_VALUES_STORE(instr.r.rt.GetValue(), rtVal, addr);
  WriteMem(addr, INVALID_VALUE);
}

void CPU::PGXP::CPU_SH(Instruction instr, u32 addr, u32 rtVal)
{
  LOG_VALUES_STORE(instr.r.rt.GetValue(), rtVal, addr);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
  WriteMem16(addr, prtVal);
}

void CPU::PGXP::CPU_SW(Instruction instr, u32 addr, u32 rtVal)
{
  // Mem[Rs + Im] = Rt
  LOG_VALUES_STORE(instr.r.rt.GetValue(), rtVal, addr);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
  WriteMem(addr, prtVal);
}

void CPU::PGXP::CPU_MOVE_Packed(u32 rd_and_rs, u32 rsVal)
{
  const u32 Rs = (rd_and_rs & 0xFFu);
  const u32 Rd = (rd_and_rs >> 8);
  CPU_MOVE(Rd, Rs, rsVal);
}

void CPU::PGXP::CPU_MOVE(u32 Rd, u32 Rs, u32 rsVal)
{
#ifdef LOG_VALUES
  const Instruction instr = {0};
  LOG_VALUES_C1(Rs, rsVal);
#endif
  PGXPValue& prsVal = g_state.pgxp_gpr[Rs];
  prsVal.Validate(rsVal);
  g_state.pgxp_gpr[Rd] = prsVal;
}

void CPU::PGXP::CPU_ADDI(Instruction instr, u32 rsVal)
{
  LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);

  // Rt = Rs + Imm (signed)
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);

  const u32 immVal = instr.i.imm_sext32();

  PGXPValue& prtVal = GetRtValue(instr);
  prtVal = prsVal;

  if (immVal == 0)
    return;

  if (rsVal == 0)
  {
    // x is low precision value
    prtVal.x = static_cast<float>(LOWORD_S16(immVal));
    prtVal.y = static_cast<float>(HIWORD_S16(immVal));
    prtVal.flags |= VALID_X | VALID_Y | VALID_TAINTED_Z;
    prtVal.value = immVal;
    return;
  }

  prtVal.x = static_cast<float>(f16Unsign(prtVal.x));
  prtVal.x += static_cast<float>(LOWORD_U16(immVal));

  // carry on over/underflow
  const float of = (prtVal.x > USHRT_MAX) ? 1.0f : (prtVal.x < 0.0f) ? -1.0f : 0.0f;
  prtVal.x = static_cast<float>(f16Sign(prtVal.x));
  prtVal.y += HIWORD_S16(immVal) + of;

  // truncate on overflow/underflow
  prtVal.y += (prtVal.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (prtVal.y < SHRT_MIN) ? (USHRT_MAX + 1) : 0.0f;

  prtVal.value = rsVal + immVal;

  prtVal.flags |= VALID_TAINTED_Z;
}

void CPU::PGXP::CPU_ANDI(Instruction instr, u32 rsVal)
{
  LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);

  // Rt = Rs & Imm
  const u32 imm = instr.i.imm_zext32();
  const u32 rtVal = rsVal & imm;
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = GetRtValue(instr);

  // remove upper 16-bits
  prtVal.y = 0.0f;
  prtVal.z = prsVal.z;
  prtVal.value = rtVal;
  prtVal.flags = prsVal.flags | VALID_Y | VALID_TAINTED_Z;

  switch (imm)
  {
    case 0:
    {
      // if 0 then x == 0
      prtVal.x = 0.0f;
      prtVal.flags |= VALID_X;
    }
    break;

    case 0xFFFFu:
    {
      // if saturated then x == x
      prtVal.x = prsVal.x;
    }
    break;

    default:
    {
      // otherwise x is low precision value
      prtVal.x = static_cast<float>(LOWORD_S16(rtVal));
      prtVal.flags |= VALID_X;
    }
    break;
  }
}

void CPU::PGXP::CPU_ORI(Instruction instr, u32 rsVal)
{
  LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);

  // Rt = Rs | Imm
  const u32 imm = instr.i.imm_zext32();
  const u32 rtVal = rsVal | imm;

  PGXPValue& pRsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& pRtVal = GetRtValue(instr);
  pRtVal = pRsVal;
  pRtVal.value = rtVal;

  if (imm == 0) [[unlikely]]
  {
    // if 0 then x == x
  }
  else
  {
    // otherwise x is low precision value
    pRtVal.x = static_cast<float>(LOWORD_S16(rtVal));
    pRtVal.flags |= VALID_X | VALID_TAINTED_Z;
  }
}

void CPU::PGXP::CPU_XORI(Instruction instr, u32 rsVal)
{
  LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);

  // Rt = Rs ^ Imm
  const u32 imm = instr.i.imm_zext32();
  const u32 rtVal = rsVal ^ imm;

  PGXPValue& pRsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& pRtVal = GetRtValue(instr);
  pRtVal = pRsVal;
  pRtVal.value = rtVal;

  if (imm == 0) [[unlikely]]
  {
    // if 0 then x == x
  }
  else
  {
    // otherwise x is low precision value
    pRtVal.x = static_cast<float>(LOWORD_S16(rtVal));
    pRtVal.flags |= VALID_X | VALID_TAINTED_Z;
  }
}

void CPU::PGXP::CPU_SLTI(Instruction instr, u32 rsVal)
{
  LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);

  // Rt = Rs < Imm (signed)
  const s32 imm = instr.i.imm_s16();
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);

  const float fimmx = static_cast<float>(imm);
  const float fimmy = fimmx < 0.0f ? -1.0f : 0.0f;

  PGXPValue& prtVal = GetRtValue(instr);
  prtVal.x = (prsVal.GetValidY(rsVal) < fimmy || prsVal.GetValidX(rsVal) < fimmx) ? 1.0f : 0.0f;
  prtVal.y = 0.0f;
  prtVal.z = prsVal.z;
  prtVal.flags = prsVal.flags | VALID_X | VALID_Y | VALID_TAINTED_Z;
  prtVal.value = BoolToUInt32(static_cast<s32>(rsVal) < imm);
}

void CPU::PGXP::CPU_SLTIU(Instruction instr, u32 rsVal)
{
  LOG_VALUES_C1(instr.i.rs.GetValue(), rsVal);

  // Rt = Rs < Imm (Unsigned)
  const u32 imm = instr.i.imm_u16();
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);

  const float fimmx = static_cast<float>(static_cast<s16>(imm)); // deliberately signed
  const float fimmy = fimmx < 0.0f ? -1.0f : 0.0f;

  PGXPValue& prtVal = GetRtValue(instr);
  prtVal.x =
    (f16Unsign(prsVal.GetValidY(rsVal)) < f16Unsign(fimmy) || f16Unsign(prsVal.GetValidX(rsVal)) < fimmx) ? 1.0f : 0.0f;
  prtVal.y = 0.0f;
  prtVal.z = prsVal.z;
  prtVal.flags = prsVal.flags | VALID_X | VALID_Y | VALID_TAINTED_Z;
  prtVal.value = BoolToUInt32(rsVal < imm);
}

void CPU::PGXP::CPU_LUI(Instruction instr)
{
  LOG_VALUES_NV();

  // Rt = Imm << 16
  PGXPValue& pRtVal = GetRtValue(instr);
  pRtVal.x = 0.0f;
  pRtVal.y = static_cast<float>(instr.i.imm_s16());
  pRtVal.z = 0.0f;
  pRtVal.value = instr.i.imm_zext32() << 16;
  pRtVal.flags = VALID_XY;
}

void CPU::PGXP::CPU_ADD(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Rd = Rs + Rt (signed)
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
  PGXPValue& prdVal = GetRdValue(instr);

  if (rtVal == 0)
  {
    prdVal = prsVal;
    CopyZIfMissing(prdVal, prtVal);
  }
  else if (rsVal == 0)
  {
    prdVal = prtVal;
    CopyZIfMissing(prdVal, prsVal);
  }
  else
  {
    const double x = f16Unsign(prsVal.GetValidX(rsVal)) + f16Unsign(prtVal.GetValidX(rtVal));

    // carry on over/underflow
    const float of = (x > USHRT_MAX) ? 1.0f : (x < 0.0f) ? -1.0f : 0.0f;
    prdVal.x = static_cast<float>(f16Sign(x));
    prdVal.y = prsVal.GetValidY(rsVal) + prtVal.GetValidY(rtVal) + of;

    // truncate on overflow/underflow
    prdVal.y += (prdVal.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (prdVal.y < SHRT_MIN) ? (USHRT_MAX + 1) : 0.0f;

    prdVal.value = rsVal + rtVal;

    // valid x/y only if one side had a valid x/y
    prdVal.flags = prsVal.flags | (prtVal.flags & VALID_XY) | VALID_TAINTED_Z;

    SelectZ(prdVal.z, prdVal.flags, prsVal, prtVal);
  }
}

void CPU::PGXP::CPU_SUB(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Rd = Rs - Rt (signed)
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
  PGXPValue& prdVal = GetRdValue(instr);

  if (rtVal == 0)
  {
    prdVal = prsVal;
    CopyZIfMissing(prdVal, prtVal);
  }
  else
  {
    const double x = f16Unsign(prsVal.GetValidX(rsVal)) - f16Unsign(prtVal.GetValidX(rtVal));

    // carry on over/underflow
    const float of = (x > USHRT_MAX) ? 1.0f : (x < 0.0f) ? -1.0f : 0.0f;
    prdVal.x = static_cast<float>(f16Sign(x));
    prdVal.y = prsVal.GetValidY(rsVal) - (prtVal.GetValidY(rtVal) - of);

    // truncate on overflow/underflow
    prdVal.y += (prdVal.y > SHRT_MAX) ? -(USHRT_MAX + 1) : (prdVal.y < SHRT_MIN) ? (USHRT_MAX + 1) : 0.0f;

    prdVal.value = rsVal - rtVal;

    // valid x/y only if one side had a valid x/y
    prdVal.flags = prsVal.flags | (prtVal.flags & VALID_XY) | VALID_TAINTED_Z;

    SelectZ(prdVal.z, prdVal.flags, prsVal, prtVal);
  }
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::CPU_BITWISE(Instruction instr, u32 rdVal, u32 rsVal, u32 rtVal)
{
  // Rd = Rs & Rt
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);

  float x, y;
  if (LOWORD_U16(rdVal) == 0)
    x = 0.0f;
  else if (LOWORD_U16(rdVal) == LOWORD_U16(rsVal))
    x = prsVal.GetValidX(rsVal);
  else if (LOWORD_U16(rdVal) == LOWORD_U16(rtVal))
    x = prtVal.GetValidX(rtVal);
  else
    x = static_cast<float>(LOWORD_S16(rdVal));

  if (HIWORD_U16(rdVal) == 0)
    y = 0.0f;
  else if (HIWORD_U16(rdVal) == HIWORD_U16(rsVal))
    y = prsVal.GetValidY(rsVal);
  else if (HIWORD_U16(rdVal) == HIWORD_U16(rtVal))
    y = prtVal.GetValidY(rtVal);
  else
    y = static_cast<float>(HIWORD_S16(rdVal));

  // Why not write directly to prdVal? Because it might be the same as the source.
  u32 flags = ((prsVal.flags | prtVal.flags) & VALID_XY) ? (VALID_XY | VALID_TAINTED_Z) : 0;
  PGXPValue& prdVal = GetRdValue(instr);
  SelectZ(prdVal.z, flags, prsVal, prtVal);
  prdVal.x = x;
  prdVal.y = y;
  prdVal.flags = flags;
  prdVal.value = rdVal;
}

void CPU::PGXP::CPU_AND_(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Rd = Rs & Rt
  const u32 rdVal = rsVal & rtVal;
  CPU_BITWISE(instr, rdVal, rsVal, rtVal);
}

void CPU::PGXP::CPU_OR_(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Rd = Rs | Rt
  const u32 rdVal = rsVal | rtVal;
  CPU_BITWISE(instr, rdVal, rsVal, rtVal);
}

void CPU::PGXP::CPU_XOR_(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Rd = Rs ^ Rt
  const u32 rdVal = rsVal ^ rtVal;
  CPU_BITWISE(instr, rdVal, rsVal, rtVal);
}

void CPU::PGXP::CPU_NOR(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Rd = Rs NOR Rt
  const u32 rdVal = ~(rsVal | rtVal);
  CPU_BITWISE(instr, rdVal, rsVal, rtVal);
}

void CPU::PGXP::CPU_SLT(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Rd = Rs < Rt (signed)
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
  PGXPValue& prdVal = GetRdValue(instr);
  prdVal.x = (prsVal.GetValidY(rsVal) < prtVal.GetValidY(rtVal) ||
              f16Unsign(prsVal.GetValidX(rsVal)) < f16Unsign(prtVal.GetValidX(rtVal))) ?
               1.0f :
               0.0f;
  prdVal.y = 0.0f;
  prdVal.z = prsVal.z;
  prdVal.flags = prsVal.flags | VALID_TAINTED_Z | VALID_X | VALID_Y;
  prdVal.value = BoolToUInt32(static_cast<s32>(rsVal) < static_cast<s32>(rtVal));
}

void CPU::PGXP::CPU_SLTU(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Rd = Rs < Rt (unsigned)
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
  PGXPValue& prdVal = GetRdValue(instr);
  prdVal.x = (f16Unsign(prsVal.GetValidY(rsVal)) < f16Unsign(prtVal.GetValidY(rtVal)) ||
              f16Unsign(prsVal.GetValidX(rsVal)) < f16Unsign(prtVal.GetValidX(rtVal))) ?
               1.0f :
               0.0f;
  prdVal.y = 0.0f;
  prdVal.z = prsVal.z;
  prdVal.flags = prsVal.flags | VALID_TAINTED_Z | VALID_X | VALID_Y;
  prdVal.value = BoolToUInt32(rsVal < rtVal);
}

void CPU::PGXP::CPU_MULT(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Hi/Lo = Rs * Rt (signed)
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);

  PGXPValue& ploVal = g_state.pgxp_gpr[static_cast<u8>(Reg::lo)];
  PGXPValue& phiVal = g_state.pgxp_gpr[static_cast<u8>(Reg::hi)];
  ploVal = prsVal;
  CopyZIfMissing(ploVal, prsVal);

  // Z/valid is the same
  phiVal = ploVal;

  const float rsx = prsVal.GetValidX(rsVal);
  const float rsy = prsVal.GetValidY(rsVal);
  const float rtx = prtVal.GetValidX(rtVal);
  const float rty = prtVal.GetValidY(rtVal);

  // Multiply out components
  const double xx = f16Unsign(rsx) * f16Unsign(rtx);
  const double xy = f16Unsign(rsx) * (rty);
  const double yx = rsy * f16Unsign(rtx);
  const double yy = rsy * rty;

  // Split values into outputs
  const double lx = xx;
  const double ly = f16Overflow(xx) + (xy + yx);
  const double hx = f16Overflow(ly) + yy;
  const double hy = f16Overflow(hx);

  ploVal.x = static_cast<float>(f16Sign(lx));
  ploVal.y = static_cast<float>(f16Sign(ly));
  ploVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
  phiVal.x = static_cast<float>(f16Sign(hx));
  phiVal.y = static_cast<float>(f16Sign(hy));
  phiVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);

  // compute PSX value
  const u64 result = static_cast<u64>(static_cast<s64>(SignExtend64(rsVal)) * static_cast<s64>(SignExtend64(rtVal)));
  phiVal.value = Truncate32(result >> 32);
  ploVal.value = Truncate32(result);
}

void CPU::PGXP::CPU_MULTU(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Hi/Lo = Rs * Rt (unsigned)
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);

  PGXPValue& ploVal = g_state.pgxp_gpr[static_cast<u8>(Reg::lo)];
  PGXPValue& phiVal = g_state.pgxp_gpr[static_cast<u8>(Reg::hi)];
  ploVal = prsVal;
  CopyZIfMissing(ploVal, prsVal);

  // Z/valid is the same
  phiVal = ploVal;

  const float rsx = prsVal.GetValidX(rsVal);
  const float rsy = prsVal.GetValidY(rsVal);
  const float rtx = prtVal.GetValidX(rtVal);
  const float rty = prtVal.GetValidY(rtVal);

  // Multiply out components
  const double xx = f16Unsign(rsx) * f16Unsign(rtx);
  const double xy = f16Unsign(rsx) * f16Unsign(rty);
  const double yx = f16Unsign(rsy) * f16Unsign(rtx);
  const double yy = f16Unsign(rsy) * f16Unsign(rty);

  // Split values into outputs
  const double lx = xx;
  const double ly = f16Overflow(xx) + (xy + yx);
  const double hx = f16Overflow(ly) + yy;
  const double hy = f16Overflow(hx);

  ploVal.x = static_cast<float>(f16Sign(lx));
  ploVal.y = static_cast<float>(f16Sign(ly));
  ploVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);
  phiVal.x = static_cast<float>(f16Sign(hx));
  phiVal.y = static_cast<float>(f16Sign(hy));
  phiVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);

  // compute PSX value
  const u64 result = ZeroExtend64(rsVal) * ZeroExtend64(rtVal);
  phiVal.value = Truncate32(result >> 32);
  ploVal.value = Truncate32(result);
}

void CPU::PGXP::CPU_DIV(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Lo = Rs / Rt (signed)
  // Hi = Rs % Rt (signed)
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);

  PGXPValue& ploVal = g_state.pgxp_gpr[static_cast<u8>(Reg::lo)];
  PGXPValue& phiVal = g_state.pgxp_gpr[static_cast<u8>(Reg::hi)];
  ploVal = prsVal;
  CopyZIfMissing(ploVal, prsVal);

  // Z/valid is the same
  phiVal = ploVal;

  const double vs = f16Unsign(prsVal.GetValidX(rsVal)) + prsVal.GetValidY(rsVal) * static_cast<double>(1 << 16);
  const double vt = f16Unsign(prtVal.GetValidX(rtVal)) + prtVal.GetValidY(rtVal) * static_cast<double>(1 << 16);

  const double lo = vs / vt;
  ploVal.y = static_cast<float>(f16Sign(f16Overflow(lo)));
  ploVal.x = static_cast<float>(f16Sign(lo));
  ploVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);

  const double hi = std::fmod(vs, vt);
  phiVal.y = static_cast<float>(f16Sign(f16Overflow(hi)));
  phiVal.x = static_cast<float>(f16Sign(hi));
  phiVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);

  // compute PSX value
  if (static_cast<s32>(rtVal) == 0)
  {
    // divide by zero
    ploVal.value = (static_cast<s32>(rsVal) >= 0) ? UINT32_C(0xFFFFFFFF) : UINT32_C(1);
    phiVal.value = static_cast<u32>(static_cast<s32>(rsVal));
  }
  else if (rsVal == UINT32_C(0x80000000) && static_cast<s32>(rtVal) == -1)
  {
    // unrepresentable
    ploVal.value = UINT32_C(0x80000000);
    phiVal.value = 0;
  }
  else
  {
    ploVal.value = static_cast<u32>(static_cast<s32>(rsVal) / static_cast<s32>(rtVal));
    phiVal.value = static_cast<u32>(static_cast<s32>(rsVal) % static_cast<s32>(rtVal));
  }
}

void CPU::PGXP::CPU_DIVU(Instruction instr, u32 rsVal, u32 rtVal)
{
  LOG_VALUES_C2(instr.r.rs.GetValue(), rsVal, instr.r.rt.GetValue(), rtVal);

  // Lo = Rs / Rt (unsigned)
  // Hi = Rs % Rt (unsigned)
  PGXPValue& prsVal = ValidateAndGetRsValue(instr, rsVal);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);

  PGXPValue& ploVal = g_state.pgxp_gpr[static_cast<u8>(Reg::lo)];
  PGXPValue& phiVal = g_state.pgxp_gpr[static_cast<u8>(Reg::hi)];
  ploVal = prsVal;
  CopyZIfMissing(ploVal, prsVal);

  // Z/valid is the same
  phiVal = ploVal;

  const double vs =
    f16Unsign(prsVal.GetValidX(rsVal)) + f16Unsign(prsVal.GetValidY(rsVal)) * static_cast<double>(1 << 16);
  const double vt =
    f16Unsign(prtVal.GetValidX(rtVal)) + f16Unsign(prtVal.GetValidY(rtVal)) * static_cast<double>(1 << 16);

  const double lo = vs / vt;
  ploVal.y = static_cast<float>(f16Sign(f16Overflow(lo)));
  ploVal.x = static_cast<float>(f16Sign(lo));
  ploVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);

  const double hi = std::fmod(vs, vt);
  phiVal.y = static_cast<float>(f16Sign(f16Overflow(hi)));
  phiVal.x = static_cast<float>(f16Sign(hi));
  phiVal.flags |= VALID_TAINTED_Z | (prtVal.flags & VALID_XY);

  if (rtVal == 0)
  {
    // divide by zero
    ploVal.value = UINT32_C(0xFFFFFFFF);
    phiVal.value = rsVal;
  }
  else
  {
    ploVal.value = rsVal / rtVal;
    phiVal.value = rsVal % rtVal;
  }
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::CPU_SLL(Instruction instr, u32 rtVal, u32 sh)
{
  const u32 rdVal = rtVal << sh;
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
  PGXPValue& prdVal = GetRdValue(instr);
  prdVal.z = prtVal.z;
  prdVal.value = rdVal;

  if (sh >= 32) [[unlikely]]
  {
    prdVal.x = 0.0f;
    prdVal.y = 0.0f;
    prdVal.flags = prtVal.flags | VALID_XY | VALID_TAINTED_Z;
  }
  else if (sh == 16)
  {
    prdVal.y = prtVal.x;
    prdVal.x = 0.0f;

    // Only set valid X if there's also a valid Y. We could use GetValidX() to pull it from the low precision value
    // instead, need to investigate further. Spyro breaks if only X is set even if Y is not valid.
    // prdVal.flags = (prtVal.flags & ~VALID_Y) | ((prtVal.flags & VALID_X) << 1) | VALID_X | VALID_TAINTED_Z;
    prdVal.flags = (prtVal.flags | VALID_TAINTED_Z) | ((prtVal.flags & VALID_Y) >> 1);
  }
  else if (sh >= 16)
  {
    prdVal.y = static_cast<float>(f16Sign(f16Unsign(prtVal.x * static_cast<double>(1 << (sh - 16)))));
    prdVal.x = 0.0f;

    // See above.
    // prdVal.flags = (prtVal.flags & ~VALID_Y) | ((prtVal.flags & VALID_X) << 1) | VALID_X | VALID_TAINTED_Z;
    prdVal.flags = (prtVal.flags | VALID_TAINTED_Z) | ((prtVal.flags & VALID_Y) >> 1);
  }
  else
  {
    const double x = f16Unsign(prtVal.x) * static_cast<double>(1 << sh);
    const double y = (f16Unsign(prtVal.y) * static_cast<double>(1 << sh)) + f16Overflow(x);
    prdVal.x = static_cast<float>(f16Sign(x));
    prdVal.y = static_cast<float>(f16Sign(y));
    prdVal.flags = (prtVal.flags | VALID_TAINTED_Z);
  }
}

void CPU::PGXP::CPU_SLL(Instruction instr, u32 rtVal)
{
  LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);

  // Rd = Rt << Sa
  const u32 sh = instr.r.shamt;
  CPU_SLL(instr, rtVal, sh);
}

void CPU::PGXP::CPU_SLLV(Instruction instr, u32 rtVal, u32 rsVal)
{
  LOG_VALUES_C2(instr.r.rt.GetValue(), rtVal, instr.r.rs.GetValue(), rsVal);

  // Rd = Rt << Rs
  const u32 sh = rsVal & 0x1F;
  CPU_SLL(instr, rtVal, sh);
}

ALWAYS_INLINE_RELEASE void CPU::PGXP::CPU_SRx(Instruction instr, u32 rtVal, u32 sh, bool sign, bool is_variable)
{
  const u32 rdVal = sign ? static_cast<u32>(static_cast<s32>(rtVal) >> sh) : (rtVal >> sh);
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);

  double x = prtVal.x;
  double y = sign ? prtVal.y : f16Unsign(prtVal.y);

  const u32 iX = SignExtend32(LOWORD_S16(rtVal));   // remove Y
  const u32 iY = SET_LOWORD(rtVal, HIWORD_U16(iX)); // overwrite x with sign(x)

  // Shift test values
  const u32 dX = static_cast<u32>(static_cast<s32>(iX) >> sh);
  const u32 dY = sign ? static_cast<u32>(static_cast<s32>(iY) >> sh) : (iY >> sh);

  if (LOWORD_S16(dX) != HIWORD_S16(iX))
    x = x / static_cast<double>(1 << sh);
  else
    x = LOWORD_S16(dX); // only sign bits left

  if (LOWORD_S16(dY) != HIWORD_S16(iX))
  {
    if (sh == 16)
    {
      x = y;
    }
    else if (sh < 16)
    {
      x += y * static_cast<double>(1 << (16 - sh));
      if (prtVal.x < 0)
        x += static_cast<double>(1 << (16 - sh));
    }
    else
    {
      x += y / static_cast<double>(1 << (sh - 16));
    }
  }

  if ((HIWORD_S16(dY) == 0) || (HIWORD_S16(dY) == -1))
    y = HIWORD_S16(dY);
  else
    y = y / static_cast<double>(1 << sh);

  PGXPValue& prdVal = GetRdValue(instr);

  // Use low precision/rounded values when we're not shifting an entire component,
  // and it's not originally from a 3D value. Too many false positives in P2/etc.
  // What we probably should do is not set the valid flag on non-3D values to begin
  // with, only letting them become valid when used in another expression.
  if (sign && !is_variable && !(prtVal.flags & VALID_Z) && sh < 16)
  {
    prdVal.x = static_cast<float>(LOWORD_S16(rdVal));
    prdVal.y = static_cast<float>(HIWORD_S16(rdVal));
    prdVal.z = 0.0f;
    prdVal.value = rdVal;
    prdVal.flags = VALID_XY | VALID_TAINTED_Z;
  }
  else
  {
    prdVal.x = static_cast<float>(f16Sign(x));
    prdVal.y = static_cast<float>(f16Sign(y));
    prdVal.z = prtVal.z;
    prdVal.value = rdVal;
    prdVal.flags = prtVal.flags | VALID_TAINTED_Z;
  }
}

void CPU::PGXP::CPU_SRL(Instruction instr, u32 rtVal)
{
  LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);

  // Rd = Rt >> Sa
  const u32 sh = instr.r.shamt;
  CPU_SRx(instr, rtVal, sh, false, false);
}

void CPU::PGXP::CPU_SRLV(Instruction instr, u32 rtVal, u32 rsVal)
{
  LOG_VALUES_C2(instr.r.rt.GetValue(), rtVal, instr.r.rs.GetValue(), rsVal);

  // Rd = Rt >> Sa
  const u32 sh = rsVal & 0x1F;
  CPU_SRx(instr, rtVal, sh, false, true);
}

void CPU::PGXP::CPU_SRA(Instruction instr, u32 rtVal)
{
  LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);

  // Rd = Rt >> Sa
  const u32 sh = instr.r.shamt;
  CPU_SRx(instr, rtVal, sh, true, false);
}

void CPU::PGXP::CPU_SRAV(Instruction instr, u32 rtVal, u32 rsVal)
{
  LOG_VALUES_C2(instr.r.rt.GetValue(), rtVal, instr.r.rs.GetValue(), rsVal);

  // Rd = Rt >> Sa
  const u32 sh = rsVal & 0x1F;
  CPU_SRx(instr, rtVal, sh, true, true);
}

void CPU::PGXP::CPU_MFC0(Instruction instr, u32 rdVal)
{
  const u32 idx = static_cast<u8>(instr.r.rd.GetValue());
  LOG_VALUES_1(TinyString::from_format("cop0_{}", idx).c_str(), rdVal, &g_state.pgxp_cop0[idx]);

  // CPU[Rt] = CP0[Rd]
  PGXPValue& prdVal = g_state.pgxp_cop0[idx];
  prdVal.Validate(rdVal);

  PGXPValue& prtVal = GetRtValue(instr);
  prtVal = prdVal;
  prtVal.value = rdVal;
}

void CPU::PGXP::CPU_MTC0(Instruction instr, u32 rdVal, u32 rtVal)
{
  LOG_VALUES_C1(instr.r.rt.GetValue(), rtVal);

  // CP0[Rd] = CPU[Rt]
  PGXPValue& prtVal = ValidateAndGetRtValue(instr, rtVal);
  PGXPValue& prdVal = g_state.pgxp_cop0[static_cast<u8>(instr.r.rd.GetValue())];
  prdVal = prtVal;
  prtVal.value = rdVal;
}