duckstation/src/core/gpu_backend.cpp

// SPDX-FileCopyrightText: 2019-2025 Connor McLaughlin <stenzek@gmail.com>
// SPDX-License-Identifier: CC-BY-NC-ND-4.0

#include "gpu_backend.h"
#include "gpu.h"
#include "gpu_presenter.h"
#include "gpu_sw_rasterizer.h"
#include "gpu_thread.h"
#include "host.h"
#include "performance_counters.h"
#include "save_state_version.h"
#include "settings.h"
#include "system.h"
#include "system_private.h"

#include "util/gpu_device.h"
#include "util/imgui_manager.h"
#include "util/state_wrapper.h"

#include "common/error.h"
#include "common/file_system.h"
#include "common/log.h"
#include "common/path.h"
#include "common/threading.h"

#include "IconsEmoji.h"
#include "IconsFontAwesome5.h"
#include "fmt/format.h"

LOG_CHANNEL(GPU);

namespace {

struct Counters
{
  u32 num_reads;
  u32 num_writes;
  u32 num_copies;
  u32 num_vertices;
  u32 num_primitives;
};

struct Stats : Counters
{
  size_t host_buffer_streamed;
  u32 host_num_draws;
  u32 host_num_barriers;
  u32 host_num_render_passes;
  u32 host_num_copies;
  u32 host_num_downloads;
  u32 host_num_uploads;
};

struct ALIGN_TO_CACHE_LINE CPUThreadState
{
  static constexpr u32 WAIT_NONE = 0;
  static constexpr u32 WAIT_CPU_THREAD_WAITING = 1;
  static constexpr u32 WAIT_GPU_THREAD_SIGNALING = 2;
  static constexpr u32 WAIT_GPU_THREAD_POSTED = 3;

  std::atomic<u32> queued_frames;
  std::atomic<u32> wait_state;
  Threading::KernelSemaphore gpu_thread_wait;
};

} // namespace

static Counters s_counters = {};
static Stats s_stats = {};

static CPUThreadState s_cpu_thread_state = {};

GPUBackend::GPUBackend(GPUPresenter& presenter) : m_presenter(presenter)
{
  GPU_SW_Rasterizer::SelectImplementation();
  ResetStatistics();
}

GPUBackend::~GPUBackend()
{
  m_presenter.ClearDisplayTexture();
}

void GPUBackend::SetScreenQuadInputLayout(GPUPipeline::GraphicsConfig& config)
{
  static constexpr GPUPipeline::VertexAttribute screen_vertex_attributes[] = {
    GPUPipeline::VertexAttribute::Make(0, GPUPipeline::VertexAttribute::Semantic::Position, 0,
                                       GPUPipeline::VertexAttribute::Type::Float, 2, OFFSETOF(ScreenVertex, x)),
    GPUPipeline::VertexAttribute::Make(1, GPUPipeline::VertexAttribute::Semantic::TexCoord, 0,
                                       GPUPipeline::VertexAttribute::Type::Float, 2, OFFSETOF(ScreenVertex, u)),
  };

  // common state
  config.input_layout.vertex_attributes = screen_vertex_attributes;
  config.input_layout.vertex_stride = sizeof(ScreenVertex);
  config.primitive = GPUPipeline::Primitive::TriangleStrips;
}

GSVector4 GPUBackend::GetScreenQuadClipSpaceCoordinates(const GSVector4i bounds, const GSVector2i rt_size)
{
  const GSVector4 fboundsxxyy = GSVector4(bounds.xzyw());
  const GSVector2 fsize = GSVector2(rt_size);
  const GSVector2 x = ((fboundsxxyy.xy() * GSVector2::cxpr(2.0f)) / fsize.xx()) - GSVector2::cxpr(1.0f);
  const GSVector2 y = GSVector2::cxpr(1.0f) - (GSVector2::cxpr(2.0f) * (fboundsxxyy.zw() / fsize.yy()));
  return GSVector4::xyxy(x, y).xzyw();
}

void GPUBackend::DrawScreenQuad(const GSVector4i bounds, const GSVector2i rt_size,
                                const GSVector4 uv_bounds /* = GSVector4::cxpr(0.0f, 0.0f, 1.0f, 1.0f) */)
{
  const GSVector4 xy = GetScreenQuadClipSpaceCoordinates(bounds, rt_size);

  ScreenVertex* vertices;
  u32 space;
  u32 base_vertex;
  g_gpu_device->MapVertexBuffer(sizeof(ScreenVertex), 4, reinterpret_cast<void**>(&vertices), &space, &base_vertex);

  vertices[0].Set(xy.xy(), uv_bounds.xy());
  vertices[1].Set(xy.zyzw().xy(), uv_bounds.zyzw().xy());
  vertices[2].Set(xy.xwzw().xy(), uv_bounds.xwzw().xy());
  vertices[3].Set(xy.zw(), uv_bounds.zw());

  g_gpu_device->UnmapVertexBuffer(sizeof(ScreenVertex), 4);
  g_gpu_device->Draw(4, base_vertex);
}

bool GPUBackend::Initialize(bool clear_vram, Error* error)
{
  m_clamped_drawing_area = GPU::GetClampedDrawingArea(GPU_SW_Rasterizer::g_drawing_area);
  return true;
}

bool GPUBackend::UpdateSettings(const GPUSettings& old_settings, Error* error)
{
  if (g_gpu_settings.display_show_gpu_stats != old_settings.display_show_gpu_stats)
    GPUBackend::ResetStatistics();

  return true;
}

void GPUBackend::UpdatePostProcessingSettings(bool force_reload)
{
}

GPUThreadCommand* GPUBackend::NewClearVRAMCommand()
{
  return static_cast<GPUThreadCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::ClearVRAM, sizeof(GPUThreadCommand)));
}

GPUThreadCommand* GPUBackend::NewClearDisplayCommand()
{
  return static_cast<GPUThreadCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::ClearDisplay, sizeof(GPUThreadCommand)));
}

GPUBackendUpdateDisplayCommand* GPUBackend::NewUpdateDisplayCommand()
{
  return static_cast<GPUBackendUpdateDisplayCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::UpdateDisplay, sizeof(GPUBackendUpdateDisplayCommand)));
}

GPUBackendSubmitFrameCommand* GPUBackend::NewSubmitFrameCommand()
{
  return static_cast<GPUBackendSubmitFrameCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::SubmitFrame, sizeof(GPUBackendUpdateDisplayCommand)));
}

GPUThreadCommand* GPUBackend::NewClearCacheCommand()
{
  return static_cast<GPUThreadCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::ClearCache, sizeof(GPUThreadCommand)));
}

GPUThreadCommand* GPUBackend::NewBufferSwappedCommand()
{
  return static_cast<GPUThreadCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::BufferSwapped, sizeof(GPUThreadCommand)));
}

GPUBackendReadVRAMCommand* GPUBackend::NewReadVRAMCommand()
{
  return static_cast<GPUBackendReadVRAMCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::ReadVRAM, sizeof(GPUBackendReadVRAMCommand)));
}

GPUBackendFillVRAMCommand* GPUBackend::NewFillVRAMCommand()
{
  return static_cast<GPUBackendFillVRAMCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::FillVRAM, sizeof(GPUBackendFillVRAMCommand)));
}

GPUBackendUpdateVRAMCommand* GPUBackend::NewUpdateVRAMCommand(u32 num_words)
{
  const u32 size = sizeof(GPUBackendUpdateVRAMCommand) + (num_words * sizeof(u16));
  GPUBackendUpdateVRAMCommand* cmd =
    static_cast<GPUBackendUpdateVRAMCommand*>(GPUThread::AllocateCommand(GPUBackendCommandType::UpdateVRAM, size));
  return cmd;
}

GPUBackendCopyVRAMCommand* GPUBackend::NewCopyVRAMCommand()
{
  return static_cast<GPUBackendCopyVRAMCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::CopyVRAM, sizeof(GPUBackendCopyVRAMCommand)));
}

GPUBackendSetDrawingAreaCommand* GPUBackend::NewSetDrawingAreaCommand()
{
  return static_cast<GPUBackendSetDrawingAreaCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::SetDrawingArea, sizeof(GPUBackendSetDrawingAreaCommand)));
}

GPUBackendUpdateCLUTCommand* GPUBackend::NewUpdateCLUTCommand()
{
  return static_cast<GPUBackendUpdateCLUTCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::UpdateCLUT, sizeof(GPUBackendUpdateCLUTCommand)));
}

GPUBackendDrawPolygonCommand* GPUBackend::NewDrawPolygonCommand(u32 num_vertices)
{
  const u32 size = sizeof(GPUBackendDrawPolygonCommand) + (num_vertices * sizeof(GPUBackendDrawPolygonCommand::Vertex));
  GPUBackendDrawPolygonCommand* cmd =
    static_cast<GPUBackendDrawPolygonCommand*>(GPUThread::AllocateCommand(GPUBackendCommandType::DrawPolygon, size));
  cmd->num_vertices = Truncate16(num_vertices);
  return cmd;
}

GPUBackendDrawPrecisePolygonCommand* GPUBackend::NewDrawPrecisePolygonCommand(u32 num_vertices)
{
  const u32 size =
    sizeof(GPUBackendDrawPrecisePolygonCommand) + (num_vertices * sizeof(GPUBackendDrawPrecisePolygonCommand::Vertex));
  GPUBackendDrawPrecisePolygonCommand* cmd = static_cast<GPUBackendDrawPrecisePolygonCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::DrawPrecisePolygon, size));
  cmd->num_vertices = Truncate16(num_vertices);
  return cmd;
}

GPUBackendDrawRectangleCommand* GPUBackend::NewDrawRectangleCommand()
{
  return static_cast<GPUBackendDrawRectangleCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::DrawRectangle, sizeof(GPUBackendDrawRectangleCommand)));
}

GPUBackendDrawLineCommand* GPUBackend::NewDrawLineCommand(u32 num_vertices)
{
  const u32 size = sizeof(GPUBackendDrawLineCommand) + (num_vertices * sizeof(GPUBackendDrawLineCommand::Vertex));
  GPUBackendDrawLineCommand* cmd =
    static_cast<GPUBackendDrawLineCommand*>(GPUThread::AllocateCommand(GPUBackendCommandType::DrawLine, size));
  cmd->num_vertices = Truncate16(num_vertices);
  return cmd;
}

GPUBackendDrawPreciseLineCommand* GPUBackend::NewDrawPreciseLineCommand(u32 num_vertices)
{
  const u32 size =
    sizeof(GPUBackendDrawPreciseLineCommand) + (num_vertices * sizeof(GPUBackendDrawPreciseLineCommand::Vertex));
  GPUBackendDrawPreciseLineCommand* cmd = static_cast<GPUBackendDrawPreciseLineCommand*>(
    GPUThread::AllocateCommand(GPUBackendCommandType::DrawPreciseLine, size));
  cmd->num_vertices = Truncate16(num_vertices);
  return cmd;
}

void GPUBackend::PushCommand(GPUThreadCommand* cmd)
{
  GPUThread::PushCommand(cmd);
}

void GPUBackend::PushCommandAndWakeThread(GPUThreadCommand* cmd)
{
  GPUThread::PushCommandAndWakeThread(cmd);
}

void GPUBackend::PushCommandAndSync(GPUThreadCommand* cmd, bool spin)
{
  GPUThread::PushCommandAndSync(cmd, spin);
}

void GPUBackend::SyncGPUThread(bool spin)
{
  GPUThread::SyncGPUThread(spin);
}

bool GPUBackend::IsUsingHardwareBackend()
{
  return (GPUThread::GetRequestedRenderer().value_or(GPURenderer::Software) != GPURenderer::Software);
}

bool GPUBackend::BeginQueueFrame()
{
  const u32 queued_frames = s_cpu_thread_state.queued_frames.fetch_add(1, std::memory_order_acq_rel) + 1;
  if (queued_frames <= g_settings.gpu_max_queued_frames)
    return false;

  if (g_settings.gpu_max_queued_frames > 0)
    DEV_LOG("<-- {} queued frames, {} max, blocking CPU thread", queued_frames, g_settings.gpu_max_queued_frames);

  s_cpu_thread_state.wait_state.store(CPUThreadState::WAIT_CPU_THREAD_WAITING, std::memory_order_release);
  return true;
}

void GPUBackend::WaitForOneQueuedFrame()
{
  // Inbetween this and the post call, we may have finished the frame. Check.
  if (s_cpu_thread_state.queued_frames.load(std::memory_order_acquire) <= g_settings.gpu_max_queued_frames)
  {
    // It's possible that the GPU thread has already signaled the semaphore.
    // If so, then we still need to drain it, otherwise waits in the future will return prematurely.
    u32 expected = CPUThreadState::WAIT_CPU_THREAD_WAITING;
    if (s_cpu_thread_state.wait_state.compare_exchange_strong(expected, CPUThreadState::WAIT_NONE,
                                                              std::memory_order_acq_rel, std::memory_order_acquire))
    {
      return;
    }
  }

  s_cpu_thread_state.gpu_thread_wait.Wait();

  // Depending on where the GPU thread is, now we can either be in WAIT_GPU_THREAD_SIGNALING or WAIT_GPU_THREAD_POSTED
  // state. We want to clear the flag here regardless, so a store-release is fine. Because the GPU thread has a
  // compare-exchange on WAIT_GPU_THREAD_SIGNALING, it can't "overwrite" the value we store here.
  s_cpu_thread_state.wait_state.store(CPUThreadState::WAIT_NONE, std::memory_order_release);

  // Sanity check: queued frames should be in range now. If they're not, we fucked up the semaphore.
  if (const u32 queued_frames = s_cpu_thread_state.queued_frames.load(std::memory_order_acquire);
      queued_frames > g_settings.gpu_max_queued_frames) [[unlikely]]
  {
    ERROR_LOG("queued_frames {} above max queued frames {} after CPU wait", queued_frames,
              g_settings.gpu_max_queued_frames);
  }
}

u32 GPUBackend::GetQueuedFrameCount()
{
  return s_cpu_thread_state.queued_frames.load(std::memory_order_acquire);
}

void GPUBackend::ReleaseQueuedFrame()
{
  s_cpu_thread_state.queued_frames.fetch_sub(1, std::memory_order_acq_rel);

  // We need two states here in case we get preempted in between the compare_exchange_strong() and Post().
  // This means that we will only release the semaphore once the CPU is guaranteed to be in a waiting state,
  // and ensure that we don't post twice if the CPU thread lags and we process 2 frames before it wakes up.
  u32 expected = CPUThreadState::WAIT_CPU_THREAD_WAITING;
  if (s_cpu_thread_state.wait_state.compare_exchange_strong(expected, CPUThreadState::WAIT_GPU_THREAD_SIGNALING,
                                                            std::memory_order_acq_rel, std::memory_order_acquire))
  {
    if (g_settings.gpu_max_queued_frames > 0)
      DEV_LOG("--> Unblocking CPU thread");

    s_cpu_thread_state.gpu_thread_wait.Post();

    // This needs to be a compare_exchange, because the CPU thread can clear the flag before we execute this line.
    expected = CPUThreadState::WAIT_GPU_THREAD_SIGNALING;
    s_cpu_thread_state.wait_state.compare_exchange_strong(expected, CPUThreadState::WAIT_GPU_THREAD_POSTED,
                                                          std::memory_order_acq_rel, std::memory_order_acquire);
  }
}

bool GPUBackend::AllocateMemorySaveStates(std::span<System::MemorySaveState> states, Error* error)
{
  bool result;
  GPUThread::RunOnBackend(
    [states, error, &result](GPUBackend* backend) {
      // Free old textures first.
      for (size_t i = 0; i < states.size(); i++)
        g_gpu_device->RecycleTexture(std::move(states[i].vram_texture));

      // Maximize potential for texture reuse by flushing the current command buffer.
      g_gpu_device->WaitForGPUIdle();

      for (size_t i = 0; i < states.size(); i++)
      {
        if (!backend->AllocateMemorySaveState(states[i], error))
        {
          // Try flushing the pool.
          WARNING_LOG("Failed to allocate memory save state texture, trying flushing pool.");
          g_gpu_device->PurgeTexturePool();
          g_gpu_device->WaitForGPUIdle();
          if (!backend->AllocateMemorySaveState(states[i], error))
          {
            // Free anything that was allocated.
            for (size_t j = 0; j <= i; i++)
            {
              states[j].state_data.deallocate();
              states[j].vram_texture.reset();
              result = false;
              return;
            }
          }
        }
      }

      backend->RestoreDeviceContext();
      result = true;
    },
    true, false);
  return result;
}

void GPUBackend::QueueUpdateResolutionScale()
{
  DebugAssert(!GPUThread::IsOnThread());

  GPUThread::RunOnBackend(
    [](GPUBackend* backend) {
      Error error;
      if (!backend->UpdateResolutionScale(&error)) [[unlikely]]
        GPUThread::ReportFatalErrorAndShutdown(
          fmt::format("Failed to update resolution scale: {}", error.GetDescription()));
    },
    false, true);
}

void GPUBackend::HandleCommand(const GPUThreadCommand* cmd)
{
  switch (cmd->type)
  {
    case GPUBackendCommandType::ClearVRAM:
    {
      ClearVRAM();
    }
    break;

    case GPUBackendCommandType::LoadState:
    {
      LoadState(static_cast<const GPUBackendLoadStateCommand*>(cmd));
    }
    break;

    case GPUBackendCommandType::LoadMemoryState:
    {
      System::MemorySaveState& mss = *static_cast<const GPUBackendDoMemoryStateCommand*>(cmd)->memory_save_state;
      StateWrapper sw(mss.gpu_state_data.span(0, mss.gpu_state_size), StateWrapper::Mode::Read, SAVE_STATE_VERSION);
      DoMemoryState(sw, mss);
    }
    break;

    case GPUBackendCommandType::SaveMemoryState:
    {
      System::MemorySaveState& mss = *static_cast<const GPUBackendDoMemoryStateCommand*>(cmd)->memory_save_state;
      StateWrapper sw(mss.gpu_state_data.span(), StateWrapper::Mode::Write, SAVE_STATE_VERSION);
      DoMemoryState(sw, mss);
      mss.gpu_state_size = static_cast<u32>(sw.GetPosition());
    }
    break;

    case GPUBackendCommandType::ClearDisplay:
    {
      m_presenter.ClearDisplay();
    }
    break;

    case GPUBackendCommandType::UpdateDisplay:
    {
      HandleUpdateDisplayCommand(static_cast<const GPUBackendUpdateDisplayCommand*>(cmd));
    }
    break;

    case GPUBackendCommandType::SubmitFrame:
    {
      HandleSubmitFrameCommand(&static_cast<const GPUBackendSubmitFrameCommand*>(cmd)->frame);
    }
    break;

    case GPUBackendCommandType::ClearCache:
    {
      ClearCache();
    }
    break;

    case GPUBackendCommandType::BufferSwapped:
    {
      OnBufferSwapped();
    }
    break;

    case GPUBackendCommandType::ReadVRAM:
    {
      const GPUBackendReadVRAMCommand* ccmd = static_cast<const GPUBackendReadVRAMCommand*>(cmd);
      s_counters.num_reads++;
      ReadVRAM(ZeroExtend32(ccmd->x), ZeroExtend32(ccmd->y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height));
    }
    break;

    case GPUBackendCommandType::FillVRAM:
    {
      const GPUBackendFillVRAMCommand* ccmd = static_cast<const GPUBackendFillVRAMCommand*>(cmd);
      FillVRAM(ZeroExtend32(ccmd->x), ZeroExtend32(ccmd->y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height),
               ccmd->color, ccmd->interlaced_rendering, ccmd->active_line_lsb);
    }
    break;

    case GPUBackendCommandType::UpdateVRAM:
    {
      const GPUBackendUpdateVRAMCommand* ccmd = static_cast<const GPUBackendUpdateVRAMCommand*>(cmd);
      s_counters.num_writes++;
      UpdateVRAM(ZeroExtend32(ccmd->x), ZeroExtend32(ccmd->y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height),
                 ccmd->data, ccmd->set_mask_while_drawing, ccmd->check_mask_before_draw);
    }
    break;

    case GPUBackendCommandType::CopyVRAM:
    {
      const GPUBackendCopyVRAMCommand* ccmd = static_cast<const GPUBackendCopyVRAMCommand*>(cmd);
      s_counters.num_copies++;
      CopyVRAM(ZeroExtend32(ccmd->src_x), ZeroExtend32(ccmd->src_y), ZeroExtend32(ccmd->dst_x),
               ZeroExtend32(ccmd->dst_y), ZeroExtend32(ccmd->width), ZeroExtend32(ccmd->height),
               ccmd->set_mask_while_drawing, ccmd->check_mask_before_draw);
    }
    break;

    case GPUBackendCommandType::SetDrawingArea:
    {
      const GPUBackendSetDrawingAreaCommand* ccmd = static_cast<const GPUBackendSetDrawingAreaCommand*>(cmd);
      GPU_SW_Rasterizer::g_drawing_area = ccmd->new_area;
      m_clamped_drawing_area = GPU::GetClampedDrawingArea(ccmd->new_area);
      DrawingAreaChanged();
    }
    break;

    case GPUBackendCommandType::UpdateCLUT:
    {
      const GPUBackendUpdateCLUTCommand* ccmd = static_cast<const GPUBackendUpdateCLUTCommand*>(cmd);
      GPU_SW_Rasterizer::UpdateCLUT(ccmd->reg, ccmd->clut_is_8bit);
    }
    break;

    case GPUBackendCommandType::DrawPolygon:
    {
      const GPUBackendDrawPolygonCommand* ccmd = static_cast<const GPUBackendDrawPolygonCommand*>(cmd);
      s_counters.num_vertices += ccmd->num_vertices;
      s_counters.num_primitives++;
      DrawPolygon(ccmd);
    }
    break;

    case GPUBackendCommandType::DrawPrecisePolygon:
    {
      const GPUBackendDrawPolygonCommand* ccmd = static_cast<const GPUBackendDrawPolygonCommand*>(cmd);
      s_counters.num_vertices += ccmd->num_vertices;
      s_counters.num_primitives++;
      DrawPrecisePolygon(static_cast<const GPUBackendDrawPrecisePolygonCommand*>(cmd));
    }
    break;

    case GPUBackendCommandType::DrawRectangle:
    {
      const GPUBackendDrawRectangleCommand* ccmd = static_cast<const GPUBackendDrawRectangleCommand*>(cmd);
      s_counters.num_vertices++;
      s_counters.num_primitives++;
      DrawSprite(ccmd);
    }
    break;

    case GPUBackendCommandType::DrawLine:
    {
      const GPUBackendDrawLineCommand* ccmd = static_cast<const GPUBackendDrawLineCommand*>(cmd);
      s_counters.num_vertices += ccmd->num_vertices;
      s_counters.num_primitives += ccmd->num_vertices / 2;
      DrawLine(ccmd);
    }
    break;

    case GPUBackendCommandType::DrawPreciseLine:
    {
      const GPUBackendDrawPreciseLineCommand* ccmd = static_cast<const GPUBackendDrawPreciseLineCommand*>(cmd);
      s_counters.num_vertices += ccmd->num_vertices;
      s_counters.num_primitives += ccmd->num_vertices / 2;
      DrawPreciseLine(ccmd);
    }
    break;

      DefaultCaseIsUnreachable();
  }
}

void GPUBackend::HandleUpdateDisplayCommand(const GPUBackendUpdateDisplayCommand* cmd)
{
  // Height has to be doubled because we halved it on the GPU side.
  m_presenter.SetDisplayParameters(
    cmd->display_width, cmd->display_height, cmd->display_origin_left, cmd->display_origin_top, cmd->display_vram_width,
    cmd->display_vram_height << BoolToUInt32(cmd->interlaced_display_enabled), cmd->display_pixel_aspect_ratio);

  UpdateDisplay(cmd);
  if (cmd->submit_frame)
    HandleSubmitFrameCommand(&cmd->frame);
}

void GPUBackend::HandleSubmitFrameCommand(const GPUBackendFramePresentationParameters* cmd)
{
  // For regtest.
  Host::FrameDoneOnGPUThread(this, cmd->frame_number);

  if (cmd->media_capture)
    m_presenter.SendDisplayToMediaCapture(cmd->media_capture);

  // If this returns false, our backend object is deleted and replaced with null, so bail out.
  if (cmd->present_frame)
  {
    const bool result = m_presenter.PresentFrame(&m_presenter, this, cmd->allow_present_skip, cmd->present_time);
    ReleaseQueuedFrame();
    if (!result)
      return;
  }

  // Update perf counters *after* throttling, we want to measure from start-of-frame
  // to start-of-frame, not end-of-frame to end-of-frame (will be noisy due to different
  // amounts of computation happening in each frame).
  if (cmd->update_performance_counters)
    PerformanceCounters::Update(this, cmd->frame_number, cmd->internal_frame_number);

  RestoreDeviceContext();
}

void GPUBackend::GetStatsString(SmallStringBase& str) const
{
  if (IsUsingHardwareBackend())
  {
    str.format("{}{} HW | {} P | {} DC | {} B | {} RP | {} RB | {} C | {} W",
               GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()), g_gpu_settings.gpu_use_thread ? "-MT" : "",
               s_stats.num_primitives, s_stats.host_num_draws, s_stats.host_num_barriers,
               s_stats.host_num_render_passes, s_stats.host_num_downloads, s_stats.num_copies, s_stats.num_writes);
  }
  else
  {
    str.format("{}{} SW | {} P | {} R | {} C | {} W", GPUDevice::RenderAPIToString(g_gpu_device->GetRenderAPI()),
               g_gpu_settings.gpu_use_thread ? "-MT" : "", s_stats.num_primitives, s_stats.num_reads,
               s_stats.num_copies, s_stats.num_writes);
  }
}

void GPUBackend::GetMemoryStatsString(SmallStringBase& str) const
{
  const u32 vram_usage_mb = static_cast<u32>((g_gpu_device->GetVRAMUsage() + (1048576 - 1)) / 1048576);
  const u32 stream_kb = static_cast<u32>((s_stats.host_buffer_streamed + (1024 - 1)) / 1024);

  str.format("{} MB VRAM | {} KB STR | {} TC | {} TU", vram_usage_mb, stream_kb, s_stats.host_num_copies,
             s_stats.host_num_uploads);
}

void GPUBackend::ResetStatistics()
{
  s_counters = {};
  g_gpu_device->ResetStatistics();
}

void GPUBackend::UpdateStatistics(u32 frame_count)
{
  const GPUDevice::Statistics& stats = g_gpu_device->GetStatistics();
  const u32 round = (frame_count - 1);

#define UPDATE_COUNTER(x) s_stats.x = (s_counters.x + round) / frame_count
#define UPDATE_GPU_STAT(x) s_stats.host_##x = (stats.x + round) / frame_count

  UPDATE_COUNTER(num_reads);
  UPDATE_COUNTER(num_writes);
  UPDATE_COUNTER(num_copies);
  UPDATE_COUNTER(num_vertices);
  UPDATE_COUNTER(num_primitives);

  // UPDATE_COUNTER(num_read_texture_updates);
  // UPDATE_COUNTER(num_ubo_updates);

  UPDATE_GPU_STAT(buffer_streamed);
  UPDATE_GPU_STAT(num_draws);
  UPDATE_GPU_STAT(num_barriers);
  UPDATE_GPU_STAT(num_render_passes);
  UPDATE_GPU_STAT(num_copies);
  UPDATE_GPU_STAT(num_downloads);
  UPDATE_GPU_STAT(num_uploads);

#undef UPDATE_GPU_STAT
#undef UPDATE_COUNTER

  ResetStatistics();
}

bool GPUBackend::RenderScreenshotToBuffer(u32 width, u32 height, bool postfx, bool apply_aspect_ratio, Image* out_image,
                                          Error* error)
{
  bool result;
  GPUThread::RunOnBackend(
    [width, height, postfx, apply_aspect_ratio, out_image, error, &result](GPUBackend* backend) {
      if (!backend)
      {
        Error::SetStringView(error, "No GPU backend.");
        result = false;
        return;
      }

      // Post-processing requires that the size match the window.
      const bool really_postfx = postfx && g_gpu_device->HasMainSwapChain();
      u32 image_width, image_height;
      if (really_postfx)
      {
        image_width = g_gpu_device->GetMainSwapChain()->GetWidth();
        image_height = g_gpu_device->GetMainSwapChain()->GetHeight();
      }
      else
      {
        // Crop it if border overlay isn't enabled.
        GSVector4i draw_rect, display_rect;
        backend->GetPresenter().CalculateDrawRect(static_cast<s32>(width), static_cast<s32>(height), apply_aspect_ratio,
                                                  false, &display_rect, &draw_rect);
        image_width = static_cast<u32>(display_rect.width());
        image_height = static_cast<u32>(display_rect.height());
      }

      result = backend->GetPresenter().RenderScreenshotToBuffer(image_width, image_height, really_postfx,
                                                                apply_aspect_ratio, out_image, error);
      backend->RestoreDeviceContext();
    },
    true, false);

  return result;
}

void GPUBackend::RenderScreenshotToFile(const std::string_view path, DisplayScreenshotMode mode, u8 quality,
                                        bool show_osd_message)
{
  GPUThread::RunOnBackend(
    [path = std::string(path), mode, quality, show_osd_message](GPUBackend* backend) mutable {
      if (!backend)
        return;

      const GSVector2i size = backend->GetPresenter().CalculateScreenshotSize(mode);
      if (size.x == 0 || size.y == 0)
        return;

      std::string osd_key;
      if (show_osd_message)
        osd_key = fmt::format("ScreenshotSaver_{}", path);

      const bool internal_resolution = (mode != DisplayScreenshotMode::ScreenResolution);
      const bool apply_aspect_ratio = (mode != DisplayScreenshotMode::UncorrectedInternalResolution);
      Error error;
      Image image;
      if (!backend->m_presenter.RenderScreenshotToBuffer(size.x, size.y, !internal_resolution, apply_aspect_ratio,
                                                         &image, &error))
      {
        ERROR_LOG("Failed to render {}x{} screenshot: {}", size.x, size.y, error.GetDescription());
        if (show_osd_message)
        {
          Host::AddIconOSDWarning(
            std::move(osd_key), ICON_EMOJI_WARNING,
            fmt::format(TRANSLATE_FS("GPU", "Failed to save screenshot:\n{}"), error.GetDescription()));
        }

        backend->RestoreDeviceContext();
        return;
      }

      // no more GPU calls
      backend->RestoreDeviceContext();

      auto fp = FileSystem::OpenManagedCFile(path.c_str(), "wb", &error);
      if (!fp)
      {
        ERROR_LOG("Can't open file '{}': {}", Path::GetFileName(path), error.GetDescription());
        if (show_osd_message)
        {
          Host::AddIconOSDWarning(
            std::move(osd_key), ICON_EMOJI_WARNING,
            fmt::format(TRANSLATE_FS("GPU", "Failed to save screenshot:\n{}"), error.GetDescription()));
        }

        return;
      }

      if (show_osd_message)
      {
        // Use a 60 second timeout to give it plenty of time to actually save.
        Host::AddIconOSDMessage(osd_key, ICON_EMOJI_CAMERA_WITH_FLASH,
                                fmt::format(TRANSLATE_FS("GPU", "Saving screenshot to '{}'."), Path::GetFileName(path)),
                                60.0f);
      }

      System::QueueAsyncTask([path = std::move(path), fp = fp.release(), quality,
                              flip_y = g_gpu_device->UsesLowerLeftOrigin(), image = std::move(image),
                              osd_key = std::move(osd_key)]() mutable {
        Error error;

        if (flip_y)
          image.FlipY();

        if (image.GetFormat() != ImageFormat::RGBA8)
        {
          std::optional<Image> convert_image = image.ConvertToRGBA8(&error);
          if (!convert_image.has_value())
          {
            ERROR_LOG("Failed to convert {} screenshot to RGBA8: {}", Image::GetFormatName(image.GetFormat()),
                      error.GetDescription());
            image.Invalidate();
          }
          else
          {
            image = std::move(convert_image.value());
          }
        }

        bool result = false;
        if (image.IsValid())
        {
          image.SetAllPixelsOpaque();

          result = image.SaveToFile(path.c_str(), fp, quality, &error);
          if (!result)
            ERROR_LOG("Failed to save screenshot to '{}': '{}'", Path::GetFileName(path), error.GetDescription());
        }

        if (!osd_key.empty())
        {
          Host::AddIconOSDMessage(std::move(osd_key), ICON_EMOJI_CAMERA,
                                  fmt::format(result ? TRANSLATE_FS("GPU", "Saved screenshot to '{}'.") :
                                                       TRANSLATE_FS("GPU", "Failed to save screenshot to '{}'."),
                                              Path::GetFileName(path),
                                              result ? Host::OSD_INFO_DURATION : Host::OSD_ERROR_DURATION));
        }

        std::fclose(fp);
        return result;
      });
    },
    false, false);
}

namespace {

class GPUNullBackend final : public GPUBackend
{
public:
  GPUNullBackend(GPUPresenter& presenter);
  ~GPUNullBackend() override;

  bool Initialize(bool upload_vram, Error* error) override;
  bool UpdateSettings(const GPUSettings& old_settings, Error* error) override;

  u32 GetResolutionScale() const override;
  bool UpdateResolutionScale(Error* error) override;

  void RestoreDeviceContext() override;
  void FlushRender() override;

  void ReadVRAM(u32 x, u32 y, u32 width, u32 height) override;
  void FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced_rendering,
                u8 interlaced_display_field) override;
  void UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask) override;
  void CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask,
                bool check_mask) override;

  void DrawPolygon(const GPUBackendDrawPolygonCommand* cmd) override;
  void DrawPrecisePolygon(const GPUBackendDrawPrecisePolygonCommand* cmd) override;
  void DrawSprite(const GPUBackendDrawRectangleCommand* cmd) override;
  void DrawLine(const GPUBackendDrawLineCommand* cmd) override;
  void DrawPreciseLine(const GPUBackendDrawPreciseLineCommand* cmd) override;

  void DrawingAreaChanged() override;
  void ClearCache() override;
  void OnBufferSwapped() override;
  void ClearVRAM() override;

  void UpdateDisplay(const GPUBackendUpdateDisplayCommand* cmd) override;

  void LoadState(const GPUBackendLoadStateCommand* cmd) override;

  bool AllocateMemorySaveState(System::MemorySaveState& mss, Error* error) override;
  void DoMemoryState(StateWrapper& sw, System::MemorySaveState& mss) override;
};

} // namespace

GPUNullBackend::GPUNullBackend(GPUPresenter& presenter) : GPUBackend(presenter)
{
}

GPUNullBackend::~GPUNullBackend() = default;

bool GPUNullBackend::Initialize(bool upload_vram, Error* error)
{
  return GPUBackend::Initialize(upload_vram, error);
}

bool GPUNullBackend::UpdateSettings(const GPUSettings& old_settings, Error* error)
{
  return GPUBackend::UpdateSettings(old_settings, error);
}

u32 GPUNullBackend::GetResolutionScale() const
{
  return 1;
}

bool GPUNullBackend::UpdateResolutionScale(Error* error)
{
  return true;
}

void GPUNullBackend::RestoreDeviceContext()
{
}

void GPUNullBackend::FlushRender()
{
}

void GPUNullBackend::ReadVRAM(u32 x, u32 y, u32 width, u32 height)
{
}

void GPUNullBackend::FillVRAM(u32 x, u32 y, u32 width, u32 height, u32 color, bool interlaced_rendering,
                              u8 interlaced_display_field)
{
}

void GPUNullBackend::UpdateVRAM(u32 x, u32 y, u32 width, u32 height, const void* data, bool set_mask, bool check_mask)
{
}

void GPUNullBackend::CopyVRAM(u32 src_x, u32 src_y, u32 dst_x, u32 dst_y, u32 width, u32 height, bool set_mask,
                              bool check_mask)
{
}

void GPUNullBackend::DrawPolygon(const GPUBackendDrawPolygonCommand* cmd)
{
}

void GPUNullBackend::DrawPrecisePolygon(const GPUBackendDrawPrecisePolygonCommand* cmd)
{
}

void GPUNullBackend::DrawSprite(const GPUBackendDrawRectangleCommand* cmd)
{
}

void GPUNullBackend::DrawLine(const GPUBackendDrawLineCommand* cmd)
{
}

void GPUNullBackend::DrawPreciseLine(const GPUBackendDrawPreciseLineCommand* cmd)
{
}

void GPUNullBackend::DrawingAreaChanged()
{
}

void GPUNullBackend::ClearCache()
{
}

void GPUNullBackend::OnBufferSwapped()
{
}

void GPUNullBackend::ClearVRAM()
{
}

void GPUNullBackend::UpdateDisplay(const GPUBackendUpdateDisplayCommand* cmd)
{
}

void GPUNullBackend::LoadState(const GPUBackendLoadStateCommand* cmd)
{
}

bool GPUNullBackend::AllocateMemorySaveState(System::MemorySaveState& mss, Error* error)
{
  return false;
}

void GPUNullBackend::DoMemoryState(StateWrapper& sw, System::MemorySaveState& mss)
{
}

std::unique_ptr<GPUBackend> GPUBackend::CreateNullBackend(GPUPresenter& presenter)
{
  return std::make_unique<GPUNullBackend>(presenter);
}