From e647192437e5d69234721ce5a30de5c51ae41906 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Wed, 20 Nov 2024 20:44:37 +1000 Subject: [PATCH] GPUDevice: Add compute shader support --- src/util/d3d11_device.cpp | 84 +++++++++---- src/util/d3d11_device.h | 6 + src/util/d3d11_pipeline.cpp | 170 ++++++++++++++++++------- src/util/d3d11_pipeline.h | 12 +- src/util/d3d12_builders.h | 2 + src/util/d3d12_device.cpp | 128 ++++++++++++++++++- src/util/d3d12_device.h | 5 + src/util/d3d12_pipeline.cpp | 55 ++++++++ src/util/d3d12_pipeline.h | 1 + src/util/gpu_device.cpp | 68 ++++++++-- src/util/gpu_device.h | 34 +++-- src/util/metal_device.h | 32 +++-- src/util/metal_device.mm | 236 +++++++++++++++++++++++++---------- src/util/opengl_device.cpp | 13 ++ src/util/opengl_device.h | 3 + src/util/vulkan_builders.cpp | 5 +- src/util/vulkan_builders.h | 2 +- src/util/vulkan_device.cpp | 113 ++++++++++++++--- src/util/vulkan_device.h | 6 +- src/util/vulkan_pipeline.cpp | 13 ++ 20 files changed, 791 insertions(+), 197 deletions(-) diff --git a/src/util/d3d11_device.cpp b/src/util/d3d11_device.cpp index a8bcfc6a8..c6d44392c 100644 --- a/src/util/d3d11_device.cpp +++ b/src/util/d3d11_device.cpp @@ -185,6 +185,8 @@ void D3D11Device::SetFeatures(FeatureMask disabled_features) m_features.texture_buffers_emulated_with_ssbo = false; m_features.feedback_loops = false; m_features.geometry_shaders = !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS); + m_features.compute_shaders = + (!(disabled_features & FEATURE_MASK_COMPUTE_SHADERS) && feature_level >= D3D_FEATURE_LEVEL_11_0); m_features.partial_msaa_resolve = false; m_features.memory_import = false; m_features.explicit_present = false; @@ -896,19 +898,7 @@ void D3D11Device::PushUniformBuffer(const void* data, u32 data_size) m_uniform_buffer.Unmap(m_context.Get(), req_size); s_stats.buffer_streamed += data_size; - if (m_uniform_buffer.IsUsingMapNoOverwrite()) - { - const UINT first_constant = (res.index_aligned * UNIFORM_BUFFER_ALIGNMENT) / 16u; - const UINT num_constants = req_size / 16u; - m_context->VSSetConstantBuffers1(0, 1, m_uniform_buffer.GetD3DBufferArray(), &first_constant, &num_constants); - m_context->PSSetConstantBuffers1(0, 1, m_uniform_buffer.GetD3DBufferArray(), &first_constant, &num_constants); - } - else - { - DebugAssert(res.index_aligned == 0); - m_context->VSSetConstantBuffers(0, 1, m_uniform_buffer.GetD3DBufferArray()); - m_context->PSSetConstantBuffers(0, 1, m_uniform_buffer.GetD3DBufferArray()); - } + BindUniformBuffer(res.index_aligned * UNIFORM_BUFFER_ALIGNMENT, req_size); } void* D3D11Device::MapUniformBuffer(u32 size) @@ -930,18 +920,37 @@ void D3D11Device::UnmapUniformBuffer(u32 size) m_uniform_buffer.Unmap(m_context.Get(), req_size); s_stats.buffer_streamed += size; + BindUniformBuffer(pos, req_size); +} + +void D3D11Device::BindUniformBuffer(u32 offset, u32 size) +{ if (m_uniform_buffer.IsUsingMapNoOverwrite()) { - const UINT first_constant = pos / 16u; - const UINT num_constants = req_size / 16u; - m_context->VSSetConstantBuffers1(0, 1, m_uniform_buffer.GetD3DBufferArray(), &first_constant, &num_constants); - m_context->PSSetConstantBuffers1(0, 1, m_uniform_buffer.GetD3DBufferArray(), &first_constant, &num_constants); + const UINT first_constant = offset / 16u; + const UINT num_constants = size / 16u; + if (m_current_compute_shader) + { + m_context->CSSetConstantBuffers1(0, 1, m_uniform_buffer.GetD3DBufferArray(), &first_constant, &num_constants); + } + else + { + m_context->VSSetConstantBuffers1(0, 1, m_uniform_buffer.GetD3DBufferArray(), &first_constant, &num_constants); + m_context->PSSetConstantBuffers1(0, 1, m_uniform_buffer.GetD3DBufferArray(), &first_constant, &num_constants); + } } else { - DebugAssert(pos == 0); - m_context->VSSetConstantBuffers(0, 1, m_uniform_buffer.GetD3DBufferArray()); - m_context->PSSetConstantBuffers(0, 1, m_uniform_buffer.GetD3DBufferArray()); + DebugAssert(offset == 0); + if (m_current_compute_shader) + { + m_context->CSSetConstantBuffers(0, 1, m_uniform_buffer.GetD3DBufferArray()); + } + else + { + m_context->VSSetConstantBuffers(0, 1, m_uniform_buffer.GetD3DBufferArray()); + m_context->PSSetConstantBuffers(0, 1, m_uniform_buffer.GetD3DBufferArray()); + } } } @@ -1004,9 +1013,16 @@ void D3D11Device::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTextu for (u32 i = 0; i < m_num_current_render_targets; i++) uavs[i] = m_current_render_targets[i]->GetD3DUAV(); - m_context->OMSetRenderTargetsAndUnorderedAccessViews( - 0, nullptr, m_current_depth_target ? m_current_depth_target->GetD3DDSV() : nullptr, 0, - m_num_current_render_targets, uavs.data(), nullptr); + if (!m_current_compute_shader) + { + m_context->OMSetRenderTargetsAndUnorderedAccessViews( + 0, nullptr, m_current_depth_target ? m_current_depth_target->GetD3DDSV() : nullptr, 0, + m_num_current_render_targets, uavs.data(), nullptr); + } + else + { + m_context->CSSetUnorderedAccessViews(0, m_num_current_render_targets, uavs.data(), nullptr); + } } else { @@ -1046,11 +1062,15 @@ void D3D11Device::SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* s { m_current_textures[slot] = T; m_context->PSSetShaderResources(slot, 1, &T); + if (m_current_compute_shader) + m_context->CSSetShaderResources(slot, 1, &T); } if (m_current_samplers[slot] != S) { m_current_samplers[slot] = S; m_context->PSSetSamplers(slot, 1, &S); + if (m_current_compute_shader) + m_context->CSSetSamplers(slot, 1, &S); } } @@ -1060,6 +1080,8 @@ void D3D11Device::SetTextureBuffer(u32 slot, GPUTextureBuffer* buffer) if (m_current_textures[slot] != B) { m_current_textures[slot] = B; + + // Compute doesn't support texture buffers, yet... m_context->PSSetShaderResources(slot, 1, &B); } } @@ -1113,14 +1135,14 @@ void D3D11Device::SetScissor(const GSVector4i rc) void D3D11Device::Draw(u32 vertex_count, u32 base_vertex) { - DebugAssert(!m_vertex_buffer.IsMapped() && !m_index_buffer.IsMapped()); + DebugAssert(!m_vertex_buffer.IsMapped() && !m_index_buffer.IsMapped() && !m_current_compute_shader); s_stats.num_draws++; m_context->Draw(vertex_count, base_vertex); } void D3D11Device::DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) { - DebugAssert(!m_vertex_buffer.IsMapped() && !m_index_buffer.IsMapped()); + DebugAssert(!m_vertex_buffer.IsMapped() && !m_index_buffer.IsMapped() && !m_current_compute_shader); s_stats.num_draws++; m_context->DrawIndexed(index_count, base_index, base_vertex); } @@ -1129,3 +1151,15 @@ void D3D11Device::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 ba { Panic("Barriers are not supported"); } + +void D3D11Device::Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) +{ + DebugAssert(m_current_compute_shader); + s_stats.num_draws++; + + const u32 groups_x = threads_x / group_size_x; + const u32 groups_y = threads_y / group_size_y; + const u32 groups_z = threads_z / group_size_z; + m_context->Dispatch(groups_x, groups_y, groups_z); +} diff --git a/src/util/d3d11_device.h b/src/util/d3d11_device.h index 2fbb2dfa4..e1b327250 100644 --- a/src/util/d3d11_device.h +++ b/src/util/d3d11_device.h @@ -75,6 +75,7 @@ public: std::string_view source, const char* entry_point, DynamicHeapArray* out_binary, Error* error) override; std::unique_ptr CreatePipeline(const GPUPipeline::GraphicsConfig& config, Error* error) override; + std::unique_ptr CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) override; void PushDebugGroup(const char* name) override; void PopDebugGroup() override; @@ -98,6 +99,8 @@ public: void Draw(u32 vertex_count, u32 base_vertex) override; void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override; void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) override; + void Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) override; bool SetGPUTimingEnabled(bool enabled) override; float GetAndResetAccumulatedGPUTime() override; @@ -140,6 +143,8 @@ private: bool CreateBuffers(); void DestroyBuffers(); + void BindUniformBuffer(u32 offset, u32 size); + void UnbindComputePipeline(); bool IsRenderTargetBound(const D3D11Texture* tex) const; @@ -180,6 +185,7 @@ private: ID3D11VertexShader* m_current_vertex_shader = nullptr; ID3D11GeometryShader* m_current_geometry_shader = nullptr; ID3D11PixelShader* m_current_pixel_shader = nullptr; + ID3D11ComputeShader* m_current_compute_shader = nullptr; ID3D11RasterizerState* m_current_rasterizer_state = nullptr; ID3D11DepthStencilState* m_current_depth_state = nullptr; ID3D11BlendState* m_current_blend_state = nullptr; diff --git a/src/util/d3d11_pipeline.cpp b/src/util/d3d11_pipeline.cpp index 0c2301cec..b0d4dd681 100644 --- a/src/util/d3d11_pipeline.cpp +++ b/src/util/d3d11_pipeline.cpp @@ -3,6 +3,7 @@ #include "d3d11_pipeline.h" #include "d3d11_device.h" +#include "d3d11_texture.h" #include "d3d_common.h" #include "common/assert.h" @@ -121,10 +122,10 @@ std::unique_ptr D3D11Device::CreateShaderFromSource(GPUShaderStage st D3D11Pipeline::D3D11Pipeline(ComPtr rs, ComPtr ds, ComPtr bs, ComPtr il, ComPtr vs, - ComPtr gs, ComPtr ps, + ComPtr gs, ComPtr ps_or_cs, D3D11_PRIMITIVE_TOPOLOGY topology, u32 vertex_stride, u32 blend_factor) : m_rs(std::move(rs)), m_ds(std::move(ds)), m_bs(std::move(bs)), m_il(std::move(il)), m_vs(std::move(vs)), - m_gs(std::move(gs)), m_ps(std::move(ps)), m_topology(topology), m_vertex_stride(vertex_stride), + m_gs(std::move(gs)), m_ps_or_cs(std::move(ps_or_cs)), m_topology(topology), m_vertex_stride(vertex_stride), m_blend_factor(blend_factor), m_blend_factor_float(GPUDevice::RGBA8ToFloat(blend_factor)) { } @@ -215,7 +216,8 @@ size_t D3D11Device::BlendStateMapHash::operator()(const BlendStateMapKey& key) c return h; } -D3D11Device::ComPtr D3D11Device::GetBlendState(const GPUPipeline::BlendState& bs, u32 num_rts, Error* error) +D3D11Device::ComPtr D3D11Device::GetBlendState(const GPUPipeline::BlendState& bs, u32 num_rts, + Error* error) { ComPtr dbs; @@ -365,69 +367,124 @@ std::unique_ptr D3D11Device::CreatePipeline(const GPUPipeline::Grap primitives[static_cast(config.primitive)], vertex_stride, config.blend.constant)); } +std::unique_ptr D3D11Device::CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) +{ + if (!config.compute_shader) [[unlikely]] + { + Error::SetStringView(error, "Missing compute shader."); + return {}; + } + + return std::unique_ptr( + new D3D11Pipeline(nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + static_cast(config.compute_shader)->GetComputeShader(), + D3D11_PRIMITIVE_TOPOLOGY_UNDEFINED, 0, 0)); +} + void D3D11Device::SetPipeline(GPUPipeline* pipeline) { if (m_current_pipeline == pipeline) return; + const bool was_compute = m_current_pipeline && m_current_pipeline->IsComputePipeline(); D3D11Pipeline* const PL = static_cast(pipeline); m_current_pipeline = PL; - if (ID3D11InputLayout* il = PL->GetInputLayout(); m_current_input_layout != il) + if (!PL->IsComputePipeline()) { - m_current_input_layout = il; - m_context->IASetInputLayout(il); - } + if (was_compute) + UnbindComputePipeline(); - if (const u32 vertex_stride = PL->GetVertexStride(); m_current_vertex_stride != vertex_stride) - { - const UINT offset = 0; - m_current_vertex_stride = PL->GetVertexStride(); - m_context->IASetVertexBuffers(0, 1, m_vertex_buffer.GetD3DBufferArray(), &m_current_vertex_stride, &offset); - } + if (ID3D11InputLayout* il = PL->GetInputLayout(); m_current_input_layout != il) + { + m_current_input_layout = il; + m_context->IASetInputLayout(il); + } - if (D3D_PRIMITIVE_TOPOLOGY topology = PL->GetPrimitiveTopology(); m_current_primitive_topology != topology) - { - m_current_primitive_topology = topology; - m_context->IASetPrimitiveTopology(topology); - } + if (const u32 vertex_stride = PL->GetVertexStride(); m_current_vertex_stride != vertex_stride) + { + const UINT offset = 0; + m_current_vertex_stride = PL->GetVertexStride(); + m_context->IASetVertexBuffers(0, 1, m_vertex_buffer.GetD3DBufferArray(), &m_current_vertex_stride, &offset); + } - if (ID3D11VertexShader* vs = PL->GetVertexShader(); m_current_vertex_shader != vs) - { - m_current_vertex_shader = vs; - m_context->VSSetShader(vs, nullptr, 0); - } + if (D3D_PRIMITIVE_TOPOLOGY topology = PL->GetPrimitiveTopology(); m_current_primitive_topology != topology) + { + m_current_primitive_topology = topology; + m_context->IASetPrimitiveTopology(topology); + } - if (ID3D11GeometryShader* gs = PL->GetGeometryShader(); m_current_geometry_shader != gs) - { - m_current_geometry_shader = gs; - m_context->GSSetShader(gs, nullptr, 0); - } + if (ID3D11VertexShader* vs = PL->GetVertexShader(); m_current_vertex_shader != vs) + { + m_current_vertex_shader = vs; + m_context->VSSetShader(vs, nullptr, 0); + } - if (ID3D11PixelShader* ps = PL->GetPixelShader(); m_current_pixel_shader != ps) - { - m_current_pixel_shader = ps; - m_context->PSSetShader(ps, nullptr, 0); - } + if (ID3D11GeometryShader* gs = PL->GetGeometryShader(); m_current_geometry_shader != gs) + { + m_current_geometry_shader = gs; + m_context->GSSetShader(gs, nullptr, 0); + } - if (ID3D11RasterizerState* rs = PL->GetRasterizerState(); m_current_rasterizer_state != rs) - { - m_current_rasterizer_state = rs; - m_context->RSSetState(rs); - } + if (ID3D11PixelShader* ps = PL->GetPixelShader(); m_current_pixel_shader != ps) + { + m_current_pixel_shader = ps; + m_context->PSSetShader(ps, nullptr, 0); + } - if (ID3D11DepthStencilState* ds = PL->GetDepthStencilState(); m_current_depth_state != ds) - { - m_current_depth_state = ds; - m_context->OMSetDepthStencilState(ds, 0); - } + if (ID3D11RasterizerState* rs = PL->GetRasterizerState(); m_current_rasterizer_state != rs) + { + m_current_rasterizer_state = rs; + m_context->RSSetState(rs); + } - if (ID3D11BlendState* bs = PL->GetBlendState(); - m_current_blend_state != bs || m_current_blend_factor != PL->GetBlendFactor()) + if (ID3D11DepthStencilState* ds = PL->GetDepthStencilState(); m_current_depth_state != ds) + { + m_current_depth_state = ds; + m_context->OMSetDepthStencilState(ds, 0); + } + + if (ID3D11BlendState* bs = PL->GetBlendState(); + m_current_blend_state != bs || m_current_blend_factor != PL->GetBlendFactor()) + { + m_current_blend_state = bs; + m_current_blend_factor = PL->GetBlendFactor(); + m_context->OMSetBlendState(bs, RGBA8ToFloat(m_current_blend_factor).data(), 0xFFFFFFFFu); + } + } + else { - m_current_blend_state = bs; - m_current_blend_factor = PL->GetBlendFactor(); - m_context->OMSetBlendState(bs, RGBA8ToFloat(m_current_blend_factor).data(), 0xFFFFFFFFu); + if (ID3D11ComputeShader* cs = m_current_pipeline->GetComputeShader(); cs != m_current_compute_shader) + { + m_current_compute_shader = cs; + m_context->CSSetShader(cs, nullptr, 0); + } + + if (!was_compute) + { + // need to bind all SRVs/samplers + u32 count; + for (count = 0; count < MAX_TEXTURE_SAMPLERS; count++) + { + if (!m_current_textures[count]) + break; + } + if (count > 0) + { + m_context->CSSetShaderResources(0, count, m_current_textures.data()); + m_context->CSSetSamplers(0, count, m_current_samplers.data()); + } + + if (m_current_render_pass_flags & GPUPipeline::BindRenderTargetsAsImages) + { + ID3D11UnorderedAccessView* uavs[MAX_TEXTURE_SAMPLERS]; + for (u32 i = 0; i < m_num_current_render_targets; i++) + uavs[i] = m_current_render_targets[i]->GetD3DUAV(); + + m_context->OMSetRenderTargets(0, nullptr, nullptr); + m_context->CSSetUnorderedAccessViews(0, m_num_current_render_targets, uavs, nullptr); + } + } } } @@ -436,6 +493,23 @@ void D3D11Device::UnbindPipeline(D3D11Pipeline* pl) if (m_current_pipeline != pl) return; + if (pl->IsComputePipeline()) + UnbindComputePipeline(); + // Let the runtime deal with the dead objects... m_current_pipeline = nullptr; } + +void D3D11Device::UnbindComputePipeline() +{ + m_current_compute_shader = nullptr; + + ID3D11ShaderResourceView* null_srvs[MAX_TEXTURE_SAMPLERS] = {}; + ID3D11SamplerState* null_samplers[MAX_TEXTURE_SAMPLERS] = {}; + ID3D11UnorderedAccessView* null_uavs[MAX_RENDER_TARGETS] = {}; + m_context->CSSetShader(nullptr, nullptr, 0); + m_context->CSSetShaderResources(0, MAX_TEXTURE_SAMPLERS, null_srvs); + m_context->CSSetSamplers(0, MAX_TEXTURE_SAMPLERS, null_samplers); + if (m_current_render_pass_flags & GPUPipeline::BindRenderTargetsAsImages) + m_context->CSSetUnorderedAccessViews(0, m_num_current_render_targets, null_uavs, nullptr); +} diff --git a/src/util/d3d11_pipeline.h b/src/util/d3d11_pipeline.h index 88e825750..c3d58d3b2 100644 --- a/src/util/d3d11_pipeline.h +++ b/src/util/d3d11_pipeline.h @@ -51,13 +51,18 @@ public: void SetDebugName(std::string_view name) override; + ALWAYS_INLINE bool IsComputePipeline() const { return !m_vs; } ALWAYS_INLINE ID3D11RasterizerState* GetRasterizerState() const { return m_rs.Get(); } ALWAYS_INLINE ID3D11DepthStencilState* GetDepthStencilState() const { return m_ds.Get(); } ALWAYS_INLINE ID3D11BlendState* GetBlendState() const { return m_bs.Get(); } ALWAYS_INLINE ID3D11InputLayout* GetInputLayout() const { return m_il.Get(); } ALWAYS_INLINE ID3D11VertexShader* GetVertexShader() const { return m_vs.Get(); } ALWAYS_INLINE ID3D11GeometryShader* GetGeometryShader() const { return m_gs.Get(); } - ALWAYS_INLINE ID3D11PixelShader* GetPixelShader() const { return m_ps.Get(); } + ALWAYS_INLINE ID3D11PixelShader* GetPixelShader() const { return static_cast(m_ps_or_cs.Get()); } + ALWAYS_INLINE ID3D11ComputeShader* GetComputeShader() const + { + return static_cast(m_ps_or_cs.Get()); + } ALWAYS_INLINE D3D11_PRIMITIVE_TOPOLOGY GetPrimitiveTopology() const { return m_topology; } ALWAYS_INLINE u32 GetVertexStride() const { return m_vertex_stride; } ALWAYS_INLINE u32 GetBlendFactor() const { return m_blend_factor; } @@ -66,7 +71,8 @@ public: private: D3D11Pipeline(ComPtr rs, ComPtr ds, ComPtr bs, ComPtr il, ComPtr vs, ComPtr gs, - ComPtr ps, D3D11_PRIMITIVE_TOPOLOGY topology, u32 vertex_stride, u32 blend_factor); + ComPtr ps_or_cs, D3D11_PRIMITIVE_TOPOLOGY topology, u32 vertex_stride, + u32 blend_factor); ComPtr m_rs; ComPtr m_ds; @@ -74,7 +80,7 @@ private: ComPtr m_il; ComPtr m_vs; ComPtr m_gs; - ComPtr m_ps; + ComPtr m_ps_or_cs; D3D11_PRIMITIVE_TOPOLOGY m_topology; u32 m_vertex_stride; u32 m_blend_factor; diff --git a/src/util/d3d12_builders.h b/src/util/d3d12_builders.h index ae2970716..9dbffcf77 100644 --- a/src/util/d3d12_builders.h +++ b/src/util/d3d12_builders.h @@ -115,6 +115,8 @@ public: ComputePipelineBuilder(); ~ComputePipelineBuilder() = default; + ALWAYS_INLINE const D3D12_COMPUTE_PIPELINE_STATE_DESC* GetDesc() const { return &m_desc; } + void Clear(); Microsoft::WRL::ComPtr Create(ID3D12Device* device, Error* error, bool clear); diff --git a/src/util/d3d12_device.cpp b/src/util/d3d12_device.cpp index cd17a22e4..be58487c9 100644 --- a/src/util/d3d12_device.cpp +++ b/src/util/d3d12_device.cpp @@ -1298,6 +1298,7 @@ void D3D12Device::SetFeatures(D3D_FEATURE_LEVEL feature_level, FeatureMask disab m_features.texture_buffers_emulated_with_ssbo = false; m_features.feedback_loops = false; m_features.geometry_shaders = !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS); + m_features.compute_shaders = !(disabled_features & FEATURE_MASK_COMPUTE_SHADERS); m_features.partial_msaa_resolve = true; m_features.memory_import = false; m_features.explicit_present = true; @@ -1552,6 +1553,7 @@ void D3D12Device::PushUniformBuffer(const void* data, u32 data_size) 1, // SingleTextureBufferAndPushConstants 0, // MultiTextureAndUBO 2, // MultiTextureAndPushConstants + 2, // ComputeSingleTextureAndPushConstants }; DebugAssert(data_size < UNIFORM_PUSH_CONSTANTS_SIZE); @@ -1565,7 +1567,11 @@ void D3D12Device::PushUniformBuffer(const void* data, u32 data_size) const u32 push_param = push_parameters[static_cast(m_current_pipeline_layout)] + BoolToUInt8(IsUsingROVRootSignature()); - GetCommandList()->SetGraphicsRoot32BitConstants(push_param, data_size / 4u, data, 0); + ID3D12GraphicsCommandList4* cmdlist = GetCommandList(); + if (!IsUsingComputeRootSignature()) + cmdlist->SetGraphicsRoot32BitConstants(push_param, data_size / 4u, data, 0); + else + cmdlist->SetComputeRoot32BitConstants(push_param, data_size / 4u, data, 0); } void* D3D12Device::MapUniformBuffer(u32 size) @@ -1687,6 +1693,18 @@ bool D3D12Device::CreateRootSignatures(Error* error) } } + { + auto& rs = m_root_signatures[0][static_cast(GPUPipeline::Layout::ComputeSingleTextureAndPushConstants)]; + + rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SRV, 0, MAX_TEXTURE_SAMPLERS, D3D12_SHADER_VISIBILITY_ALL); + rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_SAMPLER, 0, MAX_TEXTURE_SAMPLERS, D3D12_SHADER_VISIBILITY_ALL); + rsb.AddDescriptorTable(D3D12_DESCRIPTOR_RANGE_TYPE_UAV, 0, MAX_IMAGE_RENDER_TARGETS, D3D12_SHADER_VISIBILITY_ALL); + rsb.Add32BitConstants(0, UNIFORM_PUSH_CONSTANTS_SIZE / sizeof(u32), D3D12_SHADER_VISIBILITY_ALL); + if (!(rs = rsb.Create(error, true))) + return false; + D3D12::SetObjectName(rs.Get(), "Compute Single Texture Pipeline Layout"); + } + return true; } @@ -1810,6 +1828,7 @@ void D3D12Device::BeginRenderPass() rt->TransitionToState(cmdlist, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); rt->SetUseFenceValue(GetCurrentFenceValue()); rt->CommitClear(cmdlist); + rt->SetState(GPUTexture::State::Dirty); } } if (m_current_depth_target) @@ -2174,15 +2193,88 @@ void D3D12Device::PreDrawCheck() BeginRenderPass(); } +void D3D12Device::PreDispatchCheck() +{ + if (InRenderPass()) + EndRenderPass(); + + // Transition images. + ID3D12GraphicsCommandList4* cmdlist = GetCommandList(); + + // All textures should be in shader read only optimal already, but just in case.. + const u32 num_textures = GetActiveTexturesForLayout(m_current_pipeline_layout); + for (u32 i = 0; i < num_textures; i++) + { + if (m_current_textures[i]) + m_current_textures[i]->TransitionToState(cmdlist, D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE); + } + + if (m_num_current_render_targets > 0 && (m_current_render_pass_flags & GPUPipeline::BindRenderTargetsAsImages)) + { + // Still need to clear the RTs. + for (u32 i = 0; i < m_num_current_render_targets; i++) + { + D3D12Texture* const rt = m_current_render_targets[i]; + rt->TransitionToState(cmdlist, D3D12_RESOURCE_STATE_UNORDERED_ACCESS); + rt->SetUseFenceValue(GetCurrentFenceValue()); + rt->CommitClear(cmdlist); + rt->SetState(GPUTexture::State::Dirty); + } + } + + // If this is a new command buffer, bind the pipeline and such. + if (m_dirty_flags & DIRTY_FLAG_INITIAL) + SetInitialPipelineState(); + + // TODO: Flushing cmdbuffer because of descriptor OOM will lose push constants. + DebugAssert(!(m_dirty_flags & DIRTY_FLAG_INITIAL)); + const u32 dirty = std::exchange(m_dirty_flags, 0); + if (dirty != 0) + { + if (dirty & DIRTY_FLAG_PIPELINE_LAYOUT) + { + UpdateRootSignature(); + if (!UpdateRootParameters(dirty)) + { + SubmitCommandList(false, "out of descriptors"); + PreDispatchCheck(); + return; + } + } + else if (dirty & (DIRTY_FLAG_CONSTANT_BUFFER | DIRTY_FLAG_TEXTURES | DIRTY_FLAG_SAMPLERS | DIRTY_FLAG_RT_UAVS)) + { + if (!UpdateRootParameters(dirty)) + { + SubmitCommandList(false, "out of descriptors"); + PreDispatchCheck(); + return; + } + } + } +} + bool D3D12Device::IsUsingROVRootSignature() const { return ((m_current_render_pass_flags & GPUPipeline::BindRenderTargetsAsImages) != 0); } +bool D3D12Device::IsUsingComputeRootSignature() const +{ + return (m_current_pipeline_layout >= GPUPipeline::Layout::ComputeSingleTextureAndPushConstants); +} + void D3D12Device::UpdateRootSignature() { - GetCommandList()->SetGraphicsRootSignature( - m_root_signatures[BoolToUInt8(IsUsingROVRootSignature())][static_cast(m_current_pipeline_layout)].Get()); + ID3D12GraphicsCommandList4* cmdlist = GetCommandList(); + if (!IsUsingComputeRootSignature()) + { + cmdlist->SetGraphicsRootSignature( + m_root_signatures[BoolToUInt8(IsUsingROVRootSignature())][static_cast(m_current_pipeline_layout)].Get()); + } + else + { + cmdlist->SetComputeRootSignature(m_root_signatures[0][static_cast(m_current_pipeline_layout)].Get()); + } } template @@ -2223,7 +2315,10 @@ bool D3D12Device::UpdateParametersForLayout(u32 dirty) D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV); } - cmdlist->SetGraphicsRootDescriptorTable(0, gpu_handle); + if constexpr (layout < GPUPipeline::Layout::ComputeSingleTextureAndPushConstants) + cmdlist->SetGraphicsRootDescriptorTable(0, gpu_handle); + else + cmdlist->SetComputeRootDescriptorTable(0, gpu_handle); } if (dirty & DIRTY_FLAG_SAMPLERS && num_textures > 0) @@ -2241,7 +2336,10 @@ bool D3D12Device::UpdateParametersForLayout(u32 dirty) return false; } - cmdlist->SetGraphicsRootDescriptorTable(1, gpu_handle); + if constexpr (layout < GPUPipeline::Layout::ComputeSingleTextureAndPushConstants) + cmdlist->SetGraphicsRootDescriptorTable(1, gpu_handle); + else + cmdlist->SetComputeRootDescriptorTable(1, gpu_handle); } if (dirty & DIRTY_FLAG_TEXTURES && layout == GPUPipeline::Layout::SingleTextureBufferAndPushConstants) @@ -2283,7 +2381,10 @@ bool D3D12Device::UpdateParametersForLayout(u32 dirty) 1 : ((layout == GPUPipeline::Layout::SingleTextureAndUBO || layout == GPUPipeline::Layout::MultiTextureAndUBO) ? 3 : 2); - cmdlist->SetGraphicsRootDescriptorTable(rov_param, gpu_handle); + if constexpr (layout < GPUPipeline::Layout::ComputeSingleTextureAndPushConstants) + cmdlist->SetGraphicsRootDescriptorTable(rov_param, gpu_handle); + else + cmdlist->SetComputeRootDescriptorTable(rov_param, gpu_handle); } return true; @@ -2308,6 +2409,9 @@ bool D3D12Device::UpdateRootParameters(u32 dirty) case GPUPipeline::Layout::MultiTextureAndPushConstants: return UpdateParametersForLayout(dirty); + case GPUPipeline::Layout::ComputeSingleTextureAndPushConstants: + return UpdateParametersForLayout(dirty); + default: UnreachableCode(); } @@ -2331,3 +2435,15 @@ void D3D12Device::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 ba { Panic("Barriers are not supported"); } + +void D3D12Device::Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) +{ + PreDispatchCheck(); + s_stats.num_draws++; + + const u32 groups_x = threads_x / group_size_x; + const u32 groups_y = threads_y / group_size_y; + const u32 groups_z = threads_z / group_size_z; + GetCommandList()->Dispatch(groups_x, groups_y, groups_z); +} diff --git a/src/util/d3d12_device.h b/src/util/d3d12_device.h index e20bd525f..ba065cfc5 100644 --- a/src/util/d3d12_device.h +++ b/src/util/d3d12_device.h @@ -96,6 +96,7 @@ public: std::string_view source, const char* entry_point, DynamicHeapArray* out_binary, Error* error) override; std::unique_ptr CreatePipeline(const GPUPipeline::GraphicsConfig& config, Error* error) override; + std::unique_ptr CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) override; void PushDebugGroup(const char* name) override; void PopDebugGroup() override; @@ -119,6 +120,8 @@ public: void Draw(u32 vertex_count, u32 base_vertex) override; void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override; void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) override; + void Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) override; bool SetGPUTimingEnabled(bool enabled) override; float GetAndResetAccumulatedGPUTime() override; @@ -275,8 +278,10 @@ private: ID3D12RootSignature* GetCurrentRootSignature() const; void SetInitialPipelineState(); void PreDrawCheck(); + void PreDispatchCheck(); bool IsUsingROVRootSignature() const; + bool IsUsingComputeRootSignature() const; void UpdateRootSignature(); template bool UpdateParametersForLayout(u32 dirty); diff --git a/src/util/d3d12_pipeline.cpp b/src/util/d3d12_pipeline.cpp index c25a67c5c..4b78c2aff 100644 --- a/src/util/d3d12_pipeline.cpp +++ b/src/util/d3d12_pipeline.cpp @@ -107,6 +107,18 @@ std::string D3D12Pipeline::GetPipelineName(const GraphicsConfig& config) return SHA1Digest::DigestToString(digest); } +std::string D3D12Pipeline::GetPipelineName(const ComputeConfig& config) +{ + SHA1Digest hash; + hash.Update(&config.layout, sizeof(config.layout)); + if (const D3D12Shader* shader = static_cast(config.compute_shader)) + hash.Update(shader->GetBytecodeData(), shader->GetBytecodeSize()); + + u8 digest[SHA1Digest::DIGEST_SIZE]; + hash.Final(digest); + return SHA1Digest::DigestToString(digest); +} + std::unique_ptr D3D12Device::CreatePipeline(const GPUPipeline::GraphicsConfig& config, Error* error) { static constexpr std::array(GPUPipeline::Primitive::MaxCount)> primitives = @@ -274,3 +286,46 @@ std::unique_ptr D3D12Device::CreatePipeline(const GPUPipeline::Grap pipeline, config.layout, primitives[static_cast(config.primitive)], config.input_layout.vertex_attributes.empty() ? 0 : config.input_layout.vertex_stride, config.blend.constant)); } + +std::unique_ptr D3D12Device::CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) +{ + D3D12::ComputePipelineBuilder cpb; + cpb.SetRootSignature(m_root_signatures[0][static_cast(config.layout)].Get()); + cpb.SetShader(static_cast(config.compute_shader)->GetBytecodeData(), + static_cast(config.compute_shader)->GetBytecodeSize()); + + ComPtr pipeline; + if (m_pipeline_library) + { + const std::wstring name = StringUtil::UTF8StringToWideString(D3D12Pipeline::GetPipelineName(config)); + HRESULT hr = + m_pipeline_library->LoadComputePipeline(name.c_str(), cpb.GetDesc(), IID_PPV_ARGS(pipeline.GetAddressOf())); + if (FAILED(hr)) + { + // E_INVALIDARG = not found. + if (hr != E_INVALIDARG) + ERROR_LOG("LoadComputePipeline() failed with HRESULT {:08X}", static_cast(hr)); + + // Need to create it normally. + pipeline = cpb.Create(m_device.Get(), error, false); + + // Store if it wasn't an OOM or something else. + if (pipeline && hr == E_INVALIDARG) + { + hr = m_pipeline_library->StorePipeline(name.c_str(), pipeline.Get()); + if (FAILED(hr)) + ERROR_LOG("StorePipeline() failed with HRESULT {:08X}", static_cast(hr)); + } + } + } + else + { + pipeline = cpb.Create(m_device.Get(), error, false); + } + + if (!pipeline) + return {}; + + return std::unique_ptr( + new D3D12Pipeline(pipeline, config.layout, D3D_PRIMITIVE_TOPOLOGY_UNDEFINED, 0, 0)); +} diff --git a/src/util/d3d12_pipeline.h b/src/util/d3d12_pipeline.h index bca9494fa..e2f83d14f 100644 --- a/src/util/d3d12_pipeline.h +++ b/src/util/d3d12_pipeline.h @@ -51,6 +51,7 @@ public: void SetDebugName(std::string_view name) override; static std::string GetPipelineName(const GraphicsConfig& config); + static std::string GetPipelineName(const ComputeConfig& config); private: D3D12Pipeline(Microsoft::WRL::ComPtr pipeline, Layout layout, D3D12_PRIMITIVE_TOPOLOGY topology, diff --git a/src/util/gpu_device.cpp b/src/util/gpu_device.cpp index d9554289c..6a4c3a6dd 100644 --- a/src/util/gpu_device.cpp +++ b/src/util/gpu_device.cpp @@ -1579,11 +1579,13 @@ bool GPUDevice::TranslateVulkanSpvToLanguage(const std::span spirv, GP // Need to know if there's UBOs for mapping. const spvc_reflected_resource *ubos, *textures; - size_t ubos_count, textures_count; + size_t ubos_count, textures_count, images_count; if ((sres = dyn_libs::spvc_resources_get_resource_list_for_type(resources, SPVC_RESOURCE_TYPE_UNIFORM_BUFFER, &ubos, &ubos_count)) != SPVC_SUCCESS || (sres = dyn_libs::spvc_resources_get_resource_list_for_type(resources, SPVC_RESOURCE_TYPE_SAMPLED_IMAGE, - &textures, &textures_count)) != SPVC_SUCCESS) + &textures, &textures_count)) != SPVC_SUCCESS || + (sres = dyn_libs::spvc_resources_get_resource_list_for_type(resources, SPVC_RESOURCE_TYPE_STORAGE_IMAGE, + &textures, &images_count)) != SPVC_SUCCESS) { Error::SetStringFmt(error, "spvc_resources_get_resource_list_for_type() failed: {}", static_cast(sres)); return {}; @@ -1592,6 +1594,7 @@ bool GPUDevice::TranslateVulkanSpvToLanguage(const std::span spirv, GP [[maybe_unused]] const SpvExecutionModel execmodel = dyn_libs::spvc_compiler_get_execution_model(scompiler); [[maybe_unused]] static constexpr u32 UBO_DESCRIPTOR_SET = 0; [[maybe_unused]] static constexpr u32 TEXTURE_DESCRIPTOR_SET = 1; + [[maybe_unused]] static constexpr u32 IMAGE_DESCRIPTOR_SET = 2; switch (target_language) { @@ -1659,6 +1662,25 @@ bool GPUDevice::TranslateVulkanSpvToLanguage(const std::span spirv, GP } } } + + if (stage == GPUShaderStage::Compute) + { + for (u32 i = 0; i < images_count; i++) + { + const spvc_hlsl_resource_binding rb = {.stage = execmodel, + .desc_set = IMAGE_DESCRIPTOR_SET, + .binding = i, + .cbv = {}, + .uav = {.register_space = 0, .register_binding = i}, + .srv = {}, + .sampler = {}}; + if ((sres = dyn_libs::spvc_compiler_hlsl_add_resource_binding(scompiler, &rb)) != SPVC_SUCCESS) + { + Error::SetStringFmt(error, "spvc_compiler_hlsl_add_resource_binding() failed: {}", static_cast(sres)); + return {}; + } + } + } } break; #endif @@ -1727,12 +1749,25 @@ bool GPUDevice::TranslateVulkanSpvToLanguage(const std::span spirv, GP return {}; } - if (stage == GPUShaderStage::Fragment) + const spvc_msl_resource_binding pc_rb = {.stage = execmodel, + .desc_set = SPVC_MSL_PUSH_CONSTANT_DESC_SET, + .binding = SPVC_MSL_PUSH_CONSTANT_BINDING, + .msl_buffer = 0, + .msl_texture = 0, + .msl_sampler = 0}; + if ((sres = dyn_libs::spvc_compiler_msl_add_resource_binding(scompiler, &pc_rb)) != SPVC_SUCCESS) + { + Error::SetStringFmt(error, "spvc_compiler_msl_add_resource_binding() for push constant failed: {}", + static_cast(sres)); + return {}; + } + + if (stage == GPUShaderStage::Fragment || stage == GPUShaderStage::Compute) { for (u32 i = 0; i < MAX_TEXTURE_SAMPLERS; i++) { - const spvc_msl_resource_binding rb = {.stage = SpvExecutionModelFragment, - .desc_set = 1, + const spvc_msl_resource_binding rb = {.stage = execmodel, + .desc_set = TEXTURE_DESCRIPTOR_SET, .binding = i, .msl_buffer = i, .msl_texture = i, @@ -1744,16 +1779,31 @@ bool GPUDevice::TranslateVulkanSpvToLanguage(const std::span spirv, GP return {}; } } + } - if (!m_features.framebuffer_fetch) + if (stage == GPUShaderStage::Fragment && !m_features.framebuffer_fetch) + { + const spvc_msl_resource_binding rb = { + .stage = execmodel, .desc_set = 2, .binding = 0, .msl_texture = MAX_TEXTURE_SAMPLERS}; + + if ((sres = dyn_libs::spvc_compiler_msl_add_resource_binding(scompiler, &rb)) != SPVC_SUCCESS) + { + Error::SetStringFmt(error, "spvc_compiler_msl_add_resource_binding() for FB failed: {}", + static_cast(sres)); + return {}; + } + } + + if (stage == GPUShaderStage::Compute) + { + for (u32 i = 0; i < MAX_IMAGE_RENDER_TARGETS; i++) { const spvc_msl_resource_binding rb = { - .stage = SpvExecutionModelFragment, .desc_set = 2, .binding = 0, .msl_texture = MAX_TEXTURE_SAMPLERS}; + .stage = execmodel, .desc_set = 2, .binding = i, .msl_buffer = i, .msl_texture = i, .msl_sampler = i}; if ((sres = dyn_libs::spvc_compiler_msl_add_resource_binding(scompiler, &rb)) != SPVC_SUCCESS) { - Error::SetStringFmt(error, "spvc_compiler_msl_add_resource_binding() for FB failed: {}", - static_cast(sres)); + Error::SetStringFmt(error, "spvc_compiler_msl_add_resource_binding() failed: {}", static_cast(sres)); return {}; } } diff --git a/src/util/gpu_device.h b/src/util/gpu_device.h index 309b4db39..6c1060c60 100644 --- a/src/util/gpu_device.h +++ b/src/util/gpu_device.h @@ -160,6 +160,9 @@ public: // Multiple textures, 128 byte UBO via push constants. MultiTextureAndPushConstants, + // 128 byte UBO via push constants, 1 texture, compute shader. + ComputeSingleTextureAndPushConstants, + MaxCount }; @@ -416,6 +419,12 @@ public: u32 GetRenderTargetCount() const; }; + struct ComputeConfig + { + Layout layout; + GPUShader* compute_shader; + }; + GPUPipeline(); virtual ~GPUPipeline(); @@ -501,9 +510,10 @@ public: FEATURE_MASK_FRAMEBUFFER_FETCH = (1 << 2), FEATURE_MASK_TEXTURE_BUFFERS = (1 << 3), FEATURE_MASK_GEOMETRY_SHADERS = (1 << 4), - FEATURE_MASK_TEXTURE_COPY_TO_SELF = (1 << 5), - FEATURE_MASK_MEMORY_IMPORT = (1 << 6), - FEATURE_MASK_RASTER_ORDER_VIEWS = (1 << 7), + FEATURE_MASK_COMPUTE_SHADERS = (1 << 5), + FEATURE_MASK_TEXTURE_COPY_TO_SELF = (1 << 6), + FEATURE_MASK_MEMORY_IMPORT = (1 << 7), + FEATURE_MASK_RASTER_ORDER_VIEWS = (1 << 8), }; enum class DrawBarrier : u32 @@ -532,6 +542,7 @@ public: bool texture_buffers_emulated_with_ssbo : 1; bool feedback_loops : 1; bool geometry_shaders : 1; + bool compute_shaders : 1; bool partial_msaa_resolve : 1; bool memory_import : 1; bool explicit_present : 1; @@ -625,11 +636,20 @@ public: 0, // SingleTextureBufferAndPushConstants MAX_TEXTURE_SAMPLERS, // MultiTextureAndUBO MAX_TEXTURE_SAMPLERS, // MultiTextureAndPushConstants + 1, // ComputeSingleTextureAndPushConstants }; return counts[static_cast(layout)]; } + /// Returns the number of thread groups to dispatch for a given total count and local size. + static constexpr std::tuple GetDispatchCount(u32 count_x, u32 count_y, u32 count_z, u32 local_size_x, + u32 local_size_y, u32 local_size_z) + { + return std::make_tuple((count_x + (local_size_x - 1)) / local_size_x, (count_y + (local_size_y - 1)) / local_size_y, + (count_z + (local_size_z - 1)) / local_size_z); + } + ALWAYS_INLINE const Features& GetFeatures() const { return m_features; } ALWAYS_INLINE RenderAPI GetRenderAPI() const { return m_render_api; } ALWAYS_INLINE u32 GetRenderAPIVersion() const { return m_render_api_version; } @@ -638,10 +658,6 @@ public: ALWAYS_INLINE GPUSwapChain* GetMainSwapChain() const { return m_main_swap_chain.get(); } ALWAYS_INLINE bool HasMainSwapChain() const { return static_cast(m_main_swap_chain); } - // ALWAYS_INLINE u32 GetMainSwapChainWidth() const { return m_main_swap_chain->GetWidth(); } - // ALWAYS_INLINE u32 GetMainSwapChainHeight() const { return m_main_swap_chain->GetHeight(); } - // ALWAYS_INLINE float GetWindowScale() const { return m_window_info.surface_scale; } - // ALWAYS_INLINE GPUTexture::Format GetWindowFormat() const { return m_window_info.surface_format; } ALWAYS_INLINE GPUSampler* GetLinearSampler() const { return m_linear_sampler.get(); } ALWAYS_INLINE GPUSampler* GetNearestSampler() const { return m_nearest_sampler.get(); } @@ -712,6 +728,8 @@ public: Error* error = nullptr, const char* entry_point = "main"); virtual std::unique_ptr CreatePipeline(const GPUPipeline::GraphicsConfig& config, Error* error = nullptr) = 0; + virtual std::unique_ptr CreatePipeline(const GPUPipeline::ComputeConfig& config, + Error* error = nullptr) = 0; /// Debug messaging. virtual void PushDebugGroup(const char* name) = 0; @@ -753,6 +771,8 @@ public: virtual void Draw(u32 vertex_count, u32 base_vertex) = 0; virtual void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) = 0; virtual void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) = 0; + virtual void Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) = 0; /// Returns false if the window was completely occluded. virtual PresentResult BeginPresent(GPUSwapChain* swap_chain, u32 clear_color = DEFAULT_CLEAR_COLOR) = 0; diff --git a/src/util/metal_device.h b/src/util/metal_device.h index 418ab174f..b1a502755 100644 --- a/src/util/metal_device.h +++ b/src/util/metal_device.h @@ -78,7 +78,16 @@ class MetalPipeline final : public GPUPipeline public: ~MetalPipeline() override; - ALWAYS_INLINE id GetPipelineState() const { return m_pipeline; } + ALWAYS_INLINE bool IsRenderPipeline() const { return (m_depth != nil); } + ALWAYS_INLINE bool IsComputePipeline() const { return (m_depth == nil); } + ALWAYS_INLINE id GetRenderPipelineState() const + { + return (id)m_pipeline; + } + ALWAYS_INLINE id GetComputePipelineState() const + { + return (id)m_pipeline; + } ALWAYS_INLINE id GetDepthState() const { return m_depth; } ALWAYS_INLINE MTLCullMode GetCullMode() const { return m_cull_mode; } ALWAYS_INLINE MTLPrimitiveType GetPrimitive() const { return m_primitive; } @@ -86,10 +95,9 @@ public: void SetDebugName(std::string_view name) override; private: - MetalPipeline(id pipeline, id depth, MTLCullMode cull_mode, - MTLPrimitiveType primitive); + MetalPipeline(id pipeline, id depth, MTLCullMode cull_mode, MTLPrimitiveType primitive); - id m_pipeline; + id m_pipeline; id m_depth; MTLCullMode m_cull_mode; MTLPrimitiveType m_primitive; @@ -251,6 +259,7 @@ public: std::string_view source, const char* entry_point, DynamicHeapArray* out_binary, Error* error) override; std::unique_ptr CreatePipeline(const GPUPipeline::GraphicsConfig& config, Error* error) override; + std::unique_ptr CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) override; void PushDebugGroup(const char* name) override; void PopDebugGroup() override; @@ -265,7 +274,7 @@ public: void* MapUniformBuffer(u32 size) override; void UnmapUniformBuffer(u32 size) override; void SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds, - GPUPipeline::RenderPassFlag feedback_loop) override; + GPUPipeline::RenderPassFlag flags) override; void SetPipeline(GPUPipeline* pipeline) override; void SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* sampler) override; void SetTextureBuffer(u32 slot, GPUTextureBuffer* buffer) override; @@ -274,6 +283,8 @@ public: void Draw(u32 vertex_count, u32 base_vertex) override; void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override; void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) override; + void Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) override; bool SetGPUTimingEnabled(bool enabled) override; float GetAndResetAccumulatedGPUTime() override; @@ -338,7 +349,6 @@ private: std::unique_ptr CreateShaderFromMSL(GPUShaderStage stage, std::string_view source, std::string_view entry_point, Error* error); id GetFunctionFromLibrary(id library, NSString* name); - id CreateComputePipeline(id function, NSString* name); ClearPipelineConfig GetCurrentClearPipelineConfig() const; id GetClearDepthPipeline(const ClearPipelineConfig& config); id GetDepthState(const GPUPipeline::DepthState& ds); @@ -349,9 +359,12 @@ private: void CleanupObjects(); ALWAYS_INLINE bool InRenderPass() const { return (m_render_encoder != nil); } + ALWAYS_INLINE bool InComputePass() const { return (m_compute_encoder != nil); } ALWAYS_INLINE bool IsInlineUploading() const { return (m_inline_upload_encoder != nil); } void BeginRenderPass(); void EndRenderPass(); + void BeginComputePass(); + void EndComputePass(); void EndInlineUploading(); void EndAnyEncoding(); @@ -359,6 +372,8 @@ private: void SetInitialEncoderState(); void SetViewportInRenderEncoder(); void SetScissorInRenderEncoder(); + void CommitRenderTargetClears(); + void BindRenderTargetsAsComputeImages(); void RenderBlankFrame(MetalSwapChain* swap_chain); @@ -384,7 +399,7 @@ private: id m_shaders = nil; id m_pipeline_archive = nil; - std::vector, id>> + std::vector, std::unique_ptr>> m_resolve_pipelines; std::vector>> m_clear_pipelines; @@ -394,9 +409,10 @@ private: id m_render_cmdbuf = nil; id m_render_encoder = nil; + id m_compute_encoder = nil; u8 m_num_current_render_targets = 0; - GPUPipeline::RenderPassFlag m_current_feedback_loop = GPUPipeline::NoRenderPassFlags; + GPUPipeline::RenderPassFlag m_current_render_pass_flags = GPUPipeline::NoRenderPassFlags; std::array m_current_render_targets = {}; MetalTexture* m_current_depth_target = nullptr; diff --git a/src/util/metal_device.mm b/src/util/metal_device.mm index 870ca1da2..ba3cd045b 100644 --- a/src/util/metal_device.mm +++ b/src/util/metal_device.mm @@ -77,7 +77,8 @@ static void LogNSError(NSError* error, std::string_view message) { Log::FastWrite(Log::Channel::GPUDevice, Log::Level::Error, message); Log::FastWrite(Log::Channel::GPUDevice, Log::Level::Error, " NSError Code: {}", static_cast(error.code)); - Log::FastWrite(Log::Channel::GPUDevice, Log::Level::Error, " NSError Description: {}", [error.description UTF8String]); + Log::FastWrite(Log::Channel::GPUDevice, Log::Level::Error, " NSError Description: {}", + [error.description UTF8String]); } static GPUTexture::Format GetTextureFormatForMTLFormat(MTLPixelFormat fmt) @@ -503,28 +504,6 @@ id MetalDevice::GetFunctionFromLibrary(id library, NSSt return function; } -id MetalDevice::CreateComputePipeline(id function, NSString* name) -{ - MTLComputePipelineDescriptor* desc = [MTLComputePipelineDescriptor new]; - if (name != nil) - [desc setLabel:name]; - [desc setComputeFunction:function]; - - NSError* err = nil; - id pipeline = [m_device newComputePipelineStateWithDescriptor:desc - options:MTLPipelineOptionNone - reflection:nil - error:&err]; - [desc release]; - if (pipeline == nil) - { - LogNSError(err, "Create compute pipeline failed:"); - return nil; - } - - return pipeline; -} - void MetalDevice::DestroyDevice() { WaitForPreviousCommandBuffers(); @@ -564,11 +543,6 @@ void MetalDevice::DestroyDevice() [it.second release]; } m_depth_states.clear(); - for (auto& it : m_resolve_pipelines) - { - if (it.second != nil) - [it.second release]; - } m_resolve_pipelines.clear(); for (auto& it : m_clear_pipelines) { @@ -755,7 +729,7 @@ std::unique_ptr MetalDevice::CreateShaderFromSource(GPUShaderStage st return CreateShaderFromMSL(stage, source, entry_point, error); } -MetalPipeline::MetalPipeline(id pipeline, id depth, MTLCullMode cull_mode, +MetalPipeline::MetalPipeline(id pipeline, id depth, MTLCullMode cull_mode, MTLPrimitiveType primitive) : m_pipeline(pipeline), m_depth(depth), m_cull_mode(cull_mode), m_primitive(primitive) { @@ -982,6 +956,29 @@ std::unique_ptr MetalDevice::CreatePipeline(const GPUPipeline::Grap } } +std::unique_ptr MetalDevice::CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) +{ + @autoreleasepool + { + MTLComputePipelineDescriptor* desc = [[MTLComputePipelineDescriptor new] autorelease]; + [desc setComputeFunction:static_cast(config.compute_shader)->GetFunction()]; + + NSError* nserror = nil; + id pipeline = [m_device newComputePipelineStateWithDescriptor:desc + options:MTLPipelineOptionNone + reflection:nil + error:&nserror]; + if (pipeline == nil) + { + LogNSError(nserror, "Failed to create compute pipeline state"); + CocoaTools::NSErrorToErrorObject(error, "newComputePipelineStateWithDescriptor failed: ", nserror); + return {}; + } + + return std::unique_ptr(new MetalPipeline(pipeline, nil, MTLCullModeNone, MTLPrimitiveTypePoint)); + } +} + MetalTexture::MetalTexture(id texture, u16 width, u16 height, u8 layers, u8 levels, u8 samples, Type type, Format format) : GPUTexture(width, height, layers, levels, samples, type, format), m_texture(texture) @@ -1559,14 +1556,14 @@ void MetalDevice::ResolveTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u3 const GPUTexture::Format src_format = dst->GetFormat(); const GPUTexture::Format dst_format = dst->GetFormat(); - id resolve_pipeline = nil; + GPUPipeline* resolve_pipeline; if (auto iter = std::find_if(m_resolve_pipelines.begin(), m_resolve_pipelines.end(), [src_format, dst_format](const auto& it) { return it.first.first == src_format && it.first.second == dst_format; }); iter != m_resolve_pipelines.end()) { - resolve_pipeline = iter->second; + resolve_pipeline = iter->second.get(); } else { @@ -1579,32 +1576,41 @@ void MetalDevice::ResolveTextureRegion(GPUTexture* dst, u32 dst_x, u32 dst_y, u3 if (function == nil) Panic("Failed to get resolve kernel"); - resolve_pipeline = [CreateComputePipeline(function, is_depth ? @"Depth Resolve" : @"Color Resolve") autorelease]; - if (resolve_pipeline != nil) - [resolve_pipeline retain]; - m_resolve_pipelines.emplace_back(std::make_pair(src_format, dst_format), resolve_pipeline); + MetalShader temp_shader(GPUShaderStage::Compute, m_shaders, function); + GPUPipeline::ComputeConfig config; + config.layout = GPUPipeline::Layout::ComputeSingleTextureAndPushConstants; + config.compute_shader = &temp_shader; + + std::unique_ptr pipeline = CreatePipeline(config, nullptr); + if (!pipeline) + Panic("Failed to create resolve pipeline"); + + GL_OBJECT_NAME(pipeline, is_depth ? "Depth Resolve" : "Color Resolve"); + resolve_pipeline = + m_resolve_pipelines.emplace_back(std::make_pair(src_format, dst_format), std::move(pipeline)).second.get(); } } - if (resolve_pipeline == nil) - Panic("Failed to get resolve pipeline"); if (InRenderPass()) EndRenderPass(); s_stats.num_copies++; - const u32 threadgroupHeight = resolve_pipeline.maxTotalThreadsPerThreadgroup / resolve_pipeline.threadExecutionWidth; - const MTLSize intrinsicThreadgroupSize = MTLSizeMake(resolve_pipeline.threadExecutionWidth, threadgroupHeight, 1); + const id mtl_pipeline = + static_cast(resolve_pipeline)->GetComputePipelineState(); + const u32 threadgroupHeight = mtl_pipeline.maxTotalThreadsPerThreadgroup / mtl_pipeline.threadExecutionWidth; + const MTLSize intrinsicThreadgroupSize = MTLSizeMake(mtl_pipeline.threadExecutionWidth, threadgroupHeight, 1); const MTLSize threadgroupsInGrid = MTLSizeMake((src->GetWidth() + intrinsicThreadgroupSize.width - 1) / intrinsicThreadgroupSize.width, (src->GetHeight() + intrinsicThreadgroupSize.height - 1) / intrinsicThreadgroupSize.height, 1); - id computeEncoder = [m_render_cmdbuf computeCommandEncoder]; - [computeEncoder setComputePipelineState:resolve_pipeline]; - [computeEncoder setTexture:static_cast(src)->GetMTLTexture() atIndex:0]; - [computeEncoder setTexture:static_cast(dst)->GetMTLTexture() atIndex:1]; - [computeEncoder dispatchThreadgroups:threadgroupsInGrid threadsPerThreadgroup:intrinsicThreadgroupSize]; - [computeEncoder endEncoding]; + // Set up manually to not disturb state. + BeginComputePass(); + [m_compute_encoder setComputePipelineState:mtl_pipeline]; + [m_compute_encoder setTexture:static_cast(src)->GetMTLTexture() atIndex:0]; + [m_compute_encoder setTexture:static_cast(dst)->GetMTLTexture() atIndex:1]; + [m_compute_encoder dispatchThreadgroups:threadgroupsInGrid threadsPerThreadgroup:intrinsicThreadgroupSize]; + EndComputePass(); } void MetalDevice::ClearRenderTarget(GPUTexture* t, u32 c) @@ -1645,7 +1651,7 @@ void MetalDevice::ClearDepth(GPUTexture* t, float d) [m_render_encoder setVertexBuffer:m_uniform_buffer.GetBuffer() offset:m_current_uniform_buffer_position atIndex:0]; if (m_current_pipeline) - [m_render_encoder setRenderPipelineState:m_current_pipeline->GetPipelineState()]; + [m_render_encoder setRenderPipelineState:m_current_pipeline->GetRenderPipelineState()]; if (m_current_cull_mode != MTLCullModeNone) [m_render_encoder setCullMode:m_current_cull_mode]; if (depth != m_current_depth_state) @@ -1674,6 +1680,8 @@ void MetalDevice::CommitClear(MetalTexture* tex) // TODO: We could combine it with the current render pass. if (InRenderPass()) EndRenderPass(); + else if (InComputePass()) + EndComputePass(); @autoreleasepool { @@ -1896,11 +1904,13 @@ void MetalDevice::UnmapUniformBuffer(u32 size) } void MetalDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTexture* ds, - GPUPipeline::RenderPassFlag feedback_loop) + GPUPipeline::RenderPassFlag flags) { bool changed = (m_num_current_render_targets != num_rts || m_current_depth_target != ds || - (!m_features.framebuffer_fetch && ((feedback_loop & GPUPipeline::ColorFeedbackLoop) != - (m_current_feedback_loop & GPUPipeline::ColorFeedbackLoop)))); + ((flags & GPUPipeline::BindRenderTargetsAsImages) != + (m_current_render_pass_flags & GPUPipeline::BindRenderTargetsAsImages)) || + (!m_features.framebuffer_fetch && ((flags & GPUPipeline::ColorFeedbackLoop) != + (m_current_render_pass_flags & GPUPipeline::ColorFeedbackLoop)))); bool needs_ds_clear = (ds && ds->IsClearedOrInvalidated()); bool needs_rt_clear = false; @@ -1915,12 +1925,19 @@ void MetalDevice::SetRenderTargets(GPUTexture* const* rts, u32 num_rts, GPUTextu for (u32 i = num_rts; i < m_num_current_render_targets; i++) m_current_render_targets[i] = nullptr; m_num_current_render_targets = static_cast(num_rts); - m_current_feedback_loop = feedback_loop; + m_current_render_pass_flags = flags; if (changed || needs_rt_clear || needs_ds_clear) { if (InRenderPass()) + { EndRenderPass(); + } + else if (InComputePass() && (flags & GPUPipeline::BindRenderTargetsAsImages) != GPUPipeline::NoRenderPassFlags) + { + CommitRenderTargetClears(); + BindRenderTargetsAsComputeImages(); + } } } @@ -1931,26 +1948,34 @@ void MetalDevice::SetPipeline(GPUPipeline* pipeline) return; m_current_pipeline = static_cast(pipeline); - if (InRenderPass()) + if (!m_current_pipeline->IsComputePipeline()) { - [m_render_encoder setRenderPipelineState:m_current_pipeline->GetPipelineState()]; + if (InRenderPass()) + { + [m_render_encoder setRenderPipelineState:m_current_pipeline->GetRenderPipelineState()]; - if (m_current_depth_state != m_current_pipeline->GetDepthState()) - { - m_current_depth_state = m_current_pipeline->GetDepthState(); - [m_render_encoder setDepthStencilState:m_current_depth_state]; + if (m_current_depth_state != m_current_pipeline->GetDepthState()) + { + m_current_depth_state = m_current_pipeline->GetDepthState(); + [m_render_encoder setDepthStencilState:m_current_depth_state]; + } + if (m_current_cull_mode != m_current_pipeline->GetCullMode()) + { + m_current_cull_mode = m_current_pipeline->GetCullMode(); + [m_render_encoder setCullMode:m_current_cull_mode]; + } } - if (m_current_cull_mode != m_current_pipeline->GetCullMode()) + else { + // Still need to set depth state before the draw begins. + m_current_depth_state = m_current_pipeline->GetDepthState(); m_current_cull_mode = m_current_pipeline->GetCullMode(); - [m_render_encoder setCullMode:m_current_cull_mode]; } } else { - // Still need to set depth state before the draw begins. - m_current_depth_state = m_current_pipeline->GetDepthState(); - m_current_cull_mode = m_current_pipeline->GetCullMode(); + if (InComputePass()) + [m_compute_encoder setComputePipelineState:m_current_pipeline->GetComputePipelineState()]; } } @@ -1979,6 +2004,8 @@ void MetalDevice::SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* s m_current_textures[slot] = T; if (InRenderPass()) [m_render_encoder setFragmentTexture:T atIndex:slot]; + else if (InComputePass()) + [m_compute_encoder setTexture:T atIndex:slot]; } id S = sampler ? static_cast(sampler)->GetSamplerState() : nil; @@ -1987,6 +2014,8 @@ void MetalDevice::SetTextureSampler(u32 slot, GPUTexture* texture, GPUSampler* s m_current_samplers[slot] = S; if (InRenderPass()) [m_render_encoder setFragmentSamplerState:S atIndex:slot]; + else if (InComputePass()) + [m_compute_encoder setTexture:T atIndex:slot]; } } @@ -2011,6 +2040,8 @@ void MetalDevice::UnbindTexture(MetalTexture* tex) m_current_textures[i] = nil; if (InRenderPass()) [m_render_encoder setFragmentTexture:nil atIndex:i]; + else if (InComputePass()) + [m_compute_encoder setTexture:nil atIndex:0]; } } @@ -2070,7 +2101,7 @@ void MetalDevice::SetScissor(const GSVector4i rc) void MetalDevice::BeginRenderPass() { - DebugAssert(m_render_encoder == nil); + DebugAssert(m_render_encoder == nil && !InComputePass()); // Inline writes :( if (m_inline_upload_encoder != nil) @@ -2180,12 +2211,57 @@ void MetalDevice::BeginRenderPass() void MetalDevice::EndRenderPass() { - DebugAssert(InRenderPass() && !IsInlineUploading()); + DebugAssert(InRenderPass() && !IsInlineUploading() && !InComputePass()); [m_render_encoder endEncoding]; [m_render_encoder release]; m_render_encoder = nil; } +void MetalDevice::BeginComputePass() +{ + DebugAssert(!InRenderPass() && !IsInlineUploading() && !InComputePass()); + + if ((m_current_render_pass_flags & GPUPipeline::BindRenderTargetsAsImages) != GPUPipeline::NoRenderPassFlags) + CommitRenderTargetClears(); + + m_compute_encoder = [[m_render_cmdbuf computeCommandEncoder] retain]; + [m_compute_encoder setTextures:m_current_textures.data() withRange:NSMakeRange(0, MAX_TEXTURE_SAMPLERS)]; + [m_compute_encoder setSamplerStates:m_current_samplers.data() withRange:NSMakeRange(0, MAX_TEXTURE_SAMPLERS)]; + + if ((m_current_render_pass_flags & GPUPipeline::BindRenderTargetsAsImages) != GPUPipeline::NoRenderPassFlags) + BindRenderTargetsAsComputeImages(); + + if (m_current_pipeline && m_current_pipeline->IsComputePipeline()) + [m_compute_encoder setComputePipelineState:m_current_pipeline->GetComputePipelineState()]; +} + +void MetalDevice::CommitRenderTargetClears() +{ + for (u32 i = 0; i < m_num_current_render_targets; i++) + { + MetalTexture* rt = m_current_render_targets[i]; + if (rt->GetState() == GPUTexture::State::Invalidated) + rt->SetState(GPUTexture::State::Dirty); + else if (rt->GetState() == GPUTexture::State::Cleared) + CommitClear(rt); + } +} + +void MetalDevice::BindRenderTargetsAsComputeImages() +{ + for (u32 i = 0; i < m_num_current_render_targets; i++) + [m_compute_encoder setTexture:m_current_render_targets[i]->GetMTLTexture() atIndex:MAX_TEXTURE_SAMPLERS + i]; +} + +void MetalDevice::EndComputePass() +{ + DebugAssert(InComputePass()); + + [m_compute_encoder endEncoding]; + [m_compute_encoder release]; + m_compute_encoder = nil; +} + void MetalDevice::EndInlineUploading() { DebugAssert(IsInlineUploading() && !InRenderPass()); @@ -2198,6 +2274,8 @@ void MetalDevice::EndAnyEncoding() { if (InRenderPass()) EndRenderPass(); + else if (InComputePass()) + EndComputePass(); else if (IsInlineUploading()) EndInlineUploading(); } @@ -2213,14 +2291,14 @@ void MetalDevice::SetInitialEncoderState() [m_render_encoder setCullMode:m_current_cull_mode]; if (m_current_depth_state != nil) [m_render_encoder setDepthStencilState:m_current_depth_state]; - if (m_current_pipeline != nil) - [m_render_encoder setRenderPipelineState:m_current_pipeline->GetPipelineState()]; + if (m_current_pipeline && m_current_pipeline->IsRenderPipeline()) + [m_render_encoder setRenderPipelineState:m_current_pipeline->GetRenderPipelineState()]; [m_render_encoder setFragmentTextures:m_current_textures.data() withRange:NSMakeRange(0, MAX_TEXTURE_SAMPLERS)]; [m_render_encoder setFragmentSamplerStates:m_current_samplers.data() withRange:NSMakeRange(0, MAX_TEXTURE_SAMPLERS)]; if (m_current_ssbo) [m_render_encoder setFragmentBuffer:m_current_ssbo offset:0 atIndex:1]; - if (!m_features.framebuffer_fetch && (m_current_feedback_loop & GPUPipeline::ColorFeedbackLoop)) + if (!m_features.framebuffer_fetch && (m_current_render_pass_flags & GPUPipeline::ColorFeedbackLoop)) { DebugAssert(m_current_render_targets[0]); [m_render_encoder setFragmentTexture:m_current_render_targets[0]->GetMTLTexture() atIndex:MAX_TEXTURE_SAMPLERS]; @@ -2249,7 +2327,12 @@ void MetalDevice::SetScissorInRenderEncoder() void MetalDevice::PreDrawCheck() { if (!InRenderPass()) + { + if (InComputePass()) + EndComputePass(); + BeginRenderPass(); + } } void MetalDevice::Draw(u32 vertex_count, u32 base_vertex) @@ -2392,6 +2475,25 @@ void MetalDevice::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 ba } } +void MetalDevice::Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) +{ + if (!InComputePass()) + { + if (InRenderPass()) + EndRenderPass(); + + BeginComputePass(); + } + + DebugAssert(m_current_pipeline && m_current_pipeline->IsComputePipeline()); + id pipeline = m_current_pipeline->GetComputePipelineState(); + + // TODO: We could remap to the optimal group size.. + [m_compute_encoder dispatchThreads:MTLSizeMake(threads_x, threads_y, threads_z) + threadsPerThreadgroup:MTLSizeMake(group_size_x, group_size_y, group_size_z)]; +} + id MetalDevice::GetBlitEncoder(bool is_inline) { @autoreleasepool @@ -2450,7 +2552,7 @@ GPUDevice::PresentResult MetalDevice::BeginPresent(GPUSwapChain* swap_chain, u32 s_stats.num_render_passes++; std::memset(m_current_render_targets.data(), 0, sizeof(m_current_render_targets)); m_num_current_render_targets = 0; - m_current_feedback_loop = GPUPipeline::NoRenderPassFlags; + m_current_render_pass_flags = GPUPipeline::NoRenderPassFlags; m_current_depth_target = nullptr; m_current_pipeline = nullptr; m_current_depth_state = nil; diff --git a/src/util/opengl_device.cpp b/src/util/opengl_device.cpp index e929da63d..29a43df2e 100644 --- a/src/util/opengl_device.cpp +++ b/src/util/opengl_device.cpp @@ -207,6 +207,12 @@ void OpenGLDevice::InvalidateRenderTarget(GPUTexture* t) } } +std::unique_ptr OpenGLDevice::CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) +{ + ERROR_LOG("Compute shaders are not yet supported."); + return {}; +} + void OpenGLDevice::PushDebugGroup(const char* name) { #ifdef _DEBUG @@ -488,6 +494,7 @@ bool OpenGLDevice::CheckFeatures(FeatureMask disabled_features) m_features.geometry_shaders = !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS) && (GLAD_GL_VERSION_3_2 || GLAD_GL_ES_VERSION_3_2); + m_features.compute_shaders = false; m_features.gpu_timing = !(m_gl_context->IsGLES() && (!GLAD_GL_EXT_disjoint_timer_query || !glGetQueryObjectivEXT || !glGetQueryObjectui64vEXT)); @@ -1078,6 +1085,12 @@ void OpenGLDevice::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 b Panic("Barriers are not supported"); } +void OpenGLDevice::Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) +{ + Panic("Compute shaders are not supported"); +} + void OpenGLDevice::MapVertexBuffer(u32 vertex_size, u32 vertex_count, void** map_ptr, u32* map_space, u32* map_base_vertex) { diff --git a/src/util/opengl_device.h b/src/util/opengl_device.h index e499c5931..d16c3b6d9 100644 --- a/src/util/opengl_device.h +++ b/src/util/opengl_device.h @@ -77,6 +77,7 @@ public: std::string_view source, const char* entry_point, DynamicHeapArray* out_binary, Error* error) override; std::unique_ptr CreatePipeline(const GPUPipeline::GraphicsConfig& config, Error* error) override; + std::unique_ptr CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) override; void PushDebugGroup(const char* name) override; void PopDebugGroup() override; @@ -100,6 +101,8 @@ public: void Draw(u32 vertex_count, u32 base_vertex) override; void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override; void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) override; + void Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) override; PresentResult BeginPresent(GPUSwapChain* swap_chain, u32 clear_color) override; void EndPresent(GPUSwapChain* swap_chain, bool explicit_present, u64 present_time) override; diff --git a/src/util/vulkan_builders.cpp b/src/util/vulkan_builders.cpp index d9a64a356..6c6251acb 100644 --- a/src/util/vulkan_builders.cpp +++ b/src/util/vulkan_builders.cpp @@ -627,14 +627,15 @@ void Vulkan::ComputePipelineBuilder::Clear() m_smap_constants = {}; } -VkPipeline Vulkan::ComputePipelineBuilder::Create(VkDevice device, VkPipelineCache pipeline_cache /*= VK_NULL_HANDLE*/, - bool clear /*= true*/) +VkPipeline Vulkan::ComputePipelineBuilder::Create(VkDevice device, VkPipelineCache pipeline_cache, bool clear, + Error* error) { VkPipeline pipeline; VkResult res = vkCreateComputePipelines(device, pipeline_cache, 1, &m_ci, nullptr, &pipeline); if (res != VK_SUCCESS) { LOG_VULKAN_ERROR(res, "vkCreateComputePipelines() failed: "); + SetErrorObject(error, "vkCreateComputePipelines() failed: ", res); return VK_NULL_HANDLE; } diff --git a/src/util/vulkan_builders.h b/src/util/vulkan_builders.h index f65f2e1aa..760caecee 100644 --- a/src/util/vulkan_builders.h +++ b/src/util/vulkan_builders.h @@ -197,7 +197,7 @@ public: void Clear(); - VkPipeline Create(VkDevice device, VkPipelineCache pipeline_cache = VK_NULL_HANDLE, bool clear = true); + VkPipeline Create(VkDevice device, VkPipelineCache pipeline_cache, bool clear, Error* error); void SetShader(VkShaderModule module, const char* entry_point); diff --git a/src/util/vulkan_device.cpp b/src/util/vulkan_device.cpp index 924ffde3f..4a41a6818 100644 --- a/src/util/vulkan_device.cpp +++ b/src/util/vulkan_device.cpp @@ -2447,6 +2447,7 @@ void VulkanDevice::SetFeatures(FeatureMask disabled_features, const VkPhysicalDe WARNING_LOG("Emulating texture buffers with SSBOs."); m_features.geometry_shaders = !(disabled_features & FEATURE_MASK_GEOMETRY_SHADERS) && vk_features.geometryShader; + m_features.compute_shaders = !(disabled_features & FEATURE_MASK_COMPUTE_SHADERS); m_features.partial_msaa_resolve = true; m_features.memory_import = m_optional_extensions.vk_ext_external_memory_host; @@ -2802,7 +2803,8 @@ bool VulkanDevice::CreatePipelineLayouts() } { - dslb.AddBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT); + dslb.AddBinding(0, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, + VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT); if ((m_single_texture_ds_layout = dslb.Create(m_device)) == VK_NULL_HANDLE) return false; Vulkan::SetObjectName(m_device, m_single_texture_ds_layout, "Single Texture Descriptor Set Layout"); @@ -2822,7 +2824,8 @@ bool VulkanDevice::CreatePipelineLayouts() if (m_optional_extensions.vk_khr_push_descriptor) dslb.SetPushFlag(); for (u32 i = 0; i < MAX_TEXTURE_SAMPLERS; i++) - dslb.AddBinding(i, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, VK_SHADER_STAGE_FRAGMENT_BIT); + dslb.AddBinding(i, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, 1, + VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT); if ((m_multi_texture_ds_layout = dslb.Create(m_device)) == VK_NULL_HANDLE) return false; Vulkan::SetObjectName(m_device, m_multi_texture_ds_layout, "Multi Texture Descriptor Set Layout"); @@ -2837,14 +2840,13 @@ bool VulkanDevice::CreatePipelineLayouts() Vulkan::SetObjectName(m_device, m_feedback_loop_ds_layout, "Feedback Loop Descriptor Set Layout"); } - if (m_features.raster_order_views) + for (u32 i = 0; i < MAX_IMAGE_RENDER_TARGETS; i++) { - for (u32 i = 0; i < MAX_IMAGE_RENDER_TARGETS; i++) - dslb.AddBinding(i, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_FRAGMENT_BIT); - if ((m_rov_ds_layout = dslb.Create(m_device)) == VK_NULL_HANDLE) - return false; - Vulkan::SetObjectName(m_device, m_feedback_loop_ds_layout, "ROV Descriptor Set Layout"); + dslb.AddBinding(i, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_FRAGMENT_BIT | VK_SHADER_STAGE_COMPUTE_BIT); } + if ((m_image_ds_layout = dslb.Create(m_device)) == VK_NULL_HANDLE) + return false; + Vulkan::SetObjectName(m_device, m_image_ds_layout, "ROV Descriptor Set Layout"); for (u32 type = 0; type < 3; type++) { @@ -2860,7 +2862,7 @@ bool VulkanDevice::CreatePipelineLayouts() if (feedback_loop) plb.AddDescriptorSet(m_feedback_loop_ds_layout); else if (rov) - plb.AddDescriptorSet(m_rov_ds_layout); + plb.AddDescriptorSet(m_image_ds_layout); if ((pl = plb.Create(m_device)) == VK_NULL_HANDLE) return false; Vulkan::SetObjectName(m_device, pl, "Single Texture + UBO Pipeline Layout"); @@ -2873,7 +2875,7 @@ bool VulkanDevice::CreatePipelineLayouts() if (feedback_loop) plb.AddDescriptorSet(m_feedback_loop_ds_layout); else if (rov) - plb.AddDescriptorSet(m_rov_ds_layout); + plb.AddDescriptorSet(m_image_ds_layout); plb.AddPushConstants(UNIFORM_PUSH_CONSTANTS_STAGES, 0, UNIFORM_PUSH_CONSTANTS_SIZE); if ((pl = plb.Create(m_device)) == VK_NULL_HANDLE) return false; @@ -2887,7 +2889,7 @@ bool VulkanDevice::CreatePipelineLayouts() if (feedback_loop) plb.AddDescriptorSet(m_feedback_loop_ds_layout); else if (rov) - plb.AddDescriptorSet(m_rov_ds_layout); + plb.AddDescriptorSet(m_image_ds_layout); plb.AddPushConstants(UNIFORM_PUSH_CONSTANTS_STAGES, 0, UNIFORM_PUSH_CONSTANTS_SIZE); if ((pl = plb.Create(m_device)) == VK_NULL_HANDLE) return false; @@ -2901,7 +2903,7 @@ bool VulkanDevice::CreatePipelineLayouts() if (feedback_loop) plb.AddDescriptorSet(m_feedback_loop_ds_layout); else if (rov) - plb.AddDescriptorSet(m_rov_ds_layout); + plb.AddDescriptorSet(m_image_ds_layout); if ((pl = plb.Create(m_device)) == VK_NULL_HANDLE) return false; Vulkan::SetObjectName(m_device, pl, "Multi Texture + UBO Pipeline Layout"); @@ -2915,13 +2917,24 @@ bool VulkanDevice::CreatePipelineLayouts() if (feedback_loop) plb.AddDescriptorSet(m_feedback_loop_ds_layout); else if (rov) - plb.AddDescriptorSet(m_rov_ds_layout); + plb.AddDescriptorSet(m_image_ds_layout); if ((pl = plb.Create(m_device)) == VK_NULL_HANDLE) return false; Vulkan::SetObjectName(m_device, pl, "Multi Texture Pipeline Layout"); } } + { + VkPipelineLayout& pl = + m_pipeline_layouts[0][static_cast(GPUPipeline::Layout::ComputeSingleTextureAndPushConstants)]; + plb.AddDescriptorSet(m_single_texture_ds_layout); + plb.AddDescriptorSet(m_image_ds_layout); + plb.AddPushConstants(VK_SHADER_STAGE_COMPUTE_BIT, 0, UNIFORM_PUSH_CONSTANTS_SIZE); + if ((pl = plb.Create(m_device)) == VK_NULL_HANDLE) + return false; + Vulkan::SetObjectName(m_device, pl, "Compute Single Texture Pipeline Layout"); + } + return true; } @@ -2942,7 +2955,7 @@ void VulkanDevice::DestroyPipelineLayouts() l = VK_NULL_HANDLE; } }; - destroy_dsl(m_rov_ds_layout); + destroy_dsl(m_image_ds_layout); destroy_dsl(m_feedback_loop_ds_layout); destroy_dsl(m_multi_texture_ds_layout); destroy_dsl(m_single_texture_buffer_ds_layout); @@ -3674,12 +3687,56 @@ void VulkanDevice::PreDrawCheck() } } +void VulkanDevice::PreDispatchCheck() +{ + // All textures should be in shader read only optimal already, but just in case.. + const u32 num_textures = GetActiveTexturesForLayout(m_current_pipeline_layout); + for (u32 i = 0; i < num_textures; i++) + { + if (m_current_textures[i]) + m_current_textures[i]->TransitionToLayout(VulkanTexture::Layout::ShaderReadOnly); + } + + // Binding as image, but we still need to clear it. + for (u32 i = 0; i < m_num_current_render_targets; i++) + { + VulkanTexture* rt = m_current_render_targets[i]; + if (rt->GetState() == GPUTexture::State::Cleared) + rt->CommitClear(m_current_command_buffer); + rt->SetState(GPUTexture::State::Dirty); + rt->TransitionToLayout(VulkanTexture::Layout::ReadWriteImage); + rt->SetUseFenceCounter(GetCurrentFenceCounter()); + } + + // If this is a new command buffer, bind the pipeline and such. + if (m_dirty_flags & DIRTY_FLAG_INITIAL) + SetInitialPipelineState(); + + DebugAssert(!(m_dirty_flags & DIRTY_FLAG_INITIAL)); + const u32 update_mask = (m_current_render_pass_flags ? ~0u : ~DIRTY_FLAG_INPUT_ATTACHMENT); + const u32 dirty = m_dirty_flags & update_mask; + m_dirty_flags = m_dirty_flags & ~update_mask; + + if (dirty != 0) + { + if (!UpdateDescriptorSets(dirty)) + { + SubmitCommandBuffer(false, "out of descriptor sets"); + PreDispatchCheck(); + return; + } + } +} + template bool VulkanDevice::UpdateDescriptorSetsForLayout(u32 dirty) { [[maybe_unused]] bool new_dynamic_offsets = false; - VkPipelineLayout const vk_pipeline_layout = GetCurrentVkPipelineLayout(); + constexpr VkPipelineBindPoint vk_bind_point = + ((layout < GPUPipeline::Layout::ComputeSingleTextureAndPushConstants) ? VK_PIPELINE_BIND_POINT_GRAPHICS : + VK_PIPELINE_BIND_POINT_COMPUTE); + const VkPipelineLayout vk_pipeline_layout = GetCurrentVkPipelineLayout(); std::array ds; u32 first_ds = 0; u32 num_ds = 0; @@ -3700,7 +3757,8 @@ bool VulkanDevice::UpdateDescriptorSetsForLayout(u32 dirty) } if constexpr (layout == GPUPipeline::Layout::SingleTextureAndUBO || - layout == GPUPipeline::Layout::SingleTextureAndPushConstants) + layout == GPUPipeline::Layout::SingleTextureAndPushConstants || + layout == GPUPipeline::Layout::ComputeSingleTextureAndPushConstants) { VulkanTexture* const tex = m_current_textures[0] ? m_current_textures[0] : m_null_texture.get(); DebugAssert(tex && m_current_samplers[0] != VK_NULL_HANDLE); @@ -3727,7 +3785,7 @@ bool VulkanDevice::UpdateDescriptorSetsForLayout(u32 dirty) } const u32 set = (layout == GPUPipeline::Layout::MultiTextureAndUBO) ? 1 : 0; - dsub.PushUpdate(GetCurrentCommandBuffer(), VK_PIPELINE_BIND_POINT_GRAPHICS, vk_pipeline_layout, set); + dsub.PushUpdate(GetCurrentCommandBuffer(), vk_bind_point, vk_pipeline_layout, set); if (num_ds == 0) return true; } @@ -3757,7 +3815,7 @@ bool VulkanDevice::UpdateDescriptorSetsForLayout(u32 dirty) { if (m_current_render_pass_flags & GPUPipeline::BindRenderTargetsAsImages) { - VkDescriptorSet ids = AllocateDescriptorSet(m_rov_ds_layout); + VkDescriptorSet ids = AllocateDescriptorSet(m_image_ds_layout); if (ids == VK_NULL_HANDLE) return false; @@ -3792,8 +3850,8 @@ bool VulkanDevice::UpdateDescriptorSetsForLayout(u32 dirty) } DebugAssert(num_ds > 0); - vkCmdBindDescriptorSets(GetCurrentCommandBuffer(), VK_PIPELINE_BIND_POINT_GRAPHICS, vk_pipeline_layout, first_ds, - num_ds, ds.data(), static_cast(new_dynamic_offsets), + vkCmdBindDescriptorSets(GetCurrentCommandBuffer(), vk_bind_point, vk_pipeline_layout, first_ds, num_ds, ds.data(), + static_cast(new_dynamic_offsets), new_dynamic_offsets ? &m_uniform_buffer_position : nullptr); return true; @@ -3818,6 +3876,9 @@ bool VulkanDevice::UpdateDescriptorSets(u32 dirty) case GPUPipeline::Layout::MultiTextureAndPushConstants: return UpdateDescriptorSetsForLayout(dirty); + case GPUPipeline::Layout::ComputeSingleTextureAndPushConstants: + return UpdateDescriptorSetsForLayout(dirty); + default: UnreachableCode(); } @@ -3911,3 +3972,15 @@ void VulkanDevice::DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 b DefaultCaseIsUnreachable(); } } + +void VulkanDevice::Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) +{ + PreDispatchCheck(); + s_stats.num_draws++; + + const u32 groups_x = threads_x / group_size_x; + const u32 groups_y = threads_y / group_size_y; + const u32 groups_z = threads_z / group_size_z; + vkCmdDispatch(GetCurrentCommandBuffer(), groups_x, groups_y, groups_z); +} diff --git a/src/util/vulkan_device.h b/src/util/vulkan_device.h index f2e870f93..43a982325 100644 --- a/src/util/vulkan_device.h +++ b/src/util/vulkan_device.h @@ -113,6 +113,7 @@ public: std::string_view source, const char* entry_point, DynamicHeapArray* out_binary, Error* error) override; std::unique_ptr CreatePipeline(const GPUPipeline::GraphicsConfig& config, Error* error) override; + std::unique_ptr CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) override; void PushDebugGroup(const char* name) override; void PopDebugGroup() override; @@ -136,6 +137,8 @@ public: void Draw(u32 vertex_count, u32 base_vertex) override; void DrawIndexed(u32 index_count, u32 base_index, u32 base_vertex) override; void DrawIndexedWithBarrier(u32 index_count, u32 base_index, u32 base_vertex, DrawBarrier type) override; + void Dispatch(u32 threads_x, u32 threads_y, u32 threads_z, u32 group_size_x, u32 group_size_y, + u32 group_size_z) override; bool SetGPUTimingEnabled(bool enabled) override; float GetAndResetAccumulatedGPUTime() override; @@ -373,6 +376,7 @@ private: VkPipelineLayout GetCurrentVkPipelineLayout() const; void SetInitialPipelineState(); void PreDrawCheck(); + void PreDispatchCheck(); template bool UpdateDescriptorSetsForLayout(u32 dirty); @@ -435,7 +439,7 @@ private: VkDescriptorSetLayout m_single_texture_buffer_ds_layout = VK_NULL_HANDLE; VkDescriptorSetLayout m_multi_texture_ds_layout = VK_NULL_HANDLE; VkDescriptorSetLayout m_feedback_loop_ds_layout = VK_NULL_HANDLE; - VkDescriptorSetLayout m_rov_ds_layout = VK_NULL_HANDLE; + VkDescriptorSetLayout m_image_ds_layout = VK_NULL_HANDLE; DimensionalArray(GPUPipeline::Layout::MaxCount), static_cast(PipelineLayoutType::MaxCount)> m_pipeline_layouts = {}; diff --git a/src/util/vulkan_pipeline.cpp b/src/util/vulkan_pipeline.cpp index 52db0d766..a6d801c77 100644 --- a/src/util/vulkan_pipeline.cpp +++ b/src/util/vulkan_pipeline.cpp @@ -275,3 +275,16 @@ std::unique_ptr VulkanDevice::CreatePipeline(const GPUPipeline::Gra return std::unique_ptr( new VulkanPipeline(pipeline, config.layout, static_cast(vertices_per_primitive), config.render_pass_flags)); } + +std::unique_ptr VulkanDevice::CreatePipeline(const GPUPipeline::ComputeConfig& config, Error* error) +{ + Vulkan::ComputePipelineBuilder cpb; + cpb.SetShader(static_cast(config.compute_shader)->GetModule(), "main"); + cpb.SetPipelineLayout(m_pipeline_layouts[0][static_cast(config.layout)]); + + const VkPipeline pipeline = cpb.Create(m_device, m_pipeline_cache, false, error); + if (!pipeline) + return {}; + + return std::unique_ptr(new VulkanPipeline(pipeline, config.layout, 0, GPUPipeline::NoRenderPassFlags)); +}