From 804da6f7668bc10161eabec449cbe9883606f50c Mon Sep 17 00:00:00 2001 From: psucien <168137814+psucien@users.noreply.github.com> Date: Fri, 30 Aug 2024 22:59:56 +0200 Subject: [PATCH] video_core: added support for indirect draws (#678) * video_core: added support for indirect draws * barriers simplified --- src/core/libraries/gnmdriver/gnmdriver.cpp | 12 ++-- src/core/libraries/gnmdriver/gnmdriver.h | 4 +- src/video_core/amdgpu/liverpool.cpp | 31 +++++++++ src/video_core/amdgpu/pm4_cmds.h | 61 +++++++++++++---- .../renderer_vulkan/vk_rasterizer.cpp | 65 +++++++++++++++---- .../renderer_vulkan/vk_rasterizer.h | 2 + 6 files changed, 140 insertions(+), 35 deletions(-) diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 6b8b070b8..34d056156 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -650,12 +650,12 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexAuto(u32* cmdbuf, u32 size, u32 index_count, u32 } s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage, - u32 vertex_sgpr_offset, u32 instance_vgpr_offset, + u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags) { LOG_TRACE(Lib_GnmDriver, "called"); if (cmdbuf && (size == 9) && (shader_stage < ShaderStages::Max) && - (vertex_sgpr_offset < 0x10u) && (instance_vgpr_offset < 0x10u)) { + (vertex_sgpr_offset < 0x10u) && (instance_sgpr_offset < 0x10u)) { const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable; cmdbuf = WriteHeader( @@ -665,7 +665,7 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, cmdbuf[0] = data_offset; cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset; - cmdbuf[2] = instance_vgpr_offset == 0 ? 0 : (instance_vgpr_offset & 0xffffu) + sgpr_offset; + cmdbuf[2] = instance_sgpr_offset == 0 ? 0 : (instance_sgpr_offset & 0xffffu) + sgpr_offset; cmdbuf[3] = 0; cmdbuf += 4; @@ -707,11 +707,11 @@ s32 PS4_SYSV_ABI sceGnmDrawIndexOffset(u32* cmdbuf, u32 size, u32 index_offset, } s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage, - u32 vertex_sgpr_offset, u32 instance_vgpr_offset, u32 flags) { + u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags) { LOG_TRACE(Lib_GnmDriver, "called"); if (cmdbuf && (size == 9) && (shader_stage < ShaderStages::Max) && - (vertex_sgpr_offset < 0x10u) && (instance_vgpr_offset < 0x10u)) { + (vertex_sgpr_offset < 0x10u) && (instance_sgpr_offset < 0x10u)) { const auto predicate = flags & 1 ? PM4Predicate::PredEnable : PM4Predicate::PredDisable; cmdbuf = WriteHeader(cmdbuf, 4, PM4ShaderType::ShaderGraphics, @@ -721,7 +721,7 @@ s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 cmdbuf[0] = data_offset; cmdbuf[1] = vertex_sgpr_offset == 0 ? 0 : (vertex_sgpr_offset & 0xffffu) + sgpr_offset; - cmdbuf[2] = instance_vgpr_offset == 0 ? 0 : (instance_vgpr_offset & 0xffffu) + sgpr_offset; + cmdbuf[2] = instance_sgpr_offset == 0 ? 0 : (instance_sgpr_offset & 0xffffu) + sgpr_offset; cmdbuf[3] = 2; // auto index cmdbuf += 4; diff --git a/src/core/libraries/gnmdriver/gnmdriver.h b/src/core/libraries/gnmdriver/gnmdriver.h index 84872297e..40a6ca5b6 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.h +++ b/src/core/libraries/gnmdriver/gnmdriver.h @@ -45,7 +45,7 @@ s32 PS4_SYSV_ABI sceGnmDrawIndex(u32* cmdbuf, u32 size, u32 index_count, uintptr u32 flags, u32 type); s32 PS4_SYSV_ABI sceGnmDrawIndexAuto(u32* cmdbuf, u32 size, u32 index_count, u32 flags); s32 PS4_SYSV_ABI sceGnmDrawIndexIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage, - u32 vertex_sgpr_offset, u32 instance_vgpr_offset, + u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags); int PS4_SYSV_ABI sceGnmDrawIndexIndirectCountMulti(); int PS4_SYSV_ABI sceGnmDrawIndexIndirectMulti(); @@ -53,7 +53,7 @@ int PS4_SYSV_ABI sceGnmDrawIndexMultiInstanced(); s32 PS4_SYSV_ABI sceGnmDrawIndexOffset(u32* cmdbuf, u32 size, u32 index_offset, u32 index_count, u32 flags); s32 PS4_SYSV_ABI sceGnmDrawIndirect(u32* cmdbuf, u32 size, u32 data_offset, u32 shader_stage, - u32 vertex_sgpr_offset, u32 instance_vgpr_offset, u32 flags); + u32 vertex_sgpr_offset, u32 instance_sgpr_offset, u32 flags); int PS4_SYSV_ABI sceGnmDrawIndirectCountMulti(); int PS4_SYSV_ABI sceGnmDrawIndirectMulti(); u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState(u32* cmdbuf, u32 size); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index e61f8cec0..2a595516d 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -368,6 +368,36 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + const auto offset = draw_indirect->data_offset; + const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr; + const auto size = sizeof(PM4CmdDrawIndirect::DrawInstancedArgs); + if (rasterizer) { + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:DrawIndirect", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); + rasterizer->DrawIndirect(false, ib_address, offset, size); + rasterizer->ScopeMarkerEnd(); + } + break; + } + case PM4ItOpcode::DrawIndexIndirect: { + const auto* draw_index_indirect = + reinterpret_cast(header); + const auto offset = draw_index_indirect->data_offset; + const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr; + const auto size = sizeof(PM4CmdDrawIndexIndirect::DrawIndexInstancedArgs); + if (rasterizer) { + const auto cmd_address = reinterpret_cast(header); + rasterizer->ScopeMarkerBegin( + fmt::format("dcb:{}:DrawIndexIndirect", cmd_address)); + rasterizer->Breadcrumb(u64(cmd_address)); + rasterizer->DrawIndirect(true, ib_address, offset, size); + rasterizer->ScopeMarkerEnd(); + } + break; + } case PM4ItOpcode::DispatchDirect: { const auto* dispatch_direct = reinterpret_cast(header); regs.cs_program.dim_x = dispatch_direct->dim_x; @@ -488,6 +518,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanCpSync(); break; } default: diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index 50e4c93a1..58ade221b 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -253,20 +253,6 @@ struct PM4CmdDrawIndexAuto { u32 draw_initiator; }; -struct PM4CmdDrawIndirect { - PM4Type3Header header; ///< header - u32 data_offset; ///< DWORD aligned offset - union { - u32 dw2; - BitField<0, 16, u32> base_vtx_loc; ///< base vertex location - }; - union { - u32 dw3; - BitField<0, 16, u32> start_inst_loc; ///< start instance location - }; - u32 draw_initiator; ///< Draw Initiator Register -}; - enum class DataSelect : u32 { None = 0, Data32Low = 1, @@ -740,4 +726,51 @@ struct PM4CmdDispatchIndirect { u32 dispatch_initiator; ///< Dispatch Initiator Register }; +struct PM4CmdDrawIndirect { + struct DrawInstancedArgs { + u32 vertex_count_per_instance; + u32 instance_count; + u32 start_vertex_location; + u32 start_instance_location; + }; + + PM4Type3Header header; ///< header + u32 data_offset; ///< Byte aligned offset where the required data structure starts + union { + u32 dw2; + BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the + ///< BaseVertexLocation it fetched from memory + }; + union { + u32 dw3; + BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the + ///< StartInstanceLocation it fetched from memory + }; + u32 draw_initiator; ///< Draw Initiator Register +}; + +struct PM4CmdDrawIndexIndirect { + struct DrawIndexInstancedArgs { + u32 index_count_per_instance; + u32 instance_count; + u32 start_index_location; + u32 base_vertex_location; + u32 start_instance_location; + }; + + PM4Type3Header header; ///< header + u32 data_offset; ///< Byte aligned offset where the required data structure starts + union { + u32 dw2; + BitField<0, 16, u32> base_vtx_loc; ///< Offset where the CP will write the + ///< BaseVertexLocation it fetched from memory + }; + union { // NOTE: this one is undocumented in AMD spec, but Gnm driver writes this field + u32 dw3; + BitField<0, 16, u32> start_inst_loc; ///< Offset where the CP will write the + ///< StartInstanceLocation it fetched from memory + }; + u32 draw_initiator; ///< Draw Initiator Register +}; + } // namespace AmdGpu diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 9231c5104..cadce01eb 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -29,6 +29,19 @@ Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, Rasterizer::~Rasterizer() = default; +void Rasterizer::CpSync() { + scheduler.EndRendering(); + auto cmdbuf = scheduler.CommandBuffer(); + + const vk::MemoryBarrier ib_barrier{ + .srcAccessMask = vk::AccessFlagBits::eShaderWrite, + .dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead, + }; + cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, + vk::PipelineStageFlagBits::eDrawIndirect, + vk::DependencyFlagBits::eByRegion, ib_barrier, {}, {}); +} + void Rasterizer::Draw(bool is_indexed, u32 index_offset) { RENDERER_TRACE; @@ -66,6 +79,45 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) { } } +void Rasterizer::DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size) { + RENDERER_TRACE; + + const auto cmdbuf = scheduler.CommandBuffer(); + const auto& regs = liverpool->regs; + const GraphicsPipeline* pipeline = pipeline_cache.GetGraphicsPipeline(); + if (!pipeline) { + return; + } + + ASSERT_MSG(regs.primitive_type != AmdGpu::Liverpool::PrimitiveType::RectList, + "Unsupported primitive type for indirect draw"); + + try { + pipeline->BindResources(regs, buffer_cache, texture_cache); + } catch (...) { + UNREACHABLE(); + } + + const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex); + buffer_cache.BindVertexBuffers(vs_info); + const u32 num_indices = buffer_cache.BindIndexBuffer(is_indexed, 0); + + BeginRendering(); + UpdateDynamicState(*pipeline); + + const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true); + const auto total_offset = base + offset; + + // We can safely ignore both SGPR UD indices and results of fetch shader parsing, as vertex and + // instance offsets will be automatically applied by Vulkan from indirect args buffer. + + if (is_indexed) { + cmdbuf.drawIndexedIndirect(buffer->Handle(), total_offset, 1, 0); + } else { + cmdbuf.drawIndirect(buffer->Handle(), total_offset, 1, 0); + } +} + void Rasterizer::DispatchDirect() { RENDERER_TRACE; @@ -113,19 +165,6 @@ void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true); const auto total_offset = base + offset; - - // Emulate PFP-to-ME sync packet - const vk::BufferMemoryBarrier ib_barrier{ - .srcAccessMask = vk::AccessFlagBits::eShaderWrite, - .dstAccessMask = vk::AccessFlagBits::eIndirectCommandRead, - .buffer = buffer->Handle(), - .offset = total_offset, - .size = size, - }; - cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, - vk::PipelineStageFlagBits::eDrawIndirect, - vk::DependencyFlagBits::eByRegion, {}, ib_barrier, {}); - cmdbuf.dispatchIndirect(buffer->Handle(), total_offset); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 34f6ae726..c38fe6ee9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -32,6 +32,7 @@ public: } void Draw(bool is_indexed, u32 index_offset = 0); + void DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 size); void DispatchDirect(); void DispatchIndirect(VAddr address, u32 offset, u32 size); @@ -45,6 +46,7 @@ public: void MapMemory(VAddr addr, u64 size); void UnmapMemory(VAddr addr, u64 size); + void CpSync(); u64 Flush(); private: