From 9df1a8d15b97089eac11041fcffb0fa74a26c4c5 Mon Sep 17 00:00:00 2001 From: psucien Date: Thu, 16 May 2024 23:56:29 +0200 Subject: [PATCH 1/3] amdgpu: added support for several single GFX submits per frame --- src/core/libraries/gnmdriver/gnmdriver.cpp | 2 +- src/video_core/amdgpu/liverpool.cpp | 66 ++++++++++++++++------ src/video_core/amdgpu/liverpool.h | 26 ++++++--- 3 files changed, 69 insertions(+), 25 deletions(-) diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index b5dca74b..659c0a5f 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -1411,7 +1411,7 @@ s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[], } } - liverpool->Submit(reinterpret_cast(dcb_gpu_addrs[0]), dcb_sizes_in_bytes[0]); + liverpool->SubmitGfx(reinterpret_cast(dcb_gpu_addrs[0]), dcb_sizes_in_bytes[0]); return ORBIS_OK; } diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 62e12fa2..03e93eb9 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -9,16 +9,50 @@ namespace AmdGpu { -Liverpool::Liverpool() = default; +Liverpool::Liverpool() { + process_thread = std::jthread{std::bind_front(&Liverpool::Process, this)}; +} -void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { +Liverpool::~Liverpool() { + process_thread.request_stop(); + cv_submit.notify_one(); +} + +void Liverpool::Process(std::stop_token stoken) { + while (!stoken.stop_requested()) { + std::span dcb{}; + { + std::unique_lock lock{m_ring_access}; + cv_submit.wait(lock, stoken, [&]() { return !gfx_ring.empty(); }); + + if (stoken.stop_requested()) { + break; + } + + dcb = gfx_ring.front(); + gfx_ring.pop(); + } + + ASSERT_MSG(dcb.size() != 0, "Empty command list received"); + ProcessCmdList(dcb.data(), dcb.size()); + + cv_complete.notify_all(); + } +} + +void Liverpool::Wait() { + std::unique_lock lock{m_ring_access}; + cv_complete.wait(lock, [this]() { return gfx_ring.empty(); }); +} + +void Liverpool::ProcessCmdList(const u32* cmdbuf, u32 size_in_bytes) { Common::SetCurrentThreadName("CommandProcessor_Gfx"); - auto* header = reinterpret_cast(cmdbuf); + auto* header = reinterpret_cast(cmdbuf); u32 processed_cmd_size = 0; while (processed_cmd_size < size_in_bytes) { - PM4Header* next_header{}; + const PM4Header* next_header{}; const u32 type = header->type; switch (type) { case 3: { @@ -26,7 +60,7 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { const u32 count = header->type3.NumWords(); switch (opcode) { case PM4ItOpcode::Nop: { - const auto* nop = reinterpret_cast(header); + const auto* nop = reinterpret_cast(header); if (nop->header.count.Value() == 0) { break; } @@ -44,30 +78,30 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { break; } case PM4ItOpcode::SetContextReg: { - const auto* set_data = reinterpret_cast(header); + const auto* set_data = reinterpret_cast(header); std::memcpy(®s.reg_array[ContextRegWordOffset + set_data->reg_offset], header + 2, (count - 1) * sizeof(u32)); break; } case PM4ItOpcode::SetShReg: { - const auto* set_data = reinterpret_cast(header); + const auto* set_data = reinterpret_cast(header); std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, (count - 1) * sizeof(u32)); break; } case PM4ItOpcode::SetUconfigReg: { - const auto* set_data = reinterpret_cast(header); + const auto* set_data = reinterpret_cast(header); std::memcpy(®s.reg_array[UconfigRegWordOffset + set_data->reg_offset], header + 2, (count - 1) * sizeof(u32)); break; } case PM4ItOpcode::IndexType: { - const auto* index_type = reinterpret_cast(header); + const auto* index_type = reinterpret_cast(header); regs.index_buffer_type.raw = index_type->raw; break; } case PM4ItOpcode::DrawIndex2: { - const auto* draw_index = reinterpret_cast(header); + const auto* draw_index = reinterpret_cast(header); regs.max_index_size = draw_index->max_size; regs.index_base_address.base_addr_lo = draw_index->index_base_lo; regs.index_base_address.base_addr_hi.Assign(draw_index->index_base_hi); @@ -77,7 +111,7 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { break; } case PM4ItOpcode::DrawIndexAuto: { - const auto* draw_index = reinterpret_cast(header); + const auto* draw_index = reinterpret_cast(header); regs.num_indices = draw_index->index_count; regs.draw_initiator = draw_index->draw_initiator; // rasterizer->DrawIndex(); @@ -88,21 +122,21 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { break; } case PM4ItOpcode::EventWriteEos: { - const auto* event_eos = reinterpret_cast(header); + const auto* event_eos = reinterpret_cast(header); event_eos->SignalFence(); break; } case PM4ItOpcode::EventWriteEop: { - const auto* event_eop = reinterpret_cast(header); + const auto* event_eop = reinterpret_cast(header); event_eop->SignalFence(); break; } case PM4ItOpcode::DmaData: { - const auto* dma_data = reinterpret_cast(header); + const auto* dma_data = reinterpret_cast(header); break; } case PM4ItOpcode::WriteData: { - const auto* write_data = reinterpret_cast(header); + const auto* write_data = reinterpret_cast(header); ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5); const u32 data_size = (header->type3.count.Value() - 2) * 4; if (!write_data->wr_one_addr.Value()) { @@ -117,7 +151,7 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { break; } case PM4ItOpcode::WaitRegMem: { - const auto* wait_reg_mem = reinterpret_cast(header); + const auto* wait_reg_mem = reinterpret_cast(header); ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); while (!wait_reg_mem->Test()) { using namespace std::chrono_literals; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 01c20c01..c040a2ff 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include namespace AmdGpu { @@ -614,23 +616,31 @@ struct Liverpool { public: Liverpool(); + ~Liverpool(); - void Submit(u32* cmdbuf, u32 size_in_bytes) { - ASSERT_MSG(!cp.valid(), "Trying to submit while previous submission is pending"); - cp = std::async(&Liverpool::ProcessCmdList, this, cmdbuf, size_in_bytes); + void SubmitGfx(const u32* dcb, u32 dcb_size) { + { + std::scoped_lock lock{m_ring_access}; + gfx_ring.push({dcb, dcb_size}); + } + cv_submit.notify_one(); } void SubmitDone() { // This is wrong as `submitDone()` should never be blocking. The behavior will be // reworked with mutiple queues introduction - if (cp.valid()) { - cp.get(); - } + Wait(); } private: - void ProcessCmdList(u32* cmdbuf, u32 size_in_bytes); + void ProcessCmdList(const u32* cmdbuf, u32 size_in_bytes); + void Process(std::stop_token stoken); + void Wait(); - std::future cp{}; + std::jthread process_thread{}; + std::queue> gfx_ring{}; + std::condition_variable_any cv_submit{}; + std::condition_variable cv_complete{}; + std::mutex m_ring_access{}; }; static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08); From dbdb3dc77ec4e677695373da773d28055a2322ae Mon Sep 17 00:00:00 2001 From: psucien Date: Fri, 17 May 2024 08:22:47 +0200 Subject: [PATCH 2/3] amdgpu: non-blocking submitDone --- src/video_core/amdgpu/liverpool.cpp | 2 +- src/video_core/amdgpu/liverpool.h | 16 ++++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 03e93eb9..5bbe6ee2 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -40,7 +40,7 @@ void Liverpool::Process(std::stop_token stoken) { } } -void Liverpool::Wait() { +void Liverpool::WaitGpuIdle() { std::unique_lock lock{m_ring_access}; cv_complete.wait(lock, [this]() { return gfx_ring.empty(); }); } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index c040a2ff..2f0d2500 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -619,6 +619,14 @@ public: ~Liverpool(); void SubmitGfx(const u32* dcb, u32 dcb_size) { + if (submission_lock) { + WaitGpuIdle(); + + // Suspend logic goes here + + submission_lock = false; + } + { std::scoped_lock lock{m_ring_access}; gfx_ring.push({dcb, dcb_size}); @@ -626,21 +634,21 @@ public: cv_submit.notify_one(); } void SubmitDone() { - // This is wrong as `submitDone()` should never be blocking. The behavior will be - // reworked with mutiple queues introduction - Wait(); + submission_lock = true; } private: void ProcessCmdList(const u32* cmdbuf, u32 size_in_bytes); void Process(std::stop_token stoken); - void Wait(); + void WaitGpuIdle(); std::jthread process_thread{}; std::queue> gfx_ring{}; std::condition_variable_any cv_submit{}; std::condition_variable cv_complete{}; std::mutex m_ring_access{}; + + bool submission_lock{}; }; static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08); From 64b2e582850c85b0a4c54fc03baa71fb8ee03886 Mon Sep 17 00:00:00 2001 From: psucien Date: Fri, 17 May 2024 08:47:38 +0200 Subject: [PATCH 3/3] a fair multi-submissions support --- src/core/libraries/gnmdriver/gnmdriver.cpp | 21 +++++++++++++-------- src/core/libraries/gnmdriver/gnmdriver.h | 8 ++++---- src/video_core/amdgpu/liverpool.h | 7 +++++-- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index 659c0a5f..5cb7f5a9 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -1359,13 +1359,13 @@ static inline s32 PatchFlipRequest(u32* cmdbuf, u32 size, u32 vo_handle, u32 buf return ORBIS_OK; } -s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, void* dcb_gpu_addrs[], - u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[], +s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, u32* dcb_gpu_addrs[], + u32* dcb_sizes_in_bytes, u32* ccb_gpu_addrs[], u32* ccb_sizes_in_bytes, u32 vo_handle, u32 buf_idx, u32 flip_mode, u32 flip_arg) { LOG_INFO(Lib_GnmDriver, "called [buf = {}]", buf_idx); - auto* cmdbuf = reinterpret_cast(dcb_gpu_addrs[count - 1]); + auto* cmdbuf = dcb_gpu_addrs[count - 1]; const auto size_dw = dcb_sizes_in_bytes[count - 1] / 4; const s32 patch_result = @@ -1374,7 +1374,8 @@ s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, void* dcb_gpu_addr return patch_result; } - return sceGnmSubmitCommandBuffers(count, dcb_gpu_addrs, dcb_sizes_in_bytes, ccb_gpu_addrs, + return sceGnmSubmitCommandBuffers(count, const_cast(dcb_gpu_addrs), + dcb_sizes_in_bytes, const_cast(ccb_gpu_addrs), ccb_sizes_in_bytes); } @@ -1383,11 +1384,10 @@ int PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffersForWorkload() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[], - u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[], +s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, const u32* dcb_gpu_addrs[], + u32* dcb_sizes_in_bytes, const u32* ccb_gpu_addrs[], u32* ccb_sizes_in_bytes) { LOG_INFO(Lib_GnmDriver, "called"); - ASSERT_MSG(count == 1, "Multiple command buffer submission is unsupported!"); if (!dcb_gpu_addrs || !dcb_sizes_in_bytes) { LOG_ERROR(Lib_GnmDriver, "dcbGpuAddrs and dcbSizesInBytes must not be NULL"); @@ -1411,7 +1411,12 @@ s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[], } } - liverpool->SubmitGfx(reinterpret_cast(dcb_gpu_addrs[0]), dcb_sizes_in_bytes[0]); + for (auto cbpair = 0u; cbpair < count; ++cbpair) { + const auto* ccb = ccb_gpu_addrs ? ccb_gpu_addrs[cbpair] : nullptr; + const auto ccb_size = ccb_sizes_in_bytes ? ccb_sizes_in_bytes[cbpair] : 0; + + liverpool->SubmitGfx({dcb_gpu_addrs[cbpair], dcb_sizes_in_bytes[cbpair]}, {ccb, ccb_size}); + } return ORBIS_OK; } diff --git a/src/core/libraries/gnmdriver/gnmdriver.h b/src/core/libraries/gnmdriver/gnmdriver.h index c1aeef8e..0f0e454e 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.h +++ b/src/core/libraries/gnmdriver/gnmdriver.h @@ -194,13 +194,13 @@ int PS4_SYSV_ABI sceGnmSqttStopTrace(); int PS4_SYSV_ABI sceGnmSqttSwitchTraceBuffer(); int PS4_SYSV_ABI sceGnmSqttSwitchTraceBuffer2(); int PS4_SYSV_ABI sceGnmSqttWaitForEvent(); -s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, void* dcb_gpu_addrs[], - u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[], +s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, u32* dcb_gpu_addrs[], + u32* dcb_sizes_in_bytes, u32* ccb_gpu_addrs[], u32* ccb_sizes_in_bytes, u32 vo_handle, u32 buf_idx, u32 flip_mode, u32 flip_arg); int PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffersForWorkload(); -s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[], - u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[], +s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, const u32* dcb_gpu_addrs[], + u32* dcb_sizes_in_bytes, const u32* ccb_gpu_addrs[], u32* ccb_sizes_in_bytes); int PS4_SYSV_ABI sceGnmSubmitCommandBuffersForWorkload(); int PS4_SYSV_ABI sceGnmSubmitDone(); diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 2f0d2500..c8e8eb0d 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -12,6 +12,7 @@ #include #include #include +#include #include namespace AmdGpu { @@ -618,7 +619,7 @@ public: Liverpool(); ~Liverpool(); - void SubmitGfx(const u32* dcb, u32 dcb_size) { + void SubmitGfx(std::span dcb, std::span ccb) { if (submission_lock) { WaitGpuIdle(); @@ -629,7 +630,9 @@ public: { std::scoped_lock lock{m_ring_access}; - gfx_ring.push({dcb, dcb_size}); + gfx_ring.emplace(dcb); + + ASSERT_MSG(ccb.size() == 0, "CCBs are not supported yet"); } cv_submit.notify_one(); }