diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index b5dca74b..5cb7f5a9 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -1359,13 +1359,13 @@ static inline s32 PatchFlipRequest(u32* cmdbuf, u32 size, u32 vo_handle, u32 buf return ORBIS_OK; } -s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, void* dcb_gpu_addrs[], - u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[], +s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, u32* dcb_gpu_addrs[], + u32* dcb_sizes_in_bytes, u32* ccb_gpu_addrs[], u32* ccb_sizes_in_bytes, u32 vo_handle, u32 buf_idx, u32 flip_mode, u32 flip_arg) { LOG_INFO(Lib_GnmDriver, "called [buf = {}]", buf_idx); - auto* cmdbuf = reinterpret_cast(dcb_gpu_addrs[count - 1]); + auto* cmdbuf = dcb_gpu_addrs[count - 1]; const auto size_dw = dcb_sizes_in_bytes[count - 1] / 4; const s32 patch_result = @@ -1374,7 +1374,8 @@ s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, void* dcb_gpu_addr return patch_result; } - return sceGnmSubmitCommandBuffers(count, dcb_gpu_addrs, dcb_sizes_in_bytes, ccb_gpu_addrs, + return sceGnmSubmitCommandBuffers(count, const_cast(dcb_gpu_addrs), + dcb_sizes_in_bytes, const_cast(ccb_gpu_addrs), ccb_sizes_in_bytes); } @@ -1383,11 +1384,10 @@ int PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffersForWorkload() { return ORBIS_OK; } -s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[], - u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[], +s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, const u32* dcb_gpu_addrs[], + u32* dcb_sizes_in_bytes, const u32* ccb_gpu_addrs[], u32* ccb_sizes_in_bytes) { LOG_INFO(Lib_GnmDriver, "called"); - ASSERT_MSG(count == 1, "Multiple command buffer submission is unsupported!"); if (!dcb_gpu_addrs || !dcb_sizes_in_bytes) { LOG_ERROR(Lib_GnmDriver, "dcbGpuAddrs and dcbSizesInBytes must not be NULL"); @@ -1411,7 +1411,12 @@ s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[], } } - liverpool->Submit(reinterpret_cast(dcb_gpu_addrs[0]), dcb_sizes_in_bytes[0]); + for (auto cbpair = 0u; cbpair < count; ++cbpair) { + const auto* ccb = ccb_gpu_addrs ? ccb_gpu_addrs[cbpair] : nullptr; + const auto ccb_size = ccb_sizes_in_bytes ? ccb_sizes_in_bytes[cbpair] : 0; + + liverpool->SubmitGfx({dcb_gpu_addrs[cbpair], dcb_sizes_in_bytes[cbpair]}, {ccb, ccb_size}); + } return ORBIS_OK; } diff --git a/src/core/libraries/gnmdriver/gnmdriver.h b/src/core/libraries/gnmdriver/gnmdriver.h index c1aeef8e..0f0e454e 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.h +++ b/src/core/libraries/gnmdriver/gnmdriver.h @@ -194,13 +194,13 @@ int PS4_SYSV_ABI sceGnmSqttStopTrace(); int PS4_SYSV_ABI sceGnmSqttSwitchTraceBuffer(); int PS4_SYSV_ABI sceGnmSqttSwitchTraceBuffer2(); int PS4_SYSV_ABI sceGnmSqttWaitForEvent(); -s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, void* dcb_gpu_addrs[], - u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[], +s32 PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffers(u32 count, u32* dcb_gpu_addrs[], + u32* dcb_sizes_in_bytes, u32* ccb_gpu_addrs[], u32* ccb_sizes_in_bytes, u32 vo_handle, u32 buf_idx, u32 flip_mode, u32 flip_arg); int PS4_SYSV_ABI sceGnmSubmitAndFlipCommandBuffersForWorkload(); -s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, void* dcb_gpu_addrs[], - u32* dcb_sizes_in_bytes, void* ccb_gpu_addrs[], +s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, const u32* dcb_gpu_addrs[], + u32* dcb_sizes_in_bytes, const u32* ccb_gpu_addrs[], u32* ccb_sizes_in_bytes); int PS4_SYSV_ABI sceGnmSubmitCommandBuffersForWorkload(); int PS4_SYSV_ABI sceGnmSubmitDone(); diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 62e12fa2..5bbe6ee2 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -9,16 +9,50 @@ namespace AmdGpu { -Liverpool::Liverpool() = default; +Liverpool::Liverpool() { + process_thread = std::jthread{std::bind_front(&Liverpool::Process, this)}; +} -void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { +Liverpool::~Liverpool() { + process_thread.request_stop(); + cv_submit.notify_one(); +} + +void Liverpool::Process(std::stop_token stoken) { + while (!stoken.stop_requested()) { + std::span dcb{}; + { + std::unique_lock lock{m_ring_access}; + cv_submit.wait(lock, stoken, [&]() { return !gfx_ring.empty(); }); + + if (stoken.stop_requested()) { + break; + } + + dcb = gfx_ring.front(); + gfx_ring.pop(); + } + + ASSERT_MSG(dcb.size() != 0, "Empty command list received"); + ProcessCmdList(dcb.data(), dcb.size()); + + cv_complete.notify_all(); + } +} + +void Liverpool::WaitGpuIdle() { + std::unique_lock lock{m_ring_access}; + cv_complete.wait(lock, [this]() { return gfx_ring.empty(); }); +} + +void Liverpool::ProcessCmdList(const u32* cmdbuf, u32 size_in_bytes) { Common::SetCurrentThreadName("CommandProcessor_Gfx"); - auto* header = reinterpret_cast(cmdbuf); + auto* header = reinterpret_cast(cmdbuf); u32 processed_cmd_size = 0; while (processed_cmd_size < size_in_bytes) { - PM4Header* next_header{}; + const PM4Header* next_header{}; const u32 type = header->type; switch (type) { case 3: { @@ -26,7 +60,7 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { const u32 count = header->type3.NumWords(); switch (opcode) { case PM4ItOpcode::Nop: { - const auto* nop = reinterpret_cast(header); + const auto* nop = reinterpret_cast(header); if (nop->header.count.Value() == 0) { break; } @@ -44,30 +78,30 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { break; } case PM4ItOpcode::SetContextReg: { - const auto* set_data = reinterpret_cast(header); + const auto* set_data = reinterpret_cast(header); std::memcpy(®s.reg_array[ContextRegWordOffset + set_data->reg_offset], header + 2, (count - 1) * sizeof(u32)); break; } case PM4ItOpcode::SetShReg: { - const auto* set_data = reinterpret_cast(header); + const auto* set_data = reinterpret_cast(header); std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, (count - 1) * sizeof(u32)); break; } case PM4ItOpcode::SetUconfigReg: { - const auto* set_data = reinterpret_cast(header); + const auto* set_data = reinterpret_cast(header); std::memcpy(®s.reg_array[UconfigRegWordOffset + set_data->reg_offset], header + 2, (count - 1) * sizeof(u32)); break; } case PM4ItOpcode::IndexType: { - const auto* index_type = reinterpret_cast(header); + const auto* index_type = reinterpret_cast(header); regs.index_buffer_type.raw = index_type->raw; break; } case PM4ItOpcode::DrawIndex2: { - const auto* draw_index = reinterpret_cast(header); + const auto* draw_index = reinterpret_cast(header); regs.max_index_size = draw_index->max_size; regs.index_base_address.base_addr_lo = draw_index->index_base_lo; regs.index_base_address.base_addr_hi.Assign(draw_index->index_base_hi); @@ -77,7 +111,7 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { break; } case PM4ItOpcode::DrawIndexAuto: { - const auto* draw_index = reinterpret_cast(header); + const auto* draw_index = reinterpret_cast(header); regs.num_indices = draw_index->index_count; regs.draw_initiator = draw_index->draw_initiator; // rasterizer->DrawIndex(); @@ -88,21 +122,21 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { break; } case PM4ItOpcode::EventWriteEos: { - const auto* event_eos = reinterpret_cast(header); + const auto* event_eos = reinterpret_cast(header); event_eos->SignalFence(); break; } case PM4ItOpcode::EventWriteEop: { - const auto* event_eop = reinterpret_cast(header); + const auto* event_eop = reinterpret_cast(header); event_eop->SignalFence(); break; } case PM4ItOpcode::DmaData: { - const auto* dma_data = reinterpret_cast(header); + const auto* dma_data = reinterpret_cast(header); break; } case PM4ItOpcode::WriteData: { - const auto* write_data = reinterpret_cast(header); + const auto* write_data = reinterpret_cast(header); ASSERT(write_data->dst_sel.Value() == 2 || write_data->dst_sel.Value() == 5); const u32 data_size = (header->type3.count.Value() - 2) * 4; if (!write_data->wr_one_addr.Value()) { @@ -117,7 +151,7 @@ void Liverpool::ProcessCmdList(u32* cmdbuf, u32 size_in_bytes) { break; } case PM4ItOpcode::WaitRegMem: { - const auto* wait_reg_mem = reinterpret_cast(header); + const auto* wait_reg_mem = reinterpret_cast(header); ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); while (!wait_reg_mem->Test()) { using namespace std::chrono_literals; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 01c20c01..c8e8eb0d 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include namespace AmdGpu { @@ -614,23 +617,41 @@ struct Liverpool { public: Liverpool(); + ~Liverpool(); - void Submit(u32* cmdbuf, u32 size_in_bytes) { - ASSERT_MSG(!cp.valid(), "Trying to submit while previous submission is pending"); - cp = std::async(&Liverpool::ProcessCmdList, this, cmdbuf, size_in_bytes); + void SubmitGfx(std::span dcb, std::span ccb) { + if (submission_lock) { + WaitGpuIdle(); + + // Suspend logic goes here + + submission_lock = false; + } + + { + std::scoped_lock lock{m_ring_access}; + gfx_ring.emplace(dcb); + + ASSERT_MSG(ccb.size() == 0, "CCBs are not supported yet"); + } + cv_submit.notify_one(); } void SubmitDone() { - // This is wrong as `submitDone()` should never be blocking. The behavior will be - // reworked with mutiple queues introduction - if (cp.valid()) { - cp.get(); - } + submission_lock = true; } private: - void ProcessCmdList(u32* cmdbuf, u32 size_in_bytes); + void ProcessCmdList(const u32* cmdbuf, u32 size_in_bytes); + void Process(std::stop_token stoken); + void WaitGpuIdle(); - std::future cp{}; + std::jthread process_thread{}; + std::queue> gfx_ring{}; + std::condition_variable_any cv_submit{}; + std::condition_variable cv_complete{}; + std::mutex m_ring_access{}; + + bool submission_lock{}; }; static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08);