From 0fd1ab674bbb6b41f5cc6d46ca5ed4bcd6d6052c Mon Sep 17 00:00:00 2001 From: psucien <168137814+psucien@users.noreply.github.com> Date: Sat, 14 Dec 2024 23:54:46 +0100 Subject: [PATCH] GPU processor refactoring (#1787) * coroutine code prettification * asc queues submission refactoring * better asc ring context handling * final touches and review notes * even more simplification for context saving --- src/common/debug.h | 3 + src/core/debug_state.cpp | 68 +++--- src/core/debug_state.h | 8 +- src/core/libraries/gnmdriver/gnmdriver.cpp | 36 ++-- .../backend/spirv/spirv_emit_context.cpp | 2 +- src/video_core/amdgpu/liverpool.cpp | 204 ++++++++++-------- src/video_core/amdgpu/liverpool.h | 25 ++- .../renderer_vulkan/vk_pipeline_cache.cpp | 9 +- .../renderer_vulkan/vk_rasterizer.cpp | 6 +- .../renderer_vulkan/vk_shader_hle.cpp | 16 +- .../renderer_vulkan/vk_shader_hle.h | 2 +- src/video_core/texture_cache/tile_manager.cpp | 1 + 12 files changed, 234 insertions(+), 146 deletions(-) diff --git a/src/common/debug.h b/src/common/debug.h index 091c6191..4d42aa4a 100644 --- a/src/common/debug.h +++ b/src/common/debug.h @@ -57,3 +57,6 @@ enum MarkersPalette : int { tracy::SourceLocationData{nullptr, name, TracyFile, (uint32_t)TracyLine, 0}; #define FRAME_END FrameMark + +#define FIBER_ENTER(name) TracyFiberEnter(name) +#define FIBER_EXIT TracyFiberLeave diff --git a/src/core/debug_state.cpp b/src/core/debug_state.cpp index c68fd469..daf614bd 100644 --- a/src/core/debug_state.cpp +++ b/src/core/debug_state.cpp @@ -142,41 +142,61 @@ void DebugStateImpl::PushQueueDump(QueueDump dump) { frame.queues.push_back(std::move(dump)); } -void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, - const AmdGpu::Liverpool::Regs& regs, bool is_compute) { - std::scoped_lock lock{frame_dump_list_mutex}; +std::optional DebugStateImpl::GetRegDump(uintptr_t base_addr, uintptr_t header_addr) { const auto it = waiting_reg_dumps.find(header_addr); if (it == waiting_reg_dumps.end()) { - return; + return std::nullopt; } auto& frame = *it->second; waiting_reg_dumps.erase(it); waiting_reg_dumps_dbg.erase(waiting_reg_dumps_dbg.find(header_addr)); - auto& dump = frame.regs[header_addr - base_addr]; - dump.regs = regs; - if (is_compute) { - dump.is_compute = true; - const auto& cs = dump.regs.cs_program; - dump.cs_data = PipelineComputerProgramDump{ - .cs_program = cs, - .code = std::vector{cs.Code().begin(), cs.Code().end()}, - }; - } else { - for (int i = 0; i < RegDump::MaxShaderStages; i++) { - if (regs.stage_enable.IsStageEnabled(i)) { - auto stage = regs.ProgramForStage(i); - if (stage->address_lo != 0) { - auto code = stage->Code(); - dump.stages[i] = PipelineShaderProgramDump{ - .user_data = *stage, - .code = std::vector{code.begin(), code.end()}, - }; - } + return &frame.regs[header_addr - base_addr]; +} + +void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, + const AmdGpu::Liverpool::Regs& regs) { + std::scoped_lock lock{frame_dump_list_mutex}; + + auto dump = GetRegDump(base_addr, header_addr); + if (!dump) { + return; + } + + (*dump)->regs = regs; + + for (int i = 0; i < RegDump::MaxShaderStages; i++) { + if ((*dump)->regs.stage_enable.IsStageEnabled(i)) { + auto stage = (*dump)->regs.ProgramForStage(i); + if (stage->address_lo != 0) { + auto code = stage->Code(); + (*dump)->stages[i] = PipelineShaderProgramDump{ + .user_data = *stage, + .code = std::vector{code.begin(), code.end()}, + }; } } } } +void DebugStateImpl::PushRegsDumpCompute(uintptr_t base_addr, uintptr_t header_addr, + const CsState& cs_state) { + std::scoped_lock lock{frame_dump_list_mutex}; + + auto dump = GetRegDump(base_addr, header_addr); + if (!dump) { + return; + } + + (*dump)->is_compute = true; + auto& cs = (*dump)->regs.cs_program; + cs = cs_state; + + (*dump)->cs_data = PipelineComputerProgramDump{ + .cs_program = cs, + .code = std::vector{cs.Code().begin(), cs.Code().end()}, + }; +} + void DebugStateImpl::CollectShader(const std::string& name, Shader::LogicalStage l_stage, vk::ShaderModule module, std::span spv, std::span raw_code, std::span patch_spv, diff --git a/src/core/debug_state.h b/src/core/debug_state.h index 0db5bc46..a0e428b6 100644 --- a/src/core/debug_state.h +++ b/src/core/debug_state.h @@ -11,7 +11,6 @@ #include #include "common/types.h" -#include "video_core/amdgpu/liverpool.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #ifdef _WIN32 @@ -204,12 +203,17 @@ public: void PushQueueDump(QueueDump dump); void PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, - const AmdGpu::Liverpool::Regs& regs, bool is_compute = false); + const AmdGpu::Liverpool::Regs& regs); + using CsState = AmdGpu::Liverpool::ComputeProgram; + void PushRegsDumpCompute(uintptr_t base_addr, uintptr_t header_addr, const CsState& cs_state); void CollectShader(const std::string& name, Shader::LogicalStage l_stage, vk::ShaderModule module, std::span spv, std::span raw_code, std::span patch_spv, bool is_patched); + +private: + std::optional GetRegDump(uintptr_t base_addr, uintptr_t header_addr); }; } // namespace DebugStateType diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index e85b8b89..583339dd 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -296,17 +296,12 @@ static_assert(CtxInitSequence400.size() == 0x61); // In case if `submitDone` is issued we need to block submissions until GPU idle static u32 submission_lock{}; std::condition_variable cv_lock{}; -static std::mutex m_submission{}; +std::mutex m_submission{}; static u64 frames_submitted{}; // frame counter static bool send_init_packet{true}; // initialize HW state before first game's submit in a frame static int sdk_version{0}; -struct AscQueueInfo { - VAddr map_addr; - u32* read_addr; - u32 ring_size_dw; -}; -static Common::SlotVector asc_queues{}; +static u32 asc_next_offs_dw[Liverpool::NumComputeRings]; static constexpr VAddr tessellation_factors_ring_addr = Core::SYSTEM_RESERVED_MAX - 0xFFFFFFF; static constexpr u32 tessellation_offchip_buffer_size = 0x800000u; @@ -506,11 +501,19 @@ void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw) { } auto vqid = gnm_vqid - 1; - auto& asc_queue = asc_queues[{vqid}]; - const auto* acb_ptr = reinterpret_cast(asc_queue.map_addr + *asc_queue.read_addr); - const auto acb_size = next_offs_dw ? (next_offs_dw << 2u) - *asc_queue.read_addr - : (asc_queue.ring_size_dw << 2u) - *asc_queue.read_addr; - const std::span acb_span{acb_ptr, acb_size >> 2u}; + auto& asc_queue = liverpool->asc_queues[{vqid}]; + + const auto& offs_dw = asc_next_offs_dw[vqid]; + + if (next_offs_dw < offs_dw) { + ASSERT_MSG(next_offs_dw == 0, "ACB submission is split at the end of ring buffer"); + } + + const auto* acb_ptr = reinterpret_cast(asc_queue.map_addr) + offs_dw; + const auto acb_size_dw = (next_offs_dw ? next_offs_dw : asc_queue.ring_size_dw) - offs_dw; + const std::span acb_span{acb_ptr, acb_size_dw}; + + asc_next_offs_dw[vqid] = next_offs_dw; if (DebugState.DumpingCurrentFrame()) { static auto last_frame_num = -1LL; @@ -545,9 +548,6 @@ void PS4_SYSV_ABI sceGnmDingDong(u32 gnm_vqid, u32 next_offs_dw) { }); } liverpool->SubmitAsc(gnm_vqid, acb_span); - - *asc_queue.read_addr += acb_size; - *asc_queue.read_addr %= asc_queue.ring_size_dw * 4; } void PS4_SYSV_ABI sceGnmDingDongForWorkload(u32 gnm_vqid, u32 next_offs_dw, u64 workload_id) { @@ -1266,12 +1266,16 @@ int PS4_SYSV_ABI sceGnmMapComputeQueue(u32 pipe_id, u32 queue_id, VAddr ring_bas return ORBIS_GNM_ERROR_COMPUTEQUEUE_INVALID_READ_PTR_ADDR; } - auto vqid = asc_queues.insert(VAddr(ring_base_addr), read_ptr_addr, ring_size_dw); + const auto vqid = + liverpool->asc_queues.insert(VAddr(ring_base_addr), read_ptr_addr, ring_size_dw, pipe_id); // We need to offset index as `dingDong` assumes it to be from the range [1..64] const auto gnm_vqid = vqid.index + 1; LOG_INFO(Lib_GnmDriver, "ASC pipe {} queue {} mapped to vqueue {}", pipe_id, queue_id, gnm_vqid); + const auto& queue = liverpool->asc_queues[vqid]; + *queue.read_addr = 0u; + return gnm_vqid; } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 2e09e70a..5f0ad298 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -294,7 +294,7 @@ void EmitContext::DefineInputs() { }); // Note that we pass index rather than Id input_params[attrib.semantic] = SpirvAttribute{ - .id = rate_idx, + .id = {rate_idx}, .pointer_type = input_u32, .component_type = U32[1], .num_components = std::min(attrib.num_elements, num_components), diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 8db2d63c..8cca636c 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include + #include "common/assert.h" #include "common/config.h" #include "common/debug.h" @@ -18,7 +20,32 @@ namespace AmdGpu { static const char* dcb_task_name{"DCB_TASK"}; static const char* ccb_task_name{"CCB_TASK"}; -static const char* acb_task_name{"ACB_TASK"}; + +#define MAX_NAMES 56 +static_assert(Liverpool::NumComputeRings <= MAX_NAMES); + +#define NAME_NUM(z, n, name) BOOST_PP_STRINGIZE(name) BOOST_PP_STRINGIZE(n), +#define NAME_ARRAY(name, num) {BOOST_PP_REPEAT(num, NAME_NUM, name)} + +static const char* acb_task_name[] = NAME_ARRAY(ACB_TASK, MAX_NAMES); + +#define YIELD(name) \ + FIBER_EXIT; \ + co_yield {}; \ + FIBER_ENTER(name); + +#define YIELD_CE() YIELD(ccb_task_name) +#define YIELD_GFX() YIELD(dcb_task_name) +#define YIELD_ASC(id) YIELD(acb_task_name[id]) + +#define RESUME(task, name) \ + FIBER_EXIT; \ + task.handle.resume(); \ + FIBER_ENTER(name); + +#define RESUME_CE(task) RESUME(task, ccb_task_name) +#define RESUME_GFX(task) RESUME(task, dcb_task_name) +#define RESUME_ASC(task, id) RESUME(task, acb_task_name[id]) std::array Liverpool::ConstantEngine::constants_heap; @@ -60,7 +87,7 @@ void Liverpool::Process(std::stop_token stoken) { VideoCore::StartCapture(); - int qid = -1; + curr_qid = -1; while (num_submits || num_commands) { @@ -79,9 +106,9 @@ void Liverpool::Process(std::stop_token stoken) { --num_commands; } - qid = (qid + 1) % NumTotalQueues; + curr_qid = (curr_qid + 1) % num_mapped_queues; - auto& queue = mapped_queues[qid]; + auto& queue = mapped_queues[curr_qid]; Task::Handle task{}; { @@ -119,7 +146,7 @@ void Liverpool::Process(std::stop_token stoken) { } Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { - TracyFiberEnter(ccb_task_name); + FIBER_ENTER(ccb_task_name); while (!ccb.empty()) { const auto* header = reinterpret_cast(ccb.data()); @@ -155,9 +182,7 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { case PM4ItOpcode::WaitOnDeCounterDiff: { const auto diff = it_body[0]; while ((cblock.de_count - cblock.ce_count) >= diff) { - TracyFiberLeave; - co_yield {}; - TracyFiberEnter(ccb_task_name); + YIELD_CE(); } break; } @@ -165,13 +190,12 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { const auto* indirect_buffer = reinterpret_cast(header); auto task = ProcessCeUpdate({indirect_buffer->Address(), indirect_buffer->ib_size}); - while (!task.handle.done()) { - task.handle.resume(); + RESUME_CE(task); - TracyFiberLeave; - co_yield {}; - TracyFiberEnter(ccb_task_name); - }; + while (!task.handle.done()) { + YIELD_CE(); + RESUME_CE(task); + } break; } default: @@ -182,11 +206,11 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span ccb) { ccb = NextPacket(ccb, header->type3.NumWords() + 1); } - TracyFiberLeave; + FIBER_EXIT; } Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span ccb) { - TracyFiberEnter(dcb_task_name); + FIBER_ENTER(dcb_task_name); cblock.Reset(); @@ -197,9 +221,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(dcb.data()); @@ -353,8 +375,18 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); - std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, - (count - 1) * sizeof(u32)); + const auto set_size = (count - 1) * sizeof(u32); + + if (set_data->reg_offset >= 0x200 && + set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) { + ASSERT(set_size <= sizeof(ComputeProgram)); + auto* addr = reinterpret_cast(&mapped_queues[GfxQueueId].cs_state) + + (set_data->reg_offset - 0x200); + std::memcpy(addr, header + 2, set_size); + } else { + std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, + set_size); + } break; } case PM4ItOpcode::SetUconfigReg: { @@ -474,15 +506,16 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); - regs.cs_program.dim_x = dispatch_direct->dim_x; - regs.cs_program.dim_y = dispatch_direct->dim_y; - regs.cs_program.dim_z = dispatch_direct->dim_z; - regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; + auto& cs_program = GetCsRegs(); + cs_program.dim_x = dispatch_direct->dim_x; + cs_program.dim_y = dispatch_direct->dim_y; + cs_program.dim_z = dispatch_direct->dim_z; + cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs, - true); + DebugState.PushRegsDumpCompute(base_addr, reinterpret_cast(header), + cs_program); } - if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { + if (rasterizer && (cs_program.dispatch_initiator & 1)) { const auto cmd_address = reinterpret_cast(header); rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:Dispatch", cmd_address)); rasterizer->DispatchDirect(); @@ -493,14 +526,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); + auto& cs_program = GetCsRegs(); const auto offset = dispatch_indirect->data_offset; const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr; const auto size = sizeof(PM4CmdDispatchIndirect::GroupDimensions); if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs, - true); + DebugState.PushRegsDumpCompute(base_addr, reinterpret_cast(header), + cs_program); } - if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { + if (rasterizer && (cs_program.dispatch_initiator & 1)) { const auto cmd_address = reinterpret_cast(header); rasterizer->ScopeMarkerBegin( fmt::format("dcb:{}:DispatchIndirect", cmd_address)); @@ -613,11 +647,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); while (!rewind->Valid()) { - mapped_queues[GfxQueueId].cs_state = regs.cs_program; - TracyFiberLeave; - co_yield {}; - TracyFiberEnter(dcb_task_name); - regs.cs_program = mapped_queues[GfxQueueId].cs_state; + YIELD_GFX(); } break; } @@ -633,11 +663,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::spanWaitVoLabel([&] { return wait_reg_mem->Test(); }); } while (!wait_reg_mem->Test()) { - mapped_queues[GfxQueueId].cs_state = regs.cs_program; - TracyFiberLeave; - co_yield {}; - TracyFiberEnter(dcb_task_name); - regs.cs_program = mapped_queues[GfxQueueId].cs_state; + YIELD_GFX(); } break; } @@ -645,13 +671,12 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); auto task = ProcessGraphics( {indirect_buffer->Address(), indirect_buffer->ib_size}, {}); - while (!task.handle.done()) { - task.handle.resume(); + RESUME_GFX(task); - TracyFiberLeave; - co_yield {}; - TracyFiberEnter(dcb_task_name); - }; + while (!task.handle.done()) { + YIELD_GFX(); + RESUME_GFX(task); + } break; } case PM4ItOpcode::IncrementDeCounter: { @@ -660,9 +685,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span dcb, std::span acb, int vqid) { - TracyFiberEnter(acb_task_name); +template +Liverpool::Task Liverpool::ProcessCompute(std::span acb, u32 vqid) { + FIBER_ENTER(acb_task_name[vqid]); + const auto& queue = asc_queues[{vqid}]; auto base_addr = reinterpret_cast(acb.data()); while (!acb.empty()) { @@ -711,15 +736,14 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { } case PM4ItOpcode::IndirectBuffer: { const auto* indirect_buffer = reinterpret_cast(header); - auto task = ProcessCompute( + auto task = ProcessCompute( {indirect_buffer->Address(), indirect_buffer->ib_size}, vqid); - while (!task.handle.done()) { - task.handle.resume(); + RESUME_ASC(task, vqid); - TracyFiberLeave; - co_yield {}; - TracyFiberEnter(acb_task_name); - }; + while (!task.handle.done()) { + YIELD_ASC(vqid); + RESUME_ASC(task, vqid); + } break; } case PM4ItOpcode::DmaData: { @@ -757,30 +781,38 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { case PM4ItOpcode::Rewind: { const PM4CmdRewind* rewind = reinterpret_cast(header); while (!rewind->Valid()) { - mapped_queues[vqid].cs_state = regs.cs_program; - TracyFiberLeave; - co_yield {}; - TracyFiberEnter(acb_task_name); - regs.cs_program = mapped_queues[vqid].cs_state; + YIELD_ASC(vqid); } break; } case PM4ItOpcode::SetShReg: { const auto* set_data = reinterpret_cast(header); - std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, - (count - 1) * sizeof(u32)); + const auto set_size = (count - 1) * sizeof(u32); + + if (set_data->reg_offset >= 0x200 && + set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) { + ASSERT(set_size <= sizeof(ComputeProgram)); + auto* addr = reinterpret_cast(&mapped_queues[vqid + 1].cs_state) + + (set_data->reg_offset - 0x200); + std::memcpy(addr, header + 2, set_size); + } else { + std::memcpy(®s.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2, + set_size); + } break; } case PM4ItOpcode::DispatchDirect: { const auto* dispatch_direct = reinterpret_cast(header); - regs.cs_program.dim_x = dispatch_direct->dim_x; - regs.cs_program.dim_y = dispatch_direct->dim_y; - regs.cs_program.dim_z = dispatch_direct->dim_z; - regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; + auto& cs_program = GetCsRegs(); + cs_program.dim_x = dispatch_direct->dim_x; + cs_program.dim_y = dispatch_direct->dim_y; + cs_program.dim_z = dispatch_direct->dim_z; + cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator; if (DebugState.DumpingCurrentReg()) { - DebugState.PushRegsDump(base_addr, reinterpret_cast(header), regs, true); + DebugState.PushRegsDumpCompute(base_addr, reinterpret_cast(header), + cs_program); } - if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) { + if (rasterizer && (cs_program.dispatch_initiator & 1)) { const auto cmd_address = reinterpret_cast(header); rasterizer->ScopeMarkerBegin(fmt::format("acb[{}]:{}:Dispatch", vqid, cmd_address)); rasterizer->DispatchDirect(); @@ -803,17 +835,13 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { const auto* wait_reg_mem = reinterpret_cast(header); ASSERT(wait_reg_mem->engine.Value() == PM4CmdWaitRegMem::Engine::Me); while (!wait_reg_mem->Test()) { - mapped_queues[vqid].cs_state = regs.cs_program; - TracyFiberLeave; - co_yield {}; - TracyFiberEnter(acb_task_name); - regs.cs_program = mapped_queues[vqid].cs_state; + YIELD_ASC(vqid); } break; } case PM4ItOpcode::ReleaseMem: { const auto* release_mem = reinterpret_cast(header); - release_mem->SignalFence(Platform::InterruptId::Compute0RelMem); // <--- + release_mem->SignalFence(static_cast(queue.pipe_id)); break; } default: @@ -821,10 +849,16 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { static_cast(opcode), count); } - acb = NextPacket(acb, header->type3.NumWords() + 1); + const auto packet_size_dw = header->type3.NumWords() + 1; + acb = NextPacket(acb, packet_size_dw); + + if constexpr (!is_indirect) { + *queue.read_addr += packet_size_dw; + *queue.read_addr %= queue.ring_size_dw; + } } - TracyFiberLeave; + FIBER_EXIT; } std::pair, std::span> Liverpool::CopyCmdBuffers( @@ -881,10 +915,11 @@ void Liverpool::SubmitGfx(std::span dcb, std::span ccb) { submit_cv.notify_one(); } -void Liverpool::SubmitAsc(u32 vqid, std::span acb) { - ASSERT_MSG(vqid >= 0 && vqid < NumTotalQueues, "Invalid virtual ASC queue index"); - auto& queue = mapped_queues[vqid]; +void Liverpool::SubmitAsc(u32 gnm_vqid, std::span acb) { + ASSERT_MSG(gnm_vqid > 0 && gnm_vqid < NumTotalQueues, "Invalid virtual ASC queue index"); + auto& queue = mapped_queues[gnm_vqid]; + const auto vqid = gnm_vqid - 1; const auto& task = ProcessCompute(acb, vqid); { std::scoped_lock lock{queue.m_access}; @@ -892,6 +927,7 @@ void Liverpool::SubmitAsc(u32 vqid, std::span acb) { } std::scoped_lock lk{submit_mutex}; + num_mapped_queues = std::max(num_mapped_queues, gnm_vqid + 1); ++num_submits; submit_cv.notify_one(); } diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index b6172d37..4c74d37d 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -16,6 +16,7 @@ #include "common/assert.h" #include "common/bit_field.h" #include "common/polyfill_thread.h" +#include "common/slot_vector.h" #include "common/types.h" #include "common/unique_function.h" #include "shader_recompiler/params.h" @@ -45,7 +46,8 @@ struct Liverpool { static constexpr u32 NumGfxRings = 1u; // actually 2, but HP is reserved by system software static constexpr u32 NumComputePipes = 7u; // actually 8, but #7 is reserved by system software static constexpr u32 NumQueuesPerPipe = 8u; - static constexpr u32 NumTotalQueues = NumGfxRings + (NumComputePipes * NumQueuesPerPipe); + static constexpr u32 NumComputeRings = NumComputePipes * NumQueuesPerPipe; + static constexpr u32 NumTotalQueues = NumGfxRings + NumComputeRings; static_assert(NumTotalQueues < 64u); // need to fit into u64 bitmap for ffs static constexpr u32 NumColorBuffers = 8; @@ -1143,7 +1145,7 @@ struct Liverpool { INSERT_PADDING_WORDS(0x2D48 - 0x2d08 - 20); ShaderProgram ls_program; INSERT_PADDING_WORDS(0xA4); - ComputeProgram cs_program; + ComputeProgram cs_program; // shadowed by `cs_state` in `mapped_queues` INSERT_PADDING_WORDS(0xA008 - 0x2E00 - 80 - 3 - 5); DepthRenderControl depth_render_control; INSERT_PADDING_WORDS(1); @@ -1298,7 +1300,7 @@ public: ~Liverpool(); void SubmitGfx(std::span dcb, std::span ccb); - void SubmitAsc(u32 vqid, std::span acb); + void SubmitAsc(u32 gnm_vqid, std::span acb); void SubmitDone() noexcept { std::scoped_lock lk{submit_mutex}; @@ -1341,6 +1343,18 @@ public: gfx_queue.dcb_buffer.reserve(GfxReservedSize); } + inline ComputeProgram& GetCsRegs() { + return mapped_queues[curr_qid].cs_state; + } + + struct AscQueueInfo { + VAddr map_addr; + u32* read_addr; + u32 ring_size_dw; + u32 pipe_id; + }; + Common::SlotVector asc_queues{}; + private: struct Task { struct promise_type { @@ -1378,7 +1392,8 @@ private: std::span ccb); Task ProcessGraphics(std::span dcb, std::span ccb); Task ProcessCeUpdate(std::span ccb); - Task ProcessCompute(std::span acb, int vqid); + template + Task ProcessCompute(std::span acb, u32 vqid); void Process(std::stop_token stoken); @@ -1393,6 +1408,7 @@ private: VAddr indirect_args_addr{}; }; std::array mapped_queues{}; + u32 num_mapped_queues{1u}; // GFX is always available struct ConstantEngine { void Reset() { @@ -1421,6 +1437,7 @@ private: std::mutex submit_mutex; std::condition_variable_any submit_cv; std::queue> command_queue{}; + int curr_qid{-1}; }; static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08); diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 58473496..50396287 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -173,9 +173,9 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Stage stage, LogicalStage l_ break; } case Stage::Compute: { - const auto& cs_pgm = regs.cs_program; + const auto& cs_pgm = liverpool->GetCsRegs(); info.num_user_data = cs_pgm.settings.num_user_regs; - info.num_allocated_vgprs = regs.cs_program.settings.num_vgprs * 4; + info.num_allocated_vgprs = cs_pgm.settings.num_vgprs * 4; info.cs_info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full, cs_pgm.num_thread_z.full}; info.cs_info.tgid_enable = {cs_pgm.IsTgidEnabled(0), cs_pgm.IsTgidEnabled(1), @@ -476,8 +476,8 @@ bool PipelineCache::RefreshGraphicsKey() { bool PipelineCache::RefreshComputeKey() { Shader::Backend::Bindings binding{}; - const auto* cs_pgm = &liverpool->regs.cs_program; - const auto cs_params = Liverpool::GetParams(*cs_pgm); + const auto& cs_pgm = liverpool->GetCsRegs(); + const auto cs_params = Liverpool::GetParams(cs_pgm); std::tie(infos[0], modules[0], fetch_shader, compute_key.value) = GetProgram(Shader::Stage::Compute, LogicalStage::Compute, cs_params, binding); return true; @@ -529,6 +529,7 @@ PipelineCache::Result PipelineCache::GetProgram(Stage stage, LogicalStage l_stag return std::make_tuple(&program->info, module, spec.fetch_shader_data, HashCombine(params.hash, 0)); } + it_pgm.value()->info.user_data = params.user_data; auto& program = it_pgm.value(); auto& info = program->info; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index fef4c7ec..bd8906f8 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -317,14 +317,14 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 void Rasterizer::DispatchDirect() { RENDERER_TRACE; - const auto& cs_program = liverpool->regs.cs_program; + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { return; } const auto& cs = pipeline->GetStage(Shader::LogicalStage::Compute); - if (ExecuteShaderHLE(cs, liverpool->regs, *this)) { + if (ExecuteShaderHLE(cs, liverpool->regs, cs_program, *this)) { return; } @@ -344,7 +344,7 @@ void Rasterizer::DispatchDirect() { void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { RENDERER_TRACE; - const auto& cs_program = liverpool->regs.cs_program; + const auto& cs_program = liverpool->GetCsRegs(); const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline(); if (!pipeline) { return; diff --git a/src/video_core/renderer_vulkan/vk_shader_hle.cpp b/src/video_core/renderer_vulkan/vk_shader_hle.cpp index b863dce2..ff78f5d2 100644 --- a/src/video_core/renderer_vulkan/vk_shader_hle.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_hle.cpp @@ -2,17 +2,19 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "shader_recompiler/info.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_hle.h" -#include "vk_rasterizer.h" +extern std::unique_ptr liverpool; namespace Vulkan { static constexpr u64 COPY_SHADER_HASH = 0xfefebf9f; -bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs, - Rasterizer& rasterizer) { +static bool ExecuteCopyShaderHLE(const Shader::Info& info, + const AmdGpu::Liverpool::ComputeProgram& cs_program, + Rasterizer& rasterizer) { auto& scheduler = rasterizer.GetScheduler(); auto& buffer_cache = rasterizer.GetBufferCache(); @@ -34,9 +36,9 @@ bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Reg static std::vector copies; copies.clear(); - copies.reserve(regs.cs_program.dim_x); + copies.reserve(cs_program.dim_x); - for (u32 i = 0; i < regs.cs_program.dim_x; i++) { + for (u32 i = 0; i < cs_program.dim_x; i++) { const auto& [dst_idx, src_idx, end] = ctl_buf[i]; const u32 local_dst_offset = dst_idx * buf_stride; const u32 local_src_offset = src_idx * buf_stride; @@ -122,10 +124,10 @@ bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Reg } bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs, - Rasterizer& rasterizer) { + const AmdGpu::Liverpool::ComputeProgram& cs_program, Rasterizer& rasterizer) { switch (info.pgm_hash) { case COPY_SHADER_HASH: - return ExecuteCopyShaderHLE(info, regs, rasterizer); + return ExecuteCopyShaderHLE(info, cs_program, rasterizer); default: return false; } diff --git a/src/video_core/renderer_vulkan/vk_shader_hle.h b/src/video_core/renderer_vulkan/vk_shader_hle.h index fda9b173..008de800 100644 --- a/src/video_core/renderer_vulkan/vk_shader_hle.h +++ b/src/video_core/renderer_vulkan/vk_shader_hle.h @@ -15,6 +15,6 @@ class Rasterizer; /// Attempts to execute a shader using HLE if possible. bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs, - Rasterizer& rasterizer); + const AmdGpu::Liverpool::ComputeProgram& cs_program, Rasterizer& rasterizer); } // namespace Vulkan diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index d8d23c40..a5e09e45 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -212,6 +212,7 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) { case vk::Format::eBc7SrgbBlock: case vk::Format::eBc7UnormBlock: case vk::Format::eBc6HUfloatBlock: + case vk::Format::eR32G32B32A32Uint: case vk::Format::eR32G32B32A32Sfloat: return vk::Format::eR32G32B32A32Uint; default: