better asc ring context handling
Some checks are pending
Build and Release / reuse (push) Waiting to run
Build and Release / clang-format (push) Waiting to run
Build and Release / get-info (push) Waiting to run
Build and Release / windows-sdl (push) Blocked by required conditions
Build and Release / windows-qt (push) Blocked by required conditions
Build and Release / macos-sdl (push) Blocked by required conditions
Build and Release / macos-qt (push) Blocked by required conditions
Build and Release / linux-sdl (push) Blocked by required conditions
Build and Release / linux-qt (push) Blocked by required conditions
Build and Release / pre-release (push) Blocked by required conditions

This commit is contained in:
psucien 2024-12-14 17:18:46 +01:00
parent 133a282be7
commit 472cfebc39
8 changed files with 98 additions and 79 deletions

View file

@ -15,6 +15,7 @@
using namespace DebugStateType;
DebugStateImpl& DebugState = *Common::Singleton<DebugStateImpl>::Instance();
extern std::unique_ptr<AmdGpu::Liverpool> liverpool;
static ThreadID ThisThreadID() {
#ifdef _WIN32
@ -142,8 +143,7 @@ void DebugStateImpl::PushQueueDump(QueueDump dump) {
frame.queues.push_back(std::move(dump));
}
void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr,
const AmdGpu::Liverpool::Regs& regs, bool is_compute) {
void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, bool is_compute) {
std::scoped_lock lock{frame_dump_list_mutex};
const auto it = waiting_reg_dumps.find(header_addr);
if (it == waiting_reg_dumps.end()) {
@ -153,18 +153,19 @@ void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr,
waiting_reg_dumps.erase(it);
waiting_reg_dumps_dbg.erase(waiting_reg_dumps_dbg.find(header_addr));
auto& dump = frame.regs[header_addr - base_addr];
dump.regs = regs;
dump.regs = liverpool->regs;
if (is_compute) {
dump.is_compute = true;
const auto& cs = dump.regs.cs_program;
auto& cs = dump.regs.cs_program;
cs = liverpool->GetCsRegs();
dump.cs_data = PipelineComputerProgramDump{
.cs_program = cs,
.code = std::vector<u32>{cs.Code().begin(), cs.Code().end()},
};
} else {
for (int i = 0; i < RegDump::MaxShaderStages; i++) {
if (regs.stage_enable.IsStageEnabled(i)) {
auto stage = regs.ProgramForStage(i);
if (dump.regs.stage_enable.IsStageEnabled(i)) {
auto stage = dump.regs.ProgramForStage(i);
if (stage->address_lo != 0) {
auto code = stage->Code();
dump.stages[i] = PipelineShaderProgramDump{

View file

@ -11,7 +11,6 @@
#include <queue>
#include "common/types.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
#ifdef _WIN32
@ -203,8 +202,7 @@ public:
void PushQueueDump(QueueDump dump);
void PushRegsDump(uintptr_t base_addr, uintptr_t header_addr,
const AmdGpu::Liverpool::Regs& regs, bool is_compute = false);
void PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, bool is_compute = false);
void CollectShader(const std::string& name, Shader::LogicalStage l_stage,
vk::ShaderModule module, std::span<const u32> spv,

View file

@ -29,32 +29,24 @@ static_assert(Liverpool::NumComputeRings <= MAX_NAMES);
static const char* acb_task_name[] = NAME_ARRAY(ACB_TASK, MAX_NAMES);
#define YIELD_CE() \
mapped_queues[GfxQueueId].cs_state = regs.cs_program; \
#define YIELD(name) \
FIBER_EXIT; \
co_yield {}; \
FIBER_ENTER(ccb_task_name); \
regs.cs_program = mapped_queues[GfxQueueId].cs_state
FIBER_ENTER(name);
#define YIELD_GFX \
mapped_queues[GfxQueueId].cs_state = regs.cs_program; \
FIBER_EXIT; \
co_yield {}; \
FIBER_ENTER(dcb_task_name); \
regs.cs_program = mapped_queues[GfxQueueId].cs_state;
#define YIELD_ASC(id) \
mapped_queues[id + 1].cs_state = regs.cs_program; \
FIBER_EXIT; \
co_yield {}; \
FIBER_ENTER(acb_task_name[id]); \
regs.cs_program = mapped_queues[id + 1].cs_state;
#define YIELD_CE() YIELD(ccb_task_name)
#define YIELD_GFX() YIELD(dcb_task_name)
#define YIELD_ASC(id) YIELD(acb_task_name[id])
#define RESUME(task, name) \
FIBER_EXIT; \
task.handle.resume(); \
FIBER_ENTER(name);
#define RESUME_CE(task) RESUME(task, ccb_task_name)
#define RESUME_GFX(task) RESUME(task, dcb_task_name)
#define RESUME_ASC(task, id) RESUME(task, acb_task_name[id])
std::array<u8, 48_KB> Liverpool::ConstantEngine::constants_heap;
static std::span<const u32> NextPacket(std::span<const u32> span, size_t offset) {
@ -198,10 +190,11 @@ Liverpool::Task Liverpool::ProcessCeUpdate(std::span<const u32> ccb) {
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task =
ProcessCeUpdate({indirect_buffer->Address<const u32>(), indirect_buffer->ib_size});
RESUME_CE(task);
while (!task.handle.done()) {
YIELD_CE();
RESUME(task, ccb_task_name);
RESUME_CE(task);
}
break;
}
@ -228,6 +221,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
if (!ccb.empty()) {
// In case of CCB provided kick off CE asap to have the constant heap ready to use
ce_task = ProcessCeUpdate(ccb);
RESUME_GFX(ce_task);
}
const auto base_addr = reinterpret_cast<uintptr_t>(dcb.data());
@ -404,7 +398,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
regs.num_indices = draw_index->index_count;
regs.draw_initiator = draw_index->draw_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header));
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
@ -421,7 +415,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
regs.num_indices = draw_index_off->index_count;
regs.draw_initiator = draw_index_off->draw_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header));
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
@ -437,7 +431,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
regs.num_indices = draw_index->index_count;
regs.draw_initiator = draw_index->draw_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header));
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
@ -453,7 +447,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(DrawIndirectArgs);
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header));
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
@ -470,7 +464,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(DrawIndexedIndirectArgs);
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header));
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
@ -487,7 +481,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
const auto offset = draw_index_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs);
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header));
}
if (rasterizer) {
const auto cmd_address = reinterpret_cast<const void*>(header);
@ -502,15 +496,16 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
}
case PM4ItOpcode::DispatchDirect: {
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
regs.cs_program.dim_x = dispatch_direct->dim_x;
regs.cs_program.dim_y = dispatch_direct->dim_y;
regs.cs_program.dim_z = dispatch_direct->dim_z;
regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
SaveDispatchContext();
auto& cs_program = GetCsRegs();
cs_program.dim_x = dispatch_direct->dim_x;
cs_program.dim_y = dispatch_direct->dim_y;
cs_program.dim_z = dispatch_direct->dim_z;
cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs,
true);
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), true);
}
if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) {
if (rasterizer && (cs_program.dispatch_initiator & 1)) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(fmt::format("dcb:{}:Dispatch", cmd_address));
rasterizer->DispatchDirect();
@ -521,14 +516,15 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
case PM4ItOpcode::DispatchIndirect: {
const auto* dispatch_indirect =
reinterpret_cast<const PM4CmdDispatchIndirect*>(header);
SaveDispatchContext();
auto& cs_program = GetCsRegs();
const auto offset = dispatch_indirect->data_offset;
const auto ib_address = mapped_queues[GfxQueueId].indirect_args_addr;
const auto size = sizeof(PM4CmdDispatchIndirect::GroupDimensions);
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs,
true);
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), true);
}
if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) {
if (rasterizer && (cs_program.dispatch_initiator & 1)) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(
fmt::format("dcb:{}:DispatchIndirect", cmd_address));
@ -641,7 +637,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
case PM4ItOpcode::Rewind: {
const PM4CmdRewind* rewind = reinterpret_cast<const PM4CmdRewind*>(header);
while (!rewind->Valid()) {
YIELD_GFX;
YIELD_GFX();
}
break;
}
@ -657,7 +653,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
vo_port->WaitVoLabel([&] { return wait_reg_mem->Test(); });
}
while (!wait_reg_mem->Test()) {
YIELD_GFX;
YIELD_GFX();
}
break;
}
@ -665,11 +661,11 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task = ProcessGraphics(
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, {});
RESUME(task, dcb_task_name);
RESUME_GFX(task);
while (!task.handle.done()) {
YIELD_GFX;
RESUME(task, dcb_task_name);
YIELD_GFX();
RESUME_GFX(task);
}
break;
}
@ -679,7 +675,7 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span<const u32> dcb, std::span<c
}
case PM4ItOpcode::WaitOnCeCounter: {
while (cblock.ce_count <= cblock.de_count) {
RESUME(ce_task, dcb_task_name);
RESUME_GFX(ce_task);
}
break;
}
@ -732,11 +728,11 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
const auto* indirect_buffer = reinterpret_cast<const PM4CmdIndirectBuffer*>(header);
auto task = ProcessCompute<true>(
{indirect_buffer->Address<const u32>(), indirect_buffer->ib_size}, vqid);
RESUME(task, acb_task_name[vqid]);
RESUME_ASC(task, vqid);
while (!task.handle.done()) {
YIELD_ASC(vqid);
RESUME(task, acb_task_name[vqid]);
RESUME_ASC(task, vqid);
}
break;
}
@ -781,20 +777,32 @@ Liverpool::Task Liverpool::ProcessCompute(std::span<const u32> acb, u32 vqid) {
}
case PM4ItOpcode::SetShReg: {
const auto* set_data = reinterpret_cast<const PM4CmdSetData*>(header);
std::memcpy(&regs.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
(count - 1) * sizeof(u32));
const auto set_size = (count - 1) * sizeof(u32);
if (set_data->reg_offset >= 0x200 &&
set_data->reg_offset <= (0x200 + sizeof(ComputeProgram) / 4)) {
ASSERT(set_size <= sizeof(ComputeProgram));
auto* addr =
reinterpret_cast<u32*>(&asc_sh_regs[vqid]) + (set_data->reg_offset - 0x200);
std::memcpy(addr, header + 2, set_size);
} else {
std::memcpy(&regs.reg_array[ShRegWordOffset + set_data->reg_offset], header + 2,
set_size);
}
break;
}
case PM4ItOpcode::DispatchDirect: {
const auto* dispatch_direct = reinterpret_cast<const PM4CmdDispatchDirect*>(header);
regs.cs_program.dim_x = dispatch_direct->dim_x;
regs.cs_program.dim_y = dispatch_direct->dim_y;
regs.cs_program.dim_z = dispatch_direct->dim_z;
regs.cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
SaveDispatchContext(vqid);
auto& cs_program = GetCsRegs();
cs_program.dim_x = dispatch_direct->dim_x;
cs_program.dim_y = dispatch_direct->dim_y;
cs_program.dim_z = dispatch_direct->dim_z;
cs_program.dispatch_initiator = dispatch_direct->dispatch_initiator;
if (DebugState.DumpingCurrentReg()) {
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), regs, true);
DebugState.PushRegsDump(base_addr, reinterpret_cast<uintptr_t>(header), true);
}
if (rasterizer && (regs.cs_program.dispatch_initiator & 1)) {
if (rasterizer && (cs_program.dispatch_initiator & 1)) {
const auto cmd_address = reinterpret_cast<const void*>(header);
rasterizer->ScopeMarkerBegin(fmt::format("acb[{}]:{}:Dispatch", vqid, cmd_address));
rasterizer->DispatchDirect();

View file

@ -1279,6 +1279,7 @@ struct Liverpool {
};
Regs regs{};
std::array<ComputeProgram, NumComputeRings> asc_sh_regs{};
// See for a comment in context reg parsing code
union CbDbExtent {
@ -1343,6 +1344,10 @@ public:
gfx_queue.dcb_buffer.reserve(GfxReservedSize);
}
inline ComputeProgram& GetCsRegs() {
return *curr_cs_regs;
}
struct AscQueueInfo {
VAddr map_addr;
u32* read_addr;
@ -1393,6 +1398,14 @@ private:
void Process(std::stop_token stoken);
inline void SaveDispatchContext() {
curr_cs_regs = &regs.cs_program;
}
inline void SaveDispatchContext(u32 vqid) {
curr_cs_regs = &asc_sh_regs[vqid];
}
struct GpuQueue {
std::mutex m_access{};
std::atomic<u32> dcb_buffer_offset;
@ -1400,7 +1413,6 @@ private:
std::vector<u32> dcb_buffer;
std::vector<u32> ccb_buffer;
std::queue<Task::Handle> submits{};
ComputeProgram cs_state{};
VAddr indirect_args_addr{};
};
std::array<GpuQueue, NumTotalQueues> mapped_queues{};
@ -1433,6 +1445,7 @@ private:
std::mutex submit_mutex;
std::condition_variable_any submit_cv;
std::queue<Common::UniqueFunction<void>> command_queue{};
ComputeProgram* curr_cs_regs{&regs.cs_program};
};
static_assert(GFX6_3D_REG_INDEX(ps_program) == 0x2C08);

View file

@ -172,10 +172,10 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Stage stage, LogicalStage l_
}
break;
}
case Stage::Compute: {
const auto& cs_pgm = regs.cs_program;
case Shader::Stage::Compute: {
const auto& cs_pgm = liverpool->GetCsRegs();
info.num_user_data = cs_pgm.settings.num_user_regs;
info.num_allocated_vgprs = regs.cs_program.settings.num_vgprs * 4;
info.num_allocated_vgprs = cs_pgm.settings.num_vgprs * 4;
info.cs_info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full,
cs_pgm.num_thread_z.full};
info.cs_info.tgid_enable = {cs_pgm.IsTgidEnabled(0), cs_pgm.IsTgidEnabled(1),
@ -476,8 +476,8 @@ bool PipelineCache::RefreshGraphicsKey() {
bool PipelineCache::RefreshComputeKey() {
Shader::Backend::Bindings binding{};
const auto* cs_pgm = &liverpool->regs.cs_program;
const auto cs_params = Liverpool::GetParams(*cs_pgm);
const auto& cs_pgm = liverpool->GetCsRegs();
const auto cs_params = Liverpool::GetParams(cs_pgm);
std::tie(infos[0], modules[0], fetch_shader, compute_key.value) =
GetProgram(Shader::Stage::Compute, LogicalStage::Compute, cs_params, binding);
return true;
@ -529,6 +529,7 @@ PipelineCache::Result PipelineCache::GetProgram(Stage stage, LogicalStage l_stag
return std::make_tuple(&program->info, module, spec.fetch_shader_data,
HashCombine(params.hash, 0));
}
it_pgm.value()->info.user_data = params.user_data;
auto& program = it_pgm.value();
auto& info = program->info;

View file

@ -317,14 +317,14 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3
void Rasterizer::DispatchDirect() {
RENDERER_TRACE;
const auto& cs_program = liverpool->regs.cs_program;
const auto& cs_program = liverpool->GetCsRegs();
const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline();
if (!pipeline) {
return;
}
const auto& cs = pipeline->GetStage(Shader::LogicalStage::Compute);
if (ExecuteShaderHLE(cs, liverpool->regs, *this)) {
if (ExecuteShaderHLE(cs, *this)) {
return;
}
@ -344,7 +344,7 @@ void Rasterizer::DispatchDirect() {
void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) {
RENDERER_TRACE;
const auto& cs_program = liverpool->regs.cs_program;
const auto& cs_program = liverpool->GetCsRegs();
const ComputePipeline* pipeline = pipeline_cache.GetComputePipeline();
if (!pipeline) {
return;

View file

@ -5,16 +5,16 @@
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_shader_hle.h"
#include "vk_rasterizer.h"
extern std::unique_ptr<AmdGpu::Liverpool> liverpool;
namespace Vulkan {
static constexpr u64 COPY_SHADER_HASH = 0xfefebf9f;
bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs,
Rasterizer& rasterizer) {
bool ExecuteCopyShaderHLE(const Shader::Info& info, Rasterizer& rasterizer) {
auto& scheduler = rasterizer.GetScheduler();
auto& buffer_cache = rasterizer.GetBufferCache();
const auto& cs_program = liverpool->GetCsRegs();
// Copy shader defines three formatted buffers as inputs: control, source, and destination.
const auto ctl_buf_sharp = info.texture_buffers[0].GetSharp(info);
@ -34,9 +34,9 @@ bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Reg
static std::vector<vk::BufferCopy> copies;
copies.clear();
copies.reserve(regs.cs_program.dim_x);
copies.reserve(cs_program.dim_x);
for (u32 i = 0; i < regs.cs_program.dim_x; i++) {
for (u32 i = 0; i < cs_program.dim_x; i++) {
const auto& [dst_idx, src_idx, end] = ctl_buf[i];
const u32 local_dst_offset = dst_idx * buf_stride;
const u32 local_src_offset = src_idx * buf_stride;
@ -121,11 +121,10 @@ bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Reg
return true;
}
bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs,
Rasterizer& rasterizer) {
bool ExecuteShaderHLE(const Shader::Info& info, Rasterizer& rasterizer) {
switch (info.pgm_hash) {
case COPY_SHADER_HASH:
return ExecuteCopyShaderHLE(info, regs, rasterizer);
return ExecuteCopyShaderHLE(info, rasterizer);
default:
return false;
}

View file

@ -3,7 +3,7 @@
#pragma once
#include "video_core/amdgpu/liverpool.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
namespace Shader {
struct Info;
@ -14,7 +14,6 @@ namespace Vulkan {
class Rasterizer;
/// Attempts to execute a shader using HLE if possible.
bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs,
Rasterizer& rasterizer);
bool ExecuteShaderHLE(const Shader::Info& info, Rasterizer& rasterizer);
} // namespace Vulkan