From 01bfb3681fc743b0e3223a543b2c7b1da7dc9a8f Mon Sep 17 00:00:00 2001 From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Sat, 19 Oct 2024 15:30:58 +0300 Subject: [PATCH] renderer_vulkan: Commize and adjust buffer bindings (#1412) * shader_recompiler: Implement finite cmp class * shader_recompiler: Implement more opcodes * renderer_vulkan: Commonize buffer binding * liverpool: More dma data impl * fix * copy_shader: Handle additional instructions from Knack * translator: Add V_CMPX_GE_I32 --- src/core/address_space.h | 4 + src/core/memory.cpp | 11 ++ src/core/memory.h | 2 + .../frontend/copy_shader.cpp | 8 + .../frontend/translate/scalar_alu.cpp | 24 ++- .../frontend/translate/translate.h | 4 +- .../frontend/translate/vector_alu.cpp | 14 ++ .../ir/passes/constant_propagation_pass.cpp | 32 +--- .../ir/passes/resource_tracking_pass.cpp | 2 +- src/shader_recompiler/ir/reg.h | 2 + src/video_core/amdgpu/liverpool.cpp | 70 +++++++- src/video_core/amdgpu/pm4_cmds.h | 31 +++- src/video_core/buffer_cache/buffer.h | 1 + src/video_core/buffer_cache/buffer_cache.cpp | 70 ++++---- src/video_core/buffer_cache/buffer_cache.h | 23 +-- .../renderer_vulkan/vk_compute_pipeline.cpp | 159 ++++-------------- .../renderer_vulkan/vk_graphics_pipeline.cpp | 143 +++------------- .../renderer_vulkan/vk_pipeline_common.cpp | 146 +++++++++++++++- .../renderer_vulkan/vk_pipeline_common.h | 9 + .../renderer_vulkan/vk_rasterizer.cpp | 18 +- .../renderer_vulkan/vk_rasterizer.h | 2 +- src/video_core/texture_cache/sampler.cpp | 3 + .../texture_cache/texture_cache.cpp | 2 +- 23 files changed, 438 insertions(+), 342 deletions(-) diff --git a/src/core/address_space.h b/src/core/address_space.h index 3233c758..7ccc2cd1 100644 --- a/src/core/address_space.h +++ b/src/core/address_space.h @@ -45,6 +45,10 @@ public: explicit AddressSpace(); ~AddressSpace(); + [[nodiscard]] u8* BackingBase() const noexcept { + return backing_base; + } + [[nodiscard]] VAddr SystemManagedVirtualBase() noexcept { return reinterpret_cast(system_managed_base); } diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 471ba585..031b7b13 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -54,6 +54,17 @@ void MemoryManager::SetupMemoryRegions(u64 flexible_size) { total_flexible_size, total_direct_size); } +bool MemoryManager::TryWriteBacking(void* address, const void* data, u32 num_bytes) { + const VAddr virtual_addr = std::bit_cast(address); + const auto& vma = FindVMA(virtual_addr)->second; + if (vma.type != VMAType::Direct) { + return false; + } + u8* backing = impl.BackingBase() + vma.phys_base + (virtual_addr - vma.base); + memcpy(backing, data, num_bytes); + return true; +} + PAddr MemoryManager::PoolExpand(PAddr search_start, PAddr search_end, size_t size, u64 alignment) { std::scoped_lock lk{mutex}; diff --git a/src/core/memory.h b/src/core/memory.h index 320aa367..286f1c97 100644 --- a/src/core/memory.h +++ b/src/core/memory.h @@ -149,6 +149,8 @@ public: return impl.SystemReservedVirtualBase(); } + bool TryWriteBacking(void* address, const void* data, u32 num_bytes); + void SetupMemoryRegions(u64 flexible_size); PAddr PoolExpand(PAddr search_start, PAddr search_end, size_t size, u64 alignment); diff --git a/src/shader_recompiler/frontend/copy_shader.cpp b/src/shader_recompiler/frontend/copy_shader.cpp index b2c79566..8750e2b1 100644 --- a/src/shader_recompiler/frontend/copy_shader.cpp +++ b/src/shader_recompiler/frontend/copy_shader.cpp @@ -29,6 +29,14 @@ CopyShaderData ParseCopyShader(std::span code) { sources[inst.dst[0].code] = inst.control.sopk.simm; break; } + case Gcn::Opcode::S_MOV_B32: { + sources[inst.dst[0].code] = inst.src[0].code; + break; + } + case Gcn::Opcode::S_ADDK_I32: { + sources[inst.dst[0].code] += inst.control.sopk.simm; + break; + } case Gcn::Opcode::EXP: { const auto& exp = inst.control.exp; const IR::Attribute semantic = static_cast(exp.target); diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 36c1ec85..1e627d95 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -92,8 +92,12 @@ void Translator::EmitScalarAlu(const GcnInst& inst) { break; case Opcode::S_BREV_B32: return S_BREV_B32(inst); + case Opcode::S_BCNT1_I32_B64: + return S_BCNT1_I32_B64(inst); case Opcode::S_AND_SAVEEXEC_B64: - return S_AND_SAVEEXEC_B64(inst); + return S_SAVEEXEC_B64(NegateMode::None, false, inst); + case Opcode::S_ORN2_SAVEEXEC_B64: + return S_SAVEEXEC_B64(NegateMode::Src1, true, inst); default: LogMissingOpcode(inst); } @@ -540,11 +544,17 @@ void Translator::S_BREV_B32(const GcnInst& inst) { SetDst(inst.dst[0], ir.BitReverse(GetSrc(inst.src[0]))); } -void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { +void Translator::S_BCNT1_I32_B64(const GcnInst& inst) { + const IR::U32 result = ir.BitCount(GetSrc(inst.src[0])); + SetDst(inst.dst[0], result); + ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); +} + +void Translator::S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst) { // This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs) // However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination // SGPR we have a special IR opcode for SPGRs that act as thread masks. - const IR::U1 exec{ir.GetExec()}; + IR::U1 exec{ir.GetExec()}; const IR::U1 src = [&] { switch (inst.src[0].field) { case OperandField::VccLo: @@ -568,7 +578,13 @@ void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { } // Update EXEC. - const IR::U1 result = ir.LogicalAnd(exec, src); + if (negate == NegateMode::Src1) { + exec = ir.LogicalNot(exec); + } + IR::U1 result = is_or ? ir.LogicalOr(exec, src) : ir.LogicalAnd(exec, src); + if (negate == NegateMode::Result) { + result = ir.LogicalNot(result); + } ir.SetExec(result); ir.SetScc(result); } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index b70d4b82..79bc33f0 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -108,8 +108,9 @@ public: void S_MOV_B64(const GcnInst& inst); void S_NOT_B64(const GcnInst& inst); void S_BREV_B32(const GcnInst& inst); + void S_BCNT1_I32_B64(const GcnInst& inst); void S_GETPC_B64(u32 pc, const GcnInst& inst); - void S_AND_SAVEEXEC_B64(const GcnInst& inst); + void S_SAVEEXEC_B64(NegateMode negate, bool is_or, const GcnInst& inst); // SOPC void S_CMP(ConditionOp cond, bool is_signed, const GcnInst& inst); @@ -225,6 +226,7 @@ public: void V_MED3_I32(const GcnInst& inst); void V_SAD(const GcnInst& inst); void V_SAD_U32(const GcnInst& inst); + void V_CVT_PK_U16_U32(const GcnInst& inst); void V_CVT_PK_U8_F32(const GcnInst& inst); void V_LSHL_B64(const GcnInst& inst); void V_MUL_F64(const GcnInst& inst); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 27969546..433f9dce 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -157,6 +157,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { return V_RCP_F64(inst); case Opcode::V_RCP_IFLAG_F32: return V_RCP_F32(inst); + case Opcode::V_RCP_CLAMP_F32: + return V_RCP_F32(inst); case Opcode::V_RSQ_CLAMP_F32: return V_RSQ_F32(inst); case Opcode::V_RSQ_LEGACY_F32: @@ -268,6 +270,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { return V_CMP_U32(ConditionOp::GT, true, true, inst); case Opcode::V_CMPX_LG_I32: return V_CMP_U32(ConditionOp::LG, true, true, inst); + case Opcode::V_CMPX_GE_I32: + return V_CMP_U32(ConditionOp::GE, true, true, inst); // V_CMP_{OP8}_U32 case Opcode::V_CMP_F_U32: @@ -355,6 +359,8 @@ void Translator::EmitVectorAlu(const GcnInst& inst) { return V_MED3_I32(inst); case Opcode::V_SAD_U32: return V_SAD_U32(inst); + case Opcode::V_CVT_PK_U16_U32: + return V_CVT_PK_U16_U32(inst); case Opcode::V_CVT_PK_U8_F32: return V_CVT_PK_U8_F32(inst); case Opcode::V_LSHL_B64: @@ -1108,6 +1114,14 @@ void Translator::V_SAD_U32(const GcnInst& inst) { SetDst(inst.dst[0], ir.IAdd(result, src2)); } +void Translator::V_CVT_PK_U16_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 lo = ir.IMin(src0, ir.Imm32(0xFFFF), false); + const IR::U32 hi = ir.IMin(src1, ir.Imm32(0xFFFF), false); + SetDst(inst.dst[0], ir.BitFieldInsert(lo, hi, ir.Imm32(16), ir.Imm32(16))); +} + void Translator::V_CVT_PK_U8_F32(const GcnInst& inst) { const IR::F32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; diff --git a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp index 775aed5b..a03fe051 100644 --- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp @@ -6,6 +6,7 @@ #include #include "common/func_traits.h" #include "shader_recompiler/ir/basic_block.h" +#include "shader_recompiler/ir/ir_emitter.h" namespace Shader::Optimization { @@ -215,36 +216,17 @@ void FoldAdd(IR::Block& block, IR::Inst& inst) { } } -template -bool IsArgImm(const IR::Inst& inst, u32 imm) { - const IR::Value& arg = inst.Arg(idx); - return arg.IsImmediate() && arg.U32() == imm; -}; - -void FoldBooleanConvert(IR::Inst& inst) { - // Eliminate pattern - // %4 = - // %5 = SelectU32 %4, #1, #0 (uses: 2) - // %8 = INotEqual %5, #0 (uses: 1) - if (!IsArgImm<1>(inst, 0)) { - return; - } - IR::Inst* prod = inst.Arg(0).TryInstRecursive(); - if (!prod || prod->GetOpcode() != IR::Opcode::SelectU32) { - return; - } - if (IsArgImm<1>(*prod, 1) && IsArgImm<2>(*prod, 0)) { - inst.ReplaceUsesWith(prod->Arg(0)); - } -} - -void FoldCmpClass(IR::Inst& inst) { +void FoldCmpClass(IR::Block& block, IR::Inst& inst) { ASSERT_MSG(inst.Arg(1).IsImmediate(), "Unable to resolve compare operation"); const auto class_mask = static_cast(inst.Arg(1).U32()); if ((class_mask & IR::FloatClassFunc::NaN) == IR::FloatClassFunc::NaN) { inst.ReplaceOpcode(IR::Opcode::FPIsNan32); } else if ((class_mask & IR::FloatClassFunc::Infinity) == IR::FloatClassFunc::Infinity) { inst.ReplaceOpcode(IR::Opcode::FPIsInf32); + } else if ((class_mask & IR::FloatClassFunc::Finite) == IR::FloatClassFunc::Finite) { + IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::F32 value = IR::F32{inst.Arg(0)}; + inst.ReplaceUsesWith(ir.LogicalNot(ir.LogicalOr(ir.FPIsInf(value), ir.FPIsInf(value)))); } else { UNREACHABLE(); } @@ -276,7 +258,7 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; }); return; case IR::Opcode::FPCmpClass32: - FoldCmpClass(inst); + FoldCmpClass(block, inst); return; case IR::Opcode::ShiftLeftLogical32: FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return static_cast(a << b); }); diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 0d91badd..aa05d3ae 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -605,7 +605,7 @@ void PatchImageSampleInstruction(IR::Block& block, IR::Inst& inst, Info& info, : IR::F32{}; const IR::F32 lod_clamp = inst_info.has_lod_clamp ? get_addr_reg(addr_reg++) : IR::F32{}; - const auto new_inst = [&] -> IR::Value { + auto new_inst = [&] -> IR::Value { if (inst_info.is_gather) { if (inst_info.is_depth) { return ir.ImageGatherDref(handle, coords, offset, dref, inst_info); diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index d7c0b1db..3004d2b8 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -24,6 +24,8 @@ enum class FloatClassFunc : u32 { NaN = SignalingNan | QuietNan, Infinity = PositiveInfinity | NegativeInfinity, + Finite = NegativeNormal | NegativeDenorm | NegativeZero | PositiveNormal | PositiveDenorm | + PositiveZero, }; DECLARE_ENUM_FLAG_OPERATORS(FloatClassFunc) diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 7b472708..53aab630 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -8,6 +8,7 @@ #include "common/thread.h" #include "core/debug_state.h" #include "core/libraries/videoout/driver.h" +#include "core/memory.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/amdgpu/pm4_cmds.h" #include "video_core/renderdoc.h" @@ -504,7 +505,12 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); - event_eos->SignalFence(); + event_eos->SignalFence([](void* address, u64 data, u32 num_bytes) { + auto* memory = Core::Memory::Instance(); + if (!memory->TryWriteBacking(address, &data, num_bytes)) { + memcpy(address, &data, num_bytes); + } + }); if (event_eos->command == PM4CmdEventWriteEos::Command::GdsStore) { ASSERT(event_eos->size == 1); if (rasterizer) { @@ -517,13 +523,42 @@ Liverpool::Task Liverpool::ProcessGraphics(std::span dcb, std::span(header); - event_eop->SignalFence(); + event_eop->SignalFence([](void* address, u64 data, u32 num_bytes) { + auto* memory = Core::Memory::Instance(); + if (!memory->TryWriteBacking(address, &data, num_bytes)) { + memcpy(address, &data, num_bytes); + } + }); break; } case PM4ItOpcode::DmaData: { const auto* dma_data = reinterpret_cast(header); + if (dma_data->dst_addr_lo == 0x3022C) { + break; + } if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) { - rasterizer->InlineDataToGds(dma_data->dst_addr_lo, dma_data->data); + rasterizer->InlineData(dma_data->dst_addr_lo, &dma_data->data, sizeof(u32), + true); + } else if (dma_data->src_sel == DmaDataSrc::Memory && + dma_data->dst_sel == DmaDataDst::Gds) { + rasterizer->InlineData(dma_data->dst_addr_lo, + dma_data->SrcAddress(), + dma_data->NumBytes(), true); + } else if (dma_data->src_sel == DmaDataSrc::Data && + dma_data->dst_sel == DmaDataDst::Memory) { + rasterizer->InlineData(dma_data->DstAddress(), &dma_data->data, + sizeof(u32), false); + } else if (dma_data->src_sel == DmaDataSrc::Gds && + dma_data->dst_sel == DmaDataDst::Memory) { + LOG_WARNING(Render_Vulkan, "GDS memory read"); + } else if (dma_data->src_sel == DmaDataSrc::Memory && + dma_data->dst_sel == DmaDataDst::Memory) { + rasterizer->InlineData(dma_data->DstAddress(), + dma_data->SrcAddress(), + dma_data->NumBytes(), false); + } else { + UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", + u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); } break; } @@ -631,6 +666,35 @@ Liverpool::Task Liverpool::ProcessCompute(std::span acb, int vqid) { }; break; } + case PM4ItOpcode::DmaData: { + const auto* dma_data = reinterpret_cast(header); + if (dma_data->dst_addr_lo == 0x3022C) { + break; + } + if (dma_data->src_sel == DmaDataSrc::Data && dma_data->dst_sel == DmaDataDst::Gds) { + rasterizer->InlineData(dma_data->dst_addr_lo, &dma_data->data, sizeof(u32), true); + } else if (dma_data->src_sel == DmaDataSrc::Memory && + dma_data->dst_sel == DmaDataDst::Gds) { + rasterizer->InlineData(dma_data->dst_addr_lo, dma_data->SrcAddress(), + dma_data->NumBytes(), true); + } else if (dma_data->src_sel == DmaDataSrc::Data && + dma_data->dst_sel == DmaDataDst::Memory) { + rasterizer->InlineData(dma_data->DstAddress(), &dma_data->data, sizeof(u32), + false); + } else if (dma_data->src_sel == DmaDataSrc::Gds && + dma_data->dst_sel == DmaDataDst::Memory) { + LOG_WARNING(Render_Vulkan, "GDS memory read"); + } else if (dma_data->src_sel == DmaDataSrc::Memory && + dma_data->dst_sel == DmaDataDst::Memory) { + rasterizer->InlineData(dma_data->DstAddress(), + dma_data->SrcAddress(), dma_data->NumBytes(), + false); + } else { + UNREACHABLE_MSG("WriteData src_sel = {}, dst_sel = {}", + u32(dma_data->src_sel.Value()), u32(dma_data->dst_sel.Value())); + } + break; + } case PM4ItOpcode::AcquireMem: { break; } diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index a7a862ea..a956b030 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -313,25 +313,26 @@ struct PM4CmdEventWriteEop { return data_lo | u64(data_hi) << 32; } - void SignalFence() const { + void SignalFence(auto&& write_mem) const { + u32* address = Address(); switch (data_sel.Value()) { case DataSelect::None: { break; } case DataSelect::Data32Low: { - *Address() = DataDWord(); + write_mem(address, DataDWord(), sizeof(u32)); break; } case DataSelect::Data64: { - *Address() = DataQWord(); + write_mem(address, DataQWord(), sizeof(u64)); break; } case DataSelect::GpuClock64: { - *Address() = GetGpuClock64(); + write_mem(address, GetGpuClock64(), sizeof(u64)); break; } case DataSelect::PerfCounter: { - *Address() = Common::FencedRDTSC(); + write_mem(address, Common::FencedRDTSC(), sizeof(u64)); break; } default: { @@ -401,6 +402,20 @@ struct PM4DmaData { u32 dst_addr_lo; u32 dst_addr_hi; u32 command; + + template + T SrcAddress() const { + return std::bit_cast(src_addr_lo | u64(src_addr_hi) << 32); + } + + template + T DstAddress() const { + return std::bit_cast(dst_addr_lo | u64(dst_addr_hi) << 32); + } + + u32 NumBytes() const noexcept { + return command & 0x1fffff; + } }; struct PM4CmdWaitRegMem { @@ -432,7 +447,7 @@ struct PM4CmdWaitRegMem { template T Address() const { - return reinterpret_cast((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo); + return std::bit_cast((uintptr_t(poll_addr_hi) << 32) | poll_addr_lo); } bool Test() const { @@ -534,11 +549,11 @@ struct PM4CmdEventWriteEos { return this->data; } - void SignalFence() const { + void SignalFence(auto&& write_mem) const { const auto cmd = command.Value(); switch (cmd) { case Command::SignalFence: { - *Address() = DataDWord(); + write_mem(Address(), DataDWord(), sizeof(u32)); break; } case Command::GdsStore: { diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 403d4ed8..f67278f6 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -142,6 +142,7 @@ public: VAddr cpu_addr = 0; bool is_picked{}; bool is_coherent{}; + bool is_deleted{}; int stream_score = 0; size_t size_bytes = 0; std::span mapped_data; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index c2993f3d..f665ba51 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -20,7 +20,7 @@ static constexpr size_t StagingBufferSize = 1_GB; static constexpr size_t UboStreamBufferSize = 64_MB; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, - const AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, + AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, PageManager& tracker_) : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, texture_cache{texture_cache_}, tracker{tracker_}, @@ -70,11 +70,10 @@ void BufferCache::InvalidateMemory(VAddr device_addr, u64 size) { void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size) { boost::container::small_vector copies; u64 total_size_bytes = 0; - u64 largest_copy = 0; memory_tracker.ForEachDownloadRange( device_addr, size, [&](u64 device_addr_out, u64 range_size) { const VAddr buffer_addr = buffer.CpuAddr(); - const auto add_download = [&](VAddr start, VAddr end, u64) { + const auto add_download = [&](VAddr start, VAddr end) { const u64 new_offset = start - buffer_addr; const u64 new_size = end - start; copies.push_back(vk::BufferCopy{ @@ -82,12 +81,10 @@ void BufferCache::DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 si .dstOffset = total_size_bytes, .size = new_size, }); - // Align up to avoid cache conflicts - constexpr u64 align = 64ULL; - constexpr u64 mask = ~(align - 1ULL); - total_size_bytes += (new_size + align - 1) & mask; - largest_copy = std::max(largest_copy, new_size); + total_size_bytes += new_size; }; + gpu_modified_ranges.ForEachInRange(device_addr_out, range_size, add_download); + gpu_modified_ranges.Subtract(device_addr_out, range_size); }); if (total_size_bytes == 0) { return; @@ -181,6 +178,9 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { .divisor = 1, }); } + if (ranges.empty()) { + return false; + } std::ranges::sort(ranges, [](const BufferRange& lhv, const BufferRange& rhv) { return lhv.base_address < rhv.base_address; @@ -269,48 +269,62 @@ u32 BufferCache::BindIndexBuffer(bool& is_indexed, u32 index_offset) { return regs.num_indices; } -void BufferCache::InlineDataToGds(u32 gds_offset, u32 value) { - ASSERT_MSG(gds_offset % 4 == 0, "GDS offset must be dword aligned"); +void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { + ASSERT_MSG(address % 4 == 0, "GDS offset must be dword aligned"); + if (!is_gds && !IsRegionRegistered(address, num_bytes)) { + memcpy(std::bit_cast(address), value, num_bytes); + return; + } scheduler.EndRendering(); const auto cmdbuf = scheduler.CommandBuffer(); + const Buffer* buffer = [&] { + if (is_gds) { + return &gds_buffer; + } + const BufferId buffer_id = FindBuffer(address, num_bytes); + return &slot_buffers[buffer_id]; + }(); const vk::BufferMemoryBarrier2 buf_barrier = { .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, .dstStageMask = vk::PipelineStageFlagBits2::eAllCommands, .dstAccessMask = vk::AccessFlagBits2::eMemoryRead, - .buffer = gds_buffer.Handle(), - .offset = gds_offset, - .size = sizeof(u32), + .buffer = buffer->Handle(), + .offset = buffer->Offset(address), + .size = num_bytes, }; cmdbuf.pipelineBarrier2(vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = 1, .pBufferMemoryBarriers = &buf_barrier, }); - cmdbuf.updateBuffer(gds_buffer.Handle(), gds_offset, sizeof(u32), &value); + cmdbuf.updateBuffer(buffer->Handle(), buf_barrier.offset, num_bytes, value); } std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, - bool is_texel_buffer) { + bool is_texel_buffer, BufferId buffer_id) { + // For small uniform buffers that have not been modified by gpu + // use device local stream buffer to reduce renderpass breaks. static constexpr u64 StreamThreshold = CACHING_PAGESIZE; const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size); if (!is_written && size <= StreamThreshold && !is_gpu_dirty) { - // For small uniform buffers that have not been modified by gpu - // use device local stream buffer to reduce renderpass breaks. const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment()); return {&stream_buffer, offset}; } - const BufferId buffer_id = FindBuffer(device_addr, size); + if (!buffer_id || slot_buffers[buffer_id].is_deleted) { + buffer_id = FindBuffer(device_addr, size); + } Buffer& buffer = slot_buffers[buffer_id]; SynchronizeBuffer(buffer, device_addr, size, is_texel_buffer); if (is_written) { memory_tracker.MarkRegionAsGpuModified(device_addr, size); + gpu_modified_ranges.Add(device_addr, size); } return {&buffer, buffer.Offset(device_addr)}; } -std::pair BufferCache::ObtainTempBuffer(VAddr gpu_addr, u32 size) { +std::pair BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size) { const u64 page = gpu_addr >> CACHING_PAGEBITS; const BufferId buffer_id = page_table[page]; if (buffer_id) { @@ -474,7 +488,7 @@ void BufferCache::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {}); - DeleteBuffer(overlap_id, true); + DeleteBuffer(overlap_id); } BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { @@ -529,7 +543,7 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, u64 total_size_bytes = 0; u64 largest_copy = 0; VAddr buffer_start = buffer.CpuAddr(); - const auto add_copy = [&](VAddr device_addr_out, u64 range_size) { + memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) { copies.push_back(vk::BufferCopy{ .srcOffset = total_size_bytes, .dstOffset = device_addr_out - buffer_start, @@ -537,11 +551,6 @@ void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, }); total_size_bytes += range_size; largest_copy = std::max(largest_copy, range_size); - }; - memory_tracker.ForEachUploadRange(device_addr, size, [&](u64 device_addr_out, u64 range_size) { - add_copy(device_addr_out, range_size); - // Prevent uploading to gpu modified regions. - // gpu_modified_ranges.ForEachNotInRange(device_addr_out, range_size, add_copy); }); SCOPE_EXIT { if (is_texel_buffer) { @@ -654,14 +663,11 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, return true; } -void BufferCache::DeleteBuffer(BufferId buffer_id, bool do_not_mark) { - // Mark the whole buffer as CPU written to stop tracking CPU writes - if (!do_not_mark) { - Buffer& buffer = slot_buffers[buffer_id]; - memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); - } +void BufferCache::DeleteBuffer(BufferId buffer_id) { + Buffer& buffer = slot_buffers[buffer_id]; Unregister(buffer_id); scheduler.DeferOperation([this, buffer_id] { slot_buffers.erase(buffer_id); }); + buffer.is_deleted = true; } } // namespace VideoCore diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 76309363..6710c861 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -12,6 +12,7 @@ #include "common/types.h" #include "video_core/buffer_cache/buffer.h" #include "video_core/buffer_cache/memory_tracker_base.h" +#include "video_core/buffer_cache/range_set.h" #include "video_core/multi_level_page_table.h" namespace AmdGpu { @@ -53,7 +54,7 @@ public: public: explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - const AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, + AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, PageManager& tracker); ~BufferCache(); @@ -80,15 +81,16 @@ public: /// Bind host index buffer for the current draw. u32 BindIndexBuffer(bool& is_indexed, u32 index_offset); - /// Writes a value to GDS buffer. - void InlineDataToGds(u32 gds_offset, u32 value); + /// Writes a value to GPU buffer. + void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, - bool is_texel_buffer = false); + bool is_texel_buffer = false, + BufferId buffer_id = {}); - /// Obtains a temporary buffer for usage in texture cache. - [[nodiscard]] std::pair ObtainTempBuffer(VAddr gpu_addr, u32 size); + /// Attempts to obtain a buffer without modifying the cache contents. + [[nodiscard]] std::pair ObtainViewBuffer(VAddr gpu_addr, u32 size); /// Return true when a region is registered on the cache [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); @@ -99,6 +101,8 @@ public: /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + [[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size); + private: template void ForEachBufferInRange(VAddr device_addr, u64 size, Func&& func) { @@ -119,8 +123,6 @@ private: void DownloadBufferMemory(Buffer& buffer, VAddr device_addr, u64 size); - [[nodiscard]] BufferId FindBuffer(VAddr device_addr, u32 size); - [[nodiscard]] OverlapResult ResolveOverlaps(VAddr device_addr, u32 wanted_size); void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); @@ -138,11 +140,11 @@ private: bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size); - void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false); + void DeleteBuffer(BufferId buffer_id); const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; - const AmdGpu::Liverpool* liverpool; + AmdGpu::Liverpool* liverpool; TextureCache& texture_cache; PageManager& tracker; StreamBuffer staging_buffer; @@ -150,6 +152,7 @@ private: Buffer gds_buffer; std::mutex mutex; Common::SlotVector slot_buffers; + RangeSet gpu_modified_ranges; vk::BufferView null_buffer_view; MemoryTracker memory_tracker; PageTable page_table; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 37a44dda..7122ca13 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -3,7 +3,6 @@ #include -#include "common/alignment.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_instance.h" @@ -113,140 +112,45 @@ ComputePipeline::~ComputePipeline() = default; bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const { // Bind resource buffers and textures. - boost::container::static_vector buffer_views; - boost::container::static_vector buffer_infos; boost::container::small_vector set_writes; - boost::container::small_vector buffer_barriers; + BufferBarriers buffer_barriers; Shader::PushData push_data{}; Shader::Backend::Bindings binding{}; + info->PushUd(binding, push_data); + + buffer_infos.clear(); + buffer_views.clear(); image_infos.clear(); - info->PushUd(binding, push_data); - for (const auto& desc : info->buffers) { - bool is_storage = true; - if (desc.is_gds_buffer) { - auto* vk_buffer = buffer_cache.GetGdsBuffer(); - buffer_infos.emplace_back(vk_buffer->Handle(), 0, vk_buffer->SizeBytes()); - } else { - const auto vsharp = desc.GetSharp(*info); - is_storage = desc.IsStorage(vsharp); - const VAddr address = vsharp.base_address; - // Most of the time when a metadata is updated with a shader it gets cleared. It means - // we can skip the whole dispatch and update the tracked state instead. Also, it is not - // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we - // will need its full emulation anyways. For cases of metadata read a warning will be - // logged. - if (desc.is_written) { - if (texture_cache.TouchMeta(address, true)) { - LOG_TRACE(Render_Vulkan, "Metadata update skipped"); - return false; - } - } else { - if (texture_cache.IsMeta(address)) { - LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)"); - } + // Most of the time when a metadata is updated with a shader it gets cleared. It means + // we can skip the whole dispatch and update the tracked state instead. Also, it is not + // intended to be consumed and in such rare cases (e.g. HTile introspection, CRAA) we + // will need its full emulation anyways. For cases of metadata read a warning will be logged. + for (const auto& desc : info->texture_buffers) { + const VAddr address = desc.GetSharp(*info).base_address; + if (desc.is_written) { + if (texture_cache.TouchMeta(address, true)) { + LOG_TRACE(Render_Vulkan, "Metadata update skipped"); + return false; + } + } else { + if (texture_cache.IsMeta(address)) { + LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)"); } - const u32 size = vsharp.GetSize(); - const u32 alignment = - is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); - const auto [vk_buffer, offset] = - buffer_cache.ObtainBuffer(address, size, desc.is_written); - const u32 offset_aligned = Common::AlignDown(offset, alignment); - const u32 adjust = offset - offset_aligned; - ASSERT(adjust % 4 == 0); - push_data.AddOffset(binding.buffer, adjust); - buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust); } - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = is_storage ? vk::DescriptorType::eStorageBuffer - : vk::DescriptorType::eUniformBuffer, - .pBufferInfo = &buffer_infos.back(), - }); - ++binding.buffer; } - const auto null_buffer_view = - instance.IsNullDescriptorSupported() ? VK_NULL_HANDLE : buffer_cache.NullBufferView(); - for (const auto& desc : info->texture_buffers) { - const auto vsharp = desc.GetSharp(*info); - vk::BufferView& buffer_view = buffer_views.emplace_back(null_buffer_view); - const u32 size = vsharp.GetSize(); - if (vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid && size != 0) { - const VAddr address = vsharp.base_address; - if (desc.is_written) { - if (texture_cache.TouchMeta(address, true)) { - LOG_TRACE(Render_Vulkan, "Metadata update skipped"); - return false; - } - } else { - if (texture_cache.IsMeta(address)) { - LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)"); - } - } - const u32 alignment = instance.TexelBufferMinAlignment(); - const auto [vk_buffer, offset] = - buffer_cache.ObtainBuffer(address, size, desc.is_written, true); - const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3; - ASSERT_MSG(fmt_stride == vsharp.GetStride(), - "Texel buffer stride must match format stride"); - const u32 offset_aligned = Common::AlignDown(offset, alignment); - const u32 adjust = offset - offset_aligned; - ASSERT(adjust % fmt_stride == 0); - push_data.AddOffset(binding.buffer, adjust / fmt_stride); - buffer_view = vk_buffer->View(offset_aligned, size + adjust, desc.is_written, - vsharp.GetDataFmt(), vsharp.GetNumberFmt()); - if (auto barrier = - vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite - : vk::AccessFlagBits2::eShaderRead, - vk::PipelineStageFlagBits2::eComputeShader)) { - buffer_barriers.emplace_back(*barrier); - } - if (desc.is_written) { - texture_cache.InvalidateMemoryFromGPU(address, size); - } - } - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = desc.is_written ? vk::DescriptorType::eStorageTexelBuffer - : vk::DescriptorType::eUniformTexelBuffer, - .pTexelBufferView = &buffer_view, - }); - ++binding.buffer; - } + BindBuffers(buffer_cache, texture_cache, *info, binding, push_data, set_writes, + buffer_barriers); BindTextures(texture_cache, *info, binding, set_writes); - for (const auto& sampler : info->samplers) { - const auto ssharp = sampler.GetSharp(*info); - if (ssharp.force_degamma) { - LOG_WARNING(Render_Vulkan, "Texture requires gamma correction"); - } - const auto vk_sampler = texture_cache.GetSampler(ssharp); - image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eSampler, - .pImageInfo = &image_infos.back(), - }); - } - if (set_writes.empty()) { return false; } const auto cmdbuf = scheduler.CommandBuffer(); - if (!buffer_barriers.empty()) { const auto dependencies = vk::DependencyInfo{ .dependencyFlags = vk::DependencyFlagBits::eByRegion, @@ -257,21 +161,22 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, cmdbuf.pipelineBarrier2(dependencies); } + cmdbuf.pushConstants(*pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(push_data), + &push_data); + + // Bind descriptor set. if (uses_push_descriptors) { cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, set_writes); - } else { - const auto desc_set = desc_heap.Commit(*desc_layout); - for (auto& set_write : set_writes) { - set_write.dstSet = desc_set; - } - instance.GetDevice().updateDescriptorSets(set_writes, {}); - cmdbuf.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, desc_set, - {}); + return true; } + const auto desc_set = desc_heap.Commit(*desc_layout); + for (auto& set_write : set_writes) { + set_write.dstSet = desc_set; + } + instance.GetDevice().updateDescriptorSets(set_writes, {}); + cmdbuf.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, desc_set, {}); - cmdbuf.pushConstants(*pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(push_data), - &push_data); return true; } diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index cbc0fc5e..f6d0b49b 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -5,8 +5,8 @@ #include #include -#include "common/alignment.h" #include "common/assert.h" +#include "common/scope_exit.h" #include "video_core/amdgpu/resource.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" @@ -384,13 +384,13 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const { // Bind resource buffers and textures. - boost::container::static_vector buffer_views; - boost::container::static_vector buffer_infos; boost::container::small_vector set_writes; - boost::container::small_vector buffer_barriers; + BufferBarriers buffer_barriers; Shader::PushData push_data{}; Shader::Backend::Bindings binding{}; + buffer_infos.clear(); + buffer_views.clear(); image_infos.clear(); for (const auto* stage : stages) { @@ -402,111 +402,22 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, push_data.step1 = regs.vgt_instance_step_rate_1; } stage->PushUd(binding, push_data); - for (const auto& buffer : stage->buffers) { - const auto vsharp = buffer.GetSharp(*stage); - const bool is_storage = buffer.IsStorage(vsharp); - if (vsharp && vsharp.GetSize() > 0) { - const VAddr address = vsharp.base_address; - if (texture_cache.IsMeta(address)) { - LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a PS shader (buffer)"); - } - const u32 size = vsharp.GetSize(); - const u32 alignment = - is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); - const auto [vk_buffer, offset] = - buffer_cache.ObtainBuffer(address, size, buffer.is_written); - const u32 offset_aligned = Common::AlignDown(offset, alignment); - const u32 adjust = offset - offset_aligned; - ASSERT(adjust % 4 == 0); - push_data.AddOffset(binding.buffer, adjust); - buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, size + adjust); - } else if (instance.IsNullDescriptorSupported()) { - buffer_infos.emplace_back(VK_NULL_HANDLE, 0, VK_WHOLE_SIZE); - } else { - auto& null_buffer = buffer_cache.GetBuffer(VideoCore::NULL_BUFFER_ID); - buffer_infos.emplace_back(null_buffer.Handle(), 0, VK_WHOLE_SIZE); - } - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = is_storage ? vk::DescriptorType::eStorageBuffer - : vk::DescriptorType::eUniformBuffer, - .pBufferInfo = &buffer_infos.back(), - }); - ++binding.buffer; - } - const auto null_buffer_view = - instance.IsNullDescriptorSupported() ? VK_NULL_HANDLE : buffer_cache.NullBufferView(); - for (const auto& desc : stage->texture_buffers) { - const auto vsharp = desc.GetSharp(*stage); - vk::BufferView& buffer_view = buffer_views.emplace_back(null_buffer_view); - const u32 size = vsharp.GetSize(); - if (vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid && size != 0) { - const VAddr address = vsharp.base_address; - const u32 alignment = instance.TexelBufferMinAlignment(); - const auto [vk_buffer, offset] = - buffer_cache.ObtainBuffer(address, size, desc.is_written, true); - const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3; - ASSERT_MSG(fmt_stride == vsharp.GetStride(), - "Texel buffer stride must match format stride"); - const u32 offset_aligned = Common::AlignDown(offset, alignment); - const u32 adjust = offset - offset_aligned; - ASSERT(adjust % fmt_stride == 0); - push_data.AddOffset(binding.buffer, adjust / fmt_stride); - buffer_view = vk_buffer->View(offset_aligned, size + adjust, desc.is_written, - vsharp.GetDataFmt(), vsharp.GetNumberFmt()); - const auto dst_access = desc.is_written ? vk::AccessFlagBits2::eShaderWrite - : vk::AccessFlagBits2::eShaderRead; - if (auto barrier = vk_buffer->GetBarrier( - dst_access, vk::PipelineStageFlagBits2::eVertexShader)) { - buffer_barriers.emplace_back(*barrier); - } - if (desc.is_written) { - texture_cache.InvalidateMemoryFromGPU(address, size); - } - } - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = desc.is_written ? vk::DescriptorType::eStorageTexelBuffer - : vk::DescriptorType::eUniformTexelBuffer, - .pTexelBufferView = &buffer_view, - }); - ++binding.buffer; - } + BindBuffers(buffer_cache, texture_cache, *stage, binding, push_data, set_writes, + buffer_barriers); BindTextures(texture_cache, *stage, binding, set_writes); - - for (const auto& sampler : stage->samplers) { - auto ssharp = sampler.GetSharp(*stage); - if (ssharp.force_degamma) { - LOG_WARNING(Render_Vulkan, "Texture requires gamma correction"); - } - if (sampler.disable_aniso) { - const auto& tsharp = stage->images[sampler.associated_image].GetSharp(*stage); - if (tsharp.base_level == 0 && tsharp.last_level == 0) { - ssharp.max_aniso.Assign(AmdGpu::AnisoRatio::One); - } - } - const auto vk_sampler = texture_cache.GetSampler(ssharp); - image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); - set_writes.push_back({ - .dstSet = VK_NULL_HANDLE, - .dstBinding = binding.unified++, - .dstArrayElement = 0, - .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eSampler, - .pImageInfo = &image_infos.back(), - }); - } } const auto cmdbuf = scheduler.CommandBuffer(); + SCOPE_EXIT { + cmdbuf.pushConstants(*pipeline_layout, gp_stage_flags, 0U, sizeof(push_data), &push_data); + cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, Handle()); + }; + + if (set_writes.empty()) { + return; + } if (!buffer_barriers.empty()) { const auto dependencies = vk::DependencyInfo{ @@ -518,22 +429,18 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, cmdbuf.pipelineBarrier2(dependencies); } - if (!set_writes.empty()) { - if (uses_push_descriptors) { - cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0, - set_writes); - } else { - const auto desc_set = desc_heap.Commit(*desc_layout); - for (auto& set_write : set_writes) { - set_write.dstSet = desc_set; - } - instance.GetDevice().updateDescriptorSets(set_writes, {}); - cmdbuf.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0, - desc_set, {}); - } + // Bind descriptor set. + if (uses_push_descriptors) { + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0, + set_writes); + return; } - cmdbuf.pushConstants(*pipeline_layout, gp_stage_flags, 0U, sizeof(push_data), &push_data); - cmdbuf.bindPipeline(vk::PipelineBindPoint::eGraphics, Handle()); + const auto desc_set = desc_heap.Commit(*desc_layout); + for (auto& set_write : set_writes) { + set_write.dstSet = desc_set; + } + instance.GetDevice().updateDescriptorSets(set_writes, {}); + cmdbuf.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0, desc_set, {}); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_common.cpp b/src/video_core/renderer_vulkan/vk_pipeline_common.cpp index 61e56415..efe2838e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_common.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_common.cpp @@ -4,6 +4,7 @@ #include #include "shader_recompiler/info.h" +#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_pipeline_common.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -12,6 +13,8 @@ namespace Vulkan { boost::container::static_vector Pipeline::image_infos; +boost::container::static_vector Pipeline::buffer_views; +boost::container::static_vector Pipeline::buffer_infos; Pipeline::Pipeline(const Instance& instance_, Scheduler& scheduler_, DescriptorHeap& desc_heap_, vk::PipelineCache pipeline_cache) @@ -19,12 +22,133 @@ Pipeline::Pipeline(const Instance& instance_, Scheduler& scheduler_, DescriptorH Pipeline::~Pipeline() = default; +void Pipeline::BindBuffers(VideoCore::BufferCache& buffer_cache, + VideoCore::TextureCache& texture_cache, const Shader::Info& stage, + Shader::Backend::Bindings& binding, Shader::PushData& push_data, + DescriptorWrites& set_writes, BufferBarriers& buffer_barriers) const { + using BufferBindingInfo = std::pair; + static boost::container::static_vector buffer_bindings; + + buffer_bindings.clear(); + + for (const auto& desc : stage.buffers) { + const auto vsharp = desc.GetSharp(stage); + if (!desc.is_gds_buffer && vsharp.base_address != 0 && vsharp.GetSize() > 0) { + const auto buffer_id = buffer_cache.FindBuffer(vsharp.base_address, vsharp.GetSize()); + buffer_bindings.emplace_back(buffer_id, vsharp); + } else { + buffer_bindings.emplace_back(VideoCore::BufferId{}, vsharp); + } + } + + using TexBufferBindingInfo = std::pair; + static boost::container::static_vector texbuffer_bindings; + + texbuffer_bindings.clear(); + + for (const auto& desc : stage.texture_buffers) { + const auto vsharp = desc.GetSharp(stage); + if (vsharp.base_address != 0 && vsharp.GetSize() > 0 && + vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) { + const auto buffer_id = buffer_cache.FindBuffer(vsharp.base_address, vsharp.GetSize()); + texbuffer_bindings.emplace_back(buffer_id, vsharp); + } else { + texbuffer_bindings.emplace_back(VideoCore::BufferId{}, vsharp); + } + } + + // Second pass to re-bind buffers that were updated after binding + for (u32 i = 0; i < buffer_bindings.size(); i++) { + const auto& [buffer_id, vsharp] = buffer_bindings[i]; + const auto& desc = stage.buffers[i]; + const bool is_storage = desc.IsStorage(vsharp); + if (!buffer_id) { + if (desc.is_gds_buffer) { + const auto* gds_buf = buffer_cache.GetGdsBuffer(); + buffer_infos.emplace_back(gds_buf->Handle(), 0, gds_buf->SizeBytes()); + } else if (instance.IsNullDescriptorSupported()) { + buffer_infos.emplace_back(VK_NULL_HANDLE, 0, VK_WHOLE_SIZE); + } else { + auto& null_buffer = buffer_cache.GetBuffer(VideoCore::NULL_BUFFER_ID); + buffer_infos.emplace_back(null_buffer.Handle(), 0, VK_WHOLE_SIZE); + } + } else { + const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer( + vsharp.base_address, vsharp.GetSize(), desc.is_written, false, buffer_id); + const u32 alignment = + is_storage ? instance.StorageMinAlignment() : instance.UniformMinAlignment(); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + ASSERT(adjust % 4 == 0); + push_data.AddOffset(binding.buffer, adjust); + buffer_infos.emplace_back(vk_buffer->Handle(), offset_aligned, + vsharp.GetSize() + adjust); + } + + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding.unified++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = is_storage ? vk::DescriptorType::eStorageBuffer + : vk::DescriptorType::eUniformBuffer, + .pBufferInfo = &buffer_infos.back(), + }); + ++binding.buffer; + } + + const auto null_buffer_view = + instance.IsNullDescriptorSupported() ? VK_NULL_HANDLE : buffer_cache.NullBufferView(); + for (u32 i = 0; i < texbuffer_bindings.size(); i++) { + const auto& [buffer_id, vsharp] = texbuffer_bindings[i]; + const auto& desc = stage.texture_buffers[i]; + vk::BufferView& buffer_view = buffer_views.emplace_back(null_buffer_view); + if (buffer_id) { + const u32 alignment = instance.TexelBufferMinAlignment(); + const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer( + vsharp.base_address, vsharp.GetSize(), desc.is_written, true, buffer_id); + const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3; + ASSERT_MSG(fmt_stride == vsharp.GetStride(), + "Texel buffer stride must match format stride"); + const u32 offset_aligned = Common::AlignDown(offset, alignment); + const u32 adjust = offset - offset_aligned; + ASSERT(adjust % fmt_stride == 0); + push_data.AddOffset(binding.buffer, adjust / fmt_stride); + buffer_view = + vk_buffer->View(offset_aligned, vsharp.GetSize() + adjust, desc.is_written, + vsharp.GetDataFmt(), vsharp.GetNumberFmt()); + if (auto barrier = + vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite + : vk::AccessFlagBits2::eShaderRead, + vk::PipelineStageFlagBits2::eComputeShader)) { + buffer_barriers.emplace_back(*barrier); + } + if (desc.is_written) { + texture_cache.InvalidateMemoryFromGPU(vsharp.base_address, vsharp.GetSize()); + } + } + + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding.unified++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = desc.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, + .pTexelBufferView = &buffer_view, + }); + ++binding.buffer; + } +} + void Pipeline::BindTextures(VideoCore::TextureCache& texture_cache, const Shader::Info& stage, Shader::Backend::Bindings& binding, DescriptorWrites& set_writes) const { using ImageBindingInfo = std::tuple; - boost::container::static_vector image_bindings; + static boost::container::static_vector image_bindings; + + image_bindings.clear(); for (const auto& image_desc : stage.images) { const auto tsharp = image_desc.GetSharp(stage); @@ -76,6 +200,26 @@ void Pipeline::BindTextures(VideoCore::TextureCache& texture_cache, const Shader .pImageInfo = &image_infos.back(), }); } + + for (const auto& sampler : stage.samplers) { + auto ssharp = sampler.GetSharp(stage); + if (sampler.disable_aniso) { + const auto& tsharp = stage.images[sampler.associated_image].GetSharp(stage); + if (tsharp.base_level == 0 && tsharp.last_level == 0) { + ssharp.max_aniso.Assign(AmdGpu::AnisoRatio::One); + } + } + const auto vk_sampler = texture_cache.GetSampler(ssharp); + image_infos.emplace_back(vk_sampler, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding.unified++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eSampler, + .pImageInfo = &image_infos.back(), + }); + } } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_common.h b/src/video_core/renderer_vulkan/vk_pipeline_common.h index ab99e7b3..75764bfa 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_common.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_common.h @@ -33,6 +33,13 @@ public: } using DescriptorWrites = boost::container::small_vector; + using BufferBarriers = boost::container::small_vector; + + void BindBuffers(VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache, + const Shader::Info& stage, Shader::Backend::Bindings& binding, + Shader::PushData& push_data, DescriptorWrites& set_writes, + BufferBarriers& buffer_barriers) const; + void BindTextures(VideoCore::TextureCache& texture_cache, const Shader::Info& stage, Shader::Backend::Bindings& binding, DescriptorWrites& set_writes) const; @@ -44,6 +51,8 @@ protected: vk::UniquePipelineLayout pipeline_layout; vk::UniqueDescriptorSetLayout desc_layout; static boost::container::static_vector image_infos; + static boost::container::static_vector buffer_views; + static boost::container::static_vector buffer_infos; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index b3c42fcb..14a73261 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -98,10 +98,9 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 si const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex); buffer_cache.BindVertexBuffers(vs_info); - const u32 num_indices = buffer_cache.BindIndexBuffer(is_indexed, 0); + buffer_cache.BindIndexBuffer(is_indexed, 0); - const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true); - const auto total_offset = base + offset; + const auto [buffer, base] = buffer_cache.ObtainBuffer(address + offset, size, false); BeginRendering(*pipeline); UpdateDynamicState(*pipeline); @@ -110,9 +109,9 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr address, u32 offset, u32 si // instance offsets will be automatically applied by Vulkan from indirect args buffer. if (is_indexed) { - cmdbuf.drawIndexedIndirect(buffer->Handle(), total_offset, 1, 0); + cmdbuf.drawIndexedIndirect(buffer->Handle(), base, 1, 0); } else { - cmdbuf.drawIndirect(buffer->Handle(), total_offset, 1, 0); + cmdbuf.drawIndirect(buffer->Handle(), base, 1, 0); } } @@ -161,9 +160,8 @@ void Rasterizer::DispatchIndirect(VAddr address, u32 offset, u32 size) { scheduler.EndRendering(); cmdbuf.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->Handle()); - const auto [buffer, base] = buffer_cache.ObtainBuffer(address, size, true); - const auto total_offset = base + offset; - cmdbuf.dispatchIndirect(buffer->Handle(), total_offset); + const auto [buffer, base] = buffer_cache.ObtainBuffer(address + offset, size, false); + cmdbuf.dispatchIndirect(buffer->Handle(), base); } u64 Rasterizer::Flush() { @@ -260,8 +258,8 @@ void Rasterizer::BeginRendering(const GraphicsPipeline& pipeline) { scheduler.BeginRendering(state); } -void Rasterizer::InlineDataToGds(u32 gds_offset, u32 value) { - buffer_cache.InlineDataToGds(gds_offset, value); +void Rasterizer::InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds) { + buffer_cache.InlineData(address, value, num_bytes, is_gds); } u32 Rasterizer::ReadDataFromGds(u32 gds_offset) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index bc14f39a..d5cfbfd6 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -42,7 +42,7 @@ public: void ScopedMarkerInsert(const std::string_view& str); void ScopedMarkerInsertColor(const std::string_view& str, const u32 color); - void InlineDataToGds(u32 gds_offset, u32 value); + void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); u32 ReadDataFromGds(u32 gsd_offset); void InvalidateMemory(VAddr addr, u64 size); void MapMemory(VAddr addr, u64 size); diff --git a/src/video_core/texture_cache/sampler.cpp b/src/video_core/texture_cache/sampler.cpp index 179dd664..e47f53ab 100644 --- a/src/video_core/texture_cache/sampler.cpp +++ b/src/video_core/texture_cache/sampler.cpp @@ -8,6 +8,9 @@ namespace VideoCore { Sampler::Sampler(const Vulkan::Instance& instance, const AmdGpu::Sampler& sampler) { + if (sampler.force_degamma) { + LOG_WARNING(Render_Vulkan, "Texture requires gamma correction"); + } using namespace Vulkan; const vk::SamplerCreateInfo sampler_ci = { .magFilter = LiverpoolToVK::Filter(sampler.xy_mag_filter), diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 00e6bea8..279e0d82 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -427,7 +427,7 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const VAddr image_addr = image.info.guest_address; const size_t image_size = image.info.guest_size_bytes; - const auto [vk_buffer, buf_offset] = buffer_cache.ObtainTempBuffer(image_addr, image_size); + const auto [vk_buffer, buf_offset] = buffer_cache.ObtainViewBuffer(image_addr, image_size); // The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW // hazard if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead,