diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index ffec7030..645bcf42 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -2155,6 +2155,7 @@ int PS4_SYSV_ABI sceGnmSubmitCommandBuffersForWorkload() { int PS4_SYSV_ABI sceGnmSubmitDone() { LOG_DEBUG(Lib_GnmDriver, "called"); + WaitGpuIdle(); if (!liverpool->IsGpuIdle()) { submission_lock = true; } diff --git a/src/emulator.cpp b/src/emulator.cpp index a469a31c..e631698f 100644 --- a/src/emulator.cpp +++ b/src/emulator.cpp @@ -26,7 +26,6 @@ #include "core/libraries/libs.h" #include "core/libraries/ngs2/ngs2.h" #include "core/libraries/rtc/rtc.h" -#include "core/libraries/videoout/video_out.h" #include "core/linker.h" #include "core/memory.h" #include "emulator.h" diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index b0298cbb..11d2a1dd 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -208,6 +208,9 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { if (info.uses_group_quad) { ctx.AddCapability(spv::Capability::GroupNonUniformQuad); } + if (info.uses_group_ballot) { + ctx.AddCapability(spv::Capability::GroupNonUniformBallot); + } switch (program.info.stage) { case Stage::Compute: { const std::array workgroup_size{ctx.runtime_info.cs_info.workgroup_size}; diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 64ce532b..7df62a91 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -305,7 +305,7 @@ void EmitStoreBufferFormatF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id a const Id tex_buffer = ctx.OpLoad(buffer.image_type, buffer.id); const Id coord = ctx.OpIAdd(ctx.U32[1], address, buffer.coord_offset); if (buffer.is_integer) { - value = ctx.OpBitcast(ctx.U32[4], value); + value = ctx.OpBitcast(ctx.S32[4], value); } ctx.OpImageWrite(tex_buffer, coord, value); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 898de8b5..2d13d09f 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -27,7 +27,8 @@ Id EmitReadFirstLane(EmitContext& ctx, Id value) { } Id EmitReadLane(EmitContext& ctx, Id value, u32 lane) { - UNREACHABLE(); + return ctx.OpGroupNonUniformBroadcast(ctx.U32[1], SubgroupScope(ctx), value, + ctx.ConstU32(lane)); } Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane) { diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index b65cbdf4..8554f861 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -324,16 +324,18 @@ void EmitContext::DefineOutputs() { void EmitContext::DefinePushDataBlock() { // Create push constants block for instance steps rates - const Id struct_type{Name(TypeStruct(U32[1], U32[1], U32[4], U32[4]), "AuxData")}; + const Id struct_type{Name(TypeStruct(U32[1], U32[1], U32[4], U32[4], U32[4]), "AuxData")}; Decorate(struct_type, spv::Decoration::Block); MemberName(struct_type, 0, "sr0"); MemberName(struct_type, 1, "sr1"); MemberName(struct_type, 2, "buf_offsets0"); MemberName(struct_type, 3, "buf_offsets1"); + MemberName(struct_type, 4, "buf_offsets2"); MemberDecorate(struct_type, 0, spv::Decoration::Offset, 0U); MemberDecorate(struct_type, 1, spv::Decoration::Offset, 4U); MemberDecorate(struct_type, 2, spv::Decoration::Offset, 8U); MemberDecorate(struct_type, 3, spv::Decoration::Offset, 24U); + MemberDecorate(struct_type, 4, spv::Decoration::Offset, 40U); push_data_block = DefineVar(struct_type, spv::StorageClass::PushConstant); Name(push_data_block, "push_data"); interfaces.push_back(push_data_block); diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 4e0c110c..c9144fac 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -171,7 +171,7 @@ T Translator::GetSrc(const InstOperand& operand) { } } else { if (operand.input_modifier.abs) { - LOG_WARNING(Render_Vulkan, "Input abs modifier on integer instruction"); + value = ir.IAbs(value); } if (operand.input_modifier.neg) { UNREACHABLE(); diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 5af28336..f602e762 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -117,6 +117,10 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { return BUFFER_ATOMIC(AtomicOp::Add, inst); case Opcode::BUFFER_ATOMIC_SWAP: return BUFFER_ATOMIC(AtomicOp::Swap, inst); + case Opcode::BUFFER_ATOMIC_UMIN: + return BUFFER_ATOMIC(AtomicOp::Umin, inst); + case Opcode::BUFFER_ATOMIC_UMAX: + return BUFFER_ATOMIC(AtomicOp::Umax, inst); default: LogMissingOpcode(inst); } @@ -280,6 +284,7 @@ void Translator::IMAGE_GATHER(const GcnInst& inst) { info.has_bias.Assign(flags.test(MimgModifier::LodBias)); info.has_lod_clamp.Assign(flags.test(MimgModifier::LodClamp)); info.force_level0.Assign(flags.test(MimgModifier::Level0)); + info.has_offset.Assign(flags.test(MimgModifier::Offset)); // info.explicit_lod.Assign(explicit_lod); info.gather_comp.Assign(std::bit_width(mimg.dmask) - 1); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 0184a7f6..c4e16b7a 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -1,6 +1,5 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later - #pragma once #include @@ -89,7 +88,7 @@ struct PushData { u32 step0; u32 step1; - std::array buf_offsets; + std::array buf_offsets; void AddOffset(u32 binding, u32 offset) { ASSERT(offset < 256 && binding < buf_offsets.size()); @@ -166,6 +165,7 @@ struct Info { bool has_image_query{}; bool uses_lane_id{}; bool uses_group_quad{}; + bool uses_group_ballot{}; bool uses_shared{}; bool uses_fp16{}; bool uses_step_rates{}; @@ -181,6 +181,7 @@ struct Info { const u32* base = user_data.data(); if (ptr_index != IR::NumScalarRegs) { std::memcpy(&base, &user_data[ptr_index], sizeof(base)); + base = reinterpret_cast(VAddr(base) & 0xFFFFFFFFFFFFULL); } std::memcpy(&data, base + dword_offset, sizeof(T)); return data; diff --git a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp index a87cf31b..76bfcf91 100644 --- a/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp +++ b/src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp @@ -21,8 +21,7 @@ void LowerSharedMemToRegisters(IR::Program& program) { const IR::Inst* prod = inst.Arg(0).InstRecursive(); const auto it = std::ranges::find_if(ds_writes, [&](const IR::Inst* write) { const IR::Inst* write_prod = write->Arg(0).InstRecursive(); - return write_prod->Arg(1).U32() == prod->Arg(1).U32() && - write_prod->Arg(0) == prod->Arg(0); + return write_prod->Arg(1).U32() == prod->Arg(1).U32(); }); ASSERT(it != ds_writes.end()); // Replace data read with value written. diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index aa5d39ae..6b2aa8bb 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -98,22 +98,7 @@ bool UseFP16(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat num_format) { } IR::Type BufferDataType(const IR::Inst& inst, AmdGpu::NumberFormat num_format) { - switch (inst.GetOpcode()) { - case IR::Opcode::LoadBufferU32: - case IR::Opcode::LoadBufferU32x2: - case IR::Opcode::LoadBufferU32x3: - case IR::Opcode::LoadBufferU32x4: - case IR::Opcode::StoreBufferU32: - case IR::Opcode::StoreBufferU32x2: - case IR::Opcode::StoreBufferU32x3: - case IR::Opcode::StoreBufferU32x4: - case IR::Opcode::ReadConstBuffer: - case IR::Opcode::BufferAtomicIAdd32: - case IR::Opcode::BufferAtomicSwap32: - return IR::Type::U32; - default: - UNREACHABLE(); - } + return IR::Type::U32; } bool IsImageAtomicInstruction(const IR::Inst& inst) { @@ -223,12 +208,8 @@ public: u32 Add(const SamplerResource& desc) { const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) { - if (desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset) { - return true; - } - // Samplers with different bindings might still be the same. - return existing.GetSharp(info) == desc.GetSharp(info); + return desc.sgpr_base == existing.sgpr_base && + desc.dword_offset == existing.dword_offset; })}; return index; } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 5ce024b4..63fe8a57 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -39,6 +39,11 @@ void Visit(Info& info, IR::Inst& inst) { case IR::Opcode::QuadShuffle: info.uses_group_quad = true; break; + case IR::Opcode::ReadLane: + case IR::Opcode::ReadFirstLane: + case IR::Opcode::WriteLane: + info.uses_group_ballot = true; + break; case IR::Opcode::Discard: case IR::Opcode::DiscardCond: info.has_discard = true; diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index 3dd75dbd..bbcafdb8 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -37,14 +37,14 @@ struct ImageSpecialization { * after the first compilation of a module. */ struct StageSpecialization { - static constexpr size_t MaxStageResources = 32; + static constexpr size_t MaxStageResources = 64; const Shader::Info* info; RuntimeInfo runtime_info; std::bitset bitset{}; boost::container::small_vector buffers; boost::container::small_vector tex_buffers; - boost::container::small_vector images; + boost::container::small_vector images; u32 start_binding{}; explicit StageSpecialization(const Shader::Info& info_, RuntimeInfo runtime_info_, diff --git a/src/video_core/amdgpu/pm4_cmds.h b/src/video_core/amdgpu/pm4_cmds.h index fd7980c1..064b8995 100644 --- a/src/video_core/amdgpu/pm4_cmds.h +++ b/src/video_core/amdgpu/pm4_cmds.h @@ -187,6 +187,11 @@ struct PM4CmdSetData { BitField<28, 4, u32> index; ///< Index for UCONFIG/CONTEXT on CI+ ///< Program to zero for other opcodes and on SI }; + u32 data[0]; + + [[nodiscard]] u32 Size() const { + return header.count << 2u; + } template static constexpr u32* SetContextReg(u32* cmdbuf, Args... data) { @@ -350,6 +355,16 @@ struct PM4CmdEventWriteEop { } }; +struct PM4CmdAcquireMem { + PM4Type3Header header; + u32 cp_coher_cntl; + u32 cp_coher_size_lo; + u32 cp_coher_size_hi; + u32 cp_coher_base_lo; + u32 cp_coher_base_hi; + u32 poll_interval; +}; + enum class DmaDataDst : u32 { Memory = 0, Gds = 1, @@ -467,6 +482,10 @@ struct PM4CmdWriteData { }; u32 data[0]; + u32 Size() const { + return (header.count.Value() - 2) * 4; + } + template void Address(T addr) { addr64 = static_cast(addr); diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index 86af05bf..2ed0ddc8 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -577,9 +577,6 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, return false; } Image& image = texture_cache.GetImage(image_id); - if (image.info.guest_size_bytes > size) { - return false; - } boost::container::small_vector copies; u32 offset = buffer.Offset(image.cpu_addr); const u32 num_layers = image.info.resources.layers; @@ -604,11 +601,13 @@ bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, }); offset += mip_ofs * num_layers; } - scheduler.EndRendering(); - image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead); - const auto cmdbuf = scheduler.CommandBuffer(); - cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.buffer, - copies); + if (!copies.empty()) { + scheduler.EndRendering(); + image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.buffer, + copies); + } return true; } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index aeae0813..96358bf6 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -12,9 +12,11 @@ namespace Vulkan { ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler_, - vk::PipelineCache pipeline_cache, u64 compute_key_, - const Shader::Info& info_, vk::ShaderModule module) - : instance{instance_}, scheduler{scheduler_}, compute_key{compute_key_}, info{&info_} { + DescriptorHeap& desc_heap_, vk::PipelineCache pipeline_cache, + u64 compute_key_, const Shader::Info& info_, + vk::ShaderModule module) + : instance{instance_}, scheduler{scheduler_}, desc_heap{desc_heap_}, compute_key{compute_key_}, + info{&info_} { const vk::PipelineShaderStageCreateInfo shader_ci = { .stage = vk::ShaderStageFlagBits::eCompute, .module = module, @@ -66,8 +68,12 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler .size = sizeof(Shader::PushData), }; + uses_push_descriptors = binding < instance.MaxPushDescriptors(); + const auto flags = uses_push_descriptors + ? vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR + : vk::DescriptorSetLayoutCreateFlagBits{}; const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { - .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, + .flags = flags, .bindingCount = static_cast(bindings.size()), .pBindings = bindings.data(), }; @@ -101,8 +107,8 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, VideoCore::TextureCache& texture_cache) const { // Bind resource buffers and textures. boost::container::static_vector buffer_views; - boost::container::static_vector buffer_infos; - boost::container::static_vector image_infos; + boost::container::static_vector buffer_infos; + boost::container::static_vector image_infos; boost::container::small_vector set_writes; boost::container::small_vector buffer_barriers; Shader::PushData push_data{}; @@ -265,9 +271,21 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, cmdbuf.pipelineBarrier2(dependencies); } + if (uses_push_descriptors) { + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, + set_writes); + } else { + const auto desc_set = desc_heap.Commit(*desc_layout); + for (auto& set_write : set_writes) { + set_write.dstSet = desc_set; + } + instance.GetDevice().updateDescriptorSets(set_writes, {}); + cmdbuf.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, desc_set, + {}); + } + cmdbuf.pushConstants(*pipeline_layout, vk::ShaderStageFlagBits::eCompute, 0u, sizeof(push_data), &push_data); - cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eCompute, *pipeline_layout, 0, set_writes); return true; } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index 54eaf653..8a6213a2 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -16,12 +16,13 @@ namespace Vulkan { class Instance; class Scheduler; +class DescriptorHeap; class ComputePipeline { public: explicit ComputePipeline(const Instance& instance, Scheduler& scheduler, - vk::PipelineCache pipeline_cache, u64 compute_key, - const Shader::Info& info, vk::ShaderModule module); + DescriptorHeap& desc_heap, vk::PipelineCache pipeline_cache, + u64 compute_key, const Shader::Info& info, vk::ShaderModule module); ~ComputePipeline(); [[nodiscard]] vk::Pipeline Handle() const noexcept { @@ -34,11 +35,13 @@ public: private: const Instance& instance; Scheduler& scheduler; + DescriptorHeap& desc_heap; vk::UniquePipeline pipeline; vk::UniquePipelineLayout pipeline_layout; vk::UniqueDescriptorSetLayout desc_layout; u64 compute_key; const Shader::Info* info; + bool uses_push_descriptors{}; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index a548b70a..2f5209eb 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -17,11 +17,11 @@ namespace Vulkan { GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& scheduler_, - const GraphicsPipelineKey& key_, + DescriptorHeap& desc_heap_, const GraphicsPipelineKey& key_, vk::PipelineCache pipeline_cache, std::span infos, std::span modules) - : instance{instance_}, scheduler{scheduler_}, key{key_} { + : instance{instance_}, scheduler{scheduler_}, desc_heap{desc_heap_}, key{key_} { const vk::Device device = instance.GetDevice(); std::ranges::copy(infos, stages.begin()); BuildDescSetLayout(); @@ -301,7 +301,6 @@ GraphicsPipeline::~GraphicsPipeline() = default; void GraphicsPipeline::BuildDescSetLayout() { u32 binding{}; - boost::container::small_vector bindings; for (const auto* stage : stages) { if (!stage) { continue; @@ -343,8 +342,12 @@ void GraphicsPipeline::BuildDescSetLayout() { }); } } + uses_push_descriptors = binding < instance.MaxPushDescriptors(); + const auto flags = uses_push_descriptors + ? vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR + : vk::DescriptorSetLayoutCreateFlagBits{}; const vk::DescriptorSetLayoutCreateInfo desc_layout_ci = { - .flags = vk::DescriptorSetLayoutCreateFlagBits::ePushDescriptorKHR, + .flags = flags, .bindingCount = static_cast(bindings.size()), .pBindings = bindings.data(), }; @@ -446,10 +449,10 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, }); } - boost::container::static_vector tsharps; + boost::container::static_vector tsharps; for (const auto& image_desc : stage->images) { const auto tsharp = image_desc.GetSharp(*stage); - if (tsharp) { + if (tsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) { tsharps.emplace_back(tsharp); VideoCore::ImageInfo image_info{tsharp, image_desc.is_depth}; VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage}; @@ -510,8 +513,18 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, } if (!set_writes.empty()) { - cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0, - set_writes); + if (uses_push_descriptors) { + cmdbuf.pushDescriptorSetKHR(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0, + set_writes); + } else { + const auto desc_set = desc_heap.Commit(*desc_layout); + for (auto& set_write : set_writes) { + set_write.dstSet = desc_set; + } + instance.GetDevice().updateDescriptorSets(set_writes, {}); + cmdbuf.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, *pipeline_layout, 0, + desc_set, {}); + } } cmdbuf.pushConstants(*pipeline_layout, vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment, 0U, diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index c06ddd20..7778c417 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -19,6 +19,7 @@ static constexpr u32 MaxShaderStages = 5; class Instance; class Scheduler; +class DescriptorHeap; using Liverpool = AmdGpu::Liverpool; @@ -59,7 +60,8 @@ struct GraphicsPipelineKey { class GraphicsPipeline { public: explicit GraphicsPipeline(const Instance& instance, Scheduler& scheduler, - const GraphicsPipelineKey& key, vk::PipelineCache pipeline_cache, + DescriptorHeap& desc_heap, const GraphicsPipelineKey& key, + vk::PipelineCache pipeline_cache, std::span stages, std::span modules); ~GraphicsPipeline(); @@ -98,11 +100,14 @@ private: private: const Instance& instance; Scheduler& scheduler; + DescriptorHeap& desc_heap; vk::UniquePipeline pipeline; vk::UniquePipelineLayout pipeline_layout; vk::UniqueDescriptorSetLayout desc_layout; std::array stages{}; GraphicsPipelineKey key; + bool uses_push_descriptors{}; + boost::container::small_vector bindings; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index a19ee1c7..769a808e 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -176,8 +176,10 @@ bool Instance::CreateDevice() { vk::PhysicalDevicePortabilitySubsetFeaturesKHR>(); const vk::StructureChain properties_chain = physical_device.getProperties2< vk::PhysicalDeviceProperties2, vk::PhysicalDevicePortabilitySubsetPropertiesKHR, - vk::PhysicalDeviceExternalMemoryHostPropertiesEXT, vk::PhysicalDeviceVulkan11Properties>(); + vk::PhysicalDeviceExternalMemoryHostPropertiesEXT, vk::PhysicalDeviceVulkan11Properties, + vk::PhysicalDevicePushDescriptorPropertiesKHR>(); subgroup_size = properties_chain.get().subgroupSize; + push_descriptor_props = properties_chain.get(); LOG_INFO(Render_Vulkan, "Physical device subgroup size {}", subgroup_size); features = feature_chain.get().features; diff --git a/src/video_core/renderer_vulkan/vk_instance.h b/src/video_core/renderer_vulkan/vk_instance.h index 52310955..a64c77a5 100644 --- a/src/video_core/renderer_vulkan/vk_instance.h +++ b/src/video_core/renderer_vulkan/vk_instance.h @@ -207,6 +207,11 @@ public: return properties.limits.maxTexelBufferElements; } + /// Returns the maximum number of push descriptors. + u32 MaxPushDescriptors() const { + return push_descriptor_props.maxPushDescriptors; + } + /// Returns true if shaders can declare the ClipDistance attribute bool IsShaderClipDistanceSupported() const { return features.shaderClipDistance; @@ -242,6 +247,7 @@ private: vk::PhysicalDevice physical_device; vk::UniqueDevice device; vk::PhysicalDeviceProperties properties; + vk::PhysicalDevicePushDescriptorPropertiesKHR push_descriptor_props; vk::PhysicalDeviceFeatures features; vk::DriverIdKHR driver_id; vk::UniqueDebugUtilsMessengerEXT debug_callback{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index b4b256bb..e19467b0 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -24,6 +24,15 @@ using Shader::VsOutput; return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2)); } +constexpr static std::array DescriptorHeapSizes = { + vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, 8192}, + vk::DescriptorPoolSize{vk::DescriptorType::eStorageBuffer, 1024}, + vk::DescriptorPoolSize{vk::DescriptorType::eUniformTexelBuffer, 128}, + vk::DescriptorPoolSize{vk::DescriptorType::eStorageTexelBuffer, 128}, + vk::DescriptorPoolSize{vk::DescriptorType::eSampledImage, 8192}, + vk::DescriptorPoolSize{vk::DescriptorType::eSampler, 1024}, +}; + void GatherVertexOutputs(Shader::VertexRuntimeInfo& info, const AmdGpu::Liverpool::VsOutputControl& ctl) { const auto add_output = [&](VsOutput x, VsOutput y, VsOutput z, VsOutput w) { @@ -120,7 +129,8 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Shader::Stage stage) { PipelineCache::PipelineCache(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) - : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_} { + : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, + desc_heap{instance, scheduler.GetMasterSemaphore(), DescriptorHeapSizes} { profile = Shader::Profile{ .supported_spirv = instance.ApiVersion() >= VK_API_VERSION_1_3 ? 0x00010600U : 0x00010500U, .subgroup_size = instance.SubgroupSize(), @@ -153,8 +163,8 @@ const GraphicsPipeline* PipelineCache::GetGraphicsPipeline() { } const auto [it, is_new] = graphics_pipelines.try_emplace(graphics_key); if (is_new) { - it.value() = std::make_unique(instance, scheduler, graphics_key, - *pipeline_cache, infos, modules); + it.value() = std::make_unique( + instance, scheduler, desc_heap, graphics_key, *pipeline_cache, infos, modules); } const GraphicsPipeline* pipeline = it->second.get(); return pipeline; @@ -166,8 +176,8 @@ const ComputePipeline* PipelineCache::GetComputePipeline() { } const auto [it, is_new] = compute_pipelines.try_emplace(compute_key); if (is_new) { - it.value() = std::make_unique(instance, scheduler, *pipeline_cache, - compute_key, *infos[0], modules[0]); + it.value() = std::make_unique( + instance, scheduler, desc_heap, *pipeline_cache, compute_key, *infos[0], modules[0]); } const ComputePipeline* pipeline = it->second.get(); return pipeline; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index 96e2cd04..92dcf826 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -9,6 +9,7 @@ #include "shader_recompiler/specialization.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" +#include "video_core/renderer_vulkan/vk_resource_pool.h" namespace Shader { struct Info; @@ -66,6 +67,7 @@ private: const Instance& instance; Scheduler& scheduler; AmdGpu::Liverpool* liverpool; + DescriptorHeap desc_heap; vk::UniquePipelineCache pipeline_cache; vk::UniquePipelineLayout pipeline_layout; Shader::Profile profile{}; diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.cpp b/src/video_core/renderer_vulkan/vk_resource_pool.cpp index f9f2ae0a..a5ee22c2 100644 --- a/src/video_core/renderer_vulkan/vk_resource_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_resource_pool.cpp @@ -3,8 +3,8 @@ #include #include -#include #include "common/assert.h" +#include "common/scope_exit.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" @@ -103,88 +103,86 @@ vk::CommandBuffer CommandPool::Commit() { return cmd_buffers[index]; } -constexpr u32 DESCRIPTOR_SET_BATCH = 32; - -DescriptorHeap::DescriptorHeap(const Instance& instance, MasterSemaphore* master_semaphore, - std::span bindings, +DescriptorHeap::DescriptorHeap(const Instance& instance, MasterSemaphore* master_semaphore_, + std::span pool_sizes_, u32 descriptor_heap_count_) - : ResourcePool{master_semaphore, DESCRIPTOR_SET_BATCH}, device{instance.GetDevice()}, - descriptor_heap_count{descriptor_heap_count_} { - // Create descriptor set layout. - const vk::DescriptorSetLayoutCreateInfo layout_ci = { - .bindingCount = static_cast(bindings.size()), - .pBindings = bindings.data(), - }; - descriptor_set_layout = device.createDescriptorSetLayoutUnique(layout_ci); - if (instance.HasDebuggingToolAttached()) { - SetObjectName(device, *descriptor_set_layout, "DescriptorSetLayout"); - } - - // Build descriptor set pool counts. - std::unordered_map descriptor_type_counts; - for (const auto& binding : bindings) { - descriptor_type_counts[binding.descriptorType] += binding.descriptorCount; - } - for (const auto& [type, count] : descriptor_type_counts) { - auto& pool_size = pool_sizes.emplace_back(); - pool_size.descriptorCount = count * descriptor_heap_count; - pool_size.type = type; - } - - // Create descriptor pool - AppendDescriptorPool(); + : device{instance.GetDevice()}, master_semaphore{master_semaphore_}, + descriptor_heap_count{descriptor_heap_count_}, pool_sizes{pool_sizes_} { + CreateDescriptorPool(); } -DescriptorHeap::~DescriptorHeap() = default; +DescriptorHeap::~DescriptorHeap() { + device.destroyDescriptorPool(curr_pool); + for (const auto [pool, tick] : pending_pools) { + master_semaphore->Wait(tick); + device.destroyDescriptorPool(pool); + } +} -void DescriptorHeap::Allocate(std::size_t begin, std::size_t end) { - ASSERT(end - begin == DESCRIPTOR_SET_BATCH); - descriptor_sets.resize(end); - hashes.resize(end); +vk::DescriptorSet DescriptorHeap::Commit(vk::DescriptorSetLayout set_layout) { + const u64 set_key = std::bit_cast(set_layout); + const auto [it, _] = descriptor_sets.try_emplace(set_key); - std::array layouts; - layouts.fill(*descriptor_set_layout); + // Check if allocated sets exist and pick one. + if (!it->second.empty()) { + const auto desc_set = it->second.back(); + it.value().pop_back(); + return desc_set; + } + + DescSetBatch desc_sets(DescriptorSetBatch); + std::array layouts; + layouts.fill(set_layout); - u32 current_pool = 0; vk::DescriptorSetAllocateInfo alloc_info = { - .descriptorPool = *pools[current_pool], - .descriptorSetCount = DESCRIPTOR_SET_BATCH, + .descriptorPool = curr_pool, + .descriptorSetCount = DescriptorSetBatch, .pSetLayouts = layouts.data(), }; - // Attempt to allocate the descriptor set batch. If the pool has run out of space, use a new - // one. - while (true) { - const auto result = - device.allocateDescriptorSets(&alloc_info, descriptor_sets.data() + begin); - if (result == vk::Result::eSuccess) { - break; - } - if (result == vk::Result::eErrorOutOfPoolMemory) { - current_pool++; - if (current_pool == pools.size()) { - LOG_INFO(Render_Vulkan, "Run out of pools, creating new one!"); - AppendDescriptorPool(); - } - alloc_info.descriptorPool = *pools[current_pool]; - } + // Attempt to allocate the descriptor set batch. + auto result = device.allocateDescriptorSets(&alloc_info, desc_sets.data()); + if (result == vk::Result::eSuccess) { + const auto desc_set = desc_sets.back(); + desc_sets.pop_back(); + it.value() = std::move(desc_sets); + return desc_set; } + + // The pool has run out. Record current tick and place it in pending list. + ASSERT_MSG(result == vk::Result::eErrorOutOfPoolMemory, + "Unexpected error during descriptor set allocation {}", vk::to_string(result)); + pending_pools.emplace_back(curr_pool, master_semaphore->CurrentTick()); + if (const auto [pool, tick] = pending_pools.front(); master_semaphore->IsFree(tick)) { + curr_pool = pool; + pending_pools.pop_front(); + device.resetDescriptorPool(curr_pool); + } else { + CreateDescriptorPool(); + } + + // Attempt to allocate again with fresh pool. + alloc_info.descriptorPool = curr_pool; + result = device.allocateDescriptorSets(&alloc_info, desc_sets.data()); + ASSERT_MSG(result == vk::Result::eSuccess, + "Unexpected error during descriptor set allocation {}", vk::to_string(result)); + + // We've changed pool so also reset descriptor batch cache. + descriptor_sets.clear(); + const auto desc_set = desc_sets.back(); + desc_sets.pop_back(); + descriptor_sets[set_key] = std::move(desc_sets); + return desc_set; } -vk::DescriptorSet DescriptorHeap::Commit() { - const std::size_t index = CommitResource(); - return descriptor_sets[index]; -} - -void DescriptorHeap::AppendDescriptorPool() { +void DescriptorHeap::CreateDescriptorPool() { const vk::DescriptorPoolCreateInfo pool_info = { .flags = vk::DescriptorPoolCreateFlagBits::eUpdateAfterBind, .maxSets = descriptor_heap_count, .poolSizeCount = static_cast(pool_sizes.size()), .pPoolSizes = pool_sizes.data(), }; - auto& pool = pools.emplace_back(); - pool = device.createDescriptorPoolUnique(pool_info); + curr_pool = device.createDescriptorPool(pool_info); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_resource_pool.h b/src/video_core/renderer_vulkan/vk_resource_pool.h index b138b969..98c2ddb8 100644 --- a/src/video_core/renderer_vulkan/vk_resource_pool.h +++ b/src/video_core/renderer_vulkan/vk_resource_pool.h @@ -3,7 +3,9 @@ #pragma once +#include #include +#include #include #include "common/types.h" @@ -62,32 +64,29 @@ private: std::vector cmd_buffers; }; -class DescriptorHeap final : public ResourcePool { +class DescriptorHeap final { + static constexpr u32 DescriptorSetBatch = 32; + public: explicit DescriptorHeap(const Instance& instance, MasterSemaphore* master_semaphore, - std::span bindings, + std::span pool_sizes, u32 descriptor_heap_count = 1024); - ~DescriptorHeap() override; + ~DescriptorHeap(); - const vk::DescriptorSetLayout& Layout() const { - return *descriptor_set_layout; - } - - void Allocate(std::size_t begin, std::size_t end) override; - - vk::DescriptorSet Commit(); + vk::DescriptorSet Commit(vk::DescriptorSetLayout set_layout); private: - void AppendDescriptorPool(); + void CreateDescriptorPool(); private: vk::Device device; - vk::UniqueDescriptorSetLayout descriptor_set_layout; + MasterSemaphore* master_semaphore; u32 descriptor_heap_count; - std::vector pool_sizes; - std::vector pools; - std::vector descriptor_sets; - std::vector hashes; + std::span pool_sizes; + vk::DescriptorPool curr_pool; + std::deque> pending_pools; + using DescSetBatch = boost::container::static_vector; + tsl::robin_map descriptor_sets; }; } // namespace Vulkan diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 2a5c4c43..d494322a 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -73,7 +73,6 @@ static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) { if (!info.IsBlockCoded() && !info.IsPacked()) { usage |= vk::ImageUsageFlagBits::eColorAttachment; } - // In cases where an image is created as a render/depth target and cleared with compute, // we cannot predict whether it will be used as a storage image. A proper solution would // involve re-creating the resource with a new configuration and copying previous content diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index e30c1264..f94c1a37 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -69,7 +69,12 @@ vk::Format TrySwizzleFormat(vk::Format format, u32 dst_sel) { ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image, bool is_storage_) noexcept : is_storage{is_storage_} { type = ConvertImageViewType(image.GetType()); - format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); + const auto dfmt = image.GetDataFmt(); + auto nfmt = image.GetNumberFmt(); + if (is_storage && nfmt == AmdGpu::NumberFormat::Srgb) { + nfmt = AmdGpu::NumberFormat::Unorm; + } + format = Vulkan::LiverpoolToVK::SurfaceFormat(dfmt, nfmt); range.base.level = image.base_level; range.base.layer = image.base_array; range.extent.levels = image.last_level + 1; @@ -143,7 +148,7 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info .aspectMask = aspect, .baseMipLevel = info.range.base.level, .levelCount = info.range.extent.levels - info.range.base.level, - .baseArrayLayer = info_.range.base.layer, + .baseArrayLayer = info.range.base.layer, .layerCount = info.range.extent.layers - info.range.base.layer, }, };