From 9ec75c3febda3ff7f10b16a690b542b78268924c Mon Sep 17 00:00:00 2001 From: baggins183 Date: Thu, 31 Oct 2024 23:55:53 -0700 Subject: [PATCH] Implement shader resource tables (#1165) * Implement shader resource tables * fix after rebase + squash * address some review comments * fix pipeline_common * cleanup debug stuff * switch to using single codegenerator --- CMakeLists.txt | 1 + src/common/decoder.cpp | 16 +- src/common/decoder.h | 2 + src/common/hash.h | 14 + .../spirv/emit_spirv_context_get_set.cpp | 12 +- .../backend/spirv/emit_spirv_instructions.h | 2 +- .../backend/spirv/spirv_emit_context.cpp | 45 +++- .../backend/spirv/spirv_emit_context.h | 1 + .../frontend/translate/scalar_memory.cpp | 4 + .../frontend/translate/translate.cpp | 5 +- src/shader_recompiler/info.h | 47 +++- src/shader_recompiler/ir/basic_block.cpp | 4 + .../ir/breadth_first_search.h | 30 ++- .../passes/flatten_extended_userdata_pass.cpp | 249 ++++++++++++++++++ src/shader_recompiler/ir/passes/ir_passes.h | 1 + .../ir/passes/resource_tracking_pass.cpp | 92 ++----- .../ir/passes/shader_info_collection_pass.cpp | 3 + src/shader_recompiler/ir/passes/srt.h | 37 +++ src/shader_recompiler/ir/srt_gvn_table.h | 157 +++++++++++ src/shader_recompiler/ir/value.cpp | 53 +++- src/shader_recompiler/ir/value.h | 10 + src/shader_recompiler/recompiler.cpp | 1 + src/shader_recompiler/specialization.h | 10 + src/video_core/buffer_cache/buffer_cache.cpp | 11 +- src/video_core/buffer_cache/buffer_cache.h | 2 + .../renderer_vulkan/vk_compute_pipeline.cpp | 9 + .../renderer_vulkan/vk_graphics_pipeline.cpp | 11 +- .../renderer_vulkan/vk_pipeline_cache.cpp | 10 +- .../renderer_vulkan/vk_pipeline_common.cpp | 16 ++ .../renderer_vulkan/vk_rasterizer.cpp | 4 + 30 files changed, 740 insertions(+), 119 deletions(-) create mode 100644 src/common/hash.h create mode 100644 src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp create mode 100644 src/shader_recompiler/ir/passes/srt.h create mode 100644 src/shader_recompiler/ir/srt_gvn_table.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 04bd6a331..eb085572b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -590,6 +590,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/frontend/structured_control_flow.h src/shader_recompiler/ir/passes/constant_propagation_pass.cpp src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp + src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/ir_passes.h src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp diff --git a/src/common/decoder.cpp b/src/common/decoder.cpp index 249907419..aeaba3ca6 100644 --- a/src/common/decoder.cpp +++ b/src/common/decoder.cpp @@ -13,6 +13,15 @@ DecoderImpl::DecoderImpl() { DecoderImpl::~DecoderImpl() = default; +std::string DecoderImpl::disassembleInst(ZydisDecodedInstruction& inst, + ZydisDecodedOperand* operands, u64 address) { + const int bufLen = 256; + char szBuffer[bufLen]; + ZydisFormatterFormatInstruction(&m_formatter, &inst, operands, inst.operand_count_visible, + szBuffer, sizeof(szBuffer), address, ZYAN_NULL); + return szBuffer; +} + void DecoderImpl::printInstruction(void* code, u64 address) { ZydisDecodedInstruction instruction; ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT_VISIBLE]; @@ -27,11 +36,8 @@ void DecoderImpl::printInstruction(void* code, u64 address) { void DecoderImpl::printInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, u64 address) { - const int bufLen = 256; - char szBuffer[bufLen]; - ZydisFormatterFormatInstruction(&m_formatter, &inst, operands, inst.operand_count_visible, - szBuffer, sizeof(szBuffer), address, ZYAN_NULL); - fmt::print("instruction: {}\n", szBuffer); + std::string s = disassembleInst(inst, operands, address); + fmt::print("instruction: {}\n", s); } ZyanStatus DecoderImpl::decodeInstruction(ZydisDecodedInstruction& inst, diff --git a/src/common/decoder.h b/src/common/decoder.h index 1f2219596..a5dadbf19 100644 --- a/src/common/decoder.h +++ b/src/common/decoder.h @@ -14,6 +14,8 @@ public: DecoderImpl(); ~DecoderImpl(); + std::string disassembleInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, + u64 address); void printInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, u64 address); void printInstruction(void* code, u64 address); ZyanStatus decodeInstruction(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, diff --git a/src/common/hash.h b/src/common/hash.h new file mode 100644 index 000000000..d5cacedd7 --- /dev/null +++ b/src/common/hash.h @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/types.h" + +[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) { + return seed ^ (hash + 0x9e3779b9 + (seed << 12) + (seed >> 4)); +} + +[[nodiscard]] inline u32 HashCombine(const u32 seed, const u32 hash) { + return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +} \ No newline at end of file diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 2d48999c0..064200d99 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/assert.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" @@ -146,9 +147,14 @@ void EmitGetGotoVariable(EmitContext&) { UNREACHABLE_MSG("Unreachable instruction"); } -Id EmitReadConst(EmitContext& ctx) { - return ctx.u32_zero_value; - UNREACHABLE_MSG("Unreachable instruction"); +Id EmitReadConst(EmitContext& ctx, IR::Inst* inst) { + u32 flatbuf_off_dw = inst->Flags(); + ASSERT(ctx.srt_flatbuf.binding >= 0); + ASSERT(flatbuf_off_dw > 0); + Id index = ctx.ConstU32(flatbuf_off_dw); + auto& buffer = ctx.srt_flatbuf; + const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + return ctx.OpLoad(ctx.U32[1], ptr); } Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 02b98b343..12361991a 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -62,7 +62,7 @@ void EmitSetVectorRegister(EmitContext& ctx); void EmitSetGotoVariable(EmitContext& ctx); void EmitGetGotoVariable(EmitContext& ctx); void EmitSetScc(EmitContext& ctx); -Id EmitReadConst(EmitContext& ctx); +Id EmitReadConst(EmitContext& ctx, IR::Inst* inst); Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index); Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 6581a7a56..dc404b121 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -4,12 +4,14 @@ #include "common/assert.h" #include "common/div_ceil.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" +#include "shader_recompiler/ir/passes/srt.h" #include "video_core/amdgpu/types.h" #include #include #include +#include namespace Shader::Backend::SPIRV { namespace { @@ -435,14 +437,16 @@ void EmitContext::DefinePushDataBlock() { void EmitContext::DefineBuffers() { boost::container::small_vector type_ids; - const auto define_struct = [&](Id record_array_type, bool is_instance_data) { + const auto define_struct = [&](Id record_array_type, bool is_instance_data, + std::optional explicit_name = {}) { const Id struct_type{TypeStruct(record_array_type)}; if (std::ranges::find(type_ids, record_array_type.value, &Id::value) != type_ids.end()) { return struct_type; } Decorate(record_array_type, spv::Decoration::ArrayStride, 4); - const auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage) - : fmt::format("{}_cbuf_block_f32", stage); + auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage) + : fmt::format("{}_cbuf_block_f32", stage); + name = explicit_name.value_or(name); Name(struct_type, name); Decorate(struct_type, spv::Decoration::Block); MemberName(struct_type, 0, "data"); @@ -451,6 +455,29 @@ void EmitContext::DefineBuffers() { return struct_type; }; + if (info.has_readconst) { + const Id data_type = U32[1]; + const auto storage_class = spv::StorageClass::Uniform; + const Id pointer_type = TypePointer(storage_class, data_type); + const Id record_array_type{ + TypeArray(U32[1], ConstU32(static_cast(info.flattened_ud_buf.size())))}; + + const Id struct_type{define_struct(record_array_type, false, "srt_flatbuf_ty")}; + + const Id struct_pointer_type{TypePointer(storage_class, struct_type)}; + const Id id{AddGlobalVariable(struct_pointer_type, storage_class)}; + Decorate(id, spv::Decoration::Binding, binding.unified++); + Decorate(id, spv::Decoration::DescriptorSet, 0U); + Name(id, "srt_flatbuf_ubo"); + + srt_flatbuf = { + .id = id, + .binding = binding.buffer++, + .pointer_type = pointer_type, + }; + interfaces.push_back(id); + } + for (const auto& desc : info.buffers) { const auto sharp = desc.GetSharp(info); const bool is_storage = desc.IsStorage(sharp); @@ -471,7 +498,7 @@ void EmitContext::DefineBuffers() { if (is_storage && !desc.is_written) { Decorate(id, spv::Decoration::NonWritable); } - Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sgpr_base)); + Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sharp_idx)); buffers.push_back({ .id = id, @@ -495,7 +522,7 @@ void EmitContext::DefineTextureBuffers() { const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)}; Decorate(id, spv::Decoration::Binding, binding.unified++); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sgpr_base)); + Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sharp_idx)); texture_buffers.push_back({ .id = id, .binding = binding.buffer++, @@ -582,7 +609,7 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) { } Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { - const auto image = ctx.info.ReadUd(desc.sgpr_base, desc.dword_offset); + const auto image = ctx.info.ReadUdSharp(desc.sharp_idx); const auto format = desc.is_atomic ? GetFormat(image) : spv::ImageFormat::Unknown; const u32 sampled = desc.is_storage ? 2 : 1; switch (desc.type) { @@ -618,8 +645,7 @@ void EmitContext::DefineImagesAndSamplers() { const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)}; Decorate(id, spv::Decoration::Binding, binding.unified++); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}_{}{}_{:02x}", stage, "img", image_desc.sgpr_base, - image_desc.dword_offset)); + Name(id, fmt::format("{}_{}{}", stage, "img", image_desc.sharp_idx)); images.push_back({ .data_types = &data_types, .id = id, @@ -643,8 +669,7 @@ void EmitContext::DefineImagesAndSamplers() { const Id id{AddGlobalVariable(sampler_pointer_type, spv::StorageClass::UniformConstant)}; Decorate(id, spv::Decoration::Binding, binding.unified++); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}_{}{}_{:02x}", stage, "samp", samp_desc.sgpr_base, - samp_desc.dword_offset)); + Name(id, fmt::format("{}_{}{}", stage, "samp", samp_desc.sharp_idx)); samplers.push_back(id); interfaces.push_back(id); } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 147b4c845..fb30a5dd6 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -228,6 +228,7 @@ public: Bindings& binding; boost::container::small_vector buffers; boost::container::small_vector texture_buffers; + BufferDefinition srt_flatbuf; boost::container::small_vector images; boost::container::small_vector samplers; diff --git a/src/shader_recompiler/frontend/translate/scalar_memory.cpp b/src/shader_recompiler/frontend/translate/scalar_memory.cpp index a6f8cafd7..89426e080 100644 --- a/src/shader_recompiler/frontend/translate/scalar_memory.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_memory.cpp @@ -10,6 +10,10 @@ static constexpr u32 SQ_SRC_LITERAL = 0xFF; void Translator::EmitScalarMemory(const GcnInst& inst) { switch (inst.opcode) { // SMRD + case Opcode::S_LOAD_DWORD: + return S_LOAD_DWORD(1, inst); + case Opcode::S_LOAD_DWORDX2: + return S_LOAD_DWORD(2, inst); case Opcode::S_LOAD_DWORDX4: return S_LOAD_DWORD(4, inst); case Opcode::S_LOAD_DWORDX8: diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index bae6681cb..ccce31a24 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -388,7 +388,7 @@ void Translator::EmitFetch(const GcnInst& inst) { IR::VectorReg dst_reg{attrib.dest_vgpr}; // Read the V# of the attribute to figure out component number and type. - const auto buffer = info.ReadUd(attrib.sgpr_base, attrib.dword_offset); + const auto buffer = info.ReadUdReg(attrib.sgpr_base, attrib.dword_offset); for (u32 i = 0; i < 4; i++) { const IR::F32 comp = [&] { switch (buffer.GetSwizzle(i)) { @@ -418,8 +418,7 @@ void Translator::EmitFetch(const GcnInst& inst) { if (step_rate == Info::VsInput::OverStepRate0 || step_rate == Info::VsInput::OverStepRate1) { info.buffers.push_back({ - .sgpr_base = attrib.sgpr_base, - .dword_offset = attrib.dword_offset, + .sharp_idx = info.srt_info.ReserveSharp(attrib.sgpr_base, attrib.dword_offset, 4), .used_types = IR::Type::F32, .is_instance_data = true, }); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index e727c8a08..b69863f4f 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -2,7 +2,9 @@ // SPDX-License-Identifier: GPL-2.0-or-later #pragma once +#include #include +#include #include #include #include "common/assert.h" @@ -10,6 +12,7 @@ #include "shader_recompiler/backend/bindings.h" #include "shader_recompiler/frontend/copy_shader.h" #include "shader_recompiler/ir/attribute.h" +#include "shader_recompiler/ir/passes/srt.h" #include "shader_recompiler/ir/reg.h" #include "shader_recompiler/ir/type.h" #include "shader_recompiler/params.h" @@ -36,8 +39,7 @@ constexpr u32 NUM_TEXTURE_TYPES = 7; struct Info; struct BufferResource { - u32 sgpr_base; - u32 dword_offset; + u32 sharp_idx; IR::Type used_types; AmdGpu::Buffer inline_cbuf; bool is_gds_buffer{}; @@ -53,8 +55,7 @@ struct BufferResource { using BufferResourceList = boost::container::small_vector; struct TextureBufferResource { - u32 sgpr_base; - u32 dword_offset; + u32 sharp_idx; AmdGpu::NumberFormat nfmt; bool is_written{}; @@ -63,8 +64,7 @@ struct TextureBufferResource { using TextureBufferResourceList = boost::container::small_vector; struct ImageResource { - u32 sgpr_base; - u32 dword_offset; + u32 sharp_idx; AmdGpu::ImageType type; AmdGpu::NumberFormat nfmt; bool is_storage{}; @@ -77,8 +77,7 @@ struct ImageResource { using ImageResourceList = boost::container::small_vector; struct SamplerResource { - u32 sgpr_base; - u32 dword_offset; + u32 sharp_idx; AmdGpu::Sampler inline_sampler{}; u32 associated_image : 4; u32 disable_aniso : 1; @@ -180,6 +179,9 @@ struct Info { ImageResourceList images; SamplerResourceList samplers; + PersistentSrtInfo srt_info; + std::vector flattened_ud_buf; + std::span user_data; Stage stage; @@ -199,6 +201,7 @@ struct Info { bool uses_fp64{}; bool uses_step_rates{}; bool translation_failed{}; // indicates that shader has unsupported instructions + bool has_readconst{}; u8 mrt_mask{0u}; explicit Info(Stage stage_, ShaderParams params) @@ -206,7 +209,12 @@ struct Info { user_data{params.user_data} {} template - T ReadUd(u32 ptr_index, u32 dword_offset) const noexcept { + inline T ReadUdSharp(u32 sharp_idx) const noexcept { + return *reinterpret_cast(&flattened_ud_buf[sharp_idx]); + } + + template + T ReadUdReg(u32 ptr_index, u32 dword_offset) const noexcept { T data; const u32* base = user_data.data(); if (ptr_index != IR::NumScalarRegs) { @@ -228,7 +236,8 @@ struct Info { } void AddBindings(Backend::Bindings& bnd) const { - const auto total_buffers = buffers.size() + texture_buffers.size(); + const auto total_buffers = + buffers.size() + texture_buffers.size() + (has_readconst ? 1 : 0); bnd.buffer += total_buffers; bnd.unified += total_buffers + images.size() + samplers.size(); bnd.user_data += ud_mask.NumRegs(); @@ -245,22 +254,32 @@ struct Info { } return {vertex_offset, instance_offset}; } + + void RefreshFlatBuf() { + flattened_ud_buf.resize(srt_info.flattened_bufsize_dw); + ASSERT(user_data.size() <= NumUserDataRegs); + std::memcpy(flattened_ud_buf.data(), user_data.data(), user_data.size_bytes()); + // Run the JIT program to walk the SRT and write the leaves to a flat buffer + if (srt_info.walker_func) { + srt_info.walker_func(user_data.data(), flattened_ud_buf.data()); + } + } }; constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept { - return inline_cbuf ? inline_cbuf : info.ReadUd(sgpr_base, dword_offset); + return inline_cbuf ? inline_cbuf : info.ReadUdSharp(sharp_idx); } constexpr AmdGpu::Buffer TextureBufferResource::GetSharp(const Info& info) const noexcept { - return info.ReadUd(sgpr_base, dword_offset); + return info.ReadUdSharp(sharp_idx); } constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept { - return info.ReadUd(sgpr_base, dword_offset); + return info.ReadUdSharp(sharp_idx); } constexpr AmdGpu::Sampler SamplerResource::GetSharp(const Info& info) const noexcept { - return inline_sampler ? inline_sampler : info.ReadUd(sgpr_base, dword_offset); + return inline_sampler ? inline_sampler : info.ReadUdSharp(sharp_idx); } } // namespace Shader diff --git a/src/shader_recompiler/ir/basic_block.cpp b/src/shader_recompiler/ir/basic_block.cpp index 60ba0647a..426acb2b8 100644 --- a/src/shader_recompiler/ir/basic_block.cpp +++ b/src/shader_recompiler/ir/basic_block.cpp @@ -118,6 +118,10 @@ std::string DumpBlock(const Block& block, const std::map& } else { ret += fmt::format(" {}", op); // '%00000 = ' -> 1 + 5 + 3 = 9 spaces } + + if (op == Opcode::ReadConst) { + ret += fmt::format(" (flags={}) ", inst.Flags()); + } const size_t arg_count{inst.NumArgs()}; for (size_t arg_index = 0; arg_index < arg_count; ++arg_index) { const Value arg{inst.Arg(arg_index)}; diff --git a/src/shader_recompiler/ir/breadth_first_search.h b/src/shader_recompiler/ir/breadth_first_search.h index 0156303f0..b042ae3d6 100644 --- a/src/shader_recompiler/ir/breadth_first_search.h +++ b/src/shader_recompiler/ir/breadth_first_search.h @@ -11,34 +11,37 @@ namespace Shader::IR { -template -auto BreadthFirstSearch(const Inst* inst, Pred&& pred) -> std::invoke_result_t { +// Use typename Instruction so the function can be used to return either const or mutable +// Insts depending on the context. +template +auto BreadthFirstSearch(Instruction* inst, Pred&& pred) + -> std::invoke_result_t { // Most often case the instruction is the desired already. - if (const std::optional result = pred(inst)) { + if (std::optional result = pred(inst)) { return result; } // Breadth-first search visiting the right most arguments first - boost::container::small_vector visited; - std::queue queue; + boost::container::small_vector visited; + std::queue queue; queue.push(inst); while (!queue.empty()) { // Pop one instruction from the queue - const Inst* const inst{queue.front()}; + Instruction* inst{queue.front()}; queue.pop(); - if (const std::optional result = pred(inst)) { + if (std::optional result = pred(inst)) { // This is the instruction we were looking for return result; } // Visit the right most arguments first for (size_t arg = inst->NumArgs(); arg--;) { - const Value arg_value{inst->Arg(arg)}; + Value arg_value{inst->Arg(arg)}; if (arg_value.IsImmediate()) { continue; } // Queue instruction if it hasn't been visited - const Inst* const arg_inst{arg_value.InstRecursive()}; + Instruction* arg_inst{arg_value.InstRecursive()}; if (std::ranges::find(visited, arg_inst) == visited.end()) { visited.push_back(arg_inst); queue.push(arg_inst); @@ -59,4 +62,13 @@ auto BreadthFirstSearch(const Value& value, Pred&& pred) return BreadthFirstSearch(value.InstRecursive(), pred); } +template +auto BreadthFirstSearch(Value value, Pred&& pred) -> std::invoke_result_t { + if (value.IsImmediate()) { + // Nothing to do with immediates + return std::nullopt; + } + return BreadthFirstSearch(value.InstRecursive(), pred); +} + } // namespace Shader::IR diff --git a/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp new file mode 100644 index 000000000..6292edfd8 --- /dev/null +++ b/src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp @@ -0,0 +1,249 @@ + +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include +#include "common/config.h" +#include "common/io_file.h" +#include "common/logging/log.h" +#include "common/path_util.h" +#include "shader_recompiler/info.h" +#include "shader_recompiler/ir/breadth_first_search.h" +#include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/passes/srt.h" +#include "shader_recompiler/ir/program.h" +#include "shader_recompiler/ir/reg.h" +#include "shader_recompiler/ir/srt_gvn_table.h" +#include "shader_recompiler/ir/value.h" +#include "src/common/arch.h" +#include "src/common/decoder.h" + +using namespace Xbyak::util; + +static Xbyak::CodeGenerator g_srt_codegen(32_MB); + +namespace { + +static void DumpSrtProgram(const Shader::Info& info, const u8* code, size_t codesize) { +#ifdef ARCH_X86_64 + using namespace Common::FS; + + const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps"; + if (!std::filesystem::exists(dump_dir)) { + std::filesystem::create_directories(dump_dir); + } + const auto filename = fmt::format("{}_{:#018x}.srtprogram.txt", info.stage, info.pgm_hash); + const auto file = IOFile{dump_dir / filename, FileAccessMode::Write, FileType::TextFile}; + + u64 address = reinterpret_cast(code); + u64 code_end = address + codesize; + ZydisDecodedInstruction instruction; + ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT]; + ZyanStatus status = ZYAN_STATUS_SUCCESS; + while (address < code_end && ZYAN_SUCCESS(Common::Decoder::Instance()->decodeInstruction( + instruction, operands, reinterpret_cast(address)))) { + std::string s = + Common::Decoder::Instance()->disassembleInst(instruction, operands, address); + s += "\n"; + file.WriteString(s); + address += instruction.length; + } +#endif +} + +using namespace Shader; + +struct PassInfo { + // map offset to inst + using PtrUserList = boost::container::flat_map; + + Optimization::SrtGvnTable gvn_table; + // keys are GetUserData or ReadConst instructions that are used as pointers + std::unordered_map pointer_uses; + // GetUserData instructions corresponding to sgpr_base of SRT roots + boost::container::small_flat_map srt_roots; + + // pick a single inst for a given value number + std::unordered_map vn_to_inst; + + // Bumped during codegen to assign offsets to readconsts + u32 dst_off_dw; + + PtrUserList* GetUsesAsPointer(IR::Inst* inst) { + auto it = pointer_uses.find(inst); + if (it != pointer_uses.end()) { + return &it->second; + } + return nullptr; + } + + // Return a single instruction that this instruction is identical to, according + // to value number + // The "original" is arbitrary. Here it's the first instruction found for a given value number + IR::Inst* DeduplicateInstruction(IR::Inst* inst) { + auto it = vn_to_inst.try_emplace(gvn_table.GetValueNumber(inst), inst); + return it.first->second; + } +}; +} // namespace + +namespace Shader::Optimization { + +namespace { + +static inline void PushPtr(Xbyak::CodeGenerator& c, u32 off_dw) { + c.push(rdi); + c.mov(rdi, ptr[rdi + (off_dw << 2)]); + c.mov(r10, 0xFFFFFFFFFFFFULL); + c.and_(rdi, r10); +} + +static inline void PopPtr(Xbyak::CodeGenerator& c) { + c.pop(rdi); +}; + +static void VisitPointer(u32 off_dw, IR::Inst* subtree, PassInfo& pass_info, + Xbyak::CodeGenerator& c) { + PushPtr(c, off_dw); + PassInfo::PtrUserList* use_list = pass_info.GetUsesAsPointer(subtree); + ASSERT(use_list); + + // First copy all the src data from this tree level + // That way, all data that was contiguous in the guest SRT is also contiguous in the + // flattened buffer. + // TODO src and dst are contiguous. Optimize with wider loads/stores + // TODO if this subtree is dynamically indexed, don't compact it (keep it sparse) + for (auto [src_off_dw, use] : *use_list) { + c.mov(r10d, ptr[rdi + (src_off_dw << 2)]); + c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r10d); + + use->SetFlags(pass_info.dst_off_dw); + pass_info.dst_off_dw++; + } + + // Then visit any children used as pointers + for (const auto [src_off_dw, use] : *use_list) { + if (pass_info.GetUsesAsPointer(use)) { + VisitPointer(src_off_dw, use, pass_info, c); + } + } + + PopPtr(c); +} + +static void GenerateSrtProgram(Info& info, PassInfo& pass_info) { + Xbyak::CodeGenerator& c = g_srt_codegen; + + if (info.srt_info.srt_reservations.empty() && pass_info.srt_roots.empty()) { + return; + } + + info.srt_info.walker_func = c.getCurr(); + + pass_info.dst_off_dw = NumUserDataRegs; + + // Special case for V# step rate buffers in fetch shader + for (const auto [sgpr_base, dword_offset, num_dwords] : info.srt_info.srt_reservations) { + // get pointer to V# + c.mov(r10d, ptr[rdi + (sgpr_base << 2)]); + + u32 src_off = dword_offset << 2; + + for (auto j = 0; j < num_dwords; j++) { + c.mov(r11d, ptr[r10d + src_off]); + c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r11d); + + src_off += 4; + ++pass_info.dst_off_dw; + } + } + + ASSERT(pass_info.dst_off_dw == info.srt_info.flattened_bufsize_dw); + + for (const auto& [sgpr_base, root] : pass_info.srt_roots) { + VisitPointer(static_cast(sgpr_base), root, pass_info, c); + } + + c.ret(); + c.ready(); + + if (Config::dumpShaders()) { + size_t codesize = c.getCurr() - reinterpret_cast(info.srt_info.walker_func); + DumpSrtProgram(info, reinterpret_cast(info.srt_info.walker_func), codesize); + } + + info.srt_info.flattened_bufsize_dw = pass_info.dst_off_dw; +} + +}; // namespace + +void FlattenExtendedUserdataPass(IR::Program& program) { + Shader::Info& info = program.info; + PassInfo pass_info; + + // traverse at end and assign offsets to duplicate readconsts, using + // vn_to_inst as the source + boost::container::small_vector all_readconsts; + + for (auto r_it = program.post_order_blocks.rbegin(); r_it != program.post_order_blocks.rend(); + r_it++) { + IR::Block* block = *r_it; + for (IR::Inst& inst : *block) { + if (inst.GetOpcode() == IR::Opcode::ReadConst) { + if (!inst.Arg(1).IsImmediate()) { + LOG_WARNING(Render_Recompiler, "ReadConst has non-immediate offset"); + continue; + } + + all_readconsts.push_back(&inst); + if (pass_info.DeduplicateInstruction(&inst) != &inst) { + // This is a duplicate of a readconst we've already visited + continue; + } + + IR::Inst* ptr_composite = inst.Arg(0).InstRecursive(); + + const auto pred = [](IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData || + inst->GetOpcode() == IR::Opcode::ReadConst) { + return inst; + } + return std::nullopt; + }; + auto base0 = IR::BreadthFirstSearch(ptr_composite->Arg(0), pred); + auto base1 = IR::BreadthFirstSearch(ptr_composite->Arg(1), pred); + ASSERT_MSG(base0 && base1 && "ReadConst not from constant memory"); + + IR::Inst* ptr_lo = base0.value(); + ptr_lo = pass_info.DeduplicateInstruction(ptr_lo); + + auto ptr_uses_kv = + pass_info.pointer_uses.try_emplace(ptr_lo, PassInfo::PtrUserList{}); + PassInfo::PtrUserList& user_list = ptr_uses_kv.first->second; + + user_list[inst.Arg(1).U32()] = &inst; + + if (ptr_lo->GetOpcode() == IR::Opcode::GetUserData) { + IR::ScalarReg ud_reg = ptr_lo->Arg(0).ScalarReg(); + pass_info.srt_roots[ud_reg] = ptr_lo; + } + } + } + } + + GenerateSrtProgram(info, pass_info); + + // Assign offsets to duplicate readconsts + for (IR::Inst* readconst : all_readconsts) { + ASSERT(pass_info.vn_to_inst.contains(pass_info.gvn_table.GetValueNumber(readconst))); + IR::Inst* original = pass_info.DeduplicateInstruction(readconst); + readconst->SetFlags(original->Flags()); + } + + info.RefreshFlatBuf(); +} + +} // namespace Shader::Optimization \ No newline at end of file diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index e6e389d15..7bd47992c 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -12,6 +12,7 @@ void SsaRewritePass(IR::BlockList& program); void IdentityRemovalPass(IR::BlockList& program); void DeadCodeEliminationPass(IR::Program& program); void ConstantPropagationPass(IR::BlockList& program); +void FlattenExtendedUserdataPass(IR::Program& program); void ResourceTrackingPass(IR::Program& program); void CollectShaderInfoPass(IR::Program& program); void LowerSharedMemToRegisters(IR::Program& program); diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index aa05d3aed..6c8809cf0 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -13,12 +13,7 @@ namespace Shader::Optimization { namespace { -struct SharpLocation { - u32 sgpr_base; - u32 dword_offset; - - auto operator<=>(const SharpLocation&) const = default; -}; +using SharpLocation = u32; bool IsBufferAtomic(const IR::Inst& inst) { switch (inst.GetOpcode()) { @@ -155,9 +150,7 @@ public: if (desc.is_gds_buffer && existing.is_gds_buffer) { return true; } - return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset && - desc.inline_cbuf == existing.inline_cbuf; + return desc.sharp_idx == existing.sharp_idx && desc.inline_cbuf == existing.inline_cbuf; })}; auto& buffer = buffer_resources[index]; buffer.used_types |= desc.used_types; @@ -167,8 +160,7 @@ public: u32 Add(const TextureBufferResource& desc) { const u32 index{Add(texture_buffer_resources, desc, [&desc](const auto& existing) { - return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset; + return desc.sharp_idx == existing.sharp_idx; })}; auto& buffer = texture_buffer_resources[index]; buffer.is_written |= desc.is_written; @@ -177,8 +169,7 @@ public: u32 Add(const ImageResource& desc) { const u32 index{Add(image_resources, desc, [&desc](const auto& existing) { - return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset; + return desc.sharp_idx == existing.sharp_idx; })}; auto& image = image_resources[index]; image.is_storage |= desc.is_storage; @@ -187,8 +178,7 @@ public: u32 Add(const SamplerResource& desc) { const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) { - return desc.sgpr_base == existing.sgpr_base && - desc.dword_offset == existing.dword_offset; + return desc.sharp_idx == existing.sharp_idx; })}; return index; } @@ -259,48 +249,25 @@ std::pair TryDisableAnisoLod0(const IR::Inst* inst) { return {prod2, true}; } -SharpLocation TrackSharp(const IR::Inst* inst) { +SharpLocation TrackSharp(const IR::Inst* inst, const Shader::Info& info) { // Search until we find a potential sharp source. - const auto pred0 = [](const IR::Inst* inst) -> std::optional { + const auto pred = [](const IR::Inst* inst) -> std::optional { if (inst->GetOpcode() == IR::Opcode::GetUserData || inst->GetOpcode() == IR::Opcode::ReadConst) { return inst; } return std::nullopt; }; - const auto result = IR::BreadthFirstSearch(inst, pred0); + const auto result = IR::BreadthFirstSearch(inst, pred); ASSERT_MSG(result, "Unable to track sharp source"); inst = result.value(); - // If its from user data not much else to do. if (inst->GetOpcode() == IR::Opcode::GetUserData) { - return SharpLocation{ - .sgpr_base = u32(IR::ScalarReg::Max), - .dword_offset = u32(inst->Arg(0).ScalarReg()), - }; + return static_cast(inst->Arg(0).ScalarReg()); + } else { + ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, + "Sharp load not from constant memory"); + return inst->Flags(); } - ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, "Sharp load not from constant memory"); - - // Retrieve offset from base. - const u32 dword_offset = inst->Arg(1).U32(); - const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive(); - - // Retrieve SGPR pair that holds sbase - const auto pred1 = [](const IR::Inst* inst) -> std::optional { - ASSERT(inst->GetOpcode() != IR::Opcode::ReadConst); - if (inst->GetOpcode() == IR::Opcode::GetUserData) { - return inst->Arg(0).ScalarReg(); - } - return std::nullopt; - }; - const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred1); - const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred1); - ASSERT_MSG(base0 && base1, "Nested resource loads not supported"); - - // Return retrieved location. - return SharpLocation{ - .sgpr_base = u32(base0.value()), - .dword_offset = dword_offset, - }; } s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, @@ -327,8 +294,7 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, cbuf = std::bit_cast(buffer); // Assign a binding to this sharp. return descriptors.Add(BufferResource{ - .sgpr_base = std::numeric_limits::max(), - .dword_offset = 0, + .sharp_idx = std::numeric_limits::max(), .used_types = BufferDataType(inst, cbuf.GetNumberFmt()), .inline_cbuf = cbuf, }); @@ -341,11 +307,10 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, if (binding = TryHandleInlineCbuf(inst, info, descriptors, buffer); binding == -1) { IR::Inst* handle = inst.Arg(0).InstRecursive(); IR::Inst* producer = handle->Arg(0).InstRecursive(); - const auto sharp = TrackSharp(producer); - buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); + const auto sharp = TrackSharp(producer, info); + buffer = info.ReadUdSharp(sharp); binding = descriptors.Add(BufferResource{ - .sgpr_base = sharp.sgpr_base, - .dword_offset = sharp.dword_offset, + .sharp_idx = sharp, .used_types = BufferDataType(inst, buffer.GetNumberFmt()), .is_written = IsBufferStore(inst), }); @@ -404,11 +369,10 @@ void PatchTextureBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { const IR::Inst* handle = inst.Arg(0).InstRecursive(); const IR::Inst* producer = handle->Arg(0).InstRecursive(); - const auto sharp = TrackSharp(producer); - const auto buffer = info.ReadUd(sharp.sgpr_base, sharp.dword_offset); + const auto sharp = TrackSharp(producer, info); + const auto buffer = info.ReadUdSharp(sharp); const s32 binding = descriptors.Add(TextureBufferResource{ - .sgpr_base = sharp.sgpr_base, - .dword_offset = sharp.dword_offset, + .sharp_idx = sharp, .nfmt = buffer.GetNumberFmt(), .is_written = inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32, }); @@ -456,18 +420,16 @@ void PatchImageSampleInstruction(IR::Block& block, IR::Inst& inst, Info& info, if (handle.IsImmediate()) { LOG_WARNING(Render_Vulkan, "Inline sampler detected"); return descriptors.Add(SamplerResource{ - .sgpr_base = std::numeric_limits::max(), - .dword_offset = 0, + .sharp_idx = std::numeric_limits::max(), .inline_sampler = AmdGpu::Sampler{.raw0 = handle.U32()}, }); } // Normal sampler resource. const auto ssharp_handle = handle.InstRecursive(); const auto& [ssharp_ud, disable_aniso] = TryDisableAnisoLod0(ssharp_handle); - const auto ssharp = TrackSharp(ssharp_ud); + const auto ssharp = TrackSharp(ssharp_ud, info); return descriptors.Add(SamplerResource{ - .sgpr_base = ssharp.sgpr_base, - .dword_offset = ssharp.dword_offset, + .sharp_idx = ssharp, .associated_image = image_binding, .disable_aniso = disable_aniso, }); @@ -647,9 +609,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip const auto tsharp_handle = has_sampler ? producer->Arg(0).InstRecursive() : producer; // Read image sharp. - const auto tsharp = TrackSharp(tsharp_handle); + const auto tsharp = TrackSharp(tsharp_handle, info); const auto inst_info = inst.Flags(); - auto image = info.ReadUd(tsharp.sgpr_base, tsharp.dword_offset); + auto image = info.ReadUdSharp(tsharp); if (!image.Valid()) { LOG_ERROR(Render_Vulkan, "Shader compiled with unbound image!"); image = AmdGpu::Image::Null(); @@ -658,8 +620,7 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip const bool is_storage = IsImageStorageInstruction(inst); const auto type = image.IsPartialCubemap() ? AmdGpu::ImageType::Color2DArray : image.GetType(); u32 image_binding = descriptors.Add(ImageResource{ - .sgpr_base = tsharp.sgpr_base, - .dword_offset = tsharp.dword_offset, + .sharp_idx = tsharp, .type = type, .nfmt = image.GetNumberFmt(), .is_storage = is_storage, @@ -763,6 +724,7 @@ void PatchDataRingInstruction(IR::Block& block, IR::Inst& inst, Info& info, void ResourceTrackingPass(IR::Program& program) { // Iterate resource instructions and patch them after finding the sharp. auto& info = program.info; + Descriptors descriptors{info}; for (IR::Block* const block : program.blocks) { for (IR::Inst& inst : block->Instructions()) { diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index e995852d5..8b93d72e3 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -63,6 +63,9 @@ void Visit(Info& info, IR::Inst& inst) { case IR::Opcode::LaneId: info.uses_lane_id = true; break; + case IR::Opcode::ReadConst: + info.has_readconst = true; + break; default: break; } diff --git a/src/shader_recompiler/ir/passes/srt.h b/src/shader_recompiler/ir/passes/srt.h new file mode 100644 index 000000000..0ddc15ea6 --- /dev/null +++ b/src/shader_recompiler/ir/passes/srt.h @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include +#include "common/types.h" + +namespace Shader { + +using PFN_SrtWalker = void PS4_SYSV_ABI (*)(const u32* /*user_data*/, u32* /*flat_dst*/); + +struct PersistentSrtInfo { + // Special case when fetch shader uses step rates. + struct SrtSharpReservation { + u32 sgpr_base; + u32 dword_offset; + u32 num_dwords; + }; + + PFN_SrtWalker walker_func{}; + boost::container::small_vector srt_reservations; + u32 flattened_bufsize_dw = 16; // NumUserDataRegs + + // Special case for fetch shaders because we don't generate IR to read from step rate buffers, + // so we won't see usage with GetUserData/ReadConst. + // Reserve space in the flattened buffer for a sharp ahead of time + u32 ReserveSharp(u32 sgpr_base, u32 dword_offset, u32 num_dwords) { + u32 rv = flattened_bufsize_dw; + srt_reservations.emplace_back(sgpr_base, dword_offset, num_dwords); + flattened_bufsize_dw += num_dwords; + return rv; + } +}; + +} // namespace Shader \ No newline at end of file diff --git a/src/shader_recompiler/ir/srt_gvn_table.h b/src/shader_recompiler/ir/srt_gvn_table.h new file mode 100644 index 000000000..232ee6152 --- /dev/null +++ b/src/shader_recompiler/ir/srt_gvn_table.h @@ -0,0 +1,157 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include "common/assert.h" +#include "common/hash.h" +#include "common/types.h" +#include "shader_recompiler/ir/breadth_first_search.h" +#include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/value.h" + +namespace Shader::Optimization { + +// Does global value numbering on a subset of instructions that are used +// for loads from shader resource tables. +// Inspiration from spirv-opt + +class SrtGvnTable { +public: + using ValueNumberTable = std::unordered_map; + using ValueNum = u32; + + SrtGvnTable() : value_numbers(), next_num(0) {} + + u32 GetValueNumber(IR::Inst* inst) { + return GetValueNumber(IR::Value{inst}); + } + + u32 GetValueNumber(IR::Value v) { + v = v.Resolve(); + if (auto it = value_numbers.find(v); it != value_numbers.end()) { + return it->second; + } + if (auto inst = v.TryInstRecursive()) { + return ComputeInstValueNumber(inst); + } + return NextValueNumber(v); + } + +private: + u32 ComputeInstValueNumber(IR::Inst* inst) { + ASSERT(!value_numbers.contains( + IR::Value(inst))); // Should always be checking before calling this function + + if (inst->MayHaveSideEffects()) { + return NextValueNumber(IR::Value(inst)); + } + + u32 vn; + + switch (inst->GetOpcode()) { + case IR::Opcode::Phi: { + // hack to get to parity with main + // Need to fix ssa_rewrite pass to remove certain phis + std::optional source = TryRemoveTrivialPhi(inst); + if (!source) { + const auto pred = [](IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetUserData || + inst->GetOpcode() == IR::Opcode::CompositeConstructU32x2 || + inst->GetOpcode() == IR::Opcode::ReadConst) { + return inst; + } + return std::nullopt; + }; + source = IR::BreadthFirstSearch(inst, pred).transform([](auto inst) { + return IR::Value{inst}; + }); + ASSERT(source); + } + vn = GetValueNumber(source.value()); + value_numbers[IR::Value(inst)] = vn; + break; + } + case IR::Opcode::GetUserData: + case IR::Opcode::CompositeConstructU32x2: + case IR::Opcode::ReadConst: { + InstVector iv = MakeInstVector(inst); + if (auto it = iv_to_vn.find(iv); it != iv_to_vn.end()) { + vn = it->second; + value_numbers[IR::Value(inst)] = vn; + } else { + vn = NextValueNumber(IR::Value(inst)); + iv_to_vn.emplace(std::move(iv), vn); + } + break; + } + default: + vn = NextValueNumber(IR::Value(inst)); + break; + } + + return vn; + } + + u32 NextValueNumber(IR::Value v) { + u32 rv = next_num++; + value_numbers[v] = rv; + return rv; + } + + ValueNumberTable value_numbers; + u32 next_num; + + using InstVector = boost::container::small_vector; + + InstVector MakeInstVector(IR::Inst* inst) { + ASSERT(inst->GetOpcode() != IR::Opcode::Identity); + InstVector iv; + iv.reserve(2 + inst->NumArgs()); + iv.push_back(static_cast(inst->GetOpcode())); + iv.push_back(inst->Flags()); + for (auto i = 0; i < inst->NumArgs(); i++) { + iv.push_back(GetValueNumber(inst->Arg(i))); + } + return iv; + } + + // Temp workaround for something like this: + // [0000555558a5baf8] %297 = Phi [ %24, {Block $1} ], [ %297, {Block $5} ] (uses: 4) + // [0000555558a4e038] %305 = CompositeConstructU32x2 %297, %296 (uses: 4) + // [0000555558a4e0a8] %306 = ReadConst %305, #0 (uses: 2) + // Should probably be fixed in ssa_rewrite + std::optional TryRemoveTrivialPhi(IR::Inst* phi) { + IR::Value single_source{}; + + for (auto i = 0; i < phi->NumArgs(); i++) { + IR::Value v = phi->Arg(i).Resolve(); + if (v == IR::Value(phi)) { + continue; + } + if (!single_source.IsEmpty() && single_source != v) { + return std::nullopt; + } + single_source = v; + } + + ASSERT(!single_source.IsEmpty()); + phi->ReplaceUsesWith(single_source); + return single_source; + } + + struct HashInstVector { + size_t operator()(const InstVector& iv) const { + u32 h = 0; + for (auto vn : iv) { + h = HashCombine(vn, h); + } + return h; + } + }; + + std::unordered_map iv_to_vn; +}; + +} // namespace Shader::Optimization \ No newline at end of file diff --git a/src/shader_recompiler/ir/value.cpp b/src/shader_recompiler/ir/value.cpp index cf7a70f76..889e99556 100644 --- a/src/shader_recompiler/ir/value.cpp +++ b/src/shader_recompiler/ir/value.cpp @@ -1,7 +1,9 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include +#include +#include +#include "common/hash.h" #include "shader_recompiler/ir/value.h" namespace Shader::IR { @@ -97,3 +99,52 @@ bool Value::operator!=(const Value& other) const { } } // namespace Shader::IR + +namespace std { +std::size_t hash::operator()(const Shader::IR::Value& v) const { + using namespace Shader::IR; + + u64 h = HashCombine(static_cast(v.type), 0ULL); + + switch (v.type) { + case Type::Void: + return h; + case Type::Opaque: + return reinterpret_cast(v.InstRecursive()); + case Type::ScalarReg: + return HashCombine(static_cast(v.sreg), h); + case Type::VectorReg: + return HashCombine(static_cast(v.vreg), h); + case Type::Attribute: + return HashCombine(static_cast(v.attribute), h); + case Type::U1: + return HashCombine(static_cast(v.attribute), h); + case Type::U8: + return HashCombine(static_cast(v.imm_u8), h); + case Type::U16: + case Type::F16: + return HashCombine(static_cast(v.imm_u16), h); + case Type::U32: + case Type::F32: + return HashCombine(static_cast(v.imm_u32), h); + case Type::U64: + case Type::F64: + return HashCombine(static_cast(v.imm_u64), h); + case Type::U32x2: + case Type::U32x3: + case Type::U32x4: + case Type::F16x2: + case Type::F16x3: + case Type::F16x4: + case Type::F32x2: + case Type::F32x3: + case Type::F32x4: + case Type::F64x2: + case Type::F64x3: + case Type::F64x4: + default: + break; + } + UNREACHABLE_MSG("Invalid type {}", v.type); +} +} // namespace std diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index a282b9168..7e46747b9 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -29,6 +29,7 @@ class Value { public: Value() noexcept = default; explicit Value(IR::Inst* value) noexcept; + explicit Value(const IR::Inst* value) noexcept; explicit Value(IR::ScalarReg reg) noexcept; explicit Value(IR::VectorReg reg) noexcept; explicit Value(IR::Attribute value) noexcept; @@ -82,6 +83,8 @@ private: f64 imm_f64; const char* string_literal; }; + + friend class std::hash; }; static_assert(static_cast(IR::Type::Void) == 0, "memset relies on IR::Type being zero"); static_assert(std::is_trivially_copyable_v); @@ -364,3 +367,10 @@ inline const char* Value::StringLiteral() const { } } // namespace Shader::IR + +namespace std { +template <> +struct hash { + std::size_t operator()(const Shader::IR::Value& v) const; +}; +} // namespace std \ No newline at end of file diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index e13e5d009..19579f665 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -64,6 +64,7 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info Shader::Optimization::LowerSharedMemToRegisters(program); } Shader::Optimization::RingAccessElimination(program, runtime_info, program.info.stage); + Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::IdentityRemovalPass(program.blocks); Shader::Optimization::DeadCodeEliminationPass(program); diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index 0a3a696bc..c25c611e4 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -8,6 +8,7 @@ #include "common/types.h" #include "shader_recompiler/backend/bindings.h" #include "shader_recompiler/info.h" +#include "shader_recompiler/ir/passes/srt.h" namespace Shader { @@ -52,6 +53,9 @@ struct StageSpecialization { Backend::Bindings start_) : info{&info_}, runtime_info{runtime_info_}, start{start_} { u32 binding{}; + if (info->has_readconst) { + binding++; + } ForEachSharp(binding, buffers, info->buffers, [](auto& spec, const auto& desc, AmdGpu::Buffer sharp) { spec.stride = sharp.GetStride(); @@ -90,6 +94,12 @@ struct StageSpecialization { return false; } u32 binding{}; + if (info->has_readconst != other.info->has_readconst) { + return false; + } + if (info->has_readconst) { + binding++; + } for (u32 i = 0; i < buffers.size(); i++) { if (other.bitset[binding++] && buffers[i] != other.buffers[i]) { return false; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index f665ba512..b15eace12 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -4,6 +4,7 @@ #include #include "common/alignment.h" #include "common/scope_exit.h" +#include "common/types.h" #include "shader_recompiler/info.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/buffer_cache/buffer_cache.h" @@ -156,7 +157,7 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { continue; } - const auto& buffer = vs_info.ReadUd(input.sgpr_base, input.dword_offset); + const auto& buffer = vs_info.ReadUdReg(input.sgpr_base, input.dword_offset); if (buffer.GetSize() == 0) { continue; } @@ -301,6 +302,14 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo cmdbuf.updateBuffer(buffer->Handle(), buf_barrier.offset, num_bytes, value); } +std::pair BufferCache::ObtainHostUBO(std::span data) { + static constexpr u64 StreamThreshold = CACHING_PAGESIZE; + ASSERT(data.size_bytes() <= StreamThreshold); + const u64 offset = stream_buffer.Copy(reinterpret_cast(data.data()), data.size_bytes(), + instance.UniformMinAlignment()); + return {&stream_buffer, offset}; +} + std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written, bool is_texel_buffer, BufferId buffer_id) { // For small uniform buffers that have not been modified by gpu diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 6710c8615..e2519e942 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -84,6 +84,8 @@ public: /// Writes a value to GPU buffer. void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds); + [[nodiscard]] std::pair ObtainHostUBO(std::span data); + /// Obtains a buffer for the specified region. [[nodiscard]] std::pair ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written, bool is_texel_buffer = false, diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 0c3570ab5..4ab290780 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -25,6 +25,15 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler u32 binding{}; boost::container::small_vector bindings; + + if (info->has_readconst) { + bindings.push_back({ + .binding = binding++, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = vk::ShaderStageFlagBits::eCompute, + }); + } for (const auto& buffer : info->buffers) { const auto sharp = buffer.GetSharp(*info); bindings.push_back({ diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index f6d0b49b6..32e3bf8f8 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -60,7 +60,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul } const auto buffer = - vs_info->ReadUd(input.sgpr_base, input.dword_offset); + vs_info->ReadUdReg(input.sgpr_base, input.dword_offset); if (buffer.GetSize() == 0) { continue; } @@ -327,6 +327,15 @@ void GraphicsPipeline::BuildDescSetLayout() { if (!stage) { continue; } + + if (stage->has_readconst) { + bindings.push_back({ + .binding = binding++, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .descriptorCount = 1, + .stageFlags = gp_stage_flags, + }); + } for (const auto& buffer : stage->buffers) { const auto sharp = buffer.GetSharp(*stage); bindings.push_back({ diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 4ec2a8db4..c368f2101 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -4,6 +4,7 @@ #include #include "common/config.h" +#include "common/hash.h" #include "common/io_file.h" #include "common/path_util.h" #include "shader_recompiler/backend/spirv/emit_spirv.h" @@ -22,10 +23,6 @@ namespace Vulkan { using Shader::VsOutput; -[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) { - return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2)); -} - constexpr static std::array DescriptorHeapSizes = { vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, 8192}, vk::DescriptorPoolSize{vk::DescriptorType::eStorageBuffer, 1024}, @@ -351,7 +348,7 @@ bool PipelineCache::RefreshGraphicsKey() { continue; } const auto& buffer = - vs_info->ReadUd(input.sgpr_base, input.dword_offset); + vs_info->ReadUdReg(input.sgpr_base, input.dword_offset); if (buffer.GetSize() == 0) { continue; } @@ -424,7 +421,8 @@ std::tuple PipelineCache::GetProgram } Program* program = it_pgm->second; - const auto& info = program->info; + auto& info = program->info; + info.RefreshFlatBuf(); const auto spec = Shader::StageSpecialization(info, runtime_info, binding); size_t perm_idx = program->modules.size(); vk::ShaderModule module{}; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_common.cpp b/src/video_core/renderer_vulkan/vk_pipeline_common.cpp index efe2838e4..4c297cd42 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_common.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_common.cpp @@ -57,6 +57,22 @@ void Pipeline::BindBuffers(VideoCore::BufferCache& buffer_cache, } } + // Bind the flattened user data buffer as a UBO so it's accessible to the shader + if (stage.has_readconst) { + const auto [vk_buffer, offset] = buffer_cache.ObtainHostUBO(stage.flattened_ud_buf); + buffer_infos.emplace_back(vk_buffer->Handle(), offset, + stage.flattened_ud_buf.size() * sizeof(u32)); + set_writes.push_back({ + .dstSet = VK_NULL_HANDLE, + .dstBinding = binding.unified++, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = vk::DescriptorType::eUniformBuffer, + .pBufferInfo = &buffer_infos.back(), + }); + ++binding.buffer; + } + // Second pass to re-bind buffers that were updated after binding for (u32 i = 0; i < buffer_bindings.size(); i++) { const auto& [buffer_id, vsharp] = buffer_bindings[i]; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index ae7634197..212b8165f 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -12,6 +12,10 @@ #include "video_core/texture_cache/texture_cache.h" #include "vk_rasterizer.h" +#ifdef MemoryBarrier +#undef MemoryBarrier +#endif + namespace Vulkan { Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,