From b52741b7140456a5648e4152768066199f5193c3 Mon Sep 17 00:00:00 2001 From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Sat, 24 Aug 2024 22:51:47 +0300 Subject: [PATCH] video_core: Bloodborne stabilization pt1 (#543) * shader_recompiler: Writelane elimination pass + null image fix * spirv: Implement image derivatives * texture_cache: Reduce page bit size * clang format * slot_vector: Back to debug assert * vk_graphics_pipeline: Handle null tsharp * spirv: Revert some change * vk_instance: Support primitive restart on list topology * page_manager: Adjust windows exception handler * clang format * Remove subres tracking * Will be done separately --- src/common/slot_vector.h | 22 ++++--------- .../libraries/kernel/thread_management.cpp | 1 - .../backend/spirv/emit_spirv_image.cpp | 33 ++++++++++++++++--- .../backend/spirv/emit_spirv_instructions.h | 7 ++-- .../backend/spirv/emit_spirv_warp.cpp | 12 +++++++ .../frontend/translate/data_share.cpp | 16 +++++---- .../frontend/translate/scalar_alu.cpp | 7 ++-- .../frontend/translate/vector_memory.cpp | 24 ++++++++++---- src/shader_recompiler/ir/ir_emitter.cpp | 16 ++++++--- src/shader_recompiler/ir/ir_emitter.h | 4 ++- src/shader_recompiler/ir/opcodes.inc | 26 ++++++++------- .../ir/passes/constant_propogation_pass.cpp | 14 ++++++++ .../ir/passes/resource_tracking_pass.cpp | 31 ++++++++--------- .../ir/passes/ssa_rewrite_pass.cpp | 20 +---------- src/shader_recompiler/ir/reg.h | 1 + src/shader_recompiler/recompiler.cpp | 2 +- src/video_core/amdgpu/resource.h | 4 +++ src/video_core/page_manager.cpp | 6 ++-- .../renderer_vulkan/vk_graphics_pipeline.cpp | 19 +++++++---- .../renderer_vulkan/vk_instance.cpp | 8 +++++ .../renderer_vulkan/vk_pipeline_cache.cpp | 3 -- src/video_core/texture_cache/image_info.cpp | 7 +++- src/video_core/texture_cache/image_info.h | 3 ++ .../texture_cache/texture_cache.cpp | 6 +++- src/video_core/texture_cache/texture_cache.h | 2 +- 25 files changed, 187 insertions(+), 107 deletions(-) diff --git a/src/common/slot_vector.h b/src/common/slot_vector.h index f0982e290..36e647971 100644 --- a/src/common/slot_vector.h +++ b/src/common/slot_vector.h @@ -28,9 +28,13 @@ struct SlotId { template class SlotVector { - constexpr static std::size_t InitialCapacity = 1024; + constexpr static std::size_t InitialCapacity = 2048; public: + SlotVector() { + Reserve(InitialCapacity); + } + ~SlotVector() noexcept { std::size_t index = 0; for (u64 bits : stored_bitset) { @@ -67,19 +71,6 @@ public: return SlotId{index}; } - template - [[nodiscard]] SlotId swap_and_insert(SlotId existing_id, Args&&... args) noexcept { - const u32 index = FreeValueIndex(); - T& existing_value = values[existing_id.index].object; - - new (&values[index].object) T(std::move(existing_value)); - existing_value.~T(); - new (&values[existing_id.index].object) T(std::forward(args)...); - SetStorageBit(index); - - return SlotId{index}; - } - void erase(SlotId id) noexcept { values[id.index].object.~T(); free_list.push_back(id.index); @@ -151,7 +142,8 @@ private: const std::size_t old_free_size = free_list.size(); free_list.resize(old_free_size + (new_capacity - values_capacity)); - std::iota(free_list.begin() + old_free_size, free_list.end(), + const std::size_t new_free_size = free_list.size(); + std::iota(free_list.rbegin(), free_list.rbegin() + new_free_size - old_free_size, static_cast(values_capacity)); delete[] values; diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 689532693..567fff184 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -1123,7 +1123,6 @@ int PS4_SYSV_ABI posix_pthread_join(ScePthread thread, void** res) { } int PS4_SYSV_ABI scePthreadDetach(ScePthread thread) { - LOG_INFO(Kernel_Pthread, "thread create name = {}", thread->name); thread->is_detached = true; return ORBIS_OK; } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 5526e5411..530f381d7 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -16,6 +16,12 @@ struct ImageOperands { static_cast(new_mask)); operands.push_back(value); } + void Add(spv::ImageOperandsMask new_mask, Id value1, Id value2) { + mask = static_cast(static_cast(mask) | + static_cast(new_mask)); + operands.push_back(value1); + operands.push_back(value2); + } void AddOffset(EmitContext& ctx, const IR::Value& offset, bool can_use_runtime_offsets = false) { @@ -53,6 +59,15 @@ struct ImageOperands { } } + void AddDerivatives(EmitContext& ctx, Id derivatives) { + if (!Sirit::ValidId(derivatives)) { + return; + } + const Id dx{ctx.OpVectorShuffle(ctx.F32[2], derivatives, derivatives, 0, 1)}; + const Id dy{ctx.OpVectorShuffle(ctx.F32[2], derivatives, derivatives, 2, 3)}; + Add(spv::ImageOperandsMask::Grad, dx, dy); + } + spv::ImageOperandsMask mask{}; boost::container::static_vector operands; }; @@ -117,7 +132,7 @@ Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); const u32 comp = inst->Flags().gather_comp.Value(); ImageOperands operands; - operands.AddOffset(ctx, offset); + operands.AddOffset(ctx, offset, true); return ctx.OpImageGather(ctx.F32[4], sampled_image, coords, ctx.ConstU32(comp), operands.mask, operands.operands); } @@ -129,7 +144,7 @@ Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); ImageOperands operands; - operands.AddOffset(ctx, offset); + operands.AddOffset(ctx, offset, true); return ctx.OpImageDrefGather(ctx.F32[4], sampled_image, coords, dref, operands.mask, operands.operands); } @@ -181,9 +196,17 @@ Id EmitImageQueryLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords) { return ctx.OpImageQueryLod(ctx.F32[2], sampled_image, coords); } -Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, - Id derivatives, const IR::Value& offset, Id lod_clamp) { - UNREACHABLE_MSG("SPIR-V Instruction"); +Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id derivatives, + const IR::Value& offset, Id lod_clamp) { + const auto& texture = ctx.images[handle & 0xFFFF]; + const Id image = ctx.OpLoad(texture.image_type, texture.id); + const Id sampler = ctx.OpLoad(ctx.sampler_type, ctx.samplers[handle >> 16]); + const Id sampled_image = ctx.OpSampledImage(texture.sampled_type, image, sampler); + ImageOperands operands; + operands.AddDerivatives(ctx, derivatives); + operands.AddOffset(ctx, offset); + return ctx.OpImageSampleExplicitLod(ctx.F32[4], sampled_image, coords, operands.mask, + operands.operands); } Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index bc39bc0f3..0703efb96 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -387,8 +387,8 @@ Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, const Id lod, Id ms); Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, u32 handle, Id lod, bool skip_mips); Id EmitImageQueryLod(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords); -Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, - Id derivatives, const IR::Value& offset, Id lod_clamp); +Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id derivatives, + const IR::Value& offset, Id lod_clamp); Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords); void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id color); @@ -407,5 +407,8 @@ Id EmitImageAtomicExchange32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id co Id EmitLaneId(EmitContext& ctx); Id EmitWarpId(EmitContext& ctx); Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); +Id EmitReadFirstLane(EmitContext& ctx, Id value); +Id EmitReadLane(EmitContext& ctx, Id value, u32 lane); +Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane); } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp index 38afd90f1..c55763c5d 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -22,4 +22,16 @@ Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index) { return ctx.OpGroupNonUniformQuadBroadcast(ctx.U32[1], SubgroupScope(ctx), value, index); } +Id EmitReadFirstLane(EmitContext& ctx, Id value) { + UNREACHABLE(); +} + +Id EmitReadLane(EmitContext& ctx, Id value, u32 lane) { + UNREACHABLE(); +} + +Id EmitWriteLane(EmitContext& ctx, Id value, Id write_value, u32 lane) { + return ctx.u32_zero_value; +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index b7b5aa138..7580f7444 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -127,7 +127,6 @@ void Translator::DS_ADD_U32(const GcnInst& inst, bool rtn) { const IR::U32 data{GetSrc(inst.src[1])}; const IR::U32 offset = ir.Imm32(u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - IR::VectorReg dst_reg{inst.dst[0].code}; const IR::Value original_val = ir.SharedAtomicIAdd(addr_offset, data); if (rtn) { SetDst(inst.dst[0], IR::U32{original_val}); @@ -139,7 +138,6 @@ void Translator::DS_MIN_U32(const GcnInst& inst, bool rtn) { const IR::U32 data{GetSrc(inst.src[1])}; const IR::U32 offset = ir.Imm32(u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - IR::VectorReg dst_reg{inst.dst[0].code}; const IR::Value original_val = ir.SharedAtomicIMin(addr_offset, data, false); if (rtn) { SetDst(inst.dst[0], IR::U32{original_val}); @@ -151,7 +149,6 @@ void Translator::DS_MAX_U32(const GcnInst& inst, bool rtn) { const IR::U32 data{GetSrc(inst.src[1])}; const IR::U32 offset = ir.Imm32(u32(inst.control.ds.offset0)); const IR::U32 addr_offset = ir.IAdd(addr, offset); - IR::VectorReg dst_reg{inst.dst[0].code}; const IR::Value original_val = ir.SharedAtomicIMax(addr_offset, data, false); if (rtn) { SetDst(inst.dst[0], IR::U32{original_val}); @@ -168,13 +165,18 @@ void Translator::V_READFIRSTLANE_B32(const GcnInst& inst) { } void Translator::V_READLANE_B32(const GcnInst& inst) { - ASSERT(info.stage != Stage::Compute); - SetDst(inst.dst[0], GetSrc(inst.src[0])); + const IR::ScalarReg dst{inst.dst[0].code}; + const IR::U32 value{GetSrc(inst.src[0])}; + const IR::U32 lane{GetSrc(inst.src[1])}; + ir.SetScalarReg(dst, ir.ReadLane(value, lane)); } void Translator::V_WRITELANE_B32(const GcnInst& inst) { - ASSERT(info.stage != Stage::Compute); - SetDst(inst.dst[0], GetSrc(inst.src[0])); + const IR::VectorReg dst{inst.dst[0].code}; + const IR::U32 value{GetSrc(inst.src[0])}; + const IR::U32 lane{GetSrc(inst.src[1])}; + const IR::U32 old_value{GetSrc(inst.dst[0])}; + ir.SetVectorReg(dst, ir.WriteLane(old_value, value, lane)); } } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 812d93bae..7f7c9d7e0 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -440,13 +440,16 @@ void Translator::S_SUB_U32(const GcnInst& inst) { void Translator::S_GETPC_B64(u32 pc, const GcnInst& inst) { // This only really exists to let resource tracking pass know // there is an inline cbuf. - SetDst(inst.dst[0], ir.Imm32(pc)); + const IR::ScalarReg dst{inst.dst[0].code}; + ir.SetScalarReg(dst, ir.Imm32(pc)); + ir.SetScalarReg(dst + 1, ir.Imm32(0)); } void Translator::S_ADDC_U32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; - SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), ir.GetSccLo())); + const IR::U32 carry{ir.Select(ir.GetScc(), ir.Imm32(1U), ir.Imm32(0U))}; + SetDst(inst.dst[0], ir.IAdd(ir.IAdd(src0, src1), carry)); } void Translator::S_MAX_U32(const GcnInst& inst) { diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 08674fa2d..41eb91234 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -17,6 +17,7 @@ void Translator::EmitVectorMemory(const GcnInst& inst) { case Opcode::IMAGE_SAMPLE_C_O: case Opcode::IMAGE_SAMPLE_B: case Opcode::IMAGE_SAMPLE_C_LZ_O: + case Opcode::IMAGE_SAMPLE_D: return IMAGE_SAMPLE(inst); case Opcode::IMAGE_GATHER4_C: case Opcode::IMAGE_GATHER4_LZ: @@ -162,12 +163,15 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) { flags.test(MimgModifier::LodBias) ? ir.GetVectorReg(addr_reg++) : IR::F32{}; const IR::F32 dref = flags.test(MimgModifier::Pcf) ? ir.GetVectorReg(addr_reg++) : IR::F32{}; - - // Derivatives are tricky because their number depends on the texture type which is located in - // T#. We don't have access to T# though until resource tracking pass. For now assume no - // derivatives are present, otherwise we don't know where coordinates are placed in the address - // stream. - ASSERT_MSG(!flags.test(MimgModifier::Derivative), "Derivative image instruction"); + const IR::Value derivatives = [&] -> IR::Value { + if (!flags.test(MimgModifier::Derivative)) { + return {}; + } + addr_reg = addr_reg + 4; + return ir.CompositeConstruct( + ir.GetVectorReg(addr_reg - 4), ir.GetVectorReg(addr_reg - 3), + ir.GetVectorReg(addr_reg - 2), ir.GetVectorReg(addr_reg - 1)); + }(); // Now we can load body components as noted in Table 8.9 Image Opcodes with Sampler // Since these are at most 4 dwords, we load them into a single uvec4 and place them @@ -177,6 +181,10 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) { ir.GetVectorReg(addr_reg), ir.GetVectorReg(addr_reg + 1), ir.GetVectorReg(addr_reg + 2), ir.GetVectorReg(addr_reg + 3)); + // Derivatives are tricky because their number depends on the texture type which is located in + // T#. We don't have access to T# though until resource tracking pass. For now assume if + // derivatives are present, that a 2D image is bound. + const bool has_derivatives = flags.test(MimgModifier::Derivative); const bool explicit_lod = flags.any(MimgModifier::Level0, MimgModifier::Lod); IR::TextureInstInfo info{}; @@ -186,9 +194,13 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) { info.force_level0.Assign(flags.test(MimgModifier::Level0)); info.has_offset.Assign(flags.test(MimgModifier::Offset)); info.explicit_lod.Assign(explicit_lod); + info.has_derivatives.Assign(has_derivatives); // Issue IR instruction, leaving unknown fields blank to patch later. const IR::Value texel = [&]() -> IR::Value { + if (has_derivatives) { + return ir.ImageGradient(handle, body, derivatives, offset, {}, info); + } if (!flags.test(MimgModifier::Pcf)) { if (explicit_lod) { return ir.ImageSampleExplicitLod(handle, body, offset, info); diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 3ae068072..0f2fb2f7c 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -209,10 +209,6 @@ U1 IREmitter::GetVcc() { return Inst(Opcode::GetVcc); } -U32 IREmitter::GetSccLo() { - return Inst(Opcode::GetSccLo); -} - U32 IREmitter::GetVccLo() { return Inst(Opcode::GetVccLo); } @@ -445,6 +441,18 @@ U32 IREmitter::QuadShuffle(const U32& value, const U32& index) { return Inst(Opcode::QuadShuffle, value, index); } +U32 IREmitter::ReadFirstLane(const U32& value) { + return Inst(Opcode::ReadFirstLane, value); +} + +U32 IREmitter::ReadLane(const U32& value, const U32& lane) { + return Inst(Opcode::ReadLane, value, lane); +} + +U32 IREmitter::WriteLane(const U32& value, const U32& write_value, const U32& lane) { + return Inst(Opcode::WriteLane, value, write_value, lane); +} + F32F64 IREmitter::FPAdd(const F32F64& a, const F32F64& b) { if (a.Type() != b.Type()) { UNREACHABLE_MSG("Mismatching types {} and {}", a.Type(), b.Type()); diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index be7f25153..45fa5f216 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -65,7 +65,6 @@ public: [[nodiscard]] U1 GetScc(); [[nodiscard]] U1 GetExec(); [[nodiscard]] U1 GetVcc(); - [[nodiscard]] U32 GetSccLo(); [[nodiscard]] U32 GetVccLo(); [[nodiscard]] U32 GetVccHi(); void SetScc(const U1& value); @@ -122,6 +121,9 @@ public: [[nodiscard]] U32 LaneId(); [[nodiscard]] U32 WarpId(); [[nodiscard]] U32 QuadShuffle(const U32& value, const U32& index); + [[nodiscard]] U32 ReadFirstLane(const U32& value); + [[nodiscard]] U32 ReadLane(const U32& value, const U32& lane); + [[nodiscard]] U32 WriteLane(const U32& value, const U32& write_value, const U32& lane); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2, const Value& e3); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index e9ecd4350..9be89f648 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -58,7 +58,6 @@ OPCODE(SetAttribute, Void, Attr OPCODE(GetScc, U1, Void, ) OPCODE(GetExec, U1, Void, ) OPCODE(GetVcc, U1, Void, ) -OPCODE(GetSccLo, U32, Void, ) OPCODE(GetVccLo, U32, Void, ) OPCODE(GetVccHi, U32, Void, ) OPCODE(SetScc, Void, U1, ) @@ -330,19 +329,22 @@ OPCODE(ImageRead, U32x4, Opaq OPCODE(ImageWrite, Void, Opaque, Opaque, U32x4, ) // Image atomic operations -OPCODE(ImageAtomicIAdd32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicSMin32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicUMin32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicSMax32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicUMax32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicInc32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicDec32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicAnd32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicOr32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicXor32, U32, Opaque, Opaque, U32, ) -OPCODE(ImageAtomicExchange32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicIAdd32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicSMin32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicUMin32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicSMax32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicUMax32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicInc32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicDec32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicAnd32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicOr32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicXor32, U32, Opaque, Opaque, U32, ) +OPCODE(ImageAtomicExchange32, U32, Opaque, Opaque, U32, ) // Warp operations OPCODE(LaneId, U32, ) OPCODE(WarpId, U32, ) OPCODE(QuadShuffle, U32, U32, U32 ) +OPCODE(ReadFirstLane, U32, U32, U32 ) +OPCODE(ReadLane, U32, U32, U32 ) +OPCODE(WriteLane, U32, U32, U32, U32 ) diff --git a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp index 94218b32f..b0d9dcc45 100644 --- a/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propogation_pass.cpp @@ -250,6 +250,18 @@ void FoldCmpClass(IR::Inst& inst) { } } +void FoldReadLane(IR::Inst& inst) { + const u32 lane = inst.Arg(1).U32(); + IR::Inst* prod = inst.Arg(0).InstRecursive(); + while (prod->GetOpcode() == IR::Opcode::WriteLane) { + if (prod->Arg(2).U32() == lane) { + inst.ReplaceUsesWith(prod->Arg(1)); + return; + } + prod = prod->Arg(0).InstRecursive(); + } +} + void ConstantPropagation(IR::Block& block, IR::Inst& inst) { switch (inst.GetOpcode()) { case IR::Opcode::IAdd32: @@ -289,6 +301,8 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { case IR::Opcode::SelectF32: case IR::Opcode::SelectF64: return FoldSelect(inst); + case IR::Opcode::ReadLane: + return FoldReadLane(inst); case IR::Opcode::FPNeg32: FoldWhenAllImmediates(inst, [](f32 a) { return -a; }); return; diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 20a66ad0c..efee710db 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -345,6 +345,7 @@ SharpLocation TrackSharp(const IR::Inst* inst) { // Retrieve SGPR pair that holds sbase const auto pred1 = [](const IR::Inst* inst) -> std::optional { + ASSERT(inst->GetOpcode() != IR::Opcode::ReadConst); if (inst->GetOpcode() == IR::Opcode::GetUserData) { return inst->Arg(0).ScalarReg(); } @@ -402,24 +403,13 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors, // is used to define an inline constant buffer IR::Inst* handle = inst.Arg(0).InstRecursive(); - IR::Inst* p0 = handle->Arg(0).InstRecursive(); - if (p0->GetOpcode() != IR::Opcode::IAdd32 || !p0->Arg(0).IsImmediate() || - !p0->Arg(1).IsImmediate()) { - return -1; - } - IR::Inst* p1 = handle->Arg(1).InstRecursive(); - if (p1->GetOpcode() != IR::Opcode::IAdd32) { - return -1; - } - if (!handle->Arg(3).IsImmediate() || !handle->Arg(2).IsImmediate()) { + if (!handle->AreAllArgsImmediates()) { return -1; } // We have found this pattern. Build the sharp. - std::array buffer; - buffer[0] = info.pgm_base + p0->Arg(0).U32() + p0->Arg(1).U32(); - buffer[1] = 0; - buffer[2] = handle->Arg(2).U32(); - buffer[3] = handle->Arg(3).U32(); + std::array buffer; + buffer[0] = info.pgm_base + (handle->Arg(0).U32() | u64(handle->Arg(1).U32()) << 32); + buffer[1] = handle->Arg(2).U32() | u64(handle->Arg(3).U32()) << 32; cbuf = std::bit_cast(buffer); // Assign a binding to this sharp. return descriptors.Add(BufferResource{ @@ -617,7 +607,11 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip const IR::Value arg = inst.Arg(arg_pos); ASSERT_MSG(arg.Type() == IR::Type::U32, "Unexpected offset type"); - const auto read = [&](u32 offset) -> auto { + const auto read = [&](u32 offset) -> IR::U32 { + if (arg.IsImmediate()) { + const u16 comp = (arg.U32() >> offset) & 0x3F; + return ir.Imm32(s32(comp << 26) >> 26); + } return ir.BitFieldExtract(IR::U32{arg}, ir.Imm32(offset), ir.Imm32(6), true); }; @@ -637,7 +631,10 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip UNREACHABLE(); } } - + if (inst_info.has_derivatives) { + ASSERT_MSG(image.GetType() == AmdGpu::ImageType::Color2D, + "User derivatives only supported for 2D images"); + } if (inst_info.has_lod_clamp) { const u32 arg_pos = [&]() -> u32 { switch (inst.GetOpcode()) { diff --git a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp index eef73a659..9edb157db 100644 --- a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp +++ b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp @@ -32,7 +32,6 @@ struct SccFlagTag : FlagTag {}; struct ExecFlagTag : FlagTag {}; struct VccFlagTag : FlagTag {}; struct VccLoTag : FlagTag {}; -struct SccLoTag : FlagTag {}; struct VccHiTag : FlagTag {}; struct GotoVariable : FlagTag { @@ -45,7 +44,7 @@ struct GotoVariable : FlagTag { }; using Variant = std::variant; + VccFlagTag, VccLoTag, VccHiTag>; using ValueMap = std::unordered_map; struct DefTable { @@ -84,13 +83,6 @@ struct DefTable { exec_flag.insert_or_assign(block, value); } - const IR::Value& Def(IR::Block* block, SccLoTag) { - return scc_lo_flag[block]; - } - void SetDef(IR::Block* block, SccLoTag, const IR::Value& value) { - scc_lo_flag.insert_or_assign(block, value); - } - const IR::Value& Def(IR::Block* block, VccLoTag) { return vcc_lo_flag[block]; } @@ -133,10 +125,6 @@ IR::Opcode UndefOpcode(const VccLoTag) noexcept { return IR::Opcode::UndefU32; } -IR::Opcode UndefOpcode(const SccLoTag) noexcept { - return IR::Opcode::UndefU32; -} - IR::Opcode UndefOpcode(const VccHiTag) noexcept { return IR::Opcode::UndefU32; } @@ -336,9 +324,6 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::SetVcc: pass.WriteVariable(VccFlagTag{}, block, inst.Arg(0)); break; - case IR::Opcode::SetSccLo: - pass.WriteVariable(SccLoTag{}, block, inst.Arg(0)); - break; case IR::Opcode::SetVccLo: pass.WriteVariable(VccLoTag{}, block, inst.Arg(0)); break; @@ -371,9 +356,6 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::GetVcc: inst.ReplaceUsesWith(pass.ReadVariable(VccFlagTag{}, block)); break; - case IR::Opcode::GetSccLo: - inst.ReplaceUsesWith(pass.ReadVariable(SccLoTag{}, block)); - break; case IR::Opcode::GetVccLo: inst.ReplaceUsesWith(pass.ReadVariable(VccLoTag{}, block)); break; diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index e3d04260b..7868a5a3b 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -58,6 +58,7 @@ union TextureInstInfo { BitField<4, 1, u32> explicit_lod; BitField<5, 1, u32> has_offset; BitField<6, 2, u32> gather_comp; + BitField<8, 1, u32> has_derivatives; }; union BufferInstInfo { diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 0f9fd6d41..0efac4ff1 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -56,11 +56,11 @@ IR::Program TranslateProgram(Common::ObjectPool& inst_pool, // Run optimization passes Shader::Optimization::SsaRewritePass(program.post_order_blocks); - Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); if (program.info.stage != Stage::Compute) { Shader::Optimization::LowerSharedMemToRegisters(program); } + Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::IdentityRemovalPass(program.blocks); Shader::Optimization::DeadCodeEliminationPass(program); Shader::Optimization::CollectShaderInfoPass(program); diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index ef5bf1b66..8c3b675ea 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -179,6 +179,10 @@ struct Image { return base_address << 8; } + operator bool() const noexcept { + return base_address != 0; + } + u32 DstSelect() const { return dst_sel_x | (dst_sel_y << 3) | (dst_sel_z << 6) | (dst_sel_w << 9); } diff --git a/src/video_core/page_manager.cpp b/src/video_core/page_manager.cpp index 6225f11ba..18b8aee21 100644 --- a/src/video_core/page_manager.cpp +++ b/src/video_core/page_manager.cpp @@ -51,7 +51,8 @@ struct PageManager::Impl { if (ec == EXCEPTION_ACCESS_VIOLATION) { const auto info = pExp->ExceptionRecord->ExceptionInformation; if (info[0] == 1) { // Write violation - rasterizer->InvalidateMemory(info[1], sizeof(u64)); + const VAddr addr_aligned = Common::AlignDown(info[1], PAGESIZE); + rasterizer->InvalidateMemory(addr_aligned, PAGESIZE); return EXCEPTION_CONTINUE_EXECUTION; } /* else { UNREACHABLE(); @@ -199,7 +200,8 @@ struct PageManager::Impl { const greg_t err = ctx->uc_mcontext.gregs[REG_ERR]; #endif if (err & 0x2) { - rasterizer->InvalidateMemory(address, sizeof(u64)); + const VAddr addr_aligned = Common::AlignDown(address, PAGESIZE); + rasterizer->InvalidateMemory(addr_aligned, PAGESIZE); } else { // Read not supported! UNREACHABLE(); diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index c2649b96c..95d3a4b2d 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -396,13 +396,18 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, boost::container::static_vector tsharps; for (const auto& image_desc : stage->images) { - const auto& tsharp = tsharps.emplace_back( - stage->ReadUd(image_desc.sgpr_base, image_desc.dword_offset)); - VideoCore::ImageInfo image_info{tsharp}; - VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage}; - const auto& image_view = texture_cache.FindTexture(image_info, view_info); - const auto& image = texture_cache.GetImage(image_view.image_id); - image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, image.layout); + const auto tsharp = + stage->ReadUd(image_desc.sgpr_base, image_desc.dword_offset); + if (tsharp) { + tsharps.emplace_back(tsharp); + VideoCore::ImageInfo image_info{tsharp}; + VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage}; + const auto& image_view = texture_cache.FindTexture(image_info, view_info); + const auto& image = texture_cache.GetImage(image_view.image_id); + image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, image.layout); + } else { + image_infos.emplace_back(VK_NULL_HANDLE, VK_NULL_HANDLE, vk::ImageLayout::eGeneral); + } set_writes.push_back({ .dstSet = VK_NULL_HANDLE, .dstBinding = binding++, diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index c0923b743..12db47576 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -210,6 +210,8 @@ bool Instance::CreateDevice() { color_write_en &= add_extension(VK_EXT_EXTENDED_DYNAMIC_STATE_3_EXTENSION_NAME); const bool calibrated_timestamps = add_extension(VK_EXT_CALIBRATED_TIMESTAMPS_EXTENSION_NAME); const bool robustness = add_extension(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME); + const bool topology_restart = + add_extension(VK_EXT_PRIMITIVE_TOPOLOGY_LIST_RESTART_EXTENSION_NAME); // These extensions are promoted by Vulkan 1.3, but for greater compatibility we use Vulkan 1.2 // with extensions. @@ -330,6 +332,9 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceVertexInputDynamicStateFeaturesEXT{ .vertexInputDynamicState = true, }, + vk::PhysicalDevicePrimitiveTopologyListRestartFeaturesEXT{ + .primitiveTopologyListRestart = true, + }, #ifdef __APPLE__ feature_chain.get(), #endif @@ -351,6 +356,9 @@ bool Instance::CreateDevice() { if (!workgroup_memory_explicit_layout) { device_chain.unlink(); } + if (!topology_restart) { + device_chain.unlink(); + } if (robustness) { device_chain.get().nullDescriptor = feature_chain.get().nullDescriptor; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 55f04bac4..139edcf7c 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -280,9 +280,6 @@ std::unique_ptr PipelineCache::CreateGraphicsPipeline() { DumpShader(code, hash, stage, "bin"); } - block_pool.ReleaseContents(); - inst_pool.ReleaseContents(); - if (stage != Shader::Stage::Fragment && stage != Shader::Stage::Vertex) { LOG_ERROR(Render_Vulkan, "Unsupported shader stage {}. PL creation skipped.", stage); return {}; diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index 17b78a6d5..a073d046e 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -219,7 +219,12 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { guest_address = image.Address(); mips_layout.reserve(resources.levels); + tiling_idx = image.tiling_index; + UpdateSize(); +} +void ImageInfo::UpdateSize() { + mips_layout.clear(); MipInfo mip_info{}; guest_size_bytes = 0; for (auto mip = 0u; mip < resources.levels; ++mip) { @@ -265,7 +270,7 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { ASSERT(!props.is_block); ASSERT(num_samples == 1); std::tie(mip_info.pitch, mip_info.size) = - ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, image.tiling_index); + ImageSizeMacroTiled(mip_w, mip_h, bpp, num_samples, tiling_idx); break; } default: { diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h index 9dad0dd67..ddad318d9 100644 --- a/src/video_core/texture_cache/image_info.h +++ b/src/video_core/texture_cache/image_info.h @@ -29,6 +29,8 @@ struct ImageInfo { bool IsPacked() const; bool IsDepthStencil() const; + void UpdateSize(); + struct { VAddr cmask_addr; VAddr fmask_addr; @@ -69,6 +71,7 @@ struct ImageInfo { boost::container::small_vector mips_layout; VAddr guest_address{0}; u32 guest_size_bytes{0}; + u32 tiling_idx{0}; // TODO: merge with existing! }; } // namespace VideoCore diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index cd3afc59f..cae124220 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -18,11 +18,15 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& BufferCache& buffer_cache_, PageManager& tracker_) : instance{instance_}, scheduler{scheduler_}, buffer_cache{buffer_cache_}, tracker{tracker_}, tile_manager{instance, scheduler} { - ImageInfo info; + ImageInfo info{}; info.pixel_format = vk::Format::eR8G8B8A8Unorm; info.type = vk::ImageType::e2D; + info.tiling_idx = u32(AmdGpu::TilingMode::Texture_MicroTiled); + info.num_bits = 32; + info.UpdateSize(); const ImageId null_id = slot_images.insert(instance, scheduler, info); ASSERT(null_id.index == 0); + slot_images[null_id].flags = ImageFlagBits{}; ImageViewInfo view_info; void(slot_image_views.insert(instance, view_info, slot_images[null_id], null_id)); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 7266d7a56..8af68424a 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -28,7 +28,7 @@ class TextureCache { using Entry = boost::container::small_vector; static constexpr size_t AddressSpaceBits = 39; static constexpr size_t FirstLevelBits = 9; - static constexpr size_t PageBits = 22; + static constexpr size_t PageBits = 20; }; using PageTable = MultiLevelPageTable;