From 02a50265f84b062e93314531b3503b3a1db8faa3 Mon Sep 17 00:00:00 2001 From: raphaelthegreat <47210458+raphaelthegreat@users.noreply.github.com> Date: Sat, 1 Jun 2024 20:25:31 +0300 Subject: [PATCH] shader_recompiler: Better branch detection + more opcodes --- .../backend/spirv/emit_spirv.cpp | 11 ++ .../spirv/emit_spirv_context_get_set.cpp | 39 ++-- .../backend/spirv/emit_spirv_instructions.h | 4 + .../backend/spirv/spirv_emit_context.cpp | 20 +- .../backend/spirv/spirv_emit_context.h | 3 + .../frontend/translate/scalar_alu.cpp | 141 +++++++++++++- .../frontend/translate/translate.cpp | 180 ++++++++++++++++-- .../frontend/translate/translate.h | 36 +++- .../frontend/translate/vector_alu.cpp | 168 ++++++++++++++-- .../translate/vector_interpolation.cpp | 1 - src/shader_recompiler/ir/ir_emitter.cpp | 16 ++ src/shader_recompiler/ir/ir_emitter.h | 6 +- src/shader_recompiler/ir/opcodes.inc | 4 + .../ir/passes/resource_tracking_pass.cpp | 22 ++- .../ir/passes/ssa_rewrite_pass.cpp | 27 ++- src/shader_recompiler/ir/value.h | 1 + src/shader_recompiler/recompiler.cpp | 2 +- src/shader_recompiler/runtime_info.h | 2 +- src/video_core/amdgpu/liverpool.cpp | 1 - src/video_core/amdgpu/liverpool.h | 6 + src/video_core/amdgpu/resource.h | 18 ++ .../renderer_vulkan/liverpool_to_vk.cpp | 5 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 27 ++- .../renderer_vulkan/vk_graphics_pipeline.h | 2 + .../renderer_vulkan/vk_instance.cpp | 1 + .../renderer_vulkan/vk_pipeline_cache.cpp | 12 +- .../renderer_vulkan/vk_rasterizer.cpp | 1 + .../renderer_vulkan/vk_rasterizer.h | 1 + src/video_core/texture_cache/image.cpp | 10 +- src/video_core/texture_cache/image_view.cpp | 9 +- .../texture_cache/texture_cache.cpp | 116 +++++++---- 31 files changed, 772 insertions(+), 120 deletions(-) diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index c211be255..bd3f4f3fb 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -9,6 +9,7 @@ #include "shader_recompiler/backend/spirv/emit_spirv.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" +#include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/program.h" @@ -28,6 +29,8 @@ ArgType Arg(EmitContext& ctx, const IR::Value& arg) { return arg; } else if constexpr (std::is_same_v) { return arg.U32(); + } else if constexpr (std::is_same_v) { + return arg.U64(); } else if constexpr (std::is_same_v) { return arg.Attribute(); } else if constexpr (std::is_same_v) { @@ -279,6 +282,10 @@ void EmitGetVccLo(EmitContext& ctx) { throw LogicError("Unreachable instruction"); } +void EmitGetVccHi(EmitContext& ctx) { + throw LogicError("Unreachable instruction"); +} + void EmitSetScc(EmitContext& ctx) { throw LogicError("Unreachable instruction"); } @@ -295,4 +302,8 @@ void EmitSetVccLo(EmitContext& ctx) { throw LogicError("Unreachable instruction"); } +void EmitSetVccHi(EmitContext& ctx) { + throw LogicError("Unreachable instruction"); +} + } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index d6c67ee90..f653c2dd3 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -33,6 +33,14 @@ Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg) { return ctx.ConstU32(ctx.info.user_data[static_cast(reg)]); } +void EmitGetThreadBitScalarReg(EmitContext& ctx) { + throw LogicError("Unreachable instruction"); +} + +void EmitSetThreadBitScalarReg(EmitContext& ctx) { + throw LogicError("Unreachable instruction"); +} + void EmitGetScalarRegister(EmitContext&) { throw LogicError("Unreachable instruction"); } @@ -68,7 +76,7 @@ Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) { } Id EmitReadConstBufferU32(EmitContext& ctx, u32 handle, Id index) { - return EmitReadConstBuffer(ctx, handle, index); + return ctx.OpBitcast(ctx.U32[1], EmitReadConstBuffer(ctx, handle, index)); } Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) { @@ -86,7 +94,13 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp) { return ctx.OpLoad(param.component_type, param.id); } } - throw NotImplementedException("Read attribute {}", attr); + switch (attr) { + case IR::Attribute::FragCoord: + return ctx.OpLoad(ctx.F32[1], + ctx.OpAccessChain(ctx.input_f32, ctx.frag_coord, ctx.ConstU32(comp))); + default: + throw NotImplementedException("Read attribute {}", attr); + } } Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { @@ -98,6 +112,9 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { case IR::Attribute::LocalInvocationId: return ctx.OpCompositeExtract(ctx.U32[1], ctx.OpLoad(ctx.U32[3], ctx.local_invocation_id), comp); + case IR::Attribute::IsFrontFace: + return ctx.OpSelect(ctx.U32[1], ctx.OpLoad(ctx.U1[1], ctx.front_facing), ctx.u32_one_value, + ctx.u32_zero_value); default: throw NotImplementedException("Read U32 attribute {}", attr); } @@ -136,19 +153,13 @@ Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { const auto info = inst->Flags(); const auto& buffer = ctx.buffers[handle]; - if (info.index_enable && info.offset_enable) { - UNREACHABLE(); - } else if (info.index_enable) { - boost::container::static_vector ids; - for (u32 i = 0; i < 4; i++) { - const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))}; - const Id ptr{ - ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; - ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); - } - return ctx.OpCompositeConstruct(buffer.data_types->Get(4), ids); + boost::container::static_vector ids; + for (u32 i = 0; i < 4; i++) { + const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))}; + const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); } - UNREACHABLE(); + return ctx.OpCompositeConstruct(buffer.data_types->Get(4), ids); } void EmitStoreBufferF32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 2192b0542..728dd2bc9 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -34,14 +34,18 @@ void EmitGetScc(EmitContext& ctx); void EmitGetExec(EmitContext& ctx); void EmitGetVcc(EmitContext& ctx); void EmitGetVccLo(EmitContext& ctx); +void EmitGetVccHi(EmitContext& ctx); void EmitSetScc(EmitContext& ctx); void EmitSetExec(EmitContext& ctx); void EmitSetVcc(EmitContext& ctx); void EmitSetVccLo(EmitContext& ctx); +void EmitSetVccHi(EmitContext& ctx); void EmitPrologue(EmitContext& ctx); void EmitEpilogue(EmitContext& ctx); void EmitDiscard(EmitContext& ctx); Id EmitGetUserData(EmitContext& ctx, IR::ScalarReg reg); +void EmitGetThreadBitScalarReg(EmitContext& ctx); +void EmitSetThreadBitScalarReg(EmitContext& ctx); void EmitGetScalarRegister(EmitContext& ctx); void EmitSetScalarRegister(EmitContext& ctx); void EmitGetVectorRegister(EmitContext& ctx); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index de0fedd47..87da1a5ba 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -94,6 +94,7 @@ void EmitContext::DefineArithmeticTypes() { true_value = ConstantTrue(U1[1]); false_value = ConstantFalse(U1[1]); + u32_one_value = ConstU32(1U); u32_zero_value = ConstU32(0U); f32_zero_value = ConstF32(0.0f); @@ -177,21 +178,24 @@ void EmitContext::DefineInputs(const Info& info) { } break; case Stage::Fragment: + frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); + front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); for (const auto& input : info.ps_inputs) { + const u32 semantic = input.param_index; if (input.is_default) { - input_params[input.semantic] = {MakeDefaultValue(*this, input.default_value), - input_f32, F32[1]}; + input_params[semantic] = {MakeDefaultValue(*this, input.default_value), input_f32, + F32[1]}; continue; } const IR::Attribute param{IR::Attribute::Param0 + input.param_index}; const u32 num_components = info.loads.NumComponents(param); const Id type{F32[num_components]}; - const Id id{DefineInput(type, input.semantic)}; + const Id id{DefineInput(type, semantic)}; if (input.is_flat) { Decorate(id, spv::Decoration::Flat); } - Name(id, fmt::format("fs_in_attr{}", input.semantic)); - input_params[input.semantic] = {id, input_f32, F32[1], num_components}; + Name(id, fmt::format("fs_in_attr{}", semantic)); + input_params[semantic] = {id, input_f32, F32[1], num_components}; interfaces.push_back(id); } break; @@ -260,7 +264,7 @@ void EmitContext::DefineBuffers(const Info& info) { const Id id{AddGlobalVariable(struct_pointer_type, storage_class)}; Decorate(id, spv::Decoration::Binding, binding); Decorate(id, spv::Decoration::DescriptorSet, 0U); - Name(id, fmt::format("{}{}", buffer.is_storage ? "ssbo" : "cbuf", i)); + Name(id, fmt::format("{}_{}", buffer.is_storage ? "ssbo" : "cbuf", buffer.sgpr_base)); binding++; buffers.push_back({ @@ -318,7 +322,9 @@ Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { case AmdGpu::ImageType::Color2DArray: return ctx.TypeImage(sampled_type, spv::Dim::Dim2D, false, true, false, 1, format); case AmdGpu::ImageType::Color3D: - return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, 2, format); + return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, 1, format); + case AmdGpu::ImageType::Cube: + return ctx.TypeImage(sampled_type, spv::Dim::Cube, false, false, false, 1, format); case AmdGpu::ImageType::Buffer: throw NotImplementedException("Image buffer"); default: diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 53d59f434..67eac9305 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -140,6 +140,7 @@ public: Id true_value{}; Id false_value{}; + Id u32_one_value{}; Id u32_zero_value{}; Id f32_zero_value{}; @@ -154,6 +155,8 @@ public: Id output_position{}; Id vertex_index{}; Id base_vertex{}; + Id frag_coord{}; + Id front_facing{}; std::array frag_color{}; Id workgroup_id{}; diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 62d3a3782..ac9157344 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -38,8 +38,145 @@ void Translator::S_CMP(ConditionOp cond, bool is_signed, const GcnInst& inst) { } void Translator::S_ANDN2_B64(const GcnInst& inst) { - // TODO: Actually implement this. - ir.SetScc(ir.GetVcc()); + // TODO: What if this is used for something other than EXEC masking? + const auto get_src = [&](const InstOperand& operand) { + switch (operand.field) { + case OperandField::VccLo: + return ir.GetVcc(); + case OperandField::ExecLo: + return ir.GetExec(); + case OperandField::ScalarGPR: + return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code)); + default: + UNREACHABLE(); + } + }; + + const IR::U1 src0{get_src(inst.src[0])}; + const IR::U1 src1{get_src(inst.src[1])}; + const IR::U1 result{ir.LogicalAnd(src0, ir.LogicalNot(src1))}; + SetDst(inst.dst[0], result); + ir.SetScc(result); +} + +void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { + // This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs) + // However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination + // SGPR we have a special IR opcode for SPGRs that act as thread masks. + const IR::U1 exec{ir.GetExec()}; + + // Mark destination SPGR as an EXEC context. This means we will use 1-bit + // IR instruction whenever it's loaded. + ASSERT(inst.dst[0].field == OperandField::ScalarGPR); + const u32 reg = inst.dst[0].code; + exec_contexts[reg] = true; + ir.SetThreadBitScalarReg(IR::ScalarReg(reg), exec); + + // Update EXEC. + ASSERT(inst.src[0].field == OperandField::VccLo); + ir.SetExec(ir.LogicalAnd(exec, ir.GetVcc())); +} + +void Translator::S_MOV_B64(const GcnInst& inst) { + // TODO: Using VCC as EXEC context. + if (inst.src[0].field == OperandField::VccLo || inst.dst[0].field == OperandField::VccLo) { + return; + } + const IR::U1 src0{GetSrc(inst.src[0])}; + if (inst.dst[0].field == OperandField::ScalarGPR && inst.src[0].field == OperandField::ExecLo) { + // Exec context push + exec_contexts[inst.dst[0].code] = true; + } else if (inst.dst[0].field == OperandField::ExecLo && + inst.src[0].field == OperandField::ScalarGPR) { + // Exec context pop + exec_contexts[inst.src[0].code] = false; + } else if (inst.src[0].field != OperandField::ConstZero) { + UNREACHABLE(); + } + SetDst(inst.dst[0], src0); +} + +void Translator::S_OR_B64(bool negate, const GcnInst& inst) { + const auto get_src = [&](const InstOperand& operand) { + switch (operand.field) { + case OperandField::VccLo: + return ir.GetVcc(); + case OperandField::ScalarGPR: + return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code)); + default: + UNREACHABLE(); + } + }; + + const IR::U1 src0{get_src(inst.src[0])}; + const IR::U1 src1{get_src(inst.src[1])}; + IR::U1 result = ir.LogicalOr(src0, src1); + if (negate) { + result = ir.LogicalNot(result); + } + ASSERT(inst.dst[0].field == OperandField::VccLo); + ir.SetVcc(result); + ir.SetScc(result); +} + +void Translator::S_AND_B64(const GcnInst& inst) { + const auto get_src = [&](const InstOperand& operand) { + switch (operand.field) { + case OperandField::VccLo: + return ir.GetVcc(); + case OperandField::ExecLo: + return ir.GetExec(); + case OperandField::ScalarGPR: + return ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code)); + default: + UNREACHABLE(); + } + }; + const IR::U1 src0{get_src(inst.src[0])}; + const IR::U1 src1{get_src(inst.src[1])}; + const IR::U1 result = ir.LogicalAnd(src0, src1); + ASSERT(inst.dst[0].field == OperandField::VccLo); + ir.SetVcc(result); + ir.SetScc(result); +} + +void Translator::S_ADD_I32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.IAdd(src0, src1)); + // TODO: Overflow flag +} + +void Translator::S_AND_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 result{ir.BitwiseAnd(src0, src1)}; + SetDst(inst.dst[0], result); + ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); +} + +void Translator::S_LSHR_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 result{ir.ShiftRightLogical(src0, src1)}; + SetDst(inst.dst[0], result); + ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); +} + +void Translator::S_CSELECT_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], IR::U32{ir.Select(ir.GetScc(), src0, src1)}); +} + +void Translator::S_BFE_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 offset{ir.BitwiseAnd(src1, ir.Imm32(0x1F))}; + const IR::U32 count{ir.BitFieldExtract(src1, ir.Imm32(16), ir.Imm32(7))}; + const IR::U32 result{ir.BitFieldExtract(src0, offset, count)}; + SetDst(inst.dst[0], result); + ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); } } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 624091527..d5ea8c48d 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -9,11 +9,15 @@ namespace Shader::Gcn { +std::array Translator::exec_contexts{}; + Translator::Translator(IR::Block* block_, Info& info_) : ir{*block_, block_->begin()}, info{info_} {} void Translator::EmitPrologue() { + exec_contexts.fill(false); ir.Prologue(); + ir.SetExec(ir.Imm1(true)); // Initialize user data. IR::ScalarReg dst_sreg = IR::ScalarReg::S0; @@ -54,10 +58,16 @@ void Translator::EmitPrologue() { } } -IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { - IR::U32F32 value{}; +IR::U1U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { + // Input modifiers work on float values. + force_flt |= operand.input_modifier.abs | operand.input_modifier.neg; + + IR::U1U32F32 value{}; switch (operand.field) { case OperandField::ScalarGPR: + if (exec_contexts[operand.code]) { + value = ir.GetThreadBitScalarReg(IR::ScalarReg(operand.code)); + } if (operand.type == ScalarType::Float32 || force_flt) { value = ir.GetScalarReg(IR::ScalarReg(operand.code)); } else { @@ -114,9 +124,15 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { case OperandField::ConstFloatNeg_2_0: value = ir.Imm32(-2.0f); break; + case OperandField::ExecLo: + value = ir.GetExec(); + break; case OperandField::VccLo: value = ir.GetVccLo(); break; + case OperandField::VccHi: + value = ir.GetVccHi(); + break; default: UNREACHABLE(); } @@ -130,8 +146,8 @@ IR::U32F32 Translator::GetSrc(const InstOperand& operand, bool force_flt) { return value; } -void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { - IR::U32F32 result = value; +void Translator::SetDst(const InstOperand& operand, const IR::U1U32F32& value) { + IR::U1U32F32 result = value; if (operand.output_modifier.multiplier != 0.f) { result = ir.FPMul(result, ir.Imm32(operand.output_modifier.multiplier)); } @@ -140,14 +156,20 @@ void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) { } switch (operand.field) { case OperandField::ScalarGPR: + if (value.Type() == IR::Type::U1) { + return ir.SetThreadBitScalarReg(IR::ScalarReg(operand.code), result); + } return ir.SetScalarReg(IR::ScalarReg(operand.code), result); case OperandField::VectorGPR: return ir.SetVectorReg(IR::VectorReg(operand.code), result); + case OperandField::ExecLo: + return ir.SetExec(result); case OperandField::VccLo: return ir.SetVccLo(result); case OperandField::VccHi: + return ir.SetVccHi(result); case OperandField::M0: - break; // Ignore for now + break; default: UNREACHABLE(); } @@ -279,11 +301,32 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::IMAGE_SAMPLE: translator.IMAGE_SAMPLE(inst); break; - case Opcode::V_CMP_EQ_U32: - translator.V_CMP_EQ_U32(inst); + case Opcode::V_CMP_EQ_I32: + translator.V_CMP_U32(ConditionOp::EQ, true, false, inst); break; - case Opcode::V_CMPX_GT_U32: - translator.V_CMPX_GT_U32(inst); + case Opcode::V_CMP_NE_U32: + translator.V_CMP_U32(ConditionOp::LG, false, false, inst); + break; + case Opcode::V_CMP_EQ_U32: + translator.V_CMP_U32(ConditionOp::EQ, false, false, inst); + break; + case Opcode::V_CMP_F_U32: + translator.V_CMP_U32(ConditionOp::F, false, false, inst); + break; + case Opcode::V_CMP_LT_U32: + translator.V_CMP_U32(ConditionOp::LT, false, false, inst); + break; + case Opcode::V_CMP_GT_U32: + translator.V_CMP_U32(ConditionOp::GT, false, false, inst); + break; + case Opcode::V_CMP_GE_U32: + translator.V_CMP_U32(ConditionOp::GE, false, false, inst); + break; + case Opcode::V_CMP_TRU_U32: + translator.V_CMP_U32(ConditionOp::TRU, false, false, inst); + break; + case Opcode::V_CMP_NEQ_F32: + translator.V_CMP_F32(ConditionOp::LG, inst); break; case Opcode::V_CMP_F_F32: translator.V_CMP_F32(ConditionOp::F, inst); @@ -309,6 +352,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::S_CMP_LG_U32: translator.S_CMP(ConditionOp::LG, false, inst); break; + case Opcode::S_CMP_EQ_I32: + translator.S_CMP(ConditionOp::EQ, true, inst); + break; case Opcode::V_CNDMASK_B32: translator.V_CNDMASK_B32(inst); break; @@ -348,13 +394,125 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_MIN3_F32: translator.V_MIN3_F32(inst); break; - case Opcode::S_NOP: + case Opcode::V_MADMK_F32: + translator.V_MADMK_F32(inst); + break; + case Opcode::V_CUBEMA_F32: + translator.V_CUBEMA_F32(inst); + break; + case Opcode::V_CUBESC_F32: + translator.V_CUBESC_F32(inst); + break; + case Opcode::V_CUBETC_F32: + translator.V_CUBETC_F32(inst); + break; + case Opcode::V_CUBEID_F32: + translator.V_CUBEID_F32(inst); + break; + case Opcode::V_CVT_U32_F32: + translator.V_CVT_U32_F32(inst); + break; + case Opcode::V_SUBREV_F32: + translator.V_SUBREV_F32(inst); + break; + case Opcode::S_AND_SAVEEXEC_B64: + translator.S_AND_SAVEEXEC_B64(inst); + break; + case Opcode::S_MOV_B64: + translator.S_MOV_B64(inst); + break; + case Opcode::V_SUBREV_I32: + translator.V_SUBREV_I32(inst); + break; + case Opcode::V_CMP_LE_U32: + translator.V_CMP_U32(ConditionOp::LE, false, false, inst); + break; + case Opcode::V_CMP_GT_I32: + translator.V_CMP_U32(ConditionOp::GT, true, false, inst); + break; + case Opcode::V_CMPX_F_U32: + translator.V_CMP_U32(ConditionOp::F, false, true, inst); + break; + case Opcode::V_CMPX_LT_U32: + translator.V_CMP_U32(ConditionOp::LT, false, true, inst); + break; + case Opcode::V_CMPX_EQ_U32: + translator.V_CMP_U32(ConditionOp::EQ, false, true, inst); + break; + case Opcode::V_CMPX_LE_U32: + translator.V_CMP_U32(ConditionOp::LE, false, true, inst); + break; + case Opcode::V_CMPX_GT_U32: + translator.V_CMP_U32(ConditionOp::GT, false, true, inst); + break; + case Opcode::V_CMPX_NE_U32: + translator.V_CMP_U32(ConditionOp::LG, false, true, inst); + break; + case Opcode::V_CMPX_GE_U32: + translator.V_CMP_U32(ConditionOp::GE, false, true, inst); + break; + case Opcode::V_CMPX_TRU_U32: + translator.V_CMP_U32(ConditionOp::TRU, false, true, inst); + break; + case Opcode::S_OR_B64: + translator.S_OR_B64(false, inst); + break; + case Opcode::S_NOR_B64: + translator.S_OR_B64(true, inst); + break; case Opcode::S_AND_B64: + translator.S_AND_B64(inst); + break; + case Opcode::V_LSHRREV_B32: + translator.V_LSHRREV_B32(inst); + break; + case Opcode::S_ADD_I32: + translator.S_ADD_I32(inst); + break; + case Opcode::V_MUL_LO_I32: + translator.V_MUL_LO_I32(inst); + break; + case Opcode::V_SAD_U32: + translator.V_SAD_U32(inst); + break; + case Opcode::V_BFE_U32: + translator.V_BFE_U32(inst); + break; + case Opcode::V_MAD_I32_I24: + translator.V_MAD_I32_I24(inst); + break; + case Opcode::V_MUL_I32_I24: + translator.V_MUL_I32_I24(inst); + break; + case Opcode::V_SUB_I32: + translator.V_SUB_I32(inst); + break; + case Opcode::V_LSHR_B32: + translator.V_LSHR_B32(inst); + break; + case Opcode::V_ASHRREV_I32: + translator.V_ASHRREV_I32(inst); + break; + case Opcode::V_MAD_U32_U24: + translator.V_MAD_U32_U24(inst); + break; + case Opcode::S_AND_B32: + translator.S_AND_B32(inst); + break; + case Opcode::S_LSHR_B32: + translator.S_LSHR_B32(inst); + break; + case Opcode::S_CSELECT_B32: + translator.S_CSELECT_B32(inst); + break; + case Opcode::S_BFE_U32: + translator.S_BFE_U32(inst); + break; + case Opcode::S_NOP: case Opcode::S_CBRANCH_EXECZ: case Opcode::S_CBRANCH_SCC0: case Opcode::S_CBRANCH_SCC1: case Opcode::S_BRANCH: - case Opcode::S_MOV_B64: case Opcode::S_WQM_B64: case Opcode::V_INTERP_P1_F32: case Opcode::S_ENDPGM: diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index e721dad5d..d1efb724e 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -23,6 +23,7 @@ enum class ConditionOp : u32 { GE, LT, LE, + TRU, }; class Translator { @@ -37,6 +38,15 @@ public: void S_MUL_I32(const GcnInst& inst); void S_CMP(ConditionOp cond, bool is_signed, const GcnInst& inst); void S_ANDN2_B64(const GcnInst& inst); + void S_AND_SAVEEXEC_B64(const GcnInst& inst); + void S_MOV_B64(const GcnInst& inst); + void S_OR_B64(bool negate, const GcnInst& inst); + void S_AND_B64(const GcnInst& inst); + void S_ADD_I32(const GcnInst& inst); + void S_AND_B32(const GcnInst& inst); + void S_LSHR_B32(const GcnInst& inst); + void S_CSELECT_B32(const GcnInst& inst); + void S_BFE_U32(const GcnInst& inst); // Scalar Memory void S_LOAD_DWORD(int num_dwords, const GcnInst& inst); @@ -48,7 +58,6 @@ public: void V_MAC_F32(const GcnInst& inst); void V_CVT_PKRTZ_F16_F32(const GcnInst& inst); void V_MUL_F32(const GcnInst& inst); - void V_CMP_EQ_U32(const GcnInst& inst); void V_CNDMASK_B32(const GcnInst& inst); void V_AND_B32(const GcnInst& inst); void V_LSHLREV_B32(const GcnInst& inst); @@ -63,7 +72,6 @@ public: void V_FLOOR_F32(const GcnInst& inst); void V_SUB_F32(const GcnInst& inst); void V_RCP_F32(const GcnInst& inst); - void V_CMPX_GT_U32(const GcnInst& inst); void V_FMA_F32(const GcnInst& inst); void V_CMP_F32(ConditionOp op, const GcnInst& inst); void V_MAX_F32(const GcnInst& inst); @@ -74,6 +82,25 @@ public: void V_SQRT_F32(const GcnInst& inst); void V_MIN_F32(const GcnInst& inst); void V_MIN3_F32(const GcnInst& inst); + void V_MADMK_F32(const GcnInst& inst); + void V_CUBEMA_F32(const GcnInst& inst); + void V_CUBESC_F32(const GcnInst& inst); + void V_CUBETC_F32(const GcnInst& inst); + void V_CUBEID_F32(const GcnInst& inst); + void V_CVT_U32_F32(const GcnInst& inst); + void V_SUBREV_F32(const GcnInst& inst); + void V_SUBREV_I32(const GcnInst& inst); + void V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst); + void V_LSHRREV_B32(const GcnInst& inst); + void V_MUL_LO_I32(const GcnInst& inst); + void V_SAD_U32(const GcnInst& inst); + void V_BFE_U32(const GcnInst& inst); + void V_MAD_I32_I24(const GcnInst& inst); + void V_MUL_I32_I24(const GcnInst& inst); + void V_SUB_I32(const GcnInst& inst); + void V_LSHR_B32(const GcnInst& inst); + void V_ASHRREV_I32(const GcnInst& inst); + void V_MAD_U32_U24(const GcnInst& inst); // Vector Memory void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); @@ -94,12 +121,13 @@ public: void EXP(const GcnInst& inst); private: - IR::U32F32 GetSrc(const InstOperand& operand, bool flt_zero = false); - void SetDst(const InstOperand& operand, const IR::U32F32& value); + IR::U1U32F32 GetSrc(const InstOperand& operand, bool flt_zero = false); + void SetDst(const InstOperand& operand, const IR::U1U32F32& value); private: IR::IREmitter ir; Info& info; + static std::array exec_contexts; }; void Translate(IR::Block* block, std::span inst_list, Info& info); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 81366117f..085d86941 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -29,17 +29,6 @@ void Translator::V_MUL_F32(const GcnInst& inst) { ir.SetVectorReg(dst_reg, ir.FPMul(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true))); } -void Translator::V_CMP_EQ_U32(const GcnInst& inst) { - const IR::U1 result = ir.IEqual(GetSrc(inst.src[0]), GetSrc(inst.src[1])); - if (inst.dst[1].field == OperandField::VccLo) { - return ir.SetVcc(result); - } else if (inst.dst[1].field == OperandField::ScalarGPR) { - const IR::ScalarReg dst_reg{inst.dst[1].code}; - return ir.SetScalarReg(dst_reg, IR::U32{ir.Select(result, ir.Imm32(1U), ir.Imm32(0U))}); - } - UNREACHABLE(); -} - void Translator::V_CNDMASK_B32(const GcnInst& inst) { const IR::VectorReg dst_reg{inst.dst[0].code}; const IR::ScalarReg flag_reg{inst.src[2].code}; @@ -70,9 +59,9 @@ void Translator::V_AND_B32(const GcnInst& inst) { void Translator::V_LSHLREV_B32(const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; - const IR::U32 src1{ir.GetVectorReg(IR::VectorReg(inst.src[1].code))}; + const IR::U32 src1{GetSrc(inst.src[1])}; const IR::VectorReg dst_reg{inst.dst[0].code}; - ir.SetVectorReg(dst_reg, ir.ShiftLeftLogical(src1, src0)); + ir.SetVectorReg(dst_reg, ir.ShiftLeftLogical(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F)))); } void Translator::V_ADD_I32(const GcnInst& inst) { @@ -148,14 +137,6 @@ void Translator::V_RCP_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPRecip(src0)); } -void Translator::V_CMPX_GT_U32(const GcnInst& inst) { - const IR::U32 src0{GetSrc(inst.src[0])}; - const IR::U32 src1{GetSrc(inst.src[1])}; - const IR::U1 result = ir.IGreaterThan(src0, src1, false); - ir.SetVcc(result); - ir.SetExec(result); -} - void Translator::V_FMA_F32(const GcnInst& inst) { const IR::F32 src0{GetSrc(inst.src[0], true)}; const IR::F32 src1{GetSrc(inst.src[1], true)}; @@ -182,6 +163,8 @@ void Translator::V_CMP_F32(ConditionOp op, const GcnInst& inst) { return ir.FPLessThanEqual(src0, src1); case ConditionOp::GE: return ir.FPGreaterThanEqual(src0, src1); + default: + UNREACHABLE(); } }(); ir.SetVcc(result); @@ -231,4 +214,147 @@ void Translator::V_MIN3_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPMin(src0, ir.FPMin(src1, src2))); } +void Translator::V_MADMK_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 k{GetSrc(inst.src[2], true)}; + SetDst(inst.dst[0], ir.FPFma(src0, k, src1)); +} + +void Translator::V_CUBEMA_F32(const GcnInst& inst) { + SetDst(inst.dst[0], ir.Imm32(1.f)); +} + +void Translator::V_CUBESC_F32(const GcnInst& inst) { + SetDst(inst.dst[0], GetSrc(inst.src[0], true)); +} + +void Translator::V_CUBETC_F32(const GcnInst& inst) { + SetDst(inst.dst[0], GetSrc(inst.src[1], true)); +} + +void Translator::V_CUBEID_F32(const GcnInst& inst) { + SetDst(inst.dst[0], GetSrc(inst.src[2], true)); +} + +void Translator::V_CVT_U32_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + SetDst(inst.dst[0], ir.ConvertFToU(32, src0)); +} + +void Translator::V_SUBREV_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + SetDst(inst.dst[0], ir.FPSub(src1, src0)); +} + +void Translator::V_SUBREV_I32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.ISub(src1, src0)); + // TODO: Carry-out +} + +void Translator::V_CMP_U32(ConditionOp op, bool is_signed, bool set_exec, const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U1 result = [&] { + switch (op) { + case ConditionOp::F: + return ir.Imm1(false); + case ConditionOp::TRU: + return ir.Imm1(true); + case ConditionOp::EQ: + return ir.IEqual(src0, src1); + case ConditionOp::LG: + return ir.INotEqual(src0, src1); + case ConditionOp::GT: + return ir.IGreaterThan(src0, src1, is_signed); + case ConditionOp::LT: + return ir.ILessThan(src0, src1, is_signed); + case ConditionOp::LE: + return ir.ILessThanEqual(src0, src1, is_signed); + case ConditionOp::GE: + return ir.IGreaterThanEqual(src0, src1, is_signed); + default: + UNREACHABLE(); + } + }(); + if (set_exec) { + ir.SetExec(result); + } + switch (inst.dst[1].field) { + case OperandField::VccLo: + return ir.SetVcc(result); + case OperandField::ScalarGPR: + return ir.SetThreadBitScalarReg(IR::ScalarReg(inst.dst[0].code), result); + default: + UNREACHABLE(); + } +} + +void Translator::V_LSHRREV_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.ShiftRightLogical(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F)))); +} + +void Translator::V_MUL_LO_I32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.IMul(src0, src1)); +} + +void Translator::V_SAD_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + const IR::U32 src2{GetSrc(inst.src[2])}; + const IR::U32 max{ir.IMax(src0, src1, false)}; + const IR::U32 min{ir.IMin(src0, src1, false)}; + SetDst(inst.dst[0], ir.IAdd(ir.ISub(max, min), src2)); +} + +void Translator::V_BFE_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{ir.BitwiseAnd(GetSrc(inst.src[1]), ir.Imm32(0x1F))}; + const IR::U32 src2{ir.BitwiseAnd(GetSrc(inst.src[2]), ir.Imm32(0x1F))}; + SetDst(inst.dst[0], ir.BitFieldExtract(src0, src1, src2)); +} + +void Translator::V_MAD_I32_I24(const GcnInst& inst) { + const IR::U32 src0{ir.BitFieldExtract(GetSrc(inst.src[0]), ir.Imm32(0), ir.Imm32(24), true)}; + const IR::U32 src1{ir.BitFieldExtract(GetSrc(inst.src[1]), ir.Imm32(0), ir.Imm32(24), true)}; + const IR::U32 src2{GetSrc(inst.src[2])}; + SetDst(inst.dst[0], ir.IAdd(ir.IMul(src0, src1), src2)); +} + +void Translator::V_MUL_I32_I24(const GcnInst& inst) { + const IR::U32 src0{ir.BitFieldExtract(GetSrc(inst.src[0]), ir.Imm32(0), ir.Imm32(24), true)}; + const IR::U32 src1{ir.BitFieldExtract(GetSrc(inst.src[1]), ir.Imm32(0), ir.Imm32(24), true)}; + SetDst(inst.dst[0], ir.IMul(src0, src1)); +} + +void Translator::V_SUB_I32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.ISub(src0, src1)); +} + +void Translator::V_LSHR_B32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.ShiftRightLogical(src0, ir.BitwiseAnd(src1, ir.Imm32(0x1F)))); +} + +void Translator::V_ASHRREV_I32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.ShiftRightArithmetic(src1, ir.BitwiseAnd(src0, ir.Imm32(0x1F)))); +} + +void Translator::V_MAD_U32_U24(const GcnInst& inst) { + // TODO: + V_MAD_I32_I24(inst); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp index 7d41d4306..55a2d624e 100644 --- a/src/shader_recompiler/frontend/translate/vector_interpolation.cpp +++ b/src/shader_recompiler/frontend/translate/vector_interpolation.cpp @@ -8,7 +8,6 @@ namespace Shader::Gcn { void Translator::V_INTERP_P2_F32(const GcnInst& inst) { const IR::VectorReg dst_reg{inst.dst[0].code}; auto& attr = info.ps_inputs.at(inst.control.vintrp.attr); - attr.semantic = inst.control.vintrp.attr; const IR::Attribute attrib{IR::Attribute::Param0 + attr.param_index}; ir.SetVectorReg(dst_reg, ir.GetAttribute(attrib, inst.control.vintrp.chan)); } diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 43e8e4393..bd41d5876 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -119,6 +119,14 @@ U32 IREmitter::GetUserData(IR::ScalarReg reg) { return Inst(Opcode::GetUserData, reg); } +U1 IREmitter::GetThreadBitScalarReg(IR::ScalarReg reg) { + return Inst(Opcode::GetThreadBitScalarReg, reg); +} + +void IREmitter::SetThreadBitScalarReg(IR::ScalarReg reg, const U1& value) { + Inst(Opcode::SetThreadBitScalarReg, reg, value); +} + template <> U32 IREmitter::GetScalarReg(IR::ScalarReg reg) { return Inst(Opcode::GetScalarRegister, reg); @@ -196,6 +204,10 @@ U32 IREmitter::GetVccLo() { return Inst(Opcode::GetVccLo); } +U32 IREmitter::GetVccHi() { + return Inst(Opcode::GetVccHi); +} + void IREmitter::SetScc(const U1& value) { Inst(Opcode::SetScc, value); } @@ -212,6 +224,10 @@ void IREmitter::SetVccLo(const U32& value) { Inst(Opcode::SetVccLo, value); } +void IREmitter::SetVccHi(const U32& value) { + Inst(Opcode::SetVccHi, value); +} + F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp) { return Inst(Opcode::GetAttribute, attribute, Imm32(comp)); } diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index a52437a90..3394c9b65 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -43,7 +43,9 @@ public: void Epilogue(); void Discard(); - U32 GetUserData(IR::ScalarReg reg); + [[nodiscard]] U32 GetUserData(IR::ScalarReg reg); + [[nodiscard]] U1 GetThreadBitScalarReg(IR::ScalarReg reg); + void SetThreadBitScalarReg(IR::ScalarReg reg, const U1& value); template [[nodiscard]] T GetScalarReg(IR::ScalarReg reg); @@ -59,10 +61,12 @@ public: [[nodiscard]] U1 GetExec(); [[nodiscard]] U1 GetVcc(); [[nodiscard]] U32 GetVccLo(); + [[nodiscard]] U32 GetVccHi(); void SetScc(const U1& value); void SetExec(const U1& value); void SetVcc(const U1& value); void SetVccLo(const U32& value); + void SetVccHi(const U32& value); [[nodiscard]] U1 Condition(IR::Condition cond); diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 5fb4dd0f7..a3009575b 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -21,6 +21,8 @@ OPCODE(ReadConstBufferU32, U32, Opaq // Context getters/setters OPCODE(GetUserData, U32, ScalarReg, ) +OPCODE(GetThreadBitScalarReg, U1, ScalarReg, ) +OPCODE(SetThreadBitScalarReg, Void, ScalarReg, U1, ) OPCODE(GetScalarRegister, U32, ScalarReg, ) OPCODE(SetScalarRegister, Void, ScalarReg, U32, ) OPCODE(GetVectorRegister, U32, VectorReg, ) @@ -36,10 +38,12 @@ OPCODE(GetScc, U1, Void, OPCODE(GetExec, U1, Void, ) OPCODE(GetVcc, U1, Void, ) OPCODE(GetVccLo, U32, Void, ) +OPCODE(GetVccHi, U32, Void, ) OPCODE(SetScc, Void, U1, ) OPCODE(SetExec, Void, U1, ) OPCODE(SetVcc, Void, U1, ) OPCODE(SetVccLo, Void, U32, ) +OPCODE(SetVccHi, Void, U32, ) // Undefined OPCODE(UndefU1, U1, ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 95ebdf1cf..c8e8d9cf6 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -206,9 +206,12 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, const u32 dword_offset = inst_info.inst_offset.Value() / sizeof(u32); IR::U32 address = ir.Imm32(dword_offset); if (inst_info.index_enable && inst_info.offset_enable) { - UNREACHABLE(); + const IR::U32 offset{ir.CompositeExtract(inst.Arg(1), 0)}; + const IR::U32 index{ir.CompositeExtract(inst.Arg(1), 1)}; + address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address); + address = ir.IAdd(address, ir.ShiftRightLogical(offset, ir.Imm32(2))); } else if (inst_info.index_enable) { - IR::U32 index{inst.Arg(1)}; + const IR::U32 index{inst.Arg(1)}; address = ir.IAdd(ir.IMul(index, ir.Imm32(dword_stride)), address); } else if (inst_info.offset_enable) { const IR::U32 offset{inst.Arg(1)}; @@ -216,6 +219,17 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info, inst.SetArg(1, address); } +IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value& t, + const IR::Value& z) { + // We need to fix x and y coordinate, + // because the s and t coordinate will be scaled and plus 1.5 by v_madak_f32. + // We already force the scale value to be 1.0 when handling v_cubema_f32, + // here we subtract 1.5 to recover the original value. + const IR::Value x = ir.FPSub(IR::F32{s}, ir.Imm32(1.5f)); + const IR::Value y = ir.FPSub(IR::F32{t}, ir.Imm32(1.5f)); + return ir.CompositeConstruct(x, y, z); +} + void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { IR::Inst* producer = inst.Arg(0).InstRecursive(); ASSERT(producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2); @@ -256,8 +270,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip return {ir.CompositeConstruct(body->Arg(0), body->Arg(1)), body->Arg(2)}; case AmdGpu::ImageType::Color2DArray: case AmdGpu::ImageType::Color3D: - case AmdGpu::ImageType::Cube: return {ir.CompositeConstruct(body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)}; + case AmdGpu::ImageType::Cube: + return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)}; default: UNREACHABLE(); } @@ -276,6 +291,7 @@ void ResourceTrackingPass(IR::Program& program) { // Most of the time it is float so that is the default. This pass detects float buffer loads // combined with bitcasts and patches them to be integer loads. for (IR::Block* const block : program.post_order_blocks) { + break; for (IR::Inst& inst : block->Instructions()) { if (inst.GetOpcode() != IR::Opcode::BitCastU32F32) { continue; diff --git a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp index 2958d3d1d..9ee019537 100644 --- a/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp +++ b/src/shader_recompiler/ir/passes/ssa_rewrite_pass.cpp @@ -32,6 +32,7 @@ struct SccFlagTag : FlagTag {}; struct ExecFlagTag : FlagTag {}; struct VccFlagTag : FlagTag {}; struct VccLoTag : FlagTag {}; +struct VccHiTag : FlagTag {}; struct GotoVariable : FlagTag { GotoVariable() = default; @@ -43,7 +44,7 @@ struct GotoVariable : FlagTag { }; using Variant = std::variant; + VccFlagTag, VccLoTag, VccHiTag>; using ValueMap = std::unordered_map; struct DefTable { @@ -89,6 +90,13 @@ struct DefTable { vcc_lo_flag.insert_or_assign(block, value); } + const IR::Value& Def(IR::Block* block, VccHiTag) { + return vcc_hi_flag[block]; + } + void SetDef(IR::Block* block, VccHiTag, const IR::Value& value) { + vcc_hi_flag.insert_or_assign(block, value); + } + const IR::Value& Def(IR::Block* block, VccFlagTag) { return vcc_flag[block]; } @@ -101,6 +109,7 @@ struct DefTable { ValueMap exec_flag; ValueMap vcc_flag; ValueMap vcc_lo_flag; + ValueMap vcc_hi_flag; }; IR::Opcode UndefOpcode(IR::ScalarReg) noexcept { @@ -111,6 +120,14 @@ IR::Opcode UndefOpcode(IR::VectorReg) noexcept { return IR::Opcode::UndefU32; } +IR::Opcode UndefOpcode(const VccLoTag&) noexcept { + return IR::Opcode::UndefU32; +} + +IR::Opcode UndefOpcode(const VccHiTag&) noexcept { + return IR::Opcode::UndefU32; +} + IR::Opcode UndefOpcode(const FlagTag&) noexcept { return IR::Opcode::UndefU1; } @@ -281,6 +298,7 @@ private: void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { const IR::Opcode opcode{inst.GetOpcode()}; switch (opcode) { + case IR::Opcode::SetThreadBitScalarReg: case IR::Opcode::SetScalarRegister: { const IR::ScalarReg reg{inst.Arg(0).ScalarReg()}; pass.WriteVariable(reg, block, inst.Arg(1)); @@ -306,6 +324,10 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::SetVccLo: pass.WriteVariable(VccLoTag{}, block, inst.Arg(0)); break; + case IR::Opcode::SetVccHi: + pass.WriteVariable(VccHiTag{}, block, inst.Arg(0)); + break; + case IR::Opcode::GetThreadBitScalarReg: case IR::Opcode::GetScalarRegister: { const IR::ScalarReg reg{inst.Arg(0).ScalarReg()}; inst.ReplaceUsesWith(pass.ReadVariable(reg, block)); @@ -331,6 +353,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) { case IR::Opcode::GetVccLo: inst.ReplaceUsesWith(pass.ReadVariable(VccLoTag{}, block)); break; + case IR::Opcode::GetVccHi: + inst.ReplaceUsesWith(pass.ReadVariable(VccHiTag{}, block)); + break; default: break; } diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index 8c97f4950..82d40a9bd 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -219,6 +219,7 @@ using U64 = TypedValue; using F16 = TypedValue; using F32 = TypedValue; using F64 = TypedValue; +using U1U32F32 = TypedValue; using U32F32 = TypedValue; using U32U64 = TypedValue; using F32F64 = TypedValue; diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 0b9c20799..189a2ab12 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -61,7 +61,7 @@ IR::Program TranslateProgram(ObjectPool& inst_pool, ObjectPool +#include #include #include "common/assert.h" #include "common/types.h" @@ -81,7 +82,6 @@ struct Info { struct PsInput { u32 param_index; - u32 semantic; bool is_default; bool is_flat; u32 default_value; diff --git a/src/video_core/amdgpu/liverpool.cpp b/src/video_core/amdgpu/liverpool.cpp index 478bc726b..2d645c9d6 100644 --- a/src/video_core/amdgpu/liverpool.cpp +++ b/src/video_core/amdgpu/liverpool.cpp @@ -2,7 +2,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later #include "common/assert.h" -#include "common/io_file.h" #include "common/thread.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/amdgpu/pm4_cmds.h" diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index c52a0f97a..56f695ca9 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -374,10 +374,16 @@ struct Liverpool { FrontAndBack = 3, }; + enum class FrontFace : u32 { + CounterClockwise = 0, + Clockwise = 1, + }; + union PolygonControl { u32 raw; BitField<0, 1, u32> cull_front; BitField<1, 1, u32> cull_back; + BitField<2, 1, FrontFace> front_face; BitField<3, 2, u32> enable_polygon_mode; BitField<5, 3, PolygonMode> polygon_mode_front; BitField<8, 3, PolygonMode> polygon_mode_back; diff --git a/src/video_core/amdgpu/resource.h b/src/video_core/amdgpu/resource.h index cc7b9722a..ef78c2f41 100644 --- a/src/video_core/amdgpu/resource.h +++ b/src/video_core/amdgpu/resource.h @@ -110,11 +110,29 @@ struct Image { BitField<59, 1, u64> atc; BitField<60, 4, ImageType> type; }; + union { + BitField<0, 13, u64> depth; + BitField<13, 14, u64> pitch; + BitField<32, 13, u64> base_array; + BitField<45, 13, u64> last_array; + }; VAddr Address() const { return base_address << 8; } + u32 Pitch() const { + return pitch; + } + + u32 NumLayers() const { + return last_array - base_array + 1; + } + + u32 NumLevels() const { + return last_level + 1; + } + DataFormat GetDataFmt() const noexcept { return static_cast(data_format.Value()); } diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index f13d5f8cb..ccbb400df 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -287,7 +287,7 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu } if (data_format == AmdGpu::DataFormat::Format8_8_8_8 && num_format == AmdGpu::NumberFormat::Srgb) { - return vk::Format::eR8G8B8A8Srgb; + return vk::Format::eB8G8R8A8Srgb; } if (data_format == AmdGpu::DataFormat::Format32_32_32 && num_format == AmdGpu::NumberFormat::Float) { @@ -304,6 +304,9 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu if (data_format == AmdGpu::DataFormat::Format8 && num_format == AmdGpu::NumberFormat::Unorm) { return vk::Format::eR8Unorm; } + if (data_format == AmdGpu::DataFormat::FormatBc3 && num_format == AmdGpu::NumberFormat::Srgb) { + return vk::Format::eBc3SrgbBlock; + } UNREACHABLE(); } diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 655dc692c..8c78a8573 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -75,8 +75,10 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul .depthClampEnable = false, .rasterizerDiscardEnable = false, .polygonMode = LiverpoolToVK::PolygonMode(key.polygon_mode), - .cullMode = LiverpoolToVK::CullMode(key.cull_mode), - .frontFace = vk::FrontFace::eClockwise, + .cullMode = vk::CullModeFlagBits::eNone, /*LiverpoolToVK::CullMode(key.cull_mode),*/ + .frontFace = key.front_face == Liverpool::FrontFace::Clockwise + ? vk::FrontFace::eClockwise + : vk::FrontFace::eCounterClockwise, .depthBiasEnable = false, .lineWidth = 1.0f, }; @@ -177,14 +179,23 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul std::array attachments; for (u32 i = 0; i < num_color_formats; i++) { const auto& control = key.blend_controls[i]; + const auto src_color = LiverpoolToVK::BlendFactor(control.color_src_factor); + const auto dst_color = LiverpoolToVK::BlendFactor(control.color_dst_factor); + const auto color_blend = LiverpoolToVK::BlendOp(control.color_func); attachments[i] = vk::PipelineColorBlendAttachmentState{ .blendEnable = key.blend_controls[i].enable, - .srcColorBlendFactor = LiverpoolToVK::BlendFactor(control.color_src_factor), - .dstColorBlendFactor = LiverpoolToVK::BlendFactor(control.color_dst_factor), - .colorBlendOp = LiverpoolToVK::BlendOp(control.color_func), - .srcAlphaBlendFactor = LiverpoolToVK::BlendFactor(control.alpha_src_factor), - .dstAlphaBlendFactor = LiverpoolToVK::BlendFactor(control.color_dst_factor), - .alphaBlendOp = LiverpoolToVK::BlendOp(control.alpha_func), + .srcColorBlendFactor = src_color, + .dstColorBlendFactor = dst_color, + .colorBlendOp = color_blend, + .srcAlphaBlendFactor = control.separate_alpha_blend + ? LiverpoolToVK::BlendFactor(control.alpha_src_factor) + : src_color, + .dstAlphaBlendFactor = control.separate_alpha_blend + ? LiverpoolToVK::BlendFactor(control.alpha_dst_factor) + : dst_color, + .alphaBlendOp = control.separate_alpha_blend + ? LiverpoolToVK::BlendOp(control.alpha_func) + : color_blend, .colorWriteMask = instance.IsColorWriteEnableSupported() ? vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index ddc67d8ee..02c1fb5af 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -38,6 +38,8 @@ struct GraphicsPipelineKey { Liverpool::PrimitiveType prim_type; Liverpool::PolygonMode polygon_mode; Liverpool::CullMode cull_mode; + Liverpool::FrontFace front_face; + u32 pad{}; std::array blend_controls; std::array write_masks; diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index 3cfe8c793..173537150 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -207,6 +207,7 @@ bool Instance::CreateDevice() { .shaderDrawParameters = true, }, vk::PhysicalDeviceVulkan12Features{ + .scalarBlockLayout = true, .timelineSemaphore = true, }, vk::PhysicalDeviceVulkan13Features{ diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 66ff94038..3a14a02e1 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -94,6 +94,7 @@ void PipelineCache::RefreshGraphicsKey() { key.prim_type = regs.primitive_type; key.polygon_mode = regs.polygon_control.PolyMode(); key.cull_mode = regs.polygon_control.CullingMode(); + key.front_face = regs.polygon_control.front_face; const auto& db = regs.depth_buffer; key.depth_format = key.depth.depth_enable @@ -163,10 +164,19 @@ std::unique_ptr PipelineCache::CreateGraphicsPipeline() { programs[i] = Shader::TranslateProgram(inst_pool, block_pool, code, std::move(info)); // Compile IR to SPIR-V - const auto spv_code = Shader::Backend::SPIRV::EmitSPIRV(profile, programs[i], binding); + auto spv_code = Shader::Backend::SPIRV::EmitSPIRV(profile, programs[i], binding); stages[i] = CompileSPV(spv_code, instance.GetDevice()); infos[i] = &programs[i].info; + // Set module name to hash in renderdoc + const auto name = fmt::format("{}_{:#x}", stage, hash); + const vk::DebugUtilsObjectNameInfoEXT name_info = { + .objectType = vk::ObjectType::eShaderModule, + .objectHandle = std::bit_cast(stages[i]), + .pObjectName = name.c_str(), + }; + instance.GetDevice().setDebugUtilsObjectNameEXT(name_info); + if (Config::dumpShaders()) { DumpShader(spv_code, hash, stage, "spv"); } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 37d6f72b5..e3b38ca48 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -85,6 +85,7 @@ void Rasterizer::Draw(bool is_indexed) { } void Rasterizer::DispatchDirect() { + compute_done = true; return; const auto cmdbuf = scheduler.CommandBuffer(); const auto& cs_program = liverpool->regs.cs_program; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index a1b6a5a66..8f365f658 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -49,6 +49,7 @@ private: Core::MemoryManager* memory; PipelineCache pipeline_cache; StreamBuffer vertex_index_buffer; + bool compute_done{}; }; } // namespace Vulkan diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index cc29f010e..3334c4a71 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -39,7 +39,9 @@ using Libraries::VideoOut::TilingMode; if (false /*&& IsDepthStencilFormat(format)*/) { usage |= vk::ImageUsageFlagBits::eDepthStencilAttachment; } else { - usage |= vk::ImageUsageFlagBits::eColorAttachment; + if (format != vk::Format::eBc3SrgbBlock) { + usage |= vk::ImageUsageFlagBits::eColorAttachment; + } } return usage; } @@ -101,8 +103,10 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image) noexcept { size.width = image.width + 1; size.height = image.height + 1; size.depth = 1; + pitch = image.Pitch(); + resources.levels = image.NumLevels(); + resources.layers = image.NumLayers(); // TODO: Derive this properly from tiling params - pitch = size.width; guest_size_bytes = size.width * size.height * 4; } @@ -183,7 +187,7 @@ void Image::Transit(vk::ImageLayout dst_layout, vk::Flags ds .subresourceRange{ .aspectMask = aspect_mask, .baseMipLevel = 0, - .levelCount = 1, + .levelCount = VK_REMAINING_MIP_LEVELS, .baseArrayLayer = 0, .layerCount = VK_REMAINING_ARRAY_LAYERS, }}; diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index 8e63e0398..353e4e7fa 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -14,8 +14,9 @@ vk::ImageViewType ConvertImageViewType(AmdGpu::ImageType type) { case AmdGpu::ImageType::Color1DArray: return vk::ImageViewType::e1DArray; case AmdGpu::ImageType::Color2D: - case AmdGpu::ImageType::Cube: return vk::ImageViewType::e2D; + case AmdGpu::ImageType::Cube: + return vk::ImageViewType::eCube; case AmdGpu::ImageType::Color2DArray: return vk::ImageViewType::e2DArray; case AmdGpu::ImageType::Color3D: @@ -47,10 +48,10 @@ vk::ComponentSwizzle ConvertComponentSwizzle(u32 dst_sel) { ImageViewInfo::ImageViewInfo(const AmdGpu::Image& image) noexcept { type = ConvertImageViewType(image.type); format = Vulkan::LiverpoolToVK::SurfaceFormat(image.GetDataFmt(), image.GetNumberFmt()); - range.base.level = image.base_level; + range.base.level = 0; range.base.layer = 0; - range.extent.levels = 1; - range.extent.layers = 1; + range.extent.levels = image.NumLevels(); + range.extent.layers = image.NumLayers(); mapping.r = ConvertComponentSwizzle(image.dst_sel_x); mapping.g = ConvertComponentSwizzle(image.dst_sel_y); mapping.b = ConvertComponentSwizzle(image.dst_sel_z); diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index be4bf907b..3e2a7deaf 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -175,48 +175,94 @@ void TextureCache::RefreshImage(Image& image) { // Mark image as validated. image.flags &= ~ImageFlagBits::CpuModified; - // Upload data to the staging buffer. - const auto [data, offset, _] = staging.Map(image.info.guest_size_bytes, 4); - const u8* image_data = reinterpret_cast(image.cpu_addr); - if (image.info.is_tiled) { - ConvertTileToLinear(data, image_data, image.info.size.width, image.info.size.height, - Config::isNeoMode()); - } else { - std::memcpy(data, image_data, image.info.guest_size_bytes); - } - staging.Commit(image.info.guest_size_bytes); + { - // Copy to the image. - const vk::BufferImageCopy image_copy = { - .bufferOffset = offset, - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource{ + // Upload data to the staging buffer. + const auto [data, offset, _] = staging.Map(image.info.guest_size_bytes, 4); + const u8* image_data = reinterpret_cast(image.cpu_addr); + if (image.info.is_tiled) { + ConvertTileToLinear(data, image_data, image.info.size.width, image.info.size.height, + Config::isNeoMode()); + } else { + std::memcpy(data, image_data, image.info.guest_size_bytes); + } + staging.Commit(image.info.guest_size_bytes); + + // Copy to the image. + const vk::BufferImageCopy image_copy = { + .bufferOffset = offset, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource{ + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {image.info.size.width, image.info.size.height, 1}, + }; + + const auto cmdbuf = scheduler.CommandBuffer(); + const vk::ImageSubresourceRange range = { .aspectMask = vk::ImageAspectFlagBits::eColor, - .mipLevel = 0, + .baseMipLevel = 0, + .levelCount = 1, .baseArrayLayer = 0, - .layerCount = 1, - }, - .imageOffset = {0, 0, 0}, - .imageExtent = {image.info.size.width, image.info.size.height, 1}, - }; + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }; - const auto cmdbuf = scheduler.CommandBuffer(); - const vk::ImageSubresourceRange range = { - .aspectMask = vk::ImageAspectFlagBits::eColor, - .baseMipLevel = 0, - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }; + image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); - image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); + cmdbuf.copyBufferToImage(staging.Handle(), image.image, + vk::ImageLayout::eTransferDstOptimal, image_copy); - cmdbuf.copyBufferToImage(staging.Handle(), image.image, vk::ImageLayout::eTransferDstOptimal, - image_copy); + image.Transit(vk::ImageLayout::eGeneral, + vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); + return; + } - image.Transit(vk::ImageLayout::eGeneral, - vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); + const u8* image_data = reinterpret_cast(image.cpu_addr); + for (u32 l = 0; l < image.info.resources.layers; l++) { + // Upload data to the staging buffer. + for (u32 m = 0; m < image.info.resources.levels; m++) { + const u32 width = image.info.size.width >> m; + const u32 height = image.info.size.height >> m; + const u32 map_size = width * height; + const auto [data, offset, _] = staging.Map(map_size, 16); + if (image.info.is_tiled) { + ConvertTileToLinear(data, image_data, width, height, Config::isNeoMode()); + } else { + std::memcpy(data, image_data, map_size); + } + staging.Commit(map_size); + image_data += map_size; + + // Copy to the image. + const vk::BufferImageCopy image_copy = { + .bufferOffset = offset, + .bufferRowLength = 0, + .bufferImageHeight = 0, + .imageSubresource{ + .aspectMask = vk::ImageAspectFlagBits::eColor, + .mipLevel = m, + .baseArrayLayer = l, + .layerCount = 1, + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {width, height, 1}, + }; + + const auto cmdbuf = scheduler.CommandBuffer(); + image.Transit(vk::ImageLayout::eTransferDstOptimal, vk::AccessFlagBits::eTransferWrite); + + cmdbuf.copyBufferToImage(staging.Handle(), image.image, + vk::ImageLayout::eTransferDstOptimal, image_copy); + + image.Transit(vk::ImageLayout::eGeneral, + vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eTransferRead); + } + } } vk::Sampler TextureCache::GetSampler(const AmdGpu::Sampler& sampler) {