From 769eb6a9ee338e99f47987e4add5d72bdcb7a97f Mon Sep 17 00:00:00 2001 From: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com> Date: Mon, 10 Jun 2024 22:35:14 +0300 Subject: [PATCH] video_core: Preliminary storage image support and more (#188) * vk_rasterizer: Clear depth buffer when DB_RENDER_CONTROL says so * video_core: Preliminary storage image support, more opcodes * renderer_vulkan: a fix for vertex buffers merging * renderer_vulkan: a heuristic for blend override when alpha out is masked --------- Co-authored-by: psucien --- CMakeLists.txt | 1 + .../backend/spirv/emit_spirv.cpp | 5 + .../spirv/emit_spirv_context_get_set.cpp | 21 +++- .../backend/spirv/emit_spirv_image.cpp | 14 ++- .../backend/spirv/emit_spirv_instructions.h | 9 +- .../backend/spirv/emit_spirv_warp.cpp | 21 ++++ .../backend/spirv/spirv_emit_context.cpp | 59 ++++------- .../backend/spirv/spirv_emit_context.h | 2 + .../frontend/translate/data_share.cpp | 13 +++ .../frontend/translate/scalar_alu.cpp | 21 +++- .../frontend/translate/translate.cpp | 98 +++++++++++++++++-- .../frontend/translate/translate.h | 10 +- .../frontend/translate/vector_alu.cpp | 32 +++++- .../frontend/translate/vector_memory.cpp | 42 ++++++++ src/shader_recompiler/ir/ir_emitter.cpp | 8 ++ src/shader_recompiler/ir/ir_emitter.h | 8 +- src/shader_recompiler/ir/microinstruction.cpp | 1 + src/shader_recompiler/ir/opcodes.inc | 4 + .../ir/passes/resource_tracking_pass.cpp | 49 +++++++--- .../ir/passes/shader_info_collection_pass.cpp | 3 + src/shader_recompiler/runtime_info.h | 2 + src/video_core/amdgpu/liverpool.h | 36 ++++++- .../renderer_vulkan/liverpool_to_vk.cpp | 13 +++ .../renderer_vulkan/vk_compute_pipeline.cpp | 5 +- .../renderer_vulkan/vk_graphics_pipeline.cpp | 34 ++++++- .../renderer_vulkan/vk_graphics_pipeline.h | 1 + .../renderer_vulkan/vk_pipeline_cache.cpp | 1 + .../renderer_vulkan/vk_rasterizer.cpp | 7 +- .../texture_cache/texture_cache.cpp | 8 +- src/video_core/texture_cache/texture_cache.h | 2 +- 30 files changed, 429 insertions(+), 101 deletions(-) create mode 100644 src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 71c32157..2e9b3a33 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -322,6 +322,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/backend/spirv/emit_spirv_select.cpp src/shader_recompiler/backend/spirv/emit_spirv_special.cpp src/shader_recompiler/backend/spirv/emit_spirv_undefined.cpp + src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp src/shader_recompiler/backend/spirv/spirv_emit_context.cpp src/shader_recompiler/backend/spirv/spirv_emit_context.h src/shader_recompiler/frontend/translate/data_share.cpp diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index bd3f4f3f..1c7ed215 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -174,6 +174,7 @@ Id DefineMain(EmitContext& ctx, IR::Program& program) { void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { const std::span interfaces(ctx.interfaces.data(), ctx.interfaces.size()); spv::ExecutionModel execution_model{}; + ctx.AddCapability(spv::Capability::StorageImageWriteWithoutFormat); switch (program.info.stage) { case Stage::Compute: { const std::array workgroup_size{program.info.workgroup_size}; @@ -192,6 +193,10 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { } else { ctx.AddExecutionMode(main, spv::ExecutionMode::OriginUpperLeft); } + if (program.info.uses_group_quad) { + ctx.AddCapability(spv::Capability::GroupNonUniform); + ctx.AddCapability(spv::Capability::GroupNonUniformQuad); + } ctx.AddCapability(spv::Capability::DemoteToHelperInvocationEXT); // if (program.info.stores_frag_depth) { // ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index d4c4afb3..99547ff4 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -21,11 +21,20 @@ Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { case IR::Attribute::Position0: { return ctx.OpAccessChain(ctx.output_f32, ctx.output_position, ctx.ConstU32(element)); case IR::Attribute::RenderTarget0: - return ctx.OpAccessChain(ctx.output_f32, ctx.frag_color[0], ctx.ConstU32(element)); + case IR::Attribute::RenderTarget1: + case IR::Attribute::RenderTarget2: + case IR::Attribute::RenderTarget3: { + const u32 index = u32(attr) - u32(IR::Attribute::RenderTarget0); + if (ctx.frag_num_comp[index] > 1) { + return ctx.OpAccessChain(ctx.output_f32, ctx.frag_color[index], ctx.ConstU32(element)); + } else { + return ctx.frag_color[index]; + } } default: throw NotImplementedException("Read attribute {}", attr); } + } } } // Anonymous namespace @@ -152,7 +161,15 @@ Id EmitLoadBufferF32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) } Id EmitLoadBufferF32x3(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { - UNREACHABLE(); + const auto info = inst->Flags(); + const auto& buffer = ctx.buffers[handle]; + boost::container::static_vector ids; + for (u32 i = 0; i < 3; i++) { + const Id index{ctx.OpIAdd(ctx.U32[1], address, ctx.ConstU32(i))}; + const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)}; + ids.push_back(ctx.OpLoad(buffer.data_types->Get(1), ptr)); + } + return ctx.OpCompositeConstruct(buffer.data_types->Get(3), ids); } Id EmitLoadBufferF32x4(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address) { diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp index 7c21e6fc..f0d01727 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_image.cpp @@ -50,9 +50,11 @@ Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, throw NotImplementedException("SPIR-V Instruction"); } -Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id offset, - Id lod, Id ms) { - throw NotImplementedException("SPIR-V Instruction"); +Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, Id lod, + Id ms) { + const auto& texture = ctx.images[handle & 0xFFFF]; + const Id image = ctx.OpLoad(texture.image_type, texture.id); + return ctx.OpImageFetch(ctx.F32[4], image, coords, spv::ImageOperandsMask::Lod, lod); } Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id lod, @@ -73,8 +75,10 @@ Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id co throw NotImplementedException("SPIR-V Instruction"); } -void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id color) { - throw NotImplementedException("SPIR-V Instruction"); +void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id color) { + const auto& texture = ctx.images[handle & 0xFFFF]; + const Id image = ctx.OpLoad(texture.image_type, texture.id); + ctx.OpImageWrite(image, ctx.OpBitcast(ctx.S32[2], coords), color); } } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 33a44935..5aa1af55 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -344,14 +344,17 @@ Id EmitImageGather(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id const IR::Value& offset, const IR::Value& offset2); Id EmitImageGatherDref(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, const IR::Value& offset, const IR::Value& offset2, Id dref); -Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id offset, - Id lod, Id ms); +Id EmitImageFetch(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id offset, Id lod, + Id ms); Id EmitImageQueryDimensions(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id lod, const IR::Value& skip_mips); Id EmitImageQueryLod(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords); Id EmitImageGradient(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id derivatives, const IR::Value& offset, Id lod_clamp); Id EmitImageRead(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords); -void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, const IR::Value& index, Id coords, Id color); +void EmitImageWrite(EmitContext& ctx, IR::Inst* inst, u32 handle, Id coords, Id color); + +Id EmitLaneId(EmitContext& ctx); +Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index); } // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp new file mode 100644 index 00000000..a1751588 --- /dev/null +++ b/src/shader_recompiler/backend/spirv/emit_spirv_warp.cpp @@ -0,0 +1,21 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" +#include "shader_recompiler/backend/spirv/spirv_emit_context.h" + +namespace Shader::Backend::SPIRV { + +Id SubgroupScope(EmitContext& ctx) { + return ctx.ConstU32(static_cast(spv::Scope::Subgroup)); +} + +Id EmitLaneId(EmitContext& ctx) { + return ctx.OpLoad(ctx.U32[1], ctx.subgroup_local_invocation_id); +} + +Id EmitQuadShuffle(EmitContext& ctx, Id value, Id index) { + return ctx.OpGroupNonUniformQuadBroadcast(ctx.U32[1], SubgroupScope(ctx), value, index); +} + +} // namespace Shader::Backend::SPIRV diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 87da1a5b..39e552c3 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -178,6 +178,11 @@ void EmitContext::DefineInputs(const Info& info) { } break; case Stage::Fragment: + if (info.uses_group_quad) { + subgroup_local_invocation_id = DefineVariable( + U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input); + Decorate(subgroup_local_invocation_id, spv::Decoration::Flat); + } frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); for (const auto& input : info.ps_inputs) { @@ -231,7 +236,9 @@ void EmitContext::DefineOutputs(const Info& info) { if (!info.stores.GetAny(mrt)) { continue; } - frag_color[i] = DefineOutput(F32[4], i); + const u32 num_components = info.stores.NumComponents(mrt); + frag_color[i] = DefineOutput(F32[num_components], i); + frag_num_comp[i] = num_components; Name(frag_color[i], fmt::format("frag_color{}", i)); interfaces.push_back(frag_color[i]); } @@ -277,54 +284,22 @@ void EmitContext::DefineBuffers(const Info& info) { } } -Id ImageType(EmitContext& ctx, const ImageResource& desc) { - const spv::ImageFormat format{spv::ImageFormat::Unknown}; - const Id type{ctx.F32[1]}; - const bool depth{desc.is_depth}; - switch (desc.type) { - case AmdGpu::ImageType::Color1D: - return ctx.TypeImage(type, spv::Dim::Dim1D, depth, false, false, 1, format, - spv::AccessQualifier::ReadOnly); - case AmdGpu::ImageType::Color1DArray: - return ctx.TypeImage(type, spv::Dim::Dim1D, depth, true, false, 1, format, - spv::AccessQualifier::ReadOnly); - case AmdGpu::ImageType::Color2D: - case AmdGpu::ImageType::Color2DMsaa: - return ctx.TypeImage(type, spv::Dim::Dim2D, depth, false, - desc.type == AmdGpu::ImageType::Color2DMsaa, 1, format, - spv::AccessQualifier::ReadOnly); - case AmdGpu::ImageType::Color2DArray: - case AmdGpu::ImageType::Color2DMsaaArray: - return ctx.TypeImage(type, spv::Dim::Dim2D, depth, true, - desc.type == AmdGpu::ImageType::Color2DMsaaArray, 1, format, - spv::AccessQualifier::ReadOnly); - case AmdGpu::ImageType::Color3D: - return ctx.TypeImage(type, spv::Dim::Dim3D, depth, false, false, 1, format, - spv::AccessQualifier::ReadOnly); - case AmdGpu::ImageType::Cube: - return ctx.TypeImage(type, spv::Dim::Cube, depth, false, false, 1, format, - spv::AccessQualifier::ReadOnly); - case AmdGpu::ImageType::Buffer: - break; - } - throw InvalidArgument("Invalid texture type {}", desc.type); -} - Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) { - const auto format = spv::ImageFormat::Unknown; // Read this from tsharp? + const auto format = spv::ImageFormat::Unknown; + const u32 sampled = desc.is_storage ? 2 : 1; switch (desc.type) { case AmdGpu::ImageType::Color1D: - return ctx.TypeImage(sampled_type, spv::Dim::Dim1D, false, false, false, 1, format); + return ctx.TypeImage(sampled_type, spv::Dim::Dim1D, false, false, false, sampled, format); case AmdGpu::ImageType::Color1DArray: - return ctx.TypeImage(sampled_type, spv::Dim::Dim1D, false, true, false, 1, format); + return ctx.TypeImage(sampled_type, spv::Dim::Dim1D, false, true, false, sampled, format); case AmdGpu::ImageType::Color2D: - return ctx.TypeImage(sampled_type, spv::Dim::Dim2D, false, false, false, 1, format); + return ctx.TypeImage(sampled_type, spv::Dim::Dim2D, false, false, false, sampled, format); case AmdGpu::ImageType::Color2DArray: - return ctx.TypeImage(sampled_type, spv::Dim::Dim2D, false, true, false, 1, format); + return ctx.TypeImage(sampled_type, spv::Dim::Dim2D, false, true, false, sampled, format); case AmdGpu::ImageType::Color3D: - return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, 1, format); + return ctx.TypeImage(sampled_type, spv::Dim::Dim3D, false, false, false, sampled, format); case AmdGpu::ImageType::Cube: - return ctx.TypeImage(sampled_type, spv::Dim::Cube, false, false, false, 1, format); + return ctx.TypeImage(sampled_type, spv::Dim::Cube, false, false, false, sampled, format); case AmdGpu::ImageType::Buffer: throw NotImplementedException("Image buffer"); default: @@ -345,7 +320,7 @@ void EmitContext::DefineImagesAndSamplers(const Info& info) { image_desc.dword_offset)); images.push_back({ .id = id, - .sampled_type = TypeSampledImage(image_type), + .sampled_type = image_desc.is_storage ? sampled_type : TypeSampledImage(image_type), .pointer_type = pointer_type, .image_type = image_type, }); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index 67eac930..c4bc722c 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -158,9 +158,11 @@ public: Id frag_coord{}; Id front_facing{}; std::array frag_color{}; + std::array frag_num_comp{}; Id workgroup_id{}; Id local_invocation_id{}; + Id subgroup_local_invocation_id{}; struct TextureDefinition { Id id; diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 9868ecd5..99883015 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -5,6 +5,19 @@ namespace Shader::Gcn { +void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { + const u8 offset0 = inst.control.ds.offset0; + const u8 offset1 = inst.control.ds.offset1; + const IR::U32 src{GetSrc(inst.src[1])}; + ASSERT(offset1 & 0x80); + const IR::U32 lane_id = ir.LaneId(); + const IR::U32 id_in_group = ir.BitwiseAnd(lane_id, ir.Imm32(0b11)); + const IR::U32 base = ir.ShiftLeftLogical(id_in_group, ir.Imm32(1)); + const IR::U32 index = + ir.IAdd(lane_id, ir.BitFieldExtract(ir.Imm32(offset0), base, ir.Imm32(2))); + SetDst(inst.dst[0], ir.QuadShuffle(src, index)); +} + void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst) { const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; const IR::VectorReg dst_reg{inst.dst[0].code}; diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 8c4c90be..d3db3766 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -75,9 +75,17 @@ void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { // This instruction normally operates on 64-bit data (EXEC, VCC, SGPRs) // However here we flatten it to 1-bit EXEC and 1-bit VCC. For the destination // SGPR we have a special IR opcode for SPGRs that act as thread masks. - ASSERT(inst.src[0].field == OperandField::VccLo); const IR::U1 exec{ir.GetExec()}; - const IR::U1 vcc{ir.GetVcc()}; + const IR::U1 src = [&] { + switch (inst.src[0].field) { + case OperandField::VccLo: + return ir.GetVcc(); + case OperandField::ScalarGPR: + return ir.GetThreadBitScalarReg(IR::ScalarReg(inst.src[0].code)); + default: + UNREACHABLE(); + } + }(); // Mark destination SPGR as an EXEC context. This means we will use 1-bit // IR instruction whenever it's loaded. @@ -96,7 +104,7 @@ void Translator::S_AND_SAVEEXEC_B64(const GcnInst& inst) { } // Update EXEC. - ir.SetExec(ir.LogicalAnd(exec, vcc)); + ir.SetExec(ir.LogicalAnd(exec, src)); } void Translator::S_MOV_B64(const GcnInst& inst) { @@ -258,4 +266,11 @@ void Translator::S_LSHL_B32(const GcnInst& inst) { ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); } +void Translator::S_BFM_B32(const GcnInst& inst) { + const IR::U32 src0{ir.BitwiseAnd(GetSrc(inst.src[0]), ir.Imm32(0x1F))}; + const IR::U32 src1{ir.BitwiseAnd(GetSrc(inst.src[1]), ir.Imm32(0x1F))}; + const IR::U32 mask{ir.ISub(ir.ShiftLeftLogical(ir.Imm32(1u), src0), ir.Imm32(1))}; + SetDst(inst.dst[0], ir.ShiftLeftLogical(mask, src1)); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 510b4b28..c0ddf4ae 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -306,6 +306,15 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::IMAGE_SAMPLE: translator.IMAGE_SAMPLE(inst); break; + case Opcode::IMAGE_STORE: + translator.IMAGE_STORE(inst); + break; + case Opcode::IMAGE_LOAD_MIP: + translator.IMAGE_LOAD_MIP(inst); + break; + case Opcode::V_CMP_GE_I32: + translator.V_CMP_U32(ConditionOp::GE, true, false, inst); + break; case Opcode::V_CMP_EQ_I32: translator.V_CMP_U32(ConditionOp::EQ, true, false, inst); break; @@ -331,28 +340,31 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) translator.V_CMP_U32(ConditionOp::TRU, false, false, inst); break; case Opcode::V_CMP_NEQ_F32: - translator.V_CMP_F32(ConditionOp::LG, inst); + translator.V_CMP_F32(ConditionOp::LG, false, inst); break; case Opcode::V_CMP_F_F32: - translator.V_CMP_F32(ConditionOp::F, inst); + translator.V_CMP_F32(ConditionOp::F, false, inst); break; case Opcode::V_CMP_LT_F32: - translator.V_CMP_F32(ConditionOp::LT, inst); + translator.V_CMP_F32(ConditionOp::LT, false, inst); break; case Opcode::V_CMP_EQ_F32: - translator.V_CMP_F32(ConditionOp::EQ, inst); + translator.V_CMP_F32(ConditionOp::EQ, false, inst); break; case Opcode::V_CMP_LE_F32: - translator.V_CMP_F32(ConditionOp::LE, inst); + translator.V_CMP_F32(ConditionOp::LE, false, inst); break; case Opcode::V_CMP_GT_F32: - translator.V_CMP_F32(ConditionOp::GT, inst); + translator.V_CMP_F32(ConditionOp::GT, false, inst); break; case Opcode::V_CMP_LG_F32: - translator.V_CMP_F32(ConditionOp::LG, inst); + translator.V_CMP_F32(ConditionOp::LG, false, inst); break; case Opcode::V_CMP_GE_F32: - translator.V_CMP_F32(ConditionOp::GE, inst); + translator.V_CMP_F32(ConditionOp::GE, false, inst); + break; + case Opcode::V_CMP_NLE_F32: + translator.V_CMP_F32(ConditionOp::GT, false, inst); break; case Opcode::S_CMP_LG_U32: translator.S_CMP(ConditionOp::LG, false, inst); @@ -378,6 +390,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_CNDMASK_B32: translator.V_CNDMASK_B32(inst); break; + case Opcode::TBUFFER_LOAD_FORMAT_XYZ: + translator.BUFFER_LOAD_FORMAT(3, true, inst); + break; case Opcode::TBUFFER_LOAD_FORMAT_XYZW: translator.BUFFER_LOAD_FORMAT(4, true, inst); break; @@ -414,6 +429,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_MIN_F32: translator.V_MIN_F32(inst); break; + case Opcode::V_MIN_I32: + translator.V_MIN_I32(inst); + break; case Opcode::V_MIN3_F32: translator.V_MIN3_F32(inst); break; @@ -435,6 +453,9 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_CVT_U32_F32: translator.V_CVT_U32_F32(inst); break; + case Opcode::V_CVT_I32_F32: + translator.V_CVT_I32_F32(inst); + break; case Opcode::V_SUBREV_F32: translator.V_SUBREV_F32(inst); break; @@ -447,12 +468,61 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_SUBREV_I32: translator.V_SUBREV_I32(inst); break; + + case Opcode::V_CMPX_F_F32: + translator.V_CMP_F32(ConditionOp::F, true, inst); + break; + case Opcode::V_CMPX_LT_F32: + translator.V_CMP_F32(ConditionOp::LT, true, inst); + break; + case Opcode::V_CMPX_EQ_F32: + translator.V_CMP_F32(ConditionOp::EQ, true, inst); + break; + case Opcode::V_CMPX_LE_F32: + translator.V_CMP_F32(ConditionOp::LE, true, inst); + break; + case Opcode::V_CMPX_GT_F32: + translator.V_CMP_F32(ConditionOp::GT, true, inst); + break; + case Opcode::V_CMPX_LG_F32: + translator.V_CMP_F32(ConditionOp::LG, true, inst); + break; + case Opcode::V_CMPX_GE_F32: + translator.V_CMP_F32(ConditionOp::GE, true, inst); + break; + case Opcode::V_CMPX_NGE_F32: + translator.V_CMP_F32(ConditionOp::LT, true, inst); + break; + case Opcode::V_CMPX_NLG_F32: + translator.V_CMP_F32(ConditionOp::EQ, true, inst); + break; + case Opcode::V_CMPX_NGT_F32: + translator.V_CMP_F32(ConditionOp::LE, true, inst); + break; + case Opcode::V_CMPX_NLE_F32: + translator.V_CMP_F32(ConditionOp::GT, true, inst); + break; + case Opcode::V_CMPX_NEQ_F32: + translator.V_CMP_F32(ConditionOp::LG, true, inst); + break; + case Opcode::V_CMPX_NLT_F32: + translator.V_CMP_F32(ConditionOp::GE, true, inst); + break; + case Opcode::V_CMPX_TRU_F32: + translator.V_CMP_F32(ConditionOp::TRU, true, inst); + break; case Opcode::V_CMP_LE_U32: translator.V_CMP_U32(ConditionOp::LE, false, false, inst); break; case Opcode::V_CMP_GT_I32: translator.V_CMP_U32(ConditionOp::GT, true, false, inst); break; + case Opcode::V_CMP_LT_I32: + translator.V_CMP_U32(ConditionOp::LT, true, false, inst); + break; + case Opcode::V_CMPX_LT_I32: + translator.V_CMP_U32(ConditionOp::LT, true, true, inst); + break; case Opcode::V_CMPX_F_U32: translator.V_CMP_U32(ConditionOp::F, false, true, inst); break; @@ -540,6 +610,18 @@ void Translate(IR::Block* block, std::span inst_list, Info& info) case Opcode::V_BCNT_U32_B32: translator.V_BCNT_U32_B32(inst); break; + case Opcode::V_MAX3_F32: + translator.V_MAX3_F32(inst); + break; + case Opcode::DS_SWIZZLE_B32: + translator.DS_SWIZZLE_B32(inst); + break; + case Opcode::V_MUL_LO_U32: + translator.V_MUL_LO_U32(inst); + break; + case Opcode::S_BFM_B32: + translator.S_BFM_B32(inst); + break; case Opcode::S_NOP: case Opcode::S_CBRANCH_EXECZ: case Opcode::S_CBRANCH_SCC0: diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 870cb3aa..a8964fc9 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -49,6 +49,7 @@ public: void S_CSELECT_B64(const GcnInst& inst); void S_BFE_U32(const GcnInst& inst); void S_LSHL_B32(const GcnInst& inst); + void S_BFM_B32(const GcnInst& inst); // Scalar Memory void S_LOAD_DWORD(int num_dwords, const GcnInst& inst); @@ -75,7 +76,7 @@ public: void V_SUB_F32(const GcnInst& inst); void V_RCP_F32(const GcnInst& inst); void V_FMA_F32(const GcnInst& inst); - void V_CMP_F32(ConditionOp op, const GcnInst& inst); + void V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst); void V_MAX_F32(const GcnInst& inst); void V_RSQ_F32(const GcnInst& inst); void V_SIN_F32(const GcnInst& inst); @@ -106,6 +107,10 @@ public: void V_RNDNE_F32(const GcnInst& inst); void V_BCNT_U32_B32(const GcnInst& inst); void V_COS_F32(const GcnInst& inst); + void V_MAX3_F32(const GcnInst& inst); + void V_CVT_I32_F32(const GcnInst& inst); + void V_MIN_I32(const GcnInst& inst); + void V_MUL_LO_U32(const GcnInst& inst); // Vector Memory void BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst); @@ -115,12 +120,15 @@ public: void V_INTERP_P2_F32(const GcnInst& inst); // Data share + void DS_SWIZZLE_B32(const GcnInst& inst); void DS_READ(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); void DS_WRITE(int bit_size, bool is_signed, bool is_pair, const GcnInst& inst); // MIMG void IMAGE_GET_RESINFO(const GcnInst& inst); void IMAGE_SAMPLE(const GcnInst& inst); + void IMAGE_STORE(const GcnInst& inst); + void IMAGE_LOAD_MIP(const GcnInst& inst); // Export void EXP(const GcnInst& inst); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 0a3ec92e..dbd9471f 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -20,7 +20,8 @@ void Translator::V_MAC_F32(const GcnInst& inst) { void Translator::V_CVT_PKRTZ_F16_F32(const GcnInst& inst) { const IR::VectorReg dst_reg{inst.dst[0].code}; - const IR::Value vec_f32 = ir.CompositeConstruct(GetSrc(inst.src[0]), GetSrc(inst.src[1])); + const IR::Value vec_f32 = + ir.CompositeConstruct(GetSrc(inst.src[0], true), GetSrc(inst.src[1], true)); ir.SetVectorReg(dst_reg, ir.PackHalf2x16(vec_f32)); } @@ -143,7 +144,7 @@ void Translator::V_FMA_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPFma(src0, src1, src2)); } -void Translator::V_CMP_F32(ConditionOp op, const GcnInst& inst) { +void Translator::V_CMP_F32(ConditionOp op, bool set_exec, const GcnInst& inst) { const IR::F32 src0{GetSrc(inst.src[0], true)}; const IR::F32 src1{GetSrc(inst.src[1], true)}; const IR::U1 result = [&] { @@ -166,6 +167,9 @@ void Translator::V_CMP_F32(ConditionOp op, const GcnInst& inst) { UNREACHABLE(); } }(); + if (set_exec) { + ir.SetExec(result); + } switch (inst.dst[1].field) { case OperandField::VccLo: @@ -382,4 +386,28 @@ void Translator::V_COS_F32(const GcnInst& inst) { SetDst(inst.dst[0], ir.FPCos(src0)); } +void Translator::V_MAX3_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + const IR::F32 src1{GetSrc(inst.src[1], true)}; + const IR::F32 src2{GetSrc(inst.src[2], true)}; + SetDst(inst.dst[0], ir.FPMax(src0, ir.FPMax(src1, src2))); +} + +void Translator::V_CVT_I32_F32(const GcnInst& inst) { + const IR::F32 src0{GetSrc(inst.src[0], true)}; + SetDst(inst.dst[0], ir.ConvertFToS(32, src0)); +} + +void Translator::V_MIN_I32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.SMin(src0, src1)); +} + +void Translator::V_MUL_LO_U32(const GcnInst& inst) { + const IR::U32 src0{GetSrc(inst.src[0])}; + const IR::U32 src1{GetSrc(inst.src[1])}; + SetDst(inst.dst[0], ir.IMul(src0, src1)); +} + } // namespace Shader::Gcn diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index 9694b06c..aad9ba84 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -107,6 +107,48 @@ void Translator::IMAGE_SAMPLE(const GcnInst& inst) { } } +void Translator::IMAGE_LOAD_MIP(const GcnInst& inst) { + const auto& mimg = inst.control.mimg; + IR::VectorReg addr_reg{inst.src[0].code}; + IR::VectorReg dest_reg{inst.dst[0].code}; + const IR::ScalarReg tsharp_reg{inst.src[2].code * 4}; + + const IR::Value handle = ir.GetScalarReg(tsharp_reg); + const IR::Value body = + ir.CompositeConstruct(ir.GetVectorReg(addr_reg), ir.GetVectorReg(addr_reg + 1), + ir.GetVectorReg(addr_reg + 2), ir.GetVectorReg(addr_reg + 3)); + + IR::TextureInstInfo info{}; + info.explicit_lod.Assign(1); + const IR::Value texel = ir.ImageFetch(handle, body, {}, {}, {}, info); + + for (u32 i = 0; i < 4; i++) { + if (((mimg.dmask >> i) & 1) == 0) { + continue; + } + IR::F32 value = IR::F32{ir.CompositeExtract(texel, i)}; + ir.SetVectorReg(dest_reg++, value); + } +} + +void Translator::IMAGE_STORE(const GcnInst& inst) { + const auto& mimg = inst.control.mimg; + IR::VectorReg addr_reg{inst.src[0].code}; + IR::VectorReg data_reg{inst.dst[0].code}; + const IR::ScalarReg tsharp_reg{inst.src[2].code * 4}; + + const IR::Value handle = ir.GetScalarReg(tsharp_reg); + const IR::Value body = + ir.CompositeConstruct(ir.GetVectorReg(addr_reg), ir.GetVectorReg(addr_reg + 1), + ir.GetVectorReg(addr_reg + 2), ir.GetVectorReg(addr_reg + 3)); + + ASSERT(mimg.dmask == 0xF); + const IR::Value value = ir.CompositeConstruct( + ir.GetVectorReg(data_reg), ir.GetVectorReg(data_reg + 1), + ir.GetVectorReg(data_reg + 2), ir.GetVectorReg(data_reg + 3)); + ir.ImageWrite(handle, body, value, {}); +} + void Translator::BUFFER_LOAD_FORMAT(u32 num_dwords, bool is_typed, const GcnInst& inst) { const auto& mtbuf = inst.control.mtbuf; const IR::VectorReg vaddr{inst.src[0].code}; diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index cf57939d..276269af 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -318,6 +318,14 @@ void IREmitter::StoreBuffer(int num_dwords, const Value& handle, const Value& ad } } +U32 IREmitter::LaneId() { + return Inst(Opcode::LaneId); +} + +U32 IREmitter::QuadShuffle(const U32& value, const U32& index) { + return Inst(Opcode::QuadShuffle, value, index); +} + F32F64 IREmitter::FPAdd(const F32F64& a, const F32F64& b) { if (a.Type() != b.Type()) { throw InvalidArgument("Mismatching types {} and {}", a.Type(), b.Type()); diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 707c127e..3e951f82 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -85,12 +85,8 @@ public: void StoreBuffer(int num_dwords, const Value& handle, const Value& address, const Value& data, BufferInstInfo info); - [[nodiscard]] U1 GetZeroFromOp(const Value& op); - [[nodiscard]] U1 GetSignFromOp(const Value& op); - [[nodiscard]] U1 GetCarryFromOp(const Value& op); - [[nodiscard]] U1 GetOverflowFromOp(const Value& op); - [[nodiscard]] U1 GetSparseFromOp(const Value& op); - [[nodiscard]] U1 GetInBoundsFromOp(const Value& op); + [[nodiscard]] U32 LaneId(); + [[nodiscard]] U32 QuadShuffle(const U32& value, const U32& index); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2, const Value& e3); diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index fdbda06f..da4e2e75 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -52,6 +52,7 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::StoreBufferF32x3: case Opcode::StoreBufferF32x4: case Opcode::StoreBufferU32: + case Opcode::ImageWrite: return true; default: return false; diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index bd506f44..71933096 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -269,3 +269,7 @@ OPCODE(ImageQueryLod, F32x4, Opaq OPCODE(ImageGradient, F32x4, Opaque, Opaque, Opaque, Opaque, Opaque, ) OPCODE(ImageRead, U32x4, Opaque, Opaque, ) OPCODE(ImageWrite, Void, Opaque, Opaque, U32x4, ) + +// Warp operations +OPCODE(LaneId, U32, ) +OPCODE(QuadShuffle, U32, U32, U32 ) diff --git a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp index 36e816fb..e35a6b08 100644 --- a/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp +++ b/src/shader_recompiler/ir/passes/resource_tracking_pass.cpp @@ -93,6 +93,16 @@ bool IsImageInstruction(const IR::Inst& inst) { } } +bool IsImageStorageInstruction(const IR::Inst& inst) { + switch (inst.GetOpcode()) { + case IR::Opcode::ImageWrite: + case IR::Opcode::ImageRead: + return true; + default: + return false; + } +} + class Descriptors { public: explicit Descriptors(BufferResourceList& buffer_resources_, ImageResourceList& image_resources_, @@ -241,32 +251,42 @@ IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value& void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) { IR::Inst* producer = inst.Arg(0).InstRecursive(); - ASSERT(producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2); + ASSERT(producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2 || + producer->GetOpcode() == IR::Opcode::GetUserData); + const auto [tsharp_handle, ssharp_handle] = [&] -> std::pair { + if (producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2) { + return std::make_pair(producer->Arg(0).InstRecursive(), + producer->Arg(1).InstRecursive()); + } + return std::make_pair(producer, nullptr); + }(); // Read image sharp. - const auto tsharp = TrackSharp(producer->Arg(0).InstRecursive()); + const auto tsharp = TrackSharp(tsharp_handle); const auto image = info.ReadUd(tsharp.sgpr_base, tsharp.dword_offset); const auto inst_info = inst.Flags(); - const u32 image_binding = descriptors.Add(ImageResource{ + u32 image_binding = descriptors.Add(ImageResource{ .sgpr_base = tsharp.sgpr_base, .dword_offset = tsharp.dword_offset, .type = image.type, .nfmt = static_cast(image.num_format.Value()), - .is_storage = false, + .is_storage = IsImageStorageInstruction(inst), .is_depth = bool(inst_info.is_depth), }); - // Read sampler sharp. - const auto ssharp = TrackSharp(producer->Arg(1).InstRecursive()); - const u32 sampler_binding = descriptors.Add(SamplerResource{ - .sgpr_base = ssharp.sgpr_base, - .dword_offset = ssharp.dword_offset, - }); + // Read sampler sharp. This doesn't exist for IMAGE_LOAD/IMAGE_STORE instructions + if (ssharp_handle) { + const auto ssharp = TrackSharp(ssharp_handle); + const u32 sampler_binding = descriptors.Add(SamplerResource{ + .sgpr_base = ssharp.sgpr_base, + .dword_offset = ssharp.dword_offset, + }); + image_binding |= (sampler_binding << 16); + } // Patch image handle - const u32 handle = image_binding | (sampler_binding << 16); IR::IREmitter ir{block, IR::Block::InstructionList::s_iterator_to(inst)}; - inst.SetArg(0, ir.Imm32(handle)); + inst.SetArg(0, ir.Imm32(image_binding)); // Now that we know the image type, adjust texture coordinate vector. const IR::Inst* body = inst.Arg(1).InstRecursive(); @@ -283,7 +303,7 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip case AmdGpu::ImageType::Cube: return {PatchCubeCoord(ir, body->Arg(0), body->Arg(1), body->Arg(2)), body->Arg(3)}; default: - UNREACHABLE(); + UNREACHABLE_MSG("Unknown image type {}", image.type.Value()); } }(); inst.SetArg(1, coords); @@ -293,6 +313,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip const u32 arg_pos = inst_info.is_depth ? 5 : 4; inst.SetArg(arg_pos, arg); } + if (inst_info.explicit_lod && inst.GetOpcode() == IR::Opcode::ImageFetch) { + inst.SetArg(3, arg); + } } void ResourceTrackingPass(IR::Program& program) { diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 99aedbc4..ac1cb060 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -16,6 +16,9 @@ void Visit(Info& info, IR::Inst& inst) { info.stores.Set(inst.Arg(0).Attribute(), inst.Arg(2).U32()); break; } + case IR::Opcode::QuadShuffle: + info.uses_group_quad = true; + break; default: break; } diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 672a1f9d..c7318460 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -126,6 +126,8 @@ struct Info { std::span user_data; Stage stage; + bool uses_group_quad{}; + template T ReadUd(u32 ptr_index, u32 dword_offset) const noexcept { T data; diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index e4d22cdc..22b59f74 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -420,6 +420,13 @@ struct Liverpool { }; union ColorBufferMask { + enum ColorComponent : u32 { + ComponentR = (1u << 0), + ComponentG = (1u << 1), + ComponentB = (1u << 2), + ComponentA = (1u << 3), + }; + u32 raw; BitField<0, 4, u32> output0_mask; BitField<4, 4, u32> output1_mask; @@ -430,8 +437,8 @@ struct Liverpool { BitField<24, 4, u32> output6_mask; BitField<28, 4, u32> output7_mask; - [[nodiscard]] u8 GetMask(int buf_id) const { - return (raw >> (buf_id * 4)) & 0xffu; + u32 GetMask(int buf_id) const { + return (raw >> (buf_id * 4)) & 0xfu; } }; @@ -732,6 +739,20 @@ struct Liverpool { float back_offset; }; + struct Address { + u32 address; + + VAddr GetAddress() const { + return u64(address) << 8; + } + }; + + union DepthRenderControl { + u32 raw; + BitField<0, 1, u32> depth_clear_enable; + BitField<1, 1, u32> stencil_clear_enable; + }; + union Regs { struct { INSERT_PADDING_WORDS(0x2C08); @@ -740,11 +761,15 @@ struct Liverpool { ShaderProgram vs_program; INSERT_PADDING_WORDS(0x2E00 - 0x2C4C - 16); ComputeProgram cs_program; - INSERT_PADDING_WORDS(0xA008 - 0x2E00 - 80); + INSERT_PADDING_WORDS(0xA008 - 0x2E00 - 80 - 3 - 5); + DepthRenderControl depth_render_control; + INSERT_PADDING_WORDS(4); + Address depth_htile_data_base; + INSERT_PADDING_WORDS(2); float depth_bounds_min; float depth_bounds_max; u32 stencil_clear; - u32 depth_clear; + float depth_clear; Scissor screen_scissor; INSERT_PADDING_WORDS(0xA010 - 0xA00C - 2); DepthBuffer depth_buffer; @@ -925,6 +950,8 @@ static_assert(GFX6_3D_REG_INDEX(cs_program) == 0x2E00); static_assert(GFX6_3D_REG_INDEX(cs_program.dim_z) == 0x2E03); static_assert(GFX6_3D_REG_INDEX(cs_program.address_lo) == 0x2E0C); static_assert(GFX6_3D_REG_INDEX(cs_program.user_data) == 0x2E40); +static_assert(GFX6_3D_REG_INDEX(depth_render_control) == 0xA000); +static_assert(GFX6_3D_REG_INDEX(depth_htile_data_base) == 0xA005); static_assert(GFX6_3D_REG_INDEX(screen_scissor) == 0xA00C); static_assert(GFX6_3D_REG_INDEX(depth_buffer.depth_slice) == 0xA017); static_assert(GFX6_3D_REG_INDEX(color_target_mask) == 0xA08E); @@ -942,6 +969,7 @@ static_assert(GFX6_3D_REG_INDEX(color_export_format) == 0xA1C5); static_assert(GFX6_3D_REG_INDEX(blend_control) == 0xA1E0); static_assert(GFX6_3D_REG_INDEX(index_base_address) == 0xA1F9); static_assert(GFX6_3D_REG_INDEX(draw_initiator) == 0xA1FC); +static_assert(GFX6_3D_REG_INDEX(depth_control) == 0xA200); static_assert(GFX6_3D_REG_INDEX(clipper_control) == 0xA204); static_assert(GFX6_3D_REG_INDEX(viewport_control) == 0xA206); static_assert(GFX6_3D_REG_INDEX(vs_output_control) == 0xA207); diff --git a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp index 000063d5..8ca82f82 100644 --- a/src/video_core/renderer_vulkan/liverpool_to_vk.cpp +++ b/src/video_core/renderer_vulkan/liverpool_to_vk.cpp @@ -334,6 +334,19 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu if (data_format == AmdGpu::DataFormat::Format32 && num_format == AmdGpu::NumberFormat::Float) { return vk::Format::eR32Sfloat; } + if (data_format == AmdGpu::DataFormat::Format16_16_16_16 && + num_format == AmdGpu::NumberFormat::Float) { + return vk::Format::eR16G16B16A16Sfloat; + } + if (data_format == AmdGpu::DataFormat::Format32 && num_format == AmdGpu::NumberFormat::Uint) { + return vk::Format::eR32Uint; + } + if (data_format == AmdGpu::DataFormat::Format32 && num_format == AmdGpu::NumberFormat::Sint) { + return vk::Format::eR32Sint; + } + if (data_format == AmdGpu::DataFormat::Format8_8 && num_format == AmdGpu::NumberFormat::Unorm) { + return vk::Format::eR8G8Unorm; + } UNREACHABLE_MSG("Unknown data_format={} and num_format={}", u32(data_format), u32(num_format)); } diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 6e81a7c9..c851aa9d 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -111,14 +111,15 @@ void ComputePipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& s for (const auto& image : info.images) { const auto tsharp = info.ReadUd(image.sgpr_base, image.dword_offset); - const auto& image_view = texture_cache.FindImageView(tsharp); + const auto& image_view = texture_cache.FindImageView(tsharp, image.is_storage); image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, vk::ImageLayout::eGeneral); set_writes.push_back({ .dstSet = VK_NULL_HANDLE, .dstBinding = binding++, .dstArrayElement = 0, .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eSampledImage, + .descriptorType = image.is_storage ? vk::DescriptorType::eStorageImage + : vk::DescriptorType::eSampledImage, .pImageInfo = &image_infos.back(), }); } diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index fe464f0f..3f2195d7 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -196,7 +196,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul const auto dst_color = LiverpoolToVK::BlendFactor(control.color_dst_factor); const auto color_blend = LiverpoolToVK::BlendOp(control.color_func); attachments[i] = vk::PipelineColorBlendAttachmentState{ - .blendEnable = key.blend_controls[i].enable, + .blendEnable = control.enable, .srcColorBlendFactor = src_color, .dstColorBlendFactor = dst_color, .colorBlendOp = color_blend, @@ -215,6 +215,29 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA : key.write_masks[i], }; + + // On GCN GPU there is an additional mask which allows to control color components exported + // from a pixel shader. A situation possible, when the game may mask out the alpha channel, + // while it is still need to be used in blending ops. For such cases, HW will default alpha + // to 1 and perform the blending, while shader normally outputs 0 in the last component. + // Unfortunatelly, Vulkan doesn't provide any control on blend inputs, so below we detecting + // such cases and override alpha value in order to emulate HW behaviour. + const auto has_alpha_masked_out = + (key.cb_shader_mask.GetMask(i) & Liverpool::ColorBufferMask::ComponentA) == 0; + const auto has_src_alpha_in_src_blend = src_color == vk::BlendFactor::eSrcAlpha || + src_color == vk::BlendFactor::eOneMinusSrcAlpha; + const auto has_src_alpha_in_dst_blend = dst_color == vk::BlendFactor::eSrcAlpha || + dst_color == vk::BlendFactor::eOneMinusSrcAlpha; + if (has_alpha_masked_out && has_src_alpha_in_src_blend) { + attachments[i].srcColorBlendFactor = src_color == vk::BlendFactor::eSrcAlpha + ? vk::BlendFactor::eOne + : vk::BlendFactor::eZero; // 1-A + } + if (has_alpha_masked_out && has_src_alpha_in_dst_blend) { + attachments[i].dstColorBlendFactor = dst_color == vk::BlendFactor::eSrcAlpha + ? vk::BlendFactor::eOne + : vk::BlendFactor::eZero; // 1-A + } } const vk::PipelineColorBlendStateCreateInfo color_blending = { @@ -318,7 +341,7 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& for (const auto& image : stage.images) { const auto tsharp = stage.ReadUd(image.sgpr_base, image.dword_offset); - const auto& image_view = texture_cache.FindImageView(tsharp); + const auto& image_view = texture_cache.FindImageView(tsharp, image.is_storage); image_infos.emplace_back(VK_NULL_HANDLE, *image_view.image_view, vk::ImageLayout::eShaderReadOnlyOptimal); set_writes.push_back({ @@ -326,7 +349,8 @@ void GraphicsPipeline::BindResources(Core::MemoryManager* memory, StreamBuffer& .dstBinding = binding++, .dstArrayElement = 0, .descriptorCount = 1, - .descriptorType = vk::DescriptorType::eSampledImage, + .descriptorType = image.is_storage ? vk::DescriptorType::eStorageImage + : vk::DescriptorType::eSampledImage, .pImageInfo = &image_infos.back(), }); } @@ -387,11 +411,11 @@ void GraphicsPipeline::BindVertexBuffers(StreamBuffer& staging) const { boost::container::static_vector ranges_merged{ranges[0]}; for (auto range : ranges) { - auto& prev_range = ranges.back(); + auto& prev_range = ranges_merged.back(); if (prev_range.end_address < range.base_address) { ranges_merged.emplace_back(range); } else { - ranges_merged.back().end_address = std::max(prev_range.end_address, range.end_address); + prev_range.end_address = std::max(prev_range.end_address, range.end_address); } } diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index ff512406..5db40524 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -46,6 +46,7 @@ struct GraphicsPipelineKey { Liverpool::CullMode cull_mode; Liverpool::FrontFace front_face; Liverpool::ClipSpace clip_space; + Liverpool::ColorBufferMask cb_shader_mask{}; std::array blend_controls; std::array write_masks; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 441d0b78..141ac635 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -132,6 +132,7 @@ void PipelineCache::RefreshGraphicsKey() { key.blend_controls[remapped_cb].enable.Assign(key.blend_controls[remapped_cb].enable && !col_buf.info.blend_bypass); key.write_masks[remapped_cb] = vk::ColorComponentFlags{regs.color_target_mask.GetMask(cb)}; + key.cb_shader_mask = regs.color_shader_mask; ++remapped_cb; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index c1340d08..291d38fd 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -60,13 +60,16 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) { }); } if (regs.depth_control.depth_enable && regs.depth_buffer.Address() != 0) { + const bool is_clear = regs.depth_render_control.depth_clear_enable; const auto& image_view = texture_cache.DepthTarget(regs.depth_buffer, liverpool->last_db_extent); depth_attachment = { .imageView = *image_view.image_view, .imageLayout = vk::ImageLayout::eGeneral, - .loadOp = vk::AttachmentLoadOp::eLoad, - .storeOp = vk::AttachmentStoreOp::eStore, + .loadOp = is_clear ? vk::AttachmentLoadOp::eClear : vk::AttachmentLoadOp::eLoad, + .storeOp = is_clear ? vk::AttachmentStoreOp::eNone : vk::AttachmentStoreOp::eStore, + .clearValue = vk::ClearValue{.depthStencil = {.depth = regs.depth_clear, + .stencil = regs.stencil_clear}}, }; num_depth_attachments++; } diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index fd6767b7..fca79f49 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -160,10 +160,10 @@ ImageView& TextureCache::RegisterImageView(Image& image, const ImageViewInfo& vi return slot_image_views[view_id]; } -ImageView& TextureCache::FindImageView(const AmdGpu::Image& desc) { +ImageView& TextureCache::FindImageView(const AmdGpu::Image& desc, bool is_storage) { Image& image = FindImage(ImageInfo{desc}, desc.Address()); - if (image.info.is_storage) { + if (is_storage) { image.Transit(vk::ImageLayout::eGeneral, vk::AccessFlagBits::eShaderWrite); } else { image.Transit(vk::ImageLayout::eShaderReadOnlyOptimal, vk::AccessFlagBits::eShaderRead); @@ -194,6 +194,10 @@ ImageView& TextureCache::DepthTarget(const AmdGpu::Liverpool::DepthBuffer& buffe auto& image = FindImage(info, buffer.Address(), false); image.flags &= ~ImageFlagBits::CpuModified; + image.Transit(vk::ImageLayout::eDepthStencilAttachmentOptimal, + vk::AccessFlagBits::eDepthStencilAttachmentWrite | + vk::AccessFlagBits::eDepthStencilAttachmentRead); + ImageViewInfo view_info; view_info.format = info.pixel_format; return RegisterImageView(image, view_info); diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d37acd21..1722fc20 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -41,7 +41,7 @@ public: bool refresh_on_create = true); /// Retrieves an image view with the properties of the specified image descriptor. - [[nodiscard]] ImageView& FindImageView(const AmdGpu::Image& image); + [[nodiscard]] ImageView& FindImageView(const AmdGpu::Image& image, bool is_storage); /// Retrieves the render target with specified properties [[nodiscard]] ImageView& RenderTarget(const AmdGpu::Liverpool::ColorBuffer& buffer,