diff --git a/CMakeLists.txt b/CMakeLists.txt index 1e54f7a0..78d8421a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -664,6 +664,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/passes/constant_propagation_pass.cpp src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp + src/shader_recompiler/ir/passes/hull_shader_transform.cpp src/shader_recompiler/ir/passes/identity_removal_pass.cpp src/shader_recompiler/ir/passes/ir_passes.h src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp @@ -683,6 +684,8 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h src/shader_recompiler/ir/opcodes.cpp src/shader_recompiler/ir/opcodes.h src/shader_recompiler/ir/opcodes.inc + src/shader_recompiler/ir/patch.cpp + src/shader_recompiler/ir/patch.h src/shader_recompiler/ir/post_order.cpp src/shader_recompiler/ir/post_order.h src/shader_recompiler/ir/program.cpp diff --git a/src/core/debug_state.cpp b/src/core/debug_state.cpp index 64962492..c68fd469 100644 --- a/src/core/debug_state.cpp +++ b/src/core/debug_state.cpp @@ -177,10 +177,11 @@ void DebugStateImpl::PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, } } -void DebugStateImpl::CollectShader(const std::string& name, vk::ShaderModule module, - std::span spv, std::span raw_code, - std::span patch_spv, bool is_patched) { - shader_dump_list.emplace_back(name, module, std::vector{spv.begin(), spv.end()}, +void DebugStateImpl::CollectShader(const std::string& name, Shader::LogicalStage l_stage, + vk::ShaderModule module, std::span spv, + std::span raw_code, std::span patch_spv, + bool is_patched) { + shader_dump_list.emplace_back(name, l_stage, module, std::vector{spv.begin(), spv.end()}, std::vector{raw_code.begin(), raw_code.end()}, std::vector{patch_spv.begin(), patch_spv.end()}, is_patched); } diff --git a/src/core/debug_state.h b/src/core/debug_state.h index fa2e5cd9..0db5bc46 100644 --- a/src/core/debug_state.h +++ b/src/core/debug_state.h @@ -76,6 +76,7 @@ struct FrameDump { struct ShaderDump { std::string name; + Shader::LogicalStage l_stage; vk::ShaderModule module; std::vector spv; @@ -90,16 +91,17 @@ struct ShaderDump { std::string cache_isa_disasm{}; std::string cache_patch_disasm{}; - ShaderDump(std::string name, vk::ShaderModule module, std::vector spv, - std::vector isa, std::vector patch_spv, bool is_patched) - : name(std::move(name)), module(module), spv(std::move(spv)), isa(std::move(isa)), - patch_spv(std::move(patch_spv)), is_patched(is_patched) {} + ShaderDump(std::string name, Shader::LogicalStage l_stage, vk::ShaderModule module, + std::vector spv, std::vector isa, std::vector patch_spv, + bool is_patched) + : name(std::move(name)), l_stage(l_stage), module(module), spv(std::move(spv)), + isa(std::move(isa)), patch_spv(std::move(patch_spv)), is_patched(is_patched) {} ShaderDump(const ShaderDump& other) = delete; ShaderDump(ShaderDump&& other) noexcept - : name{std::move(other.name)}, module{std::move(other.module)}, spv{std::move(other.spv)}, - isa{std::move(other.isa)}, patch_spv{std::move(other.patch_spv)}, - patch_source{std::move(other.patch_source)}, + : name{std::move(other.name)}, l_stage(other.l_stage), module{std::move(other.module)}, + spv{std::move(other.spv)}, isa{std::move(other.isa)}, + patch_spv{std::move(other.patch_spv)}, patch_source{std::move(other.patch_source)}, cache_spv_disasm{std::move(other.cache_spv_disasm)}, cache_isa_disasm{std::move(other.cache_isa_disasm)}, cache_patch_disasm{std::move(other.cache_patch_disasm)} {} @@ -108,6 +110,7 @@ struct ShaderDump { if (this == &other) return *this; name = std::move(other.name); + l_stage = other.l_stage; module = std::move(other.module); spv = std::move(other.spv); isa = std::move(other.isa); @@ -203,7 +206,8 @@ public: void PushRegsDump(uintptr_t base_addr, uintptr_t header_addr, const AmdGpu::Liverpool::Regs& regs, bool is_compute = false); - void CollectShader(const std::string& name, vk::ShaderModule module, std::span spv, + void CollectShader(const std::string& name, Shader::LogicalStage l_stage, + vk::ShaderModule module, std::span spv, std::span raw_code, std::span patch_spv, bool is_patched); }; diff --git a/src/core/devtools/widget/shader_list.cpp b/src/core/devtools/widget/shader_list.cpp index 80c93971..2c97db7f 100644 --- a/src/core/devtools/widget/shader_list.cpp +++ b/src/core/devtools/widget/shader_list.cpp @@ -158,16 +158,17 @@ bool ShaderList::Selection::DrawShader(DebugStateType::ShaderDump& value) { DebugState.ShowDebugMessage(msg); } if (compile) { - static std::map stage_arg = { - {"vs", "vert"}, - {"gs", "geom"}, - {"fs", "frag"}, - {"cs", "comp"}, + static std::map stage_arg = { + {Shader::LogicalStage::Vertex, "vert"}, + {Shader::LogicalStage::TessellationControl, "tesc"}, + {Shader::LogicalStage::TessellationEval, "tese"}, + {Shader::LogicalStage::Geometry, "geom"}, + {Shader::LogicalStage::Fragment, "frag"}, + {Shader::LogicalStage::Compute, "comp"}, }; - auto stage = stage_arg.find(value.name.substr(0, 2)); + auto stage = stage_arg.find(value.l_stage); if (stage == stage_arg.end()) { - DebugState.ShowDebugMessage(std::string{"Invalid shader stage: "} + - value.name.substr(0, 2)); + DebugState.ShowDebugMessage(std::string{"Invalid shader stage"}); } else { std::string cmd = fmt::format("glslc --target-env=vulkan1.3 --target-spv=spv1.6 " diff --git a/src/core/libraries/gnmdriver/gnmdriver.cpp b/src/core/libraries/gnmdriver/gnmdriver.cpp index dbf085fb..e85b8b89 100644 --- a/src/core/libraries/gnmdriver/gnmdriver.cpp +++ b/src/core/libraries/gnmdriver/gnmdriver.cpp @@ -1642,7 +1642,6 @@ s32 PS4_SYSV_ABI sceGnmSetGsShader(u32* cmdbuf, u32 size, const u32* gs_regs) { s32 PS4_SYSV_ABI sceGnmSetHsShader(u32* cmdbuf, u32 size, const u32* hs_regs, u32 param4) { LOG_TRACE(Lib_GnmDriver, "called"); - if (!cmdbuf || size < 0x1E) { return -1; } @@ -1660,11 +1659,13 @@ s32 PS4_SYSV_ABI sceGnmSetHsShader(u32* cmdbuf, u32 size, const u32* hs_regs, u3 cmdbuf = PM4CmdSetData::SetShReg(cmdbuf, 0x108u, hs_regs[0], 0u); // SPI_SHADER_PGM_LO_HS cmdbuf = PM4CmdSetData::SetShReg(cmdbuf, 0x10au, hs_regs[2], hs_regs[3]); // SPI_SHADER_PGM_RSRC1_HS/SPI_SHADER_PGM_RSRC2_HS - cmdbuf = PM4CmdSetData::SetContextReg(cmdbuf, 0x286u, hs_regs[5], - hs_regs[5]); // VGT_HOS_MAX_TESS_LEVEL + cmdbuf = PM4CmdSetData::SetContextReg(cmdbuf, 0x286u, + hs_regs[5], // VGT_HOS_MAX_TESS_LEVEL + hs_regs[6]); // VGT_HOS_MIN_TESS_LEVEL cmdbuf = PM4CmdSetData::SetContextReg(cmdbuf, 0x2dbu, hs_regs[4]); // VGT_TF_PARAM cmdbuf = PM4CmdSetData::SetContextReg(cmdbuf, 0x2d6u, param4); // VGT_LS_HS_CONFIG + // right padding? WriteTrailingNop<11>(cmdbuf); return ORBIS_OK; } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv.cpp b/src/shader_recompiler/backend/spirv/emit_spirv.cpp index ab9d6afa..e545e8e3 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv.cpp @@ -1,6 +1,5 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later - #include #include #include @@ -13,6 +12,7 @@ #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/program.h" +#include "shader_recompiler/runtime_info.h" #include "video_core/amdgpu/types.h" namespace Shader::Backend::SPIRV { @@ -72,7 +72,10 @@ ArgType Arg(EmitContext& ctx, const IR::Value& arg) { return arg.VectorReg(); } else if constexpr (std::is_same_v) { return arg.StringLiteral(); + } else if constexpr (std::is_same_v) { + return arg.Patch(); } + UNREACHABLE(); } template @@ -206,6 +209,32 @@ Id DefineMain(EmitContext& ctx, const IR::Program& program) { return main; } +spv::ExecutionMode ExecutionMode(AmdGpu::TessellationType primitive) { + switch (primitive) { + case AmdGpu::TessellationType::Isoline: + return spv::ExecutionMode::Isolines; + case AmdGpu::TessellationType::Triangle: + return spv::ExecutionMode::Triangles; + case AmdGpu::TessellationType::Quad: + return spv::ExecutionMode::Quads; + } + UNREACHABLE_MSG("Tessellation primitive {}", primitive); +} + +spv::ExecutionMode ExecutionMode(AmdGpu::TessellationPartitioning spacing) { + switch (spacing) { + case AmdGpu::TessellationPartitioning::Integer: + return spv::ExecutionMode::SpacingEqual; + case AmdGpu::TessellationPartitioning::FracOdd: + return spv::ExecutionMode::SpacingFractionalOdd; + case AmdGpu::TessellationPartitioning::FracEven: + return spv::ExecutionMode::SpacingFractionalEven; + default: + break; + } + UNREACHABLE_MSG("Tessellation spacing {}", spacing); +} + void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ctx) { ctx.AddCapability(spv::Capability::Image1D); ctx.AddCapability(spv::Capability::Sampled1D); @@ -248,36 +277,55 @@ void SetupCapabilities(const Info& info, const Profile& profile, EmitContext& ct if (info.uses_group_ballot) { ctx.AddCapability(spv::Capability::GroupNonUniformBallot); } - if (info.stage == Stage::Export || info.stage == Stage::Vertex) { + const auto stage = info.l_stage; + if (stage == LogicalStage::Vertex) { ctx.AddExtension("SPV_KHR_shader_draw_parameters"); ctx.AddCapability(spv::Capability::DrawParameters); } - if (info.stage == Stage::Geometry) { + if (stage == LogicalStage::Geometry) { ctx.AddCapability(spv::Capability::Geometry); } if (info.stage == Stage::Fragment && profile.needs_manual_interpolation) { ctx.AddExtension("SPV_KHR_fragment_shader_barycentric"); ctx.AddCapability(spv::Capability::FragmentBarycentricKHR); } + if (stage == LogicalStage::TessellationControl || stage == LogicalStage::TessellationEval) { + ctx.AddCapability(spv::Capability::Tessellation); + } } -void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { - const auto& info = program.info; +void DefineEntryPoint(const Info& info, EmitContext& ctx, Id main) { const std::span interfaces(ctx.interfaces.data(), ctx.interfaces.size()); spv::ExecutionModel execution_model{}; - switch (program.info.stage) { - case Stage::Compute: { + switch (info.l_stage) { + case LogicalStage::Compute: { const std::array workgroup_size{ctx.runtime_info.cs_info.workgroup_size}; execution_model = spv::ExecutionModel::GLCompute; ctx.AddExecutionMode(main, spv::ExecutionMode::LocalSize, workgroup_size[0], workgroup_size[1], workgroup_size[2]); break; } - case Stage::Export: - case Stage::Vertex: + case LogicalStage::Vertex: execution_model = spv::ExecutionModel::Vertex; break; - case Stage::Fragment: + case LogicalStage::TessellationControl: + execution_model = spv::ExecutionModel::TessellationControl; + ctx.AddCapability(spv::Capability::Tessellation); + ctx.AddExecutionMode(main, spv::ExecutionMode::OutputVertices, + ctx.runtime_info.hs_info.NumOutputControlPoints()); + break; + case LogicalStage::TessellationEval: { + execution_model = spv::ExecutionModel::TessellationEvaluation; + const auto& vs_info = ctx.runtime_info.vs_info; + ctx.AddExecutionMode(main, ExecutionMode(vs_info.tess_type)); + ctx.AddExecutionMode(main, ExecutionMode(vs_info.tess_partitioning)); + ctx.AddExecutionMode(main, + vs_info.tess_topology == AmdGpu::TessellationTopology::TriangleCcw + ? spv::ExecutionMode::VertexOrderCcw + : spv::ExecutionMode::VertexOrderCw); + break; + } + case LogicalStage::Fragment: execution_model = spv::ExecutionModel::Fragment; if (ctx.profile.lower_left_origin_mode) { ctx.AddExecutionMode(main, spv::ExecutionMode::OriginLowerLeft); @@ -292,7 +340,7 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { ctx.AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); } break; - case Stage::Geometry: + case LogicalStage::Geometry: execution_model = spv::ExecutionModel::Geometry; ctx.AddExecutionMode(main, GetInputPrimitiveType(ctx.runtime_info.gs_info.in_primitive)); ctx.AddExecutionMode(main, @@ -303,7 +351,7 @@ void DefineEntryPoint(const IR::Program& program, EmitContext& ctx, Id main) { ctx.runtime_info.gs_info.num_invocations); break; default: - throw NotImplementedException("Stage {}", u32(program.info.stage)); + UNREACHABLE_MSG("Stage {}", u32(info.stage)); } ctx.AddEntryPoint(execution_model, main, "main", interfaces); } @@ -349,7 +397,7 @@ std::vector EmitSPIRV(const Profile& profile, const RuntimeInfo& runtime_in const IR::Program& program, Bindings& binding) { EmitContext ctx{profile, runtime_info, program.info, binding}; const Id main{DefineMain(ctx, program)}; - DefineEntryPoint(program, ctx, main); + DefineEntryPoint(program.info, ctx, main); SetupCapabilities(program.info, profile, ctx); SetupFloatMode(ctx, profile, runtime_info, main); PatchPhiNodes(program, ctx); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp index 22b3523a..611225e8 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_barriers.cpp @@ -18,9 +18,16 @@ void MemoryBarrier(EmitContext& ctx, spv::Scope scope) { void EmitBarrier(EmitContext& ctx) { const auto execution{spv::Scope::Workgroup}; - const auto memory{spv::Scope::Workgroup}; - const auto memory_semantics{spv::MemorySemanticsMask::AcquireRelease | - spv::MemorySemanticsMask::WorkgroupMemory}; + spv::Scope memory; + spv::MemorySemanticsMask memory_semantics; + if (ctx.l_stage == Shader::LogicalStage::TessellationControl) { + memory = spv::Scope::Invocation; + memory_semantics = spv::MemorySemanticsMask::MaskNone; + } else { + memory = spv::Scope::Workgroup; + memory_semantics = + spv::MemorySemanticsMask::AcquireRelease | spv::MemorySemanticsMask::WorkgroupMemory; + } ctx.OpControlBarrier(ctx.ConstU32(static_cast(execution)), ctx.ConstU32(static_cast(memory)), ctx.ConstU32(static_cast(memory_semantics))); diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index d005169c..f3db6af5 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -4,6 +4,9 @@ #include "common/assert.h" #include "shader_recompiler/backend/spirv/emit_spirv_instructions.h" #include "shader_recompiler/backend/spirv/spirv_emit_context.h" +#include "shader_recompiler/ir/attribute.h" +#include "shader_recompiler/ir/patch.h" +#include "shader_recompiler/runtime_info.h" #include @@ -45,13 +48,19 @@ Id VsOutputAttrPointer(EmitContext& ctx, VsOutput output) { Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { if (IR::IsParam(attr)) { - const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; - const auto& info{ctx.output_params.at(index)}; - ASSERT(info.num_components > 0); - if (info.num_components == 1) { - return info.id; + const u32 attr_index{u32(attr) - u32(IR::Attribute::Param0)}; + if (ctx.stage == Stage::Local && ctx.runtime_info.ls_info.links_with_tcs) { + const auto component_ptr = ctx.TypePointer(spv::StorageClass::Output, ctx.F32[1]); + return ctx.OpAccessChain(component_ptr, ctx.output_attr_array, ctx.ConstU32(attr_index), + ctx.ConstU32(element)); } else { - return ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element)); + const auto& info{ctx.output_params.at(attr_index)}; + ASSERT(info.num_components > 0); + if (info.num_components == 1) { + return info.id; + } else { + return ctx.OpAccessChain(info.pointer_type, info.id, ctx.ConstU32(element)); + } } } if (IR::IsMrt(attr)) { @@ -82,9 +91,13 @@ Id OutputAttrPointer(EmitContext& ctx, IR::Attribute attr, u32 element) { std::pair OutputAttrComponentType(EmitContext& ctx, IR::Attribute attr) { if (IR::IsParam(attr)) { - const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; - const auto& info{ctx.output_params.at(index)}; - return {info.component_type, info.is_integer}; + if (ctx.stage == Stage::Local && ctx.runtime_info.ls_info.links_with_tcs) { + return {ctx.F32[1], false}; + } else { + const u32 index{u32(attr) - u32(IR::Attribute::Param0)}; + const auto& info{ctx.output_params.at(index)}; + return {info.component_type, info.is_integer}; + } } if (IR::IsMrt(attr)) { const u32 index{u32(attr) - u32(IR::Attribute::RenderTarget0)}; @@ -171,12 +184,11 @@ Id EmitReadStepRate(EmitContext& ctx, int rate_idx) { rate_idx == 0 ? ctx.u32_zero_value : ctx.u32_one_value)); } -Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { +Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { if (IR::IsPosition(attr)) { ASSERT(attr == IR::Attribute::Position0); const auto position_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ - ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, ctx.ConstU32(index), ctx.ConstU32(0u))}; + const auto pointer{ctx.OpAccessChain(position_arr_ptr, ctx.gl_in, index, ctx.ConstU32(0u))}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -186,7 +198,7 @@ Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u const u32 param_id{u32(attr) - u32(IR::Attribute::Param0)}; const auto param = ctx.input_params.at(param_id).id; const auto param_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); - const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, ctx.ConstU32(index))}; + const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, index)}; const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); @@ -194,9 +206,27 @@ Id EmitGetAttributeForGeometry(EmitContext& ctx, IR::Attribute attr, u32 comp, u UNREACHABLE(); } -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { - if (ctx.info.stage == Stage::Geometry) { +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index) { + if (ctx.info.l_stage == LogicalStage::Geometry) { return EmitGetAttributeForGeometry(ctx, attr, comp, index); + } else if (ctx.info.l_stage == LogicalStage::TessellationControl || + ctx.info.l_stage == LogicalStage::TessellationEval) { + if (IR::IsTessCoord(attr)) { + const u32 component = attr == IR::Attribute::TessellationEvaluationPointU ? 0 : 1; + const auto component_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); + const auto pointer{ + ctx.OpAccessChain(component_ptr, ctx.tess_coord, ctx.ConstU32(component))}; + return ctx.OpLoad(ctx.F32[1], pointer); + } else if (IR::IsParam(attr)) { + const u32 param_id{u32(attr) - u32(IR::Attribute::Param0)}; + const auto param = ctx.input_params.at(param_id).id; + const auto param_arr_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[4]); + const auto pointer{ctx.OpAccessChain(param_arr_ptr, param, index)}; + const auto position_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); + return ctx.OpLoad(ctx.F32[1], + ctx.OpAccessChain(position_comp_ptr, pointer, ctx.ConstU32(comp))); + } + UNREACHABLE(); } if (IR::IsParam(attr)) { @@ -242,8 +272,14 @@ Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index) { } return coord; } + case IR::Attribute::TessellationEvaluationPointU: + return ctx.OpLoad(ctx.F32[1], + ctx.OpAccessChain(ctx.input_f32, ctx.tess_coord, ctx.u32_zero_value)); + case IR::Attribute::TessellationEvaluationPointV: + return ctx.OpLoad(ctx.F32[1], + ctx.OpAccessChain(ctx.input_f32, ctx.tess_coord, ctx.ConstU32(1U))); default: - throw NotImplementedException("Read attribute {}", attr); + UNREACHABLE_MSG("Read attribute {}", attr); } } @@ -266,10 +302,32 @@ Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp) { return ctx.OpSelect(ctx.U32[1], ctx.OpLoad(ctx.U1[1], ctx.front_facing), ctx.u32_one_value, ctx.u32_zero_value); case IR::Attribute::PrimitiveId: - ASSERT(ctx.info.stage == Stage::Geometry); return ctx.OpLoad(ctx.U32[1], ctx.primitive_id); + case IR::Attribute::InvocationId: + ASSERT(ctx.info.l_stage == LogicalStage::Geometry || + ctx.info.l_stage == LogicalStage::TessellationControl); + return ctx.OpLoad(ctx.U32[1], ctx.invocation_id); + case IR::Attribute::PatchVertices: + ASSERT(ctx.info.l_stage == LogicalStage::TessellationControl); + return ctx.OpLoad(ctx.U32[1], ctx.patch_vertices); + case IR::Attribute::PackedHullInvocationInfo: { + ASSERT(ctx.info.l_stage == LogicalStage::TessellationControl); + // [0:8]: patch id within VGT + // [8:12]: output control point id + // But 0:8 should be treated as 0 for attribute addressing purposes + if (ctx.runtime_info.hs_info.IsPassthrough()) { + // Gcn shader would run with 1 thread, but we need to run a thread for + // each output control point. + // If Gcn shader uses this value, we should make sure all threads in the + // Vulkan shader use 0 + return ctx.ConstU32(0u); + } else { + const Id invocation_id = ctx.OpLoad(ctx.U32[1], ctx.invocation_id); + return ctx.OpShiftLeftLogical(ctx.U32[1], invocation_id, ctx.ConstU32(8u)); + } + } default: - throw NotImplementedException("Read U32 attribute {}", attr); + UNREACHABLE_MSG("Read U32 attribute {}", attr); } } @@ -287,6 +345,58 @@ void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 elemen } } +Id EmitGetTessGenericAttribute(EmitContext& ctx, Id vertex_index, Id attr_index, Id comp_index) { + const auto attr_comp_ptr = ctx.TypePointer(spv::StorageClass::Input, ctx.F32[1]); + return ctx.OpLoad(ctx.F32[1], ctx.OpAccessChain(attr_comp_ptr, ctx.input_attr_array, + vertex_index, attr_index, comp_index)); +} + +void EmitSetTcsGenericAttribute(EmitContext& ctx, Id value, Id attr_index, Id comp_index) { + // Implied vertex index is invocation_id + const auto component_ptr = ctx.TypePointer(spv::StorageClass::Output, ctx.F32[1]); + Id pointer = + ctx.OpAccessChain(component_ptr, ctx.output_attr_array, + ctx.OpLoad(ctx.U32[1], ctx.invocation_id), attr_index, comp_index); + ctx.OpStore(pointer, value); +} + +Id EmitGetPatch(EmitContext& ctx, IR::Patch patch) { + const u32 index{IR::GenericPatchIndex(patch)}; + const Id element{ctx.ConstU32(IR::GenericPatchElement(patch))}; + const Id type{ctx.l_stage == LogicalStage::TessellationControl ? ctx.output_f32 + : ctx.input_f32}; + const Id pointer{ctx.OpAccessChain(type, ctx.patches.at(index), element)}; + return ctx.OpLoad(ctx.F32[1], pointer); +} + +void EmitSetPatch(EmitContext& ctx, IR::Patch patch, Id value) { + const Id pointer{[&] { + if (IR::IsGeneric(patch)) { + const u32 index{IR::GenericPatchIndex(patch)}; + const Id element{ctx.ConstU32(IR::GenericPatchElement(patch))}; + return ctx.OpAccessChain(ctx.output_f32, ctx.patches.at(index), element); + } + switch (patch) { + case IR::Patch::TessellationLodLeft: + case IR::Patch::TessellationLodRight: + case IR::Patch::TessellationLodTop: + case IR::Patch::TessellationLodBottom: { + const u32 index{static_cast(patch) - u32(IR::Patch::TessellationLodLeft)}; + const Id index_id{ctx.ConstU32(index)}; + return ctx.OpAccessChain(ctx.output_f32, ctx.output_tess_level_outer, index_id); + } + case IR::Patch::TessellationLodInteriorU: + return ctx.OpAccessChain(ctx.output_f32, ctx.output_tess_level_inner, + ctx.u32_zero_value); + case IR::Patch::TessellationLodInteriorV: + return ctx.OpAccessChain(ctx.output_f32, ctx.output_tess_level_inner, ctx.ConstU32(1u)); + default: + UNREACHABLE_MSG("Patch {}", u32(patch)); + } + }()}; + ctx.OpStore(pointer, value); +} + template static Id EmitLoadBufferU32xN(EmitContext& ctx, u32 handle, Id address) { auto& buffer = ctx.buffers[handle]; diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h index 057b0d69..f71c61af 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h +++ b/src/shader_recompiler/backend/spirv/emit_spirv_instructions.h @@ -9,6 +9,7 @@ namespace Shader::IR { enum class Attribute : u64; enum class ScalarReg : u32; +enum class Patch : u64; class Inst; class Value; } // namespace Shader::IR @@ -27,8 +28,6 @@ Id EmitConditionRef(EmitContext& ctx, const IR::Value& value); void EmitReference(EmitContext&); void EmitPhiMove(EmitContext&); void EmitJoin(EmitContext& ctx); -void EmitWorkgroupMemoryBarrier(EmitContext& ctx); -void EmitDeviceMemoryBarrier(EmitContext& ctx); void EmitGetScc(EmitContext& ctx); void EmitGetExec(EmitContext& ctx); void EmitGetVcc(EmitContext& ctx); @@ -85,9 +84,13 @@ Id EmitBufferAtomicAnd32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id addres Id EmitBufferAtomicOr32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicXor32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); Id EmitBufferAtomicSwap32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address, Id value); -Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, u32 index); +Id EmitGetAttribute(EmitContext& ctx, IR::Attribute attr, u32 comp, Id index); Id EmitGetAttributeU32(EmitContext& ctx, IR::Attribute attr, u32 comp); void EmitSetAttribute(EmitContext& ctx, IR::Attribute attr, Id value, u32 comp); +Id EmitGetTessGenericAttribute(EmitContext& ctx, Id vertex_index, Id attr_index, Id comp_index); +void EmitSetTcsGenericAttribute(EmitContext& ctx, Id value, Id attr_index, Id comp_index); +Id EmitGetPatch(EmitContext& ctx, IR::Patch patch); +void EmitSetPatch(EmitContext& ctx, IR::Patch patch, Id value); void EmitSetFragColor(EmitContext& ctx, u32 index, u32 component, Id value); void EmitSetSampleMask(EmitContext& ctx, Id value); void EmitSetFragDepth(EmitContext& ctx, Id value); diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 1ada2f1f..2e09e70a 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -6,6 +6,7 @@ #include "shader_recompiler/backend/spirv/spirv_emit_context.h" #include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/ir/passes/srt.h" +#include "shader_recompiler/runtime_info.h" #include "video_core/amdgpu/types.h" #include @@ -34,7 +35,7 @@ std::string_view StageName(Stage stage) { case Stage::Compute: return "cs"; } - throw InvalidArgument("Invalid stage {}", u32(stage)); + UNREACHABLE_MSG("Invalid hw stage {}", u32(stage)); } static constexpr u32 NumVertices(AmdGpu::PrimitiveType type) { @@ -65,7 +66,7 @@ void Name(EmitContext& ctx, Id object, std::string_view format_str, Args&&... ar EmitContext::EmitContext(const Profile& profile_, const RuntimeInfo& runtime_info_, const Info& info_, Bindings& binding_) : Sirit::Module(profile_.supported_spirv), info{info_}, runtime_info{runtime_info_}, - profile{profile_}, stage{info.stage}, binding{binding_} { + profile{profile_}, stage{info.stage}, l_stage{info.l_stage}, binding{binding_} { AddCapability(spv::Capability::Shader); DefineArithmeticTypes(); DefineInterfaces(); @@ -268,9 +269,8 @@ void EmitContext::DefineInputs() { U32[1], spv::BuiltIn::SubgroupLocalInvocationId, spv::StorageClass::Input); Decorate(subgroup_local_invocation_id, spv::Decoration::Flat); } - switch (stage) { - case Stage::Export: - case Stage::Vertex: { + switch (l_stage) { + case LogicalStage::Vertex: { vertex_index = DefineVariable(U32[1], spv::BuiltIn::VertexIndex, spv::StorageClass::Input); base_vertex = DefineVariable(U32[1], spv::BuiltIn::BaseVertex, spv::StorageClass::Input); instance_id = DefineVariable(U32[1], spv::BuiltIn::InstanceIndex, spv::StorageClass::Input); @@ -311,12 +311,11 @@ void EmitContext::DefineInputs() { } input_params[attrib.semantic] = GetAttributeInfo(sharp.GetNumberFmt(), id, 4, false); - interfaces.push_back(id); } } break; } - case Stage::Fragment: + case LogicalStage::Fragment: frag_coord = DefineVariable(F32[4], spv::BuiltIn::FragCoord, spv::StorageClass::Input); frag_depth = DefineVariable(F32[1], spv::BuiltIn::FragDepth, spv::StorageClass::Output); front_facing = DefineVariable(U1[1], spv::BuiltIn::FrontFacing, spv::StorageClass::Input); @@ -351,15 +350,14 @@ void EmitContext::DefineInputs() { } input_params[semantic] = GetAttributeInfo(AmdGpu::NumberFormat::Float, attr_id, num_components, false); - interfaces.push_back(attr_id); } break; - case Stage::Compute: + case LogicalStage::Compute: workgroup_id = DefineVariable(U32[3], spv::BuiltIn::WorkgroupId, spv::StorageClass::Input); local_invocation_id = DefineVariable(U32[3], spv::BuiltIn::LocalInvocationId, spv::StorageClass::Input); break; - case Stage::Geometry: { + case LogicalStage::Geometry: { primitive_id = DefineVariable(U32[1], spv::BuiltIn::PrimitiveId, spv::StorageClass::Input); const auto gl_per_vertex = Name(TypeStruct(TypeVector(F32[1], 4), F32[1], TypeArray(F32[1], ConstU32(1u))), @@ -389,15 +387,129 @@ void EmitContext::DefineInputs() { } break; } + case LogicalStage::TessellationControl: { + invocation_id = + DefineVariable(U32[1], spv::BuiltIn::InvocationId, spv::StorageClass::Input); + patch_vertices = + DefineVariable(U32[1], spv::BuiltIn::PatchVertices, spv::StorageClass::Input); + primitive_id = DefineVariable(U32[1], spv::BuiltIn::PrimitiveId, spv::StorageClass::Input); + + const u32 num_attrs = runtime_info.hs_info.ls_stride >> 4; + if (num_attrs > 0) { + const Id per_vertex_type{TypeArray(F32[4], ConstU32(num_attrs))}; + // The input vertex count isn't statically known, so make length 32 (what glslang does) + const Id patch_array_type{TypeArray(per_vertex_type, ConstU32(32u))}; + input_attr_array = DefineInput(patch_array_type, 0); + Name(input_attr_array, "in_attrs"); + } + break; + } + case LogicalStage::TessellationEval: { + tess_coord = DefineInput(F32[3], std::nullopt, spv::BuiltIn::TessCoord); + primitive_id = DefineVariable(U32[1], spv::BuiltIn::PrimitiveId, spv::StorageClass::Input); + + const u32 num_attrs = runtime_info.vs_info.hs_output_cp_stride >> 4; + if (num_attrs > 0) { + const Id per_vertex_type{TypeArray(F32[4], ConstU32(num_attrs))}; + // The input vertex count isn't statically known, so make length 32 (what glslang does) + const Id patch_array_type{TypeArray(per_vertex_type, ConstU32(32u))}; + input_attr_array = DefineInput(patch_array_type, 0); + Name(input_attr_array, "in_attrs"); + } + + u32 patch_base_location = runtime_info.vs_info.hs_output_cp_stride >> 4; + for (size_t index = 0; index < 30; ++index) { + if (!(info.uses_patches & (1U << index))) { + continue; + } + const Id id{DefineInput(F32[4], patch_base_location + index)}; + Decorate(id, spv::Decoration::Patch); + Name(id, fmt::format("patch_in{}", index)); + patches[index] = id; + } + break; + } default: break; } } void EmitContext::DefineOutputs() { - switch (stage) { - case Stage::Export: - case Stage::Vertex: { + switch (l_stage) { + case LogicalStage::Vertex: { + // No point in defining builtin outputs (i.e. position) unless next stage is fragment? + // Might cause problems linking with tcs + + output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output); + const bool has_extra_pos_stores = info.stores.Get(IR::Attribute::Position1) || + info.stores.Get(IR::Attribute::Position2) || + info.stores.Get(IR::Attribute::Position3); + if (has_extra_pos_stores) { + const Id type{TypeArray(F32[1], ConstU32(8U))}; + clip_distances = + DefineVariable(type, spv::BuiltIn::ClipDistance, spv::StorageClass::Output); + cull_distances = + DefineVariable(type, spv::BuiltIn::CullDistance, spv::StorageClass::Output); + } + if (stage == Shader::Stage::Local && runtime_info.ls_info.links_with_tcs) { + const u32 num_attrs = runtime_info.ls_info.ls_stride >> 4; + if (num_attrs > 0) { + const Id type{TypeArray(F32[4], ConstU32(num_attrs))}; + output_attr_array = DefineOutput(type, 0); + Name(output_attr_array, "out_attrs"); + } + } else { + for (u32 i = 0; i < IR::NumParams; i++) { + const IR::Attribute param{IR::Attribute::Param0 + i}; + if (!info.stores.GetAny(param)) { + continue; + } + const u32 num_components = info.stores.NumComponents(param); + const Id id{DefineOutput(F32[num_components], i)}; + Name(id, fmt::format("out_attr{}", i)); + output_params[i] = + GetAttributeInfo(AmdGpu::NumberFormat::Float, id, num_components, true); + } + } + break; + } + case LogicalStage::TessellationControl: { + if (info.stores_tess_level_outer) { + const Id type{TypeArray(F32[1], ConstU32(4U))}; + output_tess_level_outer = + DefineOutput(type, std::nullopt, spv::BuiltIn::TessLevelOuter); + Decorate(output_tess_level_outer, spv::Decoration::Patch); + } + if (info.stores_tess_level_inner) { + const Id type{TypeArray(F32[1], ConstU32(2U))}; + output_tess_level_inner = + DefineOutput(type, std::nullopt, spv::BuiltIn::TessLevelInner); + Decorate(output_tess_level_inner, spv::Decoration::Patch); + } + + const u32 num_attrs = runtime_info.hs_info.hs_output_cp_stride >> 4; + if (num_attrs > 0) { + const Id per_vertex_type{TypeArray(F32[4], ConstU32(num_attrs))}; + // The input vertex count isn't statically known, so make length 32 (what glslang does) + const Id patch_array_type{TypeArray( + per_vertex_type, ConstU32(runtime_info.hs_info.NumOutputControlPoints()))}; + output_attr_array = DefineOutput(patch_array_type, 0); + Name(output_attr_array, "out_attrs"); + } + + u32 patch_base_location = runtime_info.hs_info.hs_output_cp_stride >> 4; + for (size_t index = 0; index < 30; ++index) { + if (!(info.uses_patches & (1U << index))) { + continue; + } + const Id id{DefineOutput(F32[4], patch_base_location + index)}; + Decorate(id, spv::Decoration::Patch); + Name(id, fmt::format("patch_out{}", index)); + patches[index] = id; + } + break; + } + case LogicalStage::TessellationEval: { output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output); const bool has_extra_pos_stores = info.stores.Get(IR::Attribute::Position1) || info.stores.Get(IR::Attribute::Position2) || @@ -419,11 +531,10 @@ void EmitContext::DefineOutputs() { Name(id, fmt::format("out_attr{}", i)); output_params[i] = GetAttributeInfo(AmdGpu::NumberFormat::Float, id, num_components, true); - interfaces.push_back(id); } break; } - case Stage::Fragment: + case LogicalStage::Fragment: for (u32 i = 0; i < IR::NumRenderTargets; i++) { const IR::Attribute mrt{IR::Attribute::RenderTarget0 + i}; if (!info.stores.GetAny(mrt)) { @@ -435,22 +546,22 @@ void EmitContext::DefineOutputs() { const Id id{DefineOutput(type, i)}; Name(id, fmt::format("frag_color{}", i)); frag_outputs[i] = GetAttributeInfo(num_format, id, num_components, true); - interfaces.push_back(id); } break; - case Stage::Geometry: { + case LogicalStage::Geometry: { output_position = DefineVariable(F32[4], spv::BuiltIn::Position, spv::StorageClass::Output); for (u32 attr_id = 0; attr_id < info.gs_copy_data.num_attrs; attr_id++) { const Id id{DefineOutput(F32[4], attr_id)}; Name(id, fmt::format("out_attr{}", attr_id)); output_params[attr_id] = {id, output_f32, F32[1], 4u}; - interfaces.push_back(id); } break; } - default: + case LogicalStage::Compute: break; + default: + UNREACHABLE(); } } diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.h b/src/shader_recompiler/backend/spirv/spirv_emit_context.h index cd129332..583d96b9 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.h +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.h @@ -46,14 +46,18 @@ public: void DefineBufferOffsets(); void DefineInterpolatedAttribs(); - [[nodiscard]] Id DefineInput(Id type, u32 location) { - const Id input_id{DefineVar(type, spv::StorageClass::Input)}; - Decorate(input_id, spv::Decoration::Location, location); + [[nodiscard]] Id DefineInput(Id type, std::optional location = std::nullopt, + std::optional builtin = std::nullopt) { + const Id input_id{DefineVariable(type, builtin, spv::StorageClass::Input)}; + if (location) { + Decorate(input_id, spv::Decoration::Location, *location); + } return input_id; } - [[nodiscard]] Id DefineOutput(Id type, std::optional location = std::nullopt) { - const Id output_id{DefineVar(type, spv::StorageClass::Output)}; + [[nodiscard]] Id DefineOutput(Id type, std::optional location = std::nullopt, + std::optional builtin = std::nullopt) { + const Id output_id{DefineVariable(type, builtin, spv::StorageClass::Output)}; if (location) { Decorate(output_id, spv::Decoration::Location, *location); } @@ -131,7 +135,8 @@ public: const Info& info; const RuntimeInfo& runtime_info; const Profile& profile; - Stage stage{}; + Stage stage; + LogicalStage l_stage{}; Id void_id{}; Id U8{}; @@ -188,8 +193,15 @@ public: Id clip_distances{}; Id cull_distances{}; + Id patch_vertices{}; + Id output_tess_level_outer{}; + Id output_tess_level_inner{}; + Id tess_coord; + std::array patches{}; + Id workgroup_id{}; Id local_invocation_id{}; + Id invocation_id{}; // for instanced geoshaders or output vertices within TCS patch Id subgroup_local_invocation_id{}; Id image_u32{}; @@ -252,6 +264,8 @@ public: bool is_loaded{}; s32 buffer_handle{-1}; }; + Id input_attr_array; + Id output_attr_array; std::array input_params{}; std::array output_params{}; std::array frag_outputs{}; diff --git a/src/shader_recompiler/frontend/tessellation.h b/src/shader_recompiler/frontend/tessellation.h new file mode 100644 index 00000000..bfcaa4fd --- /dev/null +++ b/src/shader_recompiler/frontend/tessellation.h @@ -0,0 +1,38 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/types.h" + +namespace Shader { + +struct TessellationDataConstantBuffer { + u32 ls_stride; + u32 hs_cp_stride; // HullStateConstants::m_cpStride != 0 ? HullStateConstants::m_cpStride : + // ls_stride + u32 num_patches; // num patches submitted in threadgroup + u32 hs_output_base; // HullStateConstants::m_numInputCP::m_cpStride != 0 ? + // HullStateConstants::m_numInputCP * ls_stride * num_patches : 0 + // basically 0 when passthrough + u32 patch_const_size; // 16 * num_patch_attrs + u32 patch_const_base; // hs_output_base + patch_output_size + u32 patch_output_size; // output_cp_stride * num_output_cp_per_patch + f32 off_chip_tessellation_factor_threshold; + u32 first_edge_tess_factor_index; +}; + +// Assign names to dword fields of TessellationDataConstantBuffer +enum class TessConstantAttribute : u32 { + LsStride, + HsCpStride, + HsNumPatch, + HsOutputBase, + PatchConstSize, + PatchConstBase, + PatchOutputSize, + OffChipTessellationFactorThreshold, + FirstEdgeTessFactorIndex, +}; + +} // namespace Shader \ No newline at end of file diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 5914f9fe..116935b9 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -1,8 +1,8 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later - #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/ir/reg.h" +#include "shader_recompiler/runtime_info.h" namespace Shader::Gcn { @@ -73,10 +73,11 @@ void Translator::EmitDataShare(const GcnInst& inst) { void Translator::V_READFIRSTLANE_B32(const GcnInst& inst) { const IR::U32 value{GetSrc(inst.src[0])}; - if (info.stage != Stage::Compute) { - SetDst(inst.dst[0], value); - } else { + if (info.l_stage == LogicalStage::Compute || + info.l_stage == LogicalStage::TessellationControl) { SetDst(inst.dst[0], ir.ReadFirstLane(value)); + } else { + SetDst(inst.dst[0], value); } } diff --git a/src/shader_recompiler/frontend/translate/scalar_alu.cpp b/src/shader_recompiler/frontend/translate/scalar_alu.cpp index 5b411d83..1ef0d82d 100644 --- a/src/shader_recompiler/frontend/translate/scalar_alu.cpp +++ b/src/shader_recompiler/frontend/translate/scalar_alu.cpp @@ -1,6 +1,8 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include +#include "common/assert.h" #include "shader_recompiler/frontend/translate/translate.h" namespace Shader::Gcn { @@ -78,8 +80,10 @@ void Translator::EmitScalarAlu(const GcnInst& inst) { return S_BFM_B32(inst); case Opcode::S_MUL_I32: return S_MUL_I32(inst); + case Opcode::S_BFE_I32: + return S_BFE(inst, true); case Opcode::S_BFE_U32: - return S_BFE_U32(inst); + return S_BFE(inst, false); case Opcode::S_ABSDIFF_I32: return S_ABSDIFF_I32(inst); @@ -434,12 +438,12 @@ void Translator::S_MUL_I32(const GcnInst& inst) { SetDst(inst.dst[0], ir.IMul(GetSrc(inst.src[0]), GetSrc(inst.src[1]))); } -void Translator::S_BFE_U32(const GcnInst& inst) { +void Translator::S_BFE(const GcnInst& inst, bool is_signed) { const IR::U32 src0{GetSrc(inst.src[0])}; const IR::U32 src1{GetSrc(inst.src[1])}; const IR::U32 offset{ir.BitwiseAnd(src1, ir.Imm32(0x1F))}; const IR::U32 count{ir.BitFieldExtract(src1, ir.Imm32(16), ir.Imm32(7))}; - const IR::U32 result{ir.BitFieldExtract(src0, offset, count)}; + const IR::U32 result{ir.BitFieldExtract(src0, offset, count, is_signed)}; SetDst(inst.dst[0], result); ir.SetScc(ir.INotEqual(result, ir.Imm32(0))); } diff --git a/src/shader_recompiler/frontend/translate/translate.cpp b/src/shader_recompiler/frontend/translate/translate.cpp index 97978ff6..3031e664 100644 --- a/src/shader_recompiler/frontend/translate/translate.cpp +++ b/src/shader_recompiler/frontend/translate/translate.cpp @@ -8,6 +8,8 @@ #include "shader_recompiler/frontend/fetch_shader.h" #include "shader_recompiler/frontend/translate/translate.h" #include "shader_recompiler/info.h" +#include "shader_recompiler/ir/attribute.h" +#include "shader_recompiler/ir/reg.h" #include "shader_recompiler/runtime_info.h" #include "video_core/amdgpu/resource.h" #include "video_core/amdgpu/types.h" @@ -34,9 +36,8 @@ void Translator::EmitPrologue() { } IR::VectorReg dst_vreg = IR::VectorReg::V0; - switch (info.stage) { - case Stage::Vertex: - case Stage::Export: + switch (info.l_stage) { + case LogicalStage::Vertex: // v0: vertex ID, always present ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::VertexId)); // v1: instance ID, step rate 0 @@ -52,7 +53,7 @@ void Translator::EmitPrologue() { ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::InstanceId)); } break; - case Stage::Fragment: + case LogicalStage::Fragment: dst_vreg = IR::VectorReg::V0; if (runtime_info.fs_info.addr_flags.persp_sample_ena) { ++dst_vreg; // I @@ -122,7 +123,30 @@ void Translator::EmitPrologue() { } } break; - case Stage::Compute: + case LogicalStage::TessellationControl: { + // Should be laid out like: + // [0:8]: patch id within VGT + // [8:12]: output control point id + ir.SetVectorReg(IR::VectorReg::V1, + ir.GetAttributeU32(IR::Attribute::PackedHullInvocationInfo)); + // TODO PrimitiveId is probably V2 but haven't seen it yet + break; + } + case LogicalStage::TessellationEval: + ir.SetVectorReg(IR::VectorReg::V0, + ir.GetAttribute(IR::Attribute::TessellationEvaluationPointU)); + ir.SetVectorReg(IR::VectorReg::V1, + ir.GetAttribute(IR::Attribute::TessellationEvaluationPointV)); + // V2 is similar to PrimitiveID but not the same. It seems to only be used in + // compiler-generated address calculations. Its probably the patch id within the + // patches running locally on a given VGT (or CU, whichever is the granularity of LDS + // memory) + // Set to 0. See explanation in comment describing hull/domain passes + ir.SetVectorReg(IR::VectorReg::V2, ir.Imm32(0u)); + // V3 is the actual PrimitiveID as intended by the shader author. + ir.SetVectorReg(IR::VectorReg::V3, ir.GetAttributeU32(IR::Attribute::PrimitiveId)); + break; + case LogicalStage::Compute: ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 0)); ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 1)); ir.SetVectorReg(dst_vreg++, ir.GetAttributeU32(IR::Attribute::LocalInvocationId, 2)); @@ -137,7 +161,7 @@ void Translator::EmitPrologue() { ir.SetScalarReg(dst_sreg++, ir.GetAttributeU32(IR::Attribute::WorkgroupId, 2)); } break; - case Stage::Geometry: + case LogicalStage::Geometry: switch (runtime_info.gs_info.out_primitive[0]) { case AmdGpu::GsOutputPrimitiveType::TriangleStrip: ir.SetVectorReg(IR::VectorReg::V3, ir.Imm32(2u)); // vertex 2 @@ -152,7 +176,7 @@ void Translator::EmitPrologue() { ir.SetVectorReg(IR::VectorReg::V2, ir.GetAttributeU32(IR::Attribute::PrimitiveId)); break; default: - throw NotImplementedException("Unknown shader stage"); + UNREACHABLE_MSG("Unknown shader stage"); } } @@ -503,7 +527,8 @@ void Translate(IR::Block* block, u32 pc, std::span inst_list, Inf // Special case for emitting fetch shader. if (inst.opcode == Opcode::S_SWAPPC_B64) { - ASSERT(info.stage == Stage::Vertex || info.stage == Stage::Export); + ASSERT(info.stage == Stage::Vertex || info.stage == Stage::Export || + info.stage == Stage::Local); translator.EmitFetch(inst); continue; } diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index 198cea27..60bad186 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -94,7 +94,8 @@ public: void S_ASHR_I32(const GcnInst& inst); void S_BFM_B32(const GcnInst& inst); void S_MUL_I32(const GcnInst& inst); - void S_BFE_U32(const GcnInst& inst); + void S_BFE(const GcnInst& inst, bool is_signed); + void S_BFE_I32(const GcnInst& inst); void S_ABSDIFF_I32(const GcnInst& inst); void S_NOT_B32(const GcnInst& inst); @@ -217,7 +218,7 @@ public: // VOP3a void V_MAD_F32(const GcnInst& inst); - void V_MAD_I32_I24(const GcnInst& inst, bool is_signed = false); + void V_MAD_I32_I24(const GcnInst& inst, bool is_signed = true); void V_MAD_U32_U24(const GcnInst& inst); void V_CUBEID_F32(const GcnInst& inst); void V_CUBESC_F32(const GcnInst& inst); diff --git a/src/shader_recompiler/frontend/translate/vector_alu.cpp b/src/shader_recompiler/frontend/translate/vector_alu.cpp index 3e9e677a..2b32ca2c 100644 --- a/src/shader_recompiler/frontend/translate/vector_alu.cpp +++ b/src/shader_recompiler/frontend/translate/vector_alu.cpp @@ -1060,8 +1060,14 @@ void Translator::V_CUBEMA_F32(const GcnInst& inst) { void Translator::V_BFE_U32(bool is_signed, const GcnInst& inst) { const IR::U32 src0{GetSrc(inst.src[0])}; - const IR::U32 src1{ir.BitwiseAnd(GetSrc(inst.src[1]), ir.Imm32(0x1F))}; - const IR::U32 src2{ir.BitwiseAnd(GetSrc(inst.src[2]), ir.Imm32(0x1F))}; + IR::U32 src1{GetSrc(inst.src[1])}; + IR::U32 src2{GetSrc(inst.src[2])}; + if (!src1.IsImmediate()) { + src1 = ir.BitwiseAnd(src1, ir.Imm32(0x1F)); + } + if (!src2.IsImmediate()) { + src2 = ir.BitwiseAnd(src2, ir.Imm32(0x1F)); + } SetDst(inst.dst[0], ir.BitFieldExtract(src0, src1, src2, is_signed)); } diff --git a/src/shader_recompiler/frontend/translate/vector_memory.cpp b/src/shader_recompiler/frontend/translate/vector_memory.cpp index eadd1c4d..072b1f88 100644 --- a/src/shader_recompiler/frontend/translate/vector_memory.cpp +++ b/src/shader_recompiler/frontend/translate/vector_memory.cpp @@ -189,7 +189,8 @@ void Translator::BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst) buffer_info.index_enable.Assign(mtbuf.idxen); buffer_info.offset_enable.Assign(mtbuf.offen); buffer_info.inst_offset.Assign(mtbuf.offset); - buffer_info.ring_access.Assign(is_ring); + buffer_info.globally_coherent.Assign(mtbuf.glc); + buffer_info.system_coherent.Assign(mtbuf.slc); if (is_typed) { const auto dmft = static_cast(mtbuf.dfmt); const auto nfmt = static_cast(mtbuf.nfmt); @@ -247,11 +248,15 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst const IR::ScalarReg sharp{inst.src[2].code * 4}; const IR::Value soffset{GetSrc(inst.src[3])}; - if (info.stage != Stage::Export && info.stage != Stage::Geometry) { + if (info.stage != Stage::Export && info.stage != Stage::Hull && info.stage != Stage::Geometry) { ASSERT_MSG(soffset.IsImmediate() && soffset.U32() == 0, "Non immediate offset not supported"); } + if (info.stage == Stage::Hull) { + // printf("here\n"); // break + } + IR::Value address = [&] -> IR::Value { if (is_ring) { return ir.CompositeConstruct(ir.GetVectorReg(vaddr), soffset); @@ -269,7 +274,8 @@ void Translator::BUFFER_STORE(u32 num_dwords, bool is_typed, const GcnInst& inst buffer_info.index_enable.Assign(mtbuf.idxen); buffer_info.offset_enable.Assign(mtbuf.offen); buffer_info.inst_offset.Assign(mtbuf.offset); - buffer_info.ring_access.Assign(is_ring); + buffer_info.globally_coherent.Assign(mtbuf.glc); + buffer_info.system_coherent.Assign(mtbuf.slc); if (is_typed) { const auto dmft = static_cast(mtbuf.dfmt); const auto nfmt = static_cast(mtbuf.nfmt); diff --git a/src/shader_recompiler/info.h b/src/shader_recompiler/info.h index 494bbb4b..dbea2af8 100644 --- a/src/shader_recompiler/info.h +++ b/src/shader_recompiler/info.h @@ -11,6 +11,7 @@ #include "common/types.h" #include "shader_recompiler/backend/bindings.h" #include "shader_recompiler/frontend/copy_shader.h" +#include "shader_recompiler/frontend/tessellation.h" #include "shader_recompiler/ir/attribute.h" #include "shader_recompiler/ir/passes/srt.h" #include "shader_recompiler/ir/reg.h" @@ -163,6 +164,7 @@ struct Info { UserDataMask ud_mask{}; CopyShaderData gs_copy_data; + u32 uses_patches{}; BufferResourceList buffers; TextureBufferResourceList texture_buffers; @@ -173,8 +175,12 @@ struct Info { PersistentSrtInfo srt_info; std::vector flattened_ud_buf; + IR::ScalarReg tess_consts_ptr_base = IR::ScalarReg::Max; + s32 tess_consts_dword_offset = -1; + std::span user_data; Stage stage; + LogicalStage l_stage; u64 pgm_hash{}; VAddr pgm_base; @@ -190,14 +196,16 @@ struct Info { bool uses_shared{}; bool uses_fp16{}; bool uses_fp64{}; + bool stores_tess_level_outer{}; + bool stores_tess_level_inner{}; bool translation_failed{}; // indicates that shader has unsupported instructions bool has_readconst{}; u8 mrt_mask{0u}; bool has_fetch_shader{false}; u32 fetch_shader_sgpr_base{0u}; - explicit Info(Stage stage_, ShaderParams params) - : stage{stage_}, pgm_hash{params.hash}, pgm_base{params.Base()}, + explicit Info(Stage stage_, LogicalStage l_stage_, ShaderParams params) + : stage{stage_}, l_stage{l_stage_}, pgm_hash{params.hash}, pgm_base{params.Base()}, user_data{params.user_data} {} template @@ -244,6 +252,16 @@ struct Info { srt_info.walker_func(user_data.data(), flattened_ud_buf.data()); } } + + void ReadTessConstantBuffer(TessellationDataConstantBuffer& tess_constants) const { + ASSERT(tess_consts_dword_offset >= 0); // We've already tracked the V# UD + auto buf = ReadUdReg(static_cast(tess_consts_ptr_base), + static_cast(tess_consts_dword_offset)); + VAddr tess_constants_addr = buf.base_address; + memcpy(&tess_constants, + reinterpret_cast(tess_constants_addr), + sizeof(tess_constants)); + } }; constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept { diff --git a/src/shader_recompiler/ir/attribute.cpp b/src/shader_recompiler/ir/attribute.cpp index e219dfb6..6a267e21 100644 --- a/src/shader_recompiler/ir/attribute.cpp +++ b/src/shader_recompiler/ir/attribute.cpp @@ -104,6 +104,8 @@ std::string NameOf(Attribute attribute) { return "VertexId"; case Attribute::InstanceId: return "InstanceId"; + case Attribute::PrimitiveId: + return "PrimitiveId"; case Attribute::FragCoord: return "FragCoord"; case Attribute::IsFrontFace: @@ -114,6 +116,16 @@ std::string NameOf(Attribute attribute) { return "LocalInvocationId"; case Attribute::LocalInvocationIndex: return "LocalInvocationIndex"; + case Attribute::InvocationId: + return "InvocationId"; + case Attribute::PatchVertices: + return "PatchVertices"; + case Attribute::TessellationEvaluationPointU: + return "TessellationEvaluationPointU"; + case Attribute::TessellationEvaluationPointV: + return "TessellationEvaluationPointV"; + case Attribute::PackedHullInvocationInfo: + return "PackedHullInvocationInfo"; default: break; } diff --git a/src/shader_recompiler/ir/attribute.h b/src/shader_recompiler/ir/attribute.h index 0890e88f..bcb2b44a 100644 --- a/src/shader_recompiler/ir/attribute.h +++ b/src/shader_recompiler/ir/attribute.h @@ -72,8 +72,13 @@ enum class Attribute : u64 { LocalInvocationId = 75, LocalInvocationIndex = 76, FragCoord = 77, - InstanceId0 = 78, // step rate 0 - InstanceId1 = 79, // step rate 1 + InstanceId0 = 78, // step rate 0 + InstanceId1 = 79, // step rate 1 + InvocationId = 80, // TCS id in output patch and instanced geometry shader id + PatchVertices = 81, + TessellationEvaluationPointU = 82, + TessellationEvaluationPointV = 83, + PackedHullInvocationInfo = 84, // contains patch id within the VGT and invocation ID Max, }; @@ -85,6 +90,11 @@ constexpr bool IsPosition(Attribute attribute) noexcept { return attribute >= Attribute::Position0 && attribute <= Attribute::Position3; } +constexpr bool IsTessCoord(Attribute attribute) noexcept { + return attribute >= Attribute::TessellationEvaluationPointU && + attribute <= Attribute::TessellationEvaluationPointV; +} + constexpr bool IsParam(Attribute attribute) noexcept { return attribute >= Attribute::Param0 && attribute <= Attribute::Param31; } diff --git a/src/shader_recompiler/ir/basic_block.cpp b/src/shader_recompiler/ir/basic_block.cpp index b4d1a78c..a312eabd 100644 --- a/src/shader_recompiler/ir/basic_block.cpp +++ b/src/shader_recompiler/ir/basic_block.cpp @@ -94,6 +94,8 @@ static std::string ArgToIndex(std::map& inst_to_index, size return fmt::format("{}", arg.VectorReg()); case Type::Attribute: return fmt::format("{}", arg.Attribute()); + case Type::Patch: + return fmt::format("{}", arg.Patch()); default: return ""; } diff --git a/src/shader_recompiler/ir/ir_emitter.cpp b/src/shader_recompiler/ir/ir_emitter.cpp index 3ebc82e6..21df5339 100644 --- a/src/shader_recompiler/ir/ir_emitter.cpp +++ b/src/shader_recompiler/ir/ir_emitter.cpp @@ -266,8 +266,8 @@ void IREmitter::SetM0(const U32& value) { Inst(Opcode::SetM0, value); } -F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, u32 index) { - return Inst(Opcode::GetAttribute, attribute, Imm32(comp), Imm32(index)); +F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp, IR::Value index) { + return Inst(Opcode::GetAttribute, attribute, Imm32(comp), index); } U32 IREmitter::GetAttributeU32(IR::Attribute attribute, u32 comp) { @@ -278,6 +278,24 @@ void IREmitter::SetAttribute(IR::Attribute attribute, const F32& value, u32 comp Inst(Opcode::SetAttribute, attribute, value, Imm32(comp)); } +F32 IREmitter::GetTessGenericAttribute(const U32& vertex_index, const U32& attr_index, + const U32& comp_index) { + return Inst(IR::Opcode::GetTessGenericAttribute, vertex_index, attr_index, comp_index); +} + +void IREmitter::SetTcsGenericAttribute(const F32& value, const U32& attr_index, + const U32& comp_index) { + Inst(Opcode::SetTcsGenericAttribute, value, attr_index, comp_index); +} + +F32 IREmitter::GetPatch(Patch patch) { + return Inst(Opcode::GetPatch, patch); +} + +void IREmitter::SetPatch(Patch patch, const F32& value) { + Inst(Opcode::SetPatch, patch, value); +} + Value IREmitter::LoadShared(int bit_size, bool is_signed, const U32& offset) { switch (bit_size) { case 32: @@ -552,6 +570,19 @@ Value IREmitter::CompositeConstruct(const Value& e1, const Value& e2, const Valu } } +Value IREmitter::CompositeConstruct(std::span elements) { + switch (elements.size()) { + case 2: + return CompositeConstruct(elements[0], elements[1]); + case 3: + return CompositeConstruct(elements[0], elements[1], elements[2]); + case 4: + return CompositeConstruct(elements[0], elements[1], elements[2], elements[3]); + default: + UNREACHABLE_MSG("Composite construct with greater than 4 elements"); + } +} + Value IREmitter::CompositeExtract(const Value& vector, size_t element) { const auto read{[&](Opcode opcode, size_t limit) -> Value { if (element >= limit) { diff --git a/src/shader_recompiler/ir/ir_emitter.h b/src/shader_recompiler/ir/ir_emitter.h index 068aba14..95713565 100644 --- a/src/shader_recompiler/ir/ir_emitter.h +++ b/src/shader_recompiler/ir/ir_emitter.h @@ -10,6 +10,7 @@ #include "shader_recompiler/ir/attribute.h" #include "shader_recompiler/ir/basic_block.h" #include "shader_recompiler/ir/condition.h" +#include "shader_recompiler/ir/patch.h" #include "shader_recompiler/ir/value.h" namespace Shader::IR { @@ -80,10 +81,18 @@ public: [[nodiscard]] U1 Condition(IR::Condition cond); - [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, u32 index = 0); + [[nodiscard]] F32 GetAttribute(Attribute attribute, u32 comp = 0, + IR::Value index = IR::Value(u32(0u))); [[nodiscard]] U32 GetAttributeU32(Attribute attribute, u32 comp = 0); void SetAttribute(Attribute attribute, const F32& value, u32 comp = 0); + [[nodiscard]] F32 GetTessGenericAttribute(const U32& vertex_index, const U32& attr_index, + const U32& comp_index); + void SetTcsGenericAttribute(const F32& value, const U32& attr_index, const U32& comp_index); + + [[nodiscard]] F32 GetPatch(Patch patch); + void SetPatch(Patch patch, const F32& value); + [[nodiscard]] Value LoadShared(int bit_size, bool is_signed, const U32& offset); void WriteShared(int bit_size, const Value& value, const U32& offset); @@ -138,6 +147,8 @@ public: [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2, const Value& e3); [[nodiscard]] Value CompositeConstruct(const Value& e1, const Value& e2, const Value& e3, const Value& e4); + [[nodiscard]] Value CompositeConstruct(std::span values); + [[nodiscard]] Value CompositeExtract(const Value& vector, size_t element); [[nodiscard]] Value CompositeInsert(const Value& vector, const Value& object, size_t element); @@ -335,6 +346,7 @@ private: template T Inst(Opcode op, Args... args) { auto it{block->PrependNewInst(insertion_point, op, {Value{args}...})}; + it->SetParent(block); return T{Value{&*it}}; } @@ -352,6 +364,7 @@ private: u32 raw_flags{}; std::memcpy(&raw_flags, &flags.proxy, sizeof(flags.proxy)); auto it{block->PrependNewInst(insertion_point, op, {Value{args}...}, raw_flags)}; + it->SetParent(block); return T{Value{&*it}}; } }; diff --git a/src/shader_recompiler/ir/microinstruction.cpp b/src/shader_recompiler/ir/microinstruction.cpp index 9b4ad63d..6e7bbe66 100644 --- a/src/shader_recompiler/ir/microinstruction.cpp +++ b/src/shader_recompiler/ir/microinstruction.cpp @@ -52,6 +52,8 @@ bool Inst::MayHaveSideEffects() const noexcept { case Opcode::Discard: case Opcode::DiscardCond: case Opcode::SetAttribute: + case Opcode::SetTcsGenericAttribute: + case Opcode::SetPatch: case Opcode::StoreBufferU32: case Opcode::StoreBufferU32x2: case Opcode::StoreBufferU32x3: diff --git a/src/shader_recompiler/ir/opcodes.h b/src/shader_recompiler/ir/opcodes.h index be640297..cd73ace7 100644 --- a/src/shader_recompiler/ir/opcodes.h +++ b/src/shader_recompiler/ir/opcodes.h @@ -30,7 +30,7 @@ constexpr Type Opaque{Type::Opaque}; constexpr Type ScalarReg{Type::ScalarReg}; constexpr Type VectorReg{Type::VectorReg}; constexpr Type Attribute{Type::Attribute}; -constexpr Type SystemValue{Type::SystemValue}; +constexpr Type Patch{Type::Patch}; constexpr Type U1{Type::U1}; constexpr Type U8{Type::U8}; constexpr Type U16{Type::U16}; diff --git a/src/shader_recompiler/ir/opcodes.inc b/src/shader_recompiler/ir/opcodes.inc index 47727582..470f9fbe 100644 --- a/src/shader_recompiler/ir/opcodes.inc +++ b/src/shader_recompiler/ir/opcodes.inc @@ -60,6 +60,10 @@ OPCODE(SetGotoVariable, Void, U32, OPCODE(GetAttribute, F32, Attribute, U32, U32, ) OPCODE(GetAttributeU32, U32, Attribute, U32, ) OPCODE(SetAttribute, Void, Attribute, F32, U32, ) +OPCODE(GetPatch, F32, Patch, ) +OPCODE(SetPatch, Void, Patch, F32, ) +OPCODE(GetTessGenericAttribute, F32, U32, U32, U32, ) +OPCODE(SetTcsGenericAttribute, Void, F32, U32, U32, ) // Flags OPCODE(GetScc, U1, Void, ) diff --git a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp index 9624ce6a..16b07e1a 100644 --- a/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp +++ b/src/shader_recompiler/ir/passes/constant_propagation_pass.cpp @@ -216,6 +216,18 @@ void FoldAdd(IR::Block& block, IR::Inst& inst) { } } +template +void FoldMul(IR::Block& block, IR::Inst& inst) { + if (!FoldCommutative(inst, [](T a, T b) { return a * b; })) { + return; + } + const IR::Value rhs{inst.Arg(1)}; + if (rhs.IsImmediate() && Arg(rhs) == 0) { + inst.ReplaceUsesWithAndRemove(IR::Value(0u)); + return; + } +} + void FoldCmpClass(IR::Block& block, IR::Inst& inst) { ASSERT_MSG(inst.Arg(1).IsImmediate(), "Unable to resolve compare operation"); const auto class_mask = static_cast(inst.Arg(1).U32()); @@ -292,7 +304,19 @@ void ConstantPropagation(IR::Block& block, IR::Inst& inst) { FoldWhenAllImmediates(inst, [](u32 a) { return static_cast(a); }); return; case IR::Opcode::IMul32: - FoldWhenAllImmediates(inst, [](u32 a, u32 b) { return a * b; }); + FoldMul(block, inst); + return; + case IR::Opcode::UDiv32: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { + ASSERT_MSG(b != 0, "Folding UDiv32 with divisor 0"); + return a / b; + }); + return; + case IR::Opcode::UMod32: + FoldWhenAllImmediates(inst, [](u32 a, u32 b) { + ASSERT_MSG(b != 0, "Folding UMod32 with modulo 0"); + return a % b; + }); return; case IR::Opcode::FPCmpClass32: FoldCmpClass(block, inst); diff --git a/src/shader_recompiler/ir/passes/constant_propogation.h b/src/shader_recompiler/ir/passes/constant_propogation.h new file mode 100644 index 00000000..313a3cc6 --- /dev/null +++ b/src/shader_recompiler/ir/passes/constant_propogation.h @@ -0,0 +1,4 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once \ No newline at end of file diff --git a/src/shader_recompiler/ir/passes/hull_shader_transform.cpp b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp new file mode 100644 index 00000000..5cf02b6d --- /dev/null +++ b/src/shader_recompiler/ir/passes/hull_shader_transform.cpp @@ -0,0 +1,744 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later +#include "common/assert.h" +#include "shader_recompiler/info.h" +#include "shader_recompiler/ir/attribute.h" +#include "shader_recompiler/ir/breadth_first_search.h" +#include "shader_recompiler/ir/ir_emitter.h" +#include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/pattern_matching.h" +#include "shader_recompiler/ir/program.h" +#include "shader_recompiler/runtime_info.h" + +namespace Shader::Optimization { + +/** + * Tessellation shaders pass outputs to the next shader using LDS. + * The Hull shader stage receives input control points stored in LDS. + * + * These passes attempt to resolve LDS accesses to attribute accesses and correctly + * write to the tessellation factor tables. + * + * The LDS layout is: + * - TCS inputs for patch 0 + * - TCS inputs for patch 1 + * - TCS inputs for patch 2 + * - ... + * - TCS outputs for patch 0 + * - TCS outputs for patch 1 + * - TCS outputs for patch 2 + * - ... + * - PatchConst TCS outputs for patch 0 + * - PatchConst TCS outputs for patch 1 + * - PatchConst TCS outputs for patch 2 + * + * + * If the Hull stage does not write any new control points the driver will + * optimize LDS layout so input and output control point spaces overlap. + * (Passthrough) + * + * The gnm driver requires a V# holding special constants to be bound + * for reads by the shader. + * The Hull and Domain shaders read values from this buffer which + * contain size and offset information required to address input, output, + * or PatchConst attributes within the current patch. + * See the TessellationDataConstantBuffer struct to see the layout of this V#. + * + * Tessellation factors are stored to a special tessellation factor V# that is automatically bound + * by the driver. This is the input to the fixed function tessellator that actually subdivides the + * domain. We translate these to writes to SPIR-V builtins for tessellation factors in the Hull + * shader. + * The offset into the tess factor buffer determines which factor the shader is writing. + * Additionally, most hull shaders seem to redundantly write tess factors to PatchConst + * attributes, even if dead in the domain shader. We just treat these as generic PatchConst writes. + * + * LDS reads in the Hull shader can be from input control points, and in the the Domain shader can + * be hs output control points (output from the perspective of the Hull shader) and patchconst + * values. + * LDS stores in the Hull shader can either be output control point writes or per-patch + * (PatchConst) data writes. The Domain shader exports attributes using EXP instructions, unless its + * followed by the geometry stage (but we havent seen this yet), so nothing special there. + * The address calculations can vary significantly and can't be easily pattern matched. We are at + * the mercy of instruction selection the ps4 compiler wanted to use. + * Generally though, they could look something like this: + * Input control point: + * addr = PatchIdInVgt * input_cp_stride * #input_cp_per_patch + index * input_cp_stride + * + attr# * 16 + component + * Output control point: + * addr = #patches * input_cp_stride * #input_cp_per_patch + * + PatchIdInVgt * output_patch_stride + InvocationID * output_cp_stride + + attr# * 16 + component + * Per patch output: + * addr = #patches * input_cp_stride * #cp_per_input_patch + * + #patches * output_patch_stride + * + PatchIdInVgt * per_patch_output_stride + attr# * 16 + component + * + * output_patch_stride and output_cp_stride are usually compile time constants in the gcn + * + * Hull shaders can probably also read output control points corresponding to other threads, like + * shared memory (but we havent seen this yet). + * ^ This is an UNREACHABLE for now. We may need to insert additional barriers if this happens. + * They should also be able to read PatchConst values, + * although not sure if this happens in practice. + * + * To determine which type of attribute (input, output, patchconst) we the check the users of + * TessConstants V# reads to deduce which type of attribute a given load/store to LDS + * is touching. + * + * In the Hull shader, both the PatchId within the VGT group (PatchIdInVgt) and the output control + * point id (InvocationId) are packed in VGPR1 by the driver like + * V1 = InvocationId << 8 | PatchIdInVgt + * The shader typically uses V_BFE_(U|S)32 to extract them. We use the starting bit_pos to determine + * which is which. + * + * This pass does not attempt to deduce the exact attribute referenced in a LDS load/store. + * Instead, it feeds the address in the LDS load/store to the get/set Insts we use for TCS in/out's, + * TES in's, and PatchConst in/out's. + * + * TCS/TES Input attributes: + * We define input attributes using an array in the shader roughly like this: + * // equivalent GLSL in TCS + * layout (location = 0) in vec4 in_attrs[][NUM_INPUT_ATTRIBUTES]; + * + * Here the NUM_INPUT_ATTRIBUTES is derived from the ls_stride member of the TessConstants V#. + * We divide ls_stride (in bytes) by 16 to get the number of vec4 attributes. + * For TES, the number of attributes comes from hs_cp_stride / 16. + * The first (outer) dimension is unsized but corresponds to the number of vertices in the hs input + * patch (for Hull) or the hs output patch (for Domain). + * + * For input reads in TCS or TES, we emit SPIR-V like: + * float value = in_attrs[addr / ls_stride][(addr % ls_stride) >> 4][(addr & 0xF) >> 2]; + * + * For output writes, we assume the control point index is InvocationId, since high level languages + * impose that restriction (although maybe it's technically possible on hardware). So SPIR-V looks + * like this: + * layout (location = 0) in vec4 in_attrs[][NUM_OUTPUT_ATTRIBUTES]; + * out_attrs[InvocationId][(addr % hs_cp_stride) >> 4][(addr & 0xF) >> 2] = value; + * + * NUM_OUTPUT_ATTRIBUTES is derived by hs_cp_stride / 16, so it can link with the TES in_attrs + * variable. + * + * Another challenge is the fact that the GCN shader needs to address attributes from LDS as a whole + * which contains the attributes from many patches. On the other hand, higher level shading + * languages restrict attribute access to the patch of the current thread, which is naturally a + * restriction in SPIR-V also. + * The addresses the ps4 compiler generates for loads/stores and the fact that LDS holds many + * patches' attributes are just implementation details of the ps4 driver/compiler. To deal with + * this, we can replace certain TessConstant V# reads with 0, which only contribute to the base + * address of the current patch's attributes in LDS and not the indexes within the local patch. + * + * (A perfect implementation might need emulation of the VGTs in mesh/compute, loading/storing + * attributes to buffers and not caring about whether they are hs input, hs output, or patchconst + * attributes) + * + */ + +namespace { + +using namespace Shader::Optimiation::PatternMatching; + +static void InitTessConstants(IR::ScalarReg sharp_ptr_base, s32 sharp_dword_offset, + Shader::Info& info, Shader::RuntimeInfo& runtime_info, + TessellationDataConstantBuffer& tess_constants) { + info.tess_consts_ptr_base = sharp_ptr_base; + info.tess_consts_dword_offset = sharp_dword_offset; + info.ReadTessConstantBuffer(tess_constants); + if (info.l_stage == LogicalStage::TessellationControl) { + runtime_info.hs_info.InitFromTessConstants(tess_constants); + } else { + runtime_info.vs_info.InitFromTessConstants(tess_constants); + } + + return; +} + +struct TessSharpLocation { + IR::ScalarReg ptr_base; + u32 dword_off; +}; + +std::optional FindTessConstantSharp(IR::Inst* read_const_buffer) { + IR::Value sharp_ptr_base; + IR::Value sharp_dword_offset; + + IR::Value rv = IR::Value{read_const_buffer}; + IR::Value handle = read_const_buffer->Arg(0); + + if (M_COMPOSITECONSTRUCTU32X4(M_GETUSERDATA(MatchImm(sharp_dword_offset)), MatchIgnore(), + MatchIgnore(), MatchIgnore()) + .Match(handle)) { + return TessSharpLocation{.ptr_base = IR::ScalarReg::Max, + .dword_off = static_cast(sharp_dword_offset.ScalarReg())}; + } else if (M_COMPOSITECONSTRUCTU32X4( + M_READCONST(M_COMPOSITECONSTRUCTU32X2(M_GETUSERDATA(MatchImm(sharp_ptr_base)), + MatchIgnore()), + MatchImm(sharp_dword_offset)), + MatchIgnore(), MatchIgnore(), MatchIgnore()) + .Match(handle)) { + return TessSharpLocation{.ptr_base = sharp_ptr_base.ScalarReg(), + .dword_off = sharp_dword_offset.U32()}; + } + return {}; +} + +// Walker that helps deduce what type of attribute a DS instruction is reading +// or writing, which could be an input control point, output control point, +// or per-patch constant (PatchConst). +// For certain ReadConstBuffer instructions using the tess constants V#,, we visit the users +// recursively and increment a counter on the Load/WriteShared users. +// Namely NumPatch (from m_hsNumPatch), HsOutputBase (m_hsOutputBase), +// and PatchConstBase (m_patchConstBase). +// In addr calculations, the term NumPatch * ls_stride * #input_cp_in_patch +// is used as an addend to skip the region for input control points, and similarly +// NumPatch * hs_cp_stride * #output_cp_in_patch is used to skip the region +// for output control points. +// +// TODO: this will break if AMD compiler used distributive property like +// TcsNumPatches * (ls_stride * #input_cp_in_patch + hs_cp_stride * #output_cp_in_patch) +class TessConstantUseWalker { +public: + void MarkTessAttributeUsers(IR::Inst* read_const_buffer, TessConstantAttribute attr) { + u32 inc; + switch (attr) { + case TessConstantAttribute::HsNumPatch: + case TessConstantAttribute::HsOutputBase: + inc = 1; + break; + case TessConstantAttribute::PatchConstBase: + inc = 2; + break; + default: + UNREACHABLE(); + } + + for (IR::Use use : read_const_buffer->Uses()) { + MarkTessAttributeUsersHelper(use, inc); + } + + ++seq_num; + } + +private: + void MarkTessAttributeUsersHelper(IR::Use use, u32 inc) { + IR::Inst* inst = use.user; + + switch (use.user->GetOpcode()) { + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::LoadSharedU128: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::WriteSharedU128: { + u32 counter = inst->Flags(); + inst->SetFlags(counter + inc); + // Stop here + return; + } + case IR::Opcode::Phi: { + struct PhiCounter { + u16 seq_num; + u8 unique_edge; + u8 counter; + }; + + PhiCounter count = inst->Flags(); + ASSERT_MSG(count.counter == 0 || count.unique_edge == use.operand); + // the point of seq_num is to tell us if we've already traversed this + // phi on the current walk. Alternatively we could keep a set of phi's + // seen on the current walk. This is to handle phi cycles + if (count.seq_num == 0) { + // First time we've encountered this phi + count.seq_num = seq_num; + // Mark the phi as having been traversed originally through this edge + count.unique_edge = use.operand; + count.counter = inc; + } else if (count.seq_num < seq_num) { + count.seq_num = seq_num; + // For now, assume we are visiting this phi via the same edge + // as on other walks. If not, some dataflow analysis might be necessary + ASSERT(count.unique_edge == use.operand); + count.counter += inc; + } else { + // count.seq_num == seq_num + // there's a cycle, and we've already been here on this walk + return; + } + inst->SetFlags(count); + break; + } + default: + break; + } + + for (IR::Use use : inst->Uses()) { + MarkTessAttributeUsersHelper(use, inc); + } + } + + u32 seq_num{1u}; +}; + +enum class AttributeRegion : u32 { InputCP, OutputCP, PatchConst }; + +static AttributeRegion GetAttributeRegionKind(IR::Inst* ring_access, const Shader::Info& info, + const Shader::RuntimeInfo& runtime_info) { + u32 count = ring_access->Flags(); + if (count == 0) { + return AttributeRegion::InputCP; + } else if (info.l_stage == LogicalStage::TessellationControl && + runtime_info.hs_info.IsPassthrough()) { + ASSERT(count <= 1); + return AttributeRegion::PatchConst; + } else { + ASSERT(count <= 2); + return AttributeRegion(count); + } +} + +static bool IsDivisibleByStride(IR::Value term, u32 stride) { + IR::Value a, b; + if (MatchU32(stride).Match(term)) { + return true; + } else if (M_BITFIELDUEXTRACT(MatchValue(a), MatchU32(0), MatchU32(24)).Match(term) || + M_BITFIELDSEXTRACT(MatchValue(a), MatchU32(0), MatchU32(24)).Match(term)) { + return IsDivisibleByStride(a, stride); + } else if (M_IMUL32(MatchValue(a), MatchValue(b)).Match(term)) { + return IsDivisibleByStride(a, stride) || IsDivisibleByStride(b, stride); + } + return false; +} + +// Return true if we can eliminate any addends +static bool TryOptimizeAddendInModulo(IR::Value addend, u32 stride, std::vector& addends) { + IR::Value a, b; + if (M_IADD32(MatchValue(a), MatchValue(b)).Match(addend)) { + bool ret = false; + ret = TryOptimizeAddendInModulo(a, stride, addends); + ret |= TryOptimizeAddendInModulo(b, stride, addends); + return ret; + } else if (!IsDivisibleByStride(addend, stride)) { + addends.push_back(IR::U32{addend}); + return false; + } else { + return true; + } +} + +// In calculation (a + b + ...) % stride +// Use this fact +// (a + b) mod N = (a mod N + b mod N) mod N +// If any addend is divisible by stride, then we can replace it with 0 in the attribute +// or component index calculation +static IR::U32 TryOptimizeAddressModulo(IR::U32 addr, u32 stride, IR::IREmitter& ir) { + std::vector addends; + if (TryOptimizeAddendInModulo(addr, stride, addends)) { + addr = ir.Imm32(0); + for (auto& addend : addends) { + addr = ir.IAdd(addr, addend); + } + } + return addr; +} + +// TODO: can optimize div in control point index similarly to mod + +// Read a TCS input (InputCP region) or TES input (OutputCP region) +static IR::F32 ReadTessInputComponent(IR::U32 addr, const u32 stride, IR::IREmitter& ir, + u32 off_dw) { + if (off_dw > 0) { + addr = ir.IAdd(addr, ir.Imm32(off_dw)); + } + const IR::U32 control_point_index = ir.IDiv(addr, ir.Imm32(stride)); + const IR::U32 addr_for_attrs = TryOptimizeAddressModulo(addr, stride, ir); + const IR::U32 attr_index = + ir.ShiftRightLogical(ir.IMod(addr_for_attrs, ir.Imm32(stride)), ir.Imm32(4u)); + const IR::U32 comp_index = + ir.ShiftRightLogical(ir.BitwiseAnd(addr_for_attrs, ir.Imm32(0xFU)), ir.Imm32(2u)); + return ir.GetTessGenericAttribute(control_point_index, attr_index, comp_index); +} + +} // namespace + +void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { + const Info& info = program.info; + + for (IR::Block* block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + const auto opcode = inst.GetOpcode(); + switch (opcode) { + case IR::Opcode::StoreBufferU32: + case IR::Opcode::StoreBufferU32x2: + case IR::Opcode::StoreBufferU32x3: + case IR::Opcode::StoreBufferU32x4: { + const auto info = inst.Flags(); + if (!info.globally_coherent) { + break; + } + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto GetValue = [&](IR::Value data) -> IR::F32 { + if (auto* inst = data.TryInstRecursive(); + inst && inst->GetOpcode() == IR::Opcode::BitCastU32F32) { + return IR::F32{inst->Arg(0)}; + } + return ir.BitCast(IR::U32{data}); + }; + const u32 num_dwords = u32(opcode) - u32(IR::Opcode::StoreBufferU32) + 1; + IR::U32 index = IR::U32{inst.Arg(1)}; + ASSERT(index.IsImmediate()); + const u32 gcn_factor_idx = (info.inst_offset.Value() + index.U32()) >> 2; + + const IR::Value data = inst.Arg(2); + auto get_factor_attr = [&](u32 gcn_factor_idx) -> IR::Patch { + // The hull outputs tess factors in different formats depending on the shader. + // For triangle domains, it seems to pack the entries into 4 consecutive floats, + // with the 3 edge factors followed by the 1 interior factor. + // For quads, it does 4 edge factors then 2 interior. + // There is a tess factor stride member of the GNMX hull constants struct in + // a hull program shader binary archive, but this doesn't seem to be + // communicated to the driver. + // The layout seems to be implied by the type of the abstract domain. + switch (runtime_info.hs_info.tess_type) { + case AmdGpu::TessellationType::Quad: + ASSERT(gcn_factor_idx < 6); + return IR::PatchFactor(gcn_factor_idx); + case AmdGpu::TessellationType::Triangle: + ASSERT(gcn_factor_idx < 4); + if (gcn_factor_idx == 3) { + return IR::Patch::TessellationLodInteriorU; + } + return IR::PatchFactor(gcn_factor_idx); + default: + // Point domain types haven't been seen so far + UNREACHABLE_MSG("Unhandled tess type"); + } + }; + + inst.Invalidate(); + if (num_dwords == 1) { + ir.SetPatch(get_factor_attr(gcn_factor_idx), GetValue(data)); + break; + } + auto* inst = data.TryInstRecursive(); + ASSERT(inst && (inst->GetOpcode() == IR::Opcode::CompositeConstructU32x2 || + inst->GetOpcode() == IR::Opcode::CompositeConstructU32x3 || + inst->GetOpcode() == IR::Opcode::CompositeConstructU32x4)); + for (s32 i = 0; i < num_dwords; i++) { + ir.SetPatch(get_factor_attr(gcn_factor_idx + i), GetValue(inst->Arg(i))); + } + break; + } + + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::WriteSharedU128: { + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + const u32 num_dwords = opcode == IR::Opcode::WriteSharedU32 + ? 1 + : (opcode == IR::Opcode::WriteSharedU64 ? 2 : 4); + const IR::U32 addr{inst.Arg(0)}; + const IR::U32 data{inst.Arg(1).Resolve()}; + + const auto SetOutput = [&](IR::U32 addr, IR::U32 value, AttributeRegion output_kind, + u32 off_dw) { + const IR::F32 data_component = ir.BitCast(value); + + if (output_kind == AttributeRegion::OutputCP) { + if (off_dw > 0) { + addr = ir.IAdd(addr, ir.Imm32(off_dw)); + } + u32 stride = runtime_info.hs_info.hs_output_cp_stride; + // Invocation ID array index is implicit, handled by SPIRV backend + const IR::U32 addr_for_attrs = TryOptimizeAddressModulo(addr, stride, ir); + const IR::U32 attr_index = ir.ShiftRightLogical( + ir.IMod(addr_for_attrs, ir.Imm32(stride)), ir.Imm32(4u)); + const IR::U32 comp_index = ir.ShiftRightLogical( + ir.BitwiseAnd(addr_for_attrs, ir.Imm32(0xFU)), ir.Imm32(2u)); + ir.SetTcsGenericAttribute(data_component, attr_index, comp_index); + } else { + ASSERT(output_kind == AttributeRegion::PatchConst); + ASSERT_MSG(addr.IsImmediate(), "patch addr non imm, inst {}", + fmt::ptr(addr.Inst())); + ir.SetPatch(IR::PatchGeneric((addr.U32() >> 2) + off_dw), data_component); + } + }; + + AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info); + if (num_dwords == 1) { + SetOutput(addr, data, region, 0); + } else { + for (auto i = 0; i < num_dwords; i++) { + SetOutput(addr, IR::U32{data.Inst()->Arg(i)}, region, i); + } + } + inst.Invalidate(); + break; + } + + case IR::Opcode::LoadSharedU32: { + case IR::Opcode::LoadSharedU64: + case IR::Opcode::LoadSharedU128: + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + const IR::U32 addr{inst.Arg(0)}; + AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info); + const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32 + ? 1 + : (opcode == IR::Opcode::LoadSharedU64 ? 2 : 4); + ASSERT_MSG(region == AttributeRegion::InputCP, + "Unhandled read of output or patchconst attribute in hull shader"); + IR::Value attr_read; + if (num_dwords == 1) { + attr_read = ir.BitCast( + ReadTessInputComponent(addr, runtime_info.hs_info.ls_stride, ir, 0)); + } else { + boost::container::static_vector read_components; + for (auto i = 0; i < num_dwords; i++) { + const IR::F32 component = + ReadTessInputComponent(addr, runtime_info.hs_info.ls_stride, ir, i); + read_components.push_back(ir.BitCast(component)); + } + attr_read = ir.CompositeConstruct(read_components); + } + inst.ReplaceUsesWithAndRemove(attr_read); + break; + } + + default: + break; + } + } + } + + if (runtime_info.hs_info.IsPassthrough()) { + // Copy input attributes to output attributes, indexed by InvocationID + // Passthrough should imply that input and output patches have same number of vertices + IR::Block* entry_block = *program.blocks.begin(); + auto it = std::ranges::find_if(entry_block->Instructions(), [](IR::Inst& inst) { + return inst.GetOpcode() == IR::Opcode::Prologue; + }); + ASSERT(it != entry_block->end()); + ++it; + ASSERT(it != entry_block->end()); + ++it; + // Prologue + // SetExec #true + // <- insert here + // ... + IR::IREmitter ir{*entry_block, it}; + + ASSERT(runtime_info.hs_info.ls_stride % 16 == 0); + u32 num_attributes = runtime_info.hs_info.ls_stride / 16; + const auto invocation_id = ir.GetAttributeU32(IR::Attribute::InvocationId); + for (u32 attr_no = 0; attr_no < num_attributes; attr_no++) { + for (u32 comp = 0; comp < 4; comp++) { + IR::F32 attr_read = + ir.GetTessGenericAttribute(invocation_id, ir.Imm32(attr_no), ir.Imm32(comp)); + // InvocationId is implicit index for output control point writes + ir.SetTcsGenericAttribute(attr_read, ir.Imm32(attr_no), ir.Imm32(comp)); + } + } + // We could wrap the rest of the program in an if stmt + // CopyInputAttrsToOutputs(); // psuedocode + // if (InvocationId == 0) { + // PatchConstFunction(); + // } + // But as long as we treat invocation ID as 0 for all threads, shouldn't matter functionally + } +} + +void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info) { + Info& info = program.info; + + for (IR::Block* block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + const auto opcode = inst.GetOpcode(); + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU32: { + case IR::Opcode::LoadSharedU64: + case IR::Opcode::LoadSharedU128: + const IR::U32 addr{inst.Arg(0)}; + AttributeRegion region = GetAttributeRegionKind(&inst, info, runtime_info); + const u32 num_dwords = opcode == IR::Opcode::LoadSharedU32 + ? 1 + : (opcode == IR::Opcode::LoadSharedU64 ? 2 : 4); + const auto GetInput = [&](IR::U32 addr, u32 off_dw) -> IR::F32 { + if (region == AttributeRegion::OutputCP) { + return ReadTessInputComponent( + addr, runtime_info.vs_info.hs_output_cp_stride, ir, off_dw); + } else { + ASSERT(region == AttributeRegion::PatchConst); + return ir.GetPatch(IR::PatchGeneric((addr.U32() >> 2) + off_dw)); + } + }; + IR::Value attr_read; + if (num_dwords == 1) { + attr_read = ir.BitCast(GetInput(addr, 0)); + } else { + boost::container::static_vector read_components; + for (auto i = 0; i < num_dwords; i++) { + const IR::F32 component = GetInput(addr, i); + read_components.push_back(ir.BitCast(component)); + } + attr_read = ir.CompositeConstruct(read_components); + } + inst.ReplaceUsesWithAndRemove(attr_read); + break; + } + default: + break; + } + } + } +} + +// Run before either hull or domain transform +void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info) { + TessellationDataConstantBuffer tess_constants; + Shader::Info& info = program.info; + // Find the TessellationDataConstantBuffer V# + for (IR::Block* block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + auto found_tess_consts_sharp = [&]() -> bool { + switch (inst.GetOpcode()) { + case IR::Opcode::LoadSharedU32: + case IR::Opcode::LoadSharedU64: + case IR::Opcode::LoadSharedU128: + case IR::Opcode::WriteSharedU32: + case IR::Opcode::WriteSharedU64: + case IR::Opcode::WriteSharedU128: { + IR::Value addr = inst.Arg(0); + auto read_const_buffer = IR::BreadthFirstSearch( + addr, [](IR::Inst* maybe_tess_const) -> std::optional { + if (maybe_tess_const->GetOpcode() == IR::Opcode::ReadConstBuffer) { + return maybe_tess_const; + } + return std::nullopt; + }); + if (read_const_buffer) { + auto sharp_location = FindTessConstantSharp(read_const_buffer.value()); + if (sharp_location) { + if (info.tess_consts_dword_offset >= 0) { + // Its possible theres a readconstbuffer that contributes to an + // LDS address and isnt a TessConstant V# read. Could improve on + // this somehow + ASSERT_MSG(static_cast(sharp_location->dword_off) == + info.tess_consts_dword_offset && + sharp_location->ptr_base == + info.tess_consts_ptr_base, + "TessConstants V# is ambiguous"); + } + InitTessConstants(sharp_location->ptr_base, + static_cast(sharp_location->dword_off), info, + runtime_info, tess_constants); + return true; + } + UNREACHABLE_MSG("Failed to match tess constant sharp"); + } + return false; + } + default: + return false; + } + }(); + + if (found_tess_consts_sharp) { + break; + } + } + } + + ASSERT(info.tess_consts_dword_offset >= 0); + + TessConstantUseWalker walker; + + for (IR::Block* block : program.blocks) { + for (IR::Inst& inst : block->Instructions()) { + if (inst.GetOpcode() == IR::Opcode::ReadConstBuffer) { + auto sharp_location = FindTessConstantSharp(&inst); + if (sharp_location && sharp_location->ptr_base == info.tess_consts_ptr_base && + sharp_location->dword_off == info.tess_consts_dword_offset) { + // The shader is reading from the TessConstants V# + IR::Value index = inst.Arg(1); + + ASSERT_MSG(index.IsImmediate(), + "Tessellation constant read with dynamic index"); + u32 off_dw = index.U32(); + ASSERT(off_dw <= + static_cast(TessConstantAttribute::FirstEdgeTessFactorIndex)); + + auto tess_const_attr = static_cast(off_dw); + switch (tess_const_attr) { + case TessConstantAttribute::LsStride: + // If not, we may need to make this runtime state for TES + ASSERT(info.l_stage == LogicalStage::TessellationControl); + inst.ReplaceUsesWithAndRemove(IR::Value(tess_constants.ls_stride)); + break; + case TessConstantAttribute::HsCpStride: + inst.ReplaceUsesWithAndRemove(IR::Value(tess_constants.hs_cp_stride)); + break; + case TessConstantAttribute::HsNumPatch: + case TessConstantAttribute::HsOutputBase: + case TessConstantAttribute::PatchConstBase: + walker.MarkTessAttributeUsers(&inst, tess_const_attr); + // We should be able to safely set these to 0 so that indexing happens only + // within the local patch in the recompiled Vulkan shader. This assumes + // these values only contribute to address calculations for in/out + // attributes in the original gcn shader. + // See the explanation for why we set V2 to 0 when emitting the prologue. + inst.ReplaceUsesWithAndRemove(IR::Value(0u)); + break; + case Shader::TessConstantAttribute::PatchConstSize: + case Shader::TessConstantAttribute::PatchOutputSize: + case Shader::TessConstantAttribute::OffChipTessellationFactorThreshold: + case Shader::TessConstantAttribute::FirstEdgeTessFactorIndex: + // May need to replace PatchConstSize and PatchOutputSize with 0 + break; + default: + UNREACHABLE_MSG("Read past end of TessConstantsBuffer"); + } + } + } + } + } + + // These pattern matching are neccessary for now unless we support dynamic indexing of + // PatchConst attributes and tess factors. PatchConst should be easy, turn those into a single + // vec4 array like in/out attrs. Not sure about tess factors. + if (info.l_stage == LogicalStage::TessellationControl) { + // Replace the BFEs on V1 (packed with patch id within VGT and output cp id) + for (IR::Block* block : program.blocks) { + for (auto it = block->Instructions().begin(); it != block->Instructions().end(); it++) { + IR::Inst& inst = *it; + if (M_BITFIELDUEXTRACT( + M_GETATTRIBUTEU32(MatchAttribute(IR::Attribute::PackedHullInvocationInfo), + MatchIgnore()), + MatchU32(0), MatchU32(8)) + .Match(IR::Value{&inst})) { + IR::IREmitter emit(*block, it); + // This is the patch id within the VGT, not the actual PrimitiveId + // in the draw + IR::Value replacement(0u); + inst.ReplaceUsesWithAndRemove(replacement); + } else if (M_BITFIELDUEXTRACT( + M_GETATTRIBUTEU32( + MatchAttribute(IR::Attribute::PackedHullInvocationInfo), + MatchIgnore()), + MatchU32(8), MatchU32(5)) + .Match(IR::Value{&inst})) { + IR::IREmitter ir(*block, it); + IR::Value replacement; + if (runtime_info.hs_info.IsPassthrough()) { + // Deal with annoying pattern in BB where InvocationID use makes no + // sense (in addr calculation for patchconst or tess factor write) + replacement = ir.Imm32(0); + } else { + replacement = ir.GetAttributeU32(IR::Attribute::InvocationId); + } + inst.ReplaceUsesWithAndRemove(replacement); + } + } + } + } +} + +} // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/ir_passes.h b/src/shader_recompiler/ir/passes/ir_passes.h index 7bd47992..61f43e7e 100644 --- a/src/shader_recompiler/ir/passes/ir_passes.h +++ b/src/shader_recompiler/ir/passes/ir_passes.h @@ -18,5 +18,8 @@ void CollectShaderInfoPass(IR::Program& program); void LowerSharedMemToRegisters(IR::Program& program); void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtime_info, Stage stage); +void TessellationPreprocess(IR::Program& program, RuntimeInfo& runtime_info); +void HullShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); +void DomainShaderTransform(IR::Program& program, RuntimeInfo& runtime_info); } // namespace Shader::Optimization diff --git a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp index eb1be296..d6f1efb1 100644 --- a/src/shader_recompiler/ir/passes/ring_access_elimination.cpp +++ b/src/shader_recompiler/ir/passes/ring_access_elimination.cpp @@ -1,11 +1,13 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/assert.h" #include "shader_recompiler/ir/ir_emitter.h" #include "shader_recompiler/ir/opcodes.h" #include "shader_recompiler/ir/program.h" #include "shader_recompiler/ir/reg.h" #include "shader_recompiler/recompiler.h" +#include "shader_recompiler/runtime_info.h" namespace Shader::Optimization { @@ -23,12 +25,45 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim }; switch (stage) { + case Stage::Local: { + ForEachInstruction([=](IR::IREmitter& ir, IR::Inst& inst) { + const auto opcode = inst.GetOpcode(); + switch (opcode) { + case IR::Opcode::WriteSharedU64: + case IR::Opcode::WriteSharedU32: { + bool is_composite = opcode == IR::Opcode::WriteSharedU64; + u32 num_components = opcode == IR::Opcode::WriteSharedU32 ? 1 : 2; + + u32 offset = 0; + const auto* addr = inst.Arg(0).InstRecursive(); + if (addr->GetOpcode() == IR::Opcode::IAdd32) { + ASSERT(addr->Arg(1).IsImmediate()); + offset = addr->Arg(1).U32(); + } + IR::Value data = inst.Arg(1).Resolve(); + for (s32 i = 0; i < num_components; i++) { + const auto attrib = IR::Attribute::Param0 + (offset / 16); + const auto comp = (offset / 4) % 4; + const IR::U32 value = IR::U32{is_composite ? data.Inst()->Arg(i) : data}; + ir.SetAttribute(attrib, ir.BitCast(value), comp); + offset += 4; + } + inst.Invalidate(); + break; + } + default: + break; + } + }); + break; + } case Stage::Export: { ForEachInstruction([=](IR::IREmitter& ir, IR::Inst& inst) { const auto opcode = inst.GetOpcode(); switch (opcode) { case IR::Opcode::StoreBufferU32: { - if (!inst.Flags().ring_access) { + const auto info = inst.Flags(); + if (!info.system_coherent || !info.globally_coherent) { break; } @@ -61,12 +96,13 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim const auto opcode = inst.GetOpcode(); switch (opcode) { case IR::Opcode::LoadBufferU32: { - if (!inst.Flags().ring_access) { + const auto info = inst.Flags(); + if (!info.system_coherent || !info.globally_coherent) { break; } const auto shl_inst = inst.Arg(1).TryInstRecursive(); - const auto vertex_id = shl_inst->Arg(0).Resolve().U32() >> 2; + const auto vertex_id = ir.Imm32(shl_inst->Arg(0).Resolve().U32() >> 2); const auto offset = inst.Arg(1).TryInstRecursive()->Arg(1); const auto bucket = offset.Resolve().U32() / 256u; const auto attrib = bucket < 4 ? IR::Attribute::Position0 @@ -80,7 +116,8 @@ void RingAccessElimination(const IR::Program& program, const RuntimeInfo& runtim break; } case IR::Opcode::StoreBufferU32: { - if (!inst.Flags().ring_access) { + const auto buffer_info = inst.Flags(); + if (!buffer_info.system_coherent || !buffer_info.globally_coherent) { break; } diff --git a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp index 8b93d72e..c34b59b8 100644 --- a/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp +++ b/src/shader_recompiler/ir/passes/shader_info_collection_pass.cpp @@ -17,6 +17,22 @@ void Visit(Info& info, IR::Inst& inst) { case IR::Opcode::GetUserData: info.ud_mask.Set(inst.Arg(0).ScalarReg()); break; + case IR::Opcode::SetPatch: { + const auto patch = inst.Arg(0).Patch(); + if (patch <= IR::Patch::TessellationLodBottom) { + info.stores_tess_level_outer = true; + } else if (patch <= IR::Patch::TessellationLodInteriorV) { + info.stores_tess_level_inner = true; + } else { + info.uses_patches |= 1U << IR::GenericPatchIndex(patch); + } + break; + } + case IR::Opcode::GetPatch: { + const auto patch = inst.Arg(0).Patch(); + info.uses_patches |= 1U << IR::GenericPatchIndex(patch); + break; + } case IR::Opcode::LoadSharedU32: case IR::Opcode::LoadSharedU64: case IR::Opcode::WriteSharedU32: diff --git a/src/shader_recompiler/ir/patch.cpp b/src/shader_recompiler/ir/patch.cpp new file mode 100644 index 00000000..2485bc5b --- /dev/null +++ b/src/shader_recompiler/ir/patch.cpp @@ -0,0 +1,28 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/ir/patch.h" + +namespace Shader::IR { + +std::string NameOf(Patch patch) { + switch (patch) { + case Patch::TessellationLodLeft: + return "TessellationLodLeft"; + case Patch::TessellationLodTop: + return "TessellationLodTop"; + case Patch::TessellationLodRight: + return "TessellationLodRight"; + case Patch::TessellationLodBottom: + return "TessellationLodBottom"; + case Patch::TessellationLodInteriorU: + return "TessellationLodInteriorU"; + case Patch::TessellationLodInteriorV: + return "TessellationLodInteriorV"; + default: + const u32 index = u32(patch) - u32(Patch::Component0); + return fmt::format("Component{}", index); + } +} + +} // namespace Shader::IR diff --git a/src/shader_recompiler/ir/patch.h b/src/shader_recompiler/ir/patch.h new file mode 100644 index 00000000..65d2192e --- /dev/null +++ b/src/shader_recompiler/ir/patch.h @@ -0,0 +1,173 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include +#include "common/types.h" + +namespace Shader::IR { + +enum class Patch : u64 { + TessellationLodLeft, + TessellationLodTop, + TessellationLodRight, + TessellationLodBottom, + TessellationLodInteriorU, + TessellationLodInteriorV, + Component0, + Component1, + Component2, + Component3, + Component4, + Component5, + Component6, + Component7, + Component8, + Component9, + Component10, + Component11, + Component12, + Component13, + Component14, + Component15, + Component16, + Component17, + Component18, + Component19, + Component20, + Component21, + Component22, + Component23, + Component24, + Component25, + Component26, + Component27, + Component28, + Component29, + Component30, + Component31, + Component32, + Component33, + Component34, + Component35, + Component36, + Component37, + Component38, + Component39, + Component40, + Component41, + Component42, + Component43, + Component44, + Component45, + Component46, + Component47, + Component48, + Component49, + Component50, + Component51, + Component52, + Component53, + Component54, + Component55, + Component56, + Component57, + Component58, + Component59, + Component60, + Component61, + Component62, + Component63, + Component64, + Component65, + Component66, + Component67, + Component68, + Component69, + Component70, + Component71, + Component72, + Component73, + Component74, + Component75, + Component76, + Component77, + Component78, + Component79, + Component80, + Component81, + Component82, + Component83, + Component84, + Component85, + Component86, + Component87, + Component88, + Component89, + Component90, + Component91, + Component92, + Component93, + Component94, + Component95, + Component96, + Component97, + Component98, + Component99, + Component100, + Component101, + Component102, + Component103, + Component104, + Component105, + Component106, + Component107, + Component108, + Component109, + Component110, + Component111, + Component112, + Component113, + Component114, + Component115, + Component116, + Component117, + Component118, + Component119, +}; +static_assert(static_cast(Patch::Component119) == 125); + +constexpr bool IsGeneric(Patch patch) noexcept { + return patch >= Patch::Component0 && patch <= Patch::Component119; +} + +constexpr Patch PatchFactor(u32 index) { + return static_cast(index); +} + +constexpr Patch PatchGeneric(u32 index) { + return static_cast(static_cast(Patch::Component0) + index); +} + +constexpr u32 GenericPatchIndex(Patch patch) { + return (static_cast(patch) - static_cast(Patch::Component0)) / 4; +} + +constexpr u32 GenericPatchElement(Patch patch) { + return (static_cast(patch) - static_cast(Patch::Component0)) % 4; +} + +[[nodiscard]] std::string NameOf(Patch patch); + +} // namespace Shader::IR + +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) { + return ctx.begin(); + } + auto format(const Shader::IR::Patch patch, format_context& ctx) const { + return fmt::format_to(ctx.out(), "{}", Shader::IR::NameOf(patch)); + } +}; diff --git a/src/shader_recompiler/ir/pattern_matching.h b/src/shader_recompiler/ir/pattern_matching.h new file mode 100644 index 00000000..1279f14c --- /dev/null +++ b/src/shader_recompiler/ir/pattern_matching.h @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "shader_recompiler/ir/attribute.h" +#include "shader_recompiler/ir/value.h" + +namespace Shader::Optimiation::PatternMatching { + +// Attempt at pattern matching for Insts and Values +// Needs improvement, mostly a convenience + +template +struct MatchObject { + inline bool Match(IR::Value v) { + return static_cast(this)->Match(v); + } +}; + +struct MatchValue : MatchObject { + MatchValue(IR::Value& return_val_) : return_val(return_val_) {} + + inline bool Match(IR::Value v) { + return_val = v; + return true; + } + +private: + IR::Value& return_val; +}; + +struct MatchIgnore : MatchObject { + MatchIgnore() {} + + inline bool Match(IR::Value v) { + return true; + } +}; + +struct MatchImm : MatchObject { + MatchImm(IR::Value& v) : return_val(v) {} + + inline bool Match(IR::Value v) { + if (!v.IsImmediate()) { + return false; + } + + return_val = v; + return true; + } + +private: + IR::Value& return_val; +}; + +struct MatchAttribute : MatchObject { + MatchAttribute(IR::Attribute attribute_) : attribute(attribute_) {} + + inline bool Match(IR::Value v) { + return v.Type() == IR::Type::Attribute && v.Attribute() == attribute; + } + +private: + IR::Attribute attribute; +}; + +struct MatchU32 : MatchObject { + MatchU32(u32 imm_) : imm(imm_) {} + + inline bool Match(IR::Value v) { + return v.IsImmediate() && v.Type() == IR::Type::U32 && v.U32() == imm; + } + +private: + u32 imm; +}; + +template +struct MatchInstObject : MatchObject> { + static_assert(sizeof...(Args) == IR::NumArgsOf(opcode)); + MatchInstObject(Args&&... args) : pattern(std::forward_as_tuple(args...)) {} + + inline bool Match(IR::Value v) { + IR::Inst* inst = v.TryInstRecursive(); + if (!inst || inst->GetOpcode() != opcode) { + return false; + } + + bool matched = true; + + [&](std::index_sequence) { + ((matched = matched && std::get(pattern).Match(inst->Arg(Is))), ...); + }(std::make_index_sequence{}); + + return matched; + } + +private: + using MatchArgs = std::tuple; + MatchArgs pattern; +}; + +template +inline auto MakeInstPattern(Args&&... args) { + return MatchInstObject(std::forward(args)...); +} + +// Conveniences. TODO probably simpler way of doing this +#define M_READCONST(...) MakeInstPattern(__VA_ARGS__) +#define M_GETUSERDATA(...) MakeInstPattern(__VA_ARGS__) +#define M_BITFIELDUEXTRACT(...) MakeInstPattern(__VA_ARGS__) +#define M_BITFIELDSEXTRACT(...) MakeInstPattern(__VA_ARGS__) +#define M_GETATTRIBUTEU32(...) MakeInstPattern(__VA_ARGS__) +#define M_UMOD32(...) MakeInstPattern(__VA_ARGS__) +#define M_SHIFTRIGHTLOGICAL32(...) MakeInstPattern(__VA_ARGS__) +#define M_IADD32(...) MakeInstPattern(__VA_ARGS__) +#define M_IMUL32(...) MakeInstPattern(__VA_ARGS__) +#define M_BITWISEAND32(...) MakeInstPattern(__VA_ARGS__) +#define M_GETTESSGENERICATTRIBUTE(...) \ + MakeInstPattern(__VA_ARGS__) +#define M_SETTCSGENERICATTRIBUTE(...) \ + MakeInstPattern(__VA_ARGS__) +#define M_COMPOSITECONSTRUCTU32X2(...) \ + MakeInstPattern(__VA_ARGS__) +#define M_COMPOSITECONSTRUCTU32X4(...) \ + MakeInstPattern(__VA_ARGS__) + +} // namespace Shader::Optimiation::PatternMatching \ No newline at end of file diff --git a/src/shader_recompiler/ir/reg.h b/src/shader_recompiler/ir/reg.h index ca2e9ceb..19e0da3d 100644 --- a/src/shader_recompiler/ir/reg.h +++ b/src/shader_recompiler/ir/reg.h @@ -49,7 +49,8 @@ union BufferInstInfo { BitField<0, 1, u32> index_enable; BitField<1, 1, u32> offset_enable; BitField<2, 12, u32> inst_offset; - BitField<14, 1, u32> ring_access; // global + system coherency + BitField<14, 1, u32> system_coherent; + BitField<15, 1, u32> globally_coherent; }; enum class ScalarReg : u32 { diff --git a/src/shader_recompiler/ir/type.h b/src/shader_recompiler/ir/type.h index ec855a77..0f043fb6 100644 --- a/src/shader_recompiler/ir/type.h +++ b/src/shader_recompiler/ir/type.h @@ -15,7 +15,7 @@ enum class Type { ScalarReg = 1 << 1, VectorReg = 1 << 2, Attribute = 1 << 3, - SystemValue = 1 << 4, + Patch = 1 << 4, U1 = 1 << 5, U8 = 1 << 6, U16 = 1 << 7, diff --git a/src/shader_recompiler/ir/value.cpp b/src/shader_recompiler/ir/value.cpp index 889e9955..8826b80f 100644 --- a/src/shader_recompiler/ir/value.cpp +++ b/src/shader_recompiler/ir/value.cpp @@ -16,6 +16,8 @@ Value::Value(IR::VectorReg reg) noexcept : type{Type::VectorReg}, vreg{reg} {} Value::Value(IR::Attribute value) noexcept : type{Type::Attribute}, attribute{value} {} +Value::Value(IR::Patch patch) noexcept : type{Type::Patch}, patch{patch} {} + Value::Value(bool value) noexcept : type{Type::U1}, imm_u1{value} {} Value::Value(u8 value) noexcept : type{Type::U8}, imm_u8{value} {} diff --git a/src/shader_recompiler/ir/value.h b/src/shader_recompiler/ir/value.h index dbe8b5cc..ed1e5536 100644 --- a/src/shader_recompiler/ir/value.h +++ b/src/shader_recompiler/ir/value.h @@ -16,6 +16,7 @@ #include "shader_recompiler/exception.h" #include "shader_recompiler/ir/attribute.h" #include "shader_recompiler/ir/opcodes.h" +#include "shader_recompiler/ir/patch.h" #include "shader_recompiler/ir/reg.h" #include "shader_recompiler/ir/type.h" @@ -34,6 +35,7 @@ public: explicit Value(IR::ScalarReg reg) noexcept; explicit Value(IR::VectorReg reg) noexcept; explicit Value(IR::Attribute value) noexcept; + explicit Value(IR::Patch patch) noexcept; explicit Value(bool value) noexcept; explicit Value(u8 value) noexcept; explicit Value(u16 value) noexcept; @@ -56,6 +58,7 @@ public: [[nodiscard]] IR::ScalarReg ScalarReg() const; [[nodiscard]] IR::VectorReg VectorReg() const; [[nodiscard]] IR::Attribute Attribute() const; + [[nodiscard]] IR::Patch Patch() const; [[nodiscard]] bool U1() const; [[nodiscard]] u8 U8() const; [[nodiscard]] u16 U16() const; @@ -75,6 +78,7 @@ private: IR::ScalarReg sreg; IR::VectorReg vreg; IR::Attribute attribute; + IR::Patch patch; bool imm_u1; u8 imm_u8; u16 imm_u16; @@ -330,6 +334,11 @@ inline IR::Attribute Value::Attribute() const { return attribute; } +inline IR::Patch Value::Patch() const { + DEBUG_ASSERT(type == Type::Patch); + return patch; +} + inline bool Value::U1() const { if (IsIdentity()) { return inst->Arg(0).U1(); diff --git a/src/shader_recompiler/recompiler.cpp b/src/shader_recompiler/recompiler.cpp index 64f842c4..ad57adb6 100644 --- a/src/shader_recompiler/recompiler.cpp +++ b/src/shader_recompiler/recompiler.cpp @@ -1,6 +1,9 @@ // SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/config.h" +#include "common/io_file.h" +#include "common/path_util.h" #include "shader_recompiler/frontend/control_flow_graph.h" #include "shader_recompiler/frontend/decode.h" #include "shader_recompiler/frontend/structured_control_flow.h" @@ -29,7 +32,7 @@ IR::BlockList GenerateBlocks(const IR::AbstractSyntaxList& syntax_list) { } IR::Program TranslateProgram(std::span code, Pools& pools, Info& info, - const RuntimeInfo& runtime_info, const Profile& profile) { + RuntimeInfo& runtime_info, const Profile& profile) { // Ensure first instruction is expected. constexpr u32 token_mov_vcchi = 0xBEEB03FF; if (code[0] != token_mov_vcchi) { @@ -60,12 +63,29 @@ IR::Program TranslateProgram(std::span code, Pools& pools, Info& info program.post_order_blocks = Shader::IR::PostOrder(program.syntax_list.front()); // Run optimization passes + const auto stage = program.info.stage; + Shader::Optimization::SsaRewritePass(program.post_order_blocks); + Shader::Optimization::IdentityRemovalPass(program.blocks); + if (info.l_stage == LogicalStage::TessellationControl) { + // Tess passes require previous const prop passes for now (for simplicity). TODO allow + // fine grained folding or opportunistic folding we set an operand to an immediate + Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); + Shader::Optimization::TessellationPreprocess(program, runtime_info); + Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); + Shader::Optimization::HullShaderTransform(program, runtime_info); + } else if (info.l_stage == LogicalStage::TessellationEval) { + Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); + Shader::Optimization::TessellationPreprocess(program, runtime_info); + Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); + Shader::Optimization::DomainShaderTransform(program, runtime_info); + } Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); - if (program.info.stage != Stage::Compute) { + Shader::Optimization::RingAccessElimination(program, runtime_info, stage); + if (stage != Stage::Compute) { Shader::Optimization::LowerSharedMemToRegisters(program); } - Shader::Optimization::RingAccessElimination(program, runtime_info, program.info.stage); + Shader::Optimization::ConstantPropagationPass(program.post_order_blocks); Shader::Optimization::FlattenExtendedUserdataPass(program); Shader::Optimization::ResourceTrackingPass(program); Shader::Optimization::IdentityRemovalPass(program.blocks); diff --git a/src/shader_recompiler/recompiler.h b/src/shader_recompiler/recompiler.h index f8acf6c9..8180c29b 100644 --- a/src/shader_recompiler/recompiler.h +++ b/src/shader_recompiler/recompiler.h @@ -28,6 +28,6 @@ struct Pools { }; [[nodiscard]] IR::Program TranslateProgram(std::span code, Pools& pools, Info& info, - const RuntimeInfo& runtime_info, const Profile& profile); + RuntimeInfo& runtime_info, const Profile& profile); } // namespace Shader diff --git a/src/shader_recompiler/runtime_info.h b/src/shader_recompiler/runtime_info.h index 4c779a36..23e23c11 100644 --- a/src/shader_recompiler/runtime_info.h +++ b/src/shader_recompiler/runtime_info.h @@ -7,6 +7,7 @@ #include #include #include "common/types.h" +#include "shader_recompiler/frontend/tessellation.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/amdgpu/types.h" @@ -21,12 +22,31 @@ enum class Stage : u32 { Local, Compute, }; -constexpr u32 MaxStageTypes = 7; + +// Vertex intentionally comes after TCS/TES due to order of compilation +enum class LogicalStage : u32 { + Fragment, + TessellationControl, + TessellationEval, + Vertex, + Geometry, + Compute, + NumLogicalStages +}; + +constexpr u32 MaxStageTypes = static_cast(LogicalStage::NumLogicalStages); [[nodiscard]] constexpr Stage StageFromIndex(size_t index) noexcept { return static_cast(index); } +struct LocalRuntimeInfo { + u32 ls_stride; + bool links_with_tcs; + + auto operator<=>(const LocalRuntimeInfo&) const noexcept = default; +}; + struct ExportRuntimeInfo { u32 vertex_data_size; @@ -64,9 +84,57 @@ struct VertexRuntimeInfo { u32 num_outputs; std::array outputs; bool emulate_depth_negative_one_to_one{}; + // Domain + AmdGpu::TessellationType tess_type; + AmdGpu::TessellationTopology tess_topology; + AmdGpu::TessellationPartitioning tess_partitioning; + u32 hs_output_cp_stride{}; bool operator==(const VertexRuntimeInfo& other) const noexcept { - return emulate_depth_negative_one_to_one == other.emulate_depth_negative_one_to_one; + return emulate_depth_negative_one_to_one == other.emulate_depth_negative_one_to_one && + tess_type == other.tess_type && tess_topology == other.tess_topology && + tess_partitioning == other.tess_partitioning && + hs_output_cp_stride == other.hs_output_cp_stride; + } + + void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) { + hs_output_cp_stride = tess_constants.hs_cp_stride; + } +}; + +struct HullRuntimeInfo { + // from registers + u32 num_input_control_points; + u32 num_threads; + AmdGpu::TessellationType tess_type; + + // from tess constants buffer + u32 ls_stride; + u32 hs_output_cp_stride; + u32 hs_output_base; + + auto operator<=>(const HullRuntimeInfo&) const noexcept = default; + + // It might be possible for a non-passthrough TCS to have these conditions, in some + // dumb situation. + // In that case, it should be fine to assume passthrough and declare some extra + // output control points and attributes that shouldnt be read by the TES anyways + bool IsPassthrough() const { + return hs_output_base == 0 && ls_stride == hs_output_cp_stride && num_threads == 1; + }; + + // regs.ls_hs_config.hs_output_control_points contains the number of threads, which + // isn't exactly the number of output control points. + // For passthrough shaders, the register field is set to 1, so use the number of + // input control points + u32 NumOutputControlPoints() const { + return IsPassthrough() ? num_input_control_points : num_threads; + } + + void InitFromTessConstants(Shader::TessellationDataConstantBuffer& tess_constants) { + ls_stride = tess_constants.ls_stride; + hs_output_cp_stride = tess_constants.hs_cp_stride; + hs_output_base = tess_constants.hs_output_base; } }; @@ -150,8 +218,10 @@ struct RuntimeInfo { AmdGpu::FpDenormMode fp_denorm_mode32; AmdGpu::FpRoundMode fp_round_mode32; union { + LocalRuntimeInfo ls_info; ExportRuntimeInfo es_info; VertexRuntimeInfo vs_info; + HullRuntimeInfo hs_info; GeometryRuntimeInfo gs_info; FragmentRuntimeInfo fs_info; ComputeRuntimeInfo cs_info; @@ -174,6 +244,10 @@ struct RuntimeInfo { return es_info == other.es_info; case Stage::Geometry: return gs_info == other.gs_info; + case Stage::Hull: + return hs_info == other.hs_info; + case Stage::Local: + return ls_info == other.ls_info; default: return true; } diff --git a/src/shader_recompiler/specialization.h b/src/shader_recompiler/specialization.h index 9b5dd8fa..5799c4c9 100644 --- a/src/shader_recompiler/specialization.h +++ b/src/shader_recompiler/specialization.h @@ -127,6 +127,18 @@ struct StageSpecialization { [](auto& spec, const auto& desc, AmdGpu::Sampler sharp) { spec.force_unnormalized = sharp.force_unnormalized; }); + + // Initialize runtime_info fields that rely on analysis in tessellation passes + if (info->l_stage == LogicalStage::TessellationControl || + info->l_stage == LogicalStage::TessellationEval) { + Shader::TessellationDataConstantBuffer tess_constants; + info->ReadTessConstantBuffer(tess_constants); + if (info->l_stage == LogicalStage::TessellationControl) { + runtime_info.hs_info.InitFromTessConstants(tess_constants); + } else { + runtime_info.vs_info.InitFromTessConstants(tess_constants); + } + } } void ForEachSharp(auto& spec_list, auto& desc_list, auto&& func) { diff --git a/src/video_core/amdgpu/liverpool.h b/src/video_core/amdgpu/liverpool.h index 9bc3454d..b6172d37 100644 --- a/src/video_core/amdgpu/liverpool.h +++ b/src/video_core/amdgpu/liverpool.h @@ -143,6 +143,13 @@ struct Liverpool { } }; + struct HsTessFactorClamp { + // I've only seen min=0.0, max=1.0 so far. + // TODO why is max set to 1.0? Makes no sense + float hs_max_tess; + float hs_min_tess; + }; + struct ComputeProgram { u32 dispatch_initiator; u32 dim_x; @@ -956,6 +963,7 @@ struct Liverpool { enum VgtStages : u32 { Vs = 0u, // always enabled EsGs = 0xB0u, + LsHs = 0x45u, }; VgtStages raw; @@ -963,7 +971,8 @@ struct Liverpool { BitField<2, 1, u32> hs_en; BitField<3, 2, u32> es_en; BitField<5, 1, u32> gs_en; - BitField<6, 1, u32> vs_en; + BitField<6, 2, u32> vs_en; + BitField<8, 1, u32> dynamic_hs; bool IsStageEnabled(u32 stage) const { switch (stage) { @@ -1059,6 +1068,28 @@ struct Liverpool { }; }; + union LsHsConfig { + u32 raw; + BitField<0, 8, u32> num_patches; + BitField<8, 6, u32> hs_input_control_points; + BitField<14, 6, u32> hs_output_control_points; + }; + + union TessellationConfig { + u32 raw; + BitField<0, 2, TessellationType> type; + BitField<2, 3, TessellationPartitioning> partitioning; + BitField<5, 3, TessellationTopology> topology; + }; + + union TessFactorMemoryBase { + u32 base; + + u64 MemoryBase() const { + return static_cast(base) << 8; + } + }; + union Eqaa { u32 raw; BitField<0, 1, u32> max_anchor_samples; @@ -1109,7 +1140,7 @@ struct Liverpool { ShaderProgram es_program; INSERT_PADDING_WORDS(0x2C); ShaderProgram hs_program; - INSERT_PADDING_WORDS(0x2C); + INSERT_PADDING_WORDS(0x2D48 - 0x2d08 - 20); ShaderProgram ls_program; INSERT_PADDING_WORDS(0xA4); ComputeProgram cs_program; @@ -1176,7 +1207,9 @@ struct Liverpool { PolygonControl polygon_control; ViewportControl viewport_control; VsOutputControl vs_output_control; - INSERT_PADDING_WORDS(0xA290 - 0xA207 - 1); + INSERT_PADDING_WORDS(0xA287 - 0xA207 - 1); + HsTessFactorClamp hs_clamp; + INSERT_PADDING_WORDS(0xA290 - 0xA287 - 2); GsMode vgt_gs_mode; INSERT_PADDING_WORDS(1); ModeControl mode_control; @@ -1200,9 +1233,10 @@ struct Liverpool { BitField<0, 11, u32> vgt_gs_max_vert_out; INSERT_PADDING_WORDS(0xA2D5 - 0xA2CE - 1); ShaderStageEnable stage_enable; - INSERT_PADDING_WORDS(1); + LsHsConfig ls_hs_config; u32 vgt_gs_vert_itemsize[4]; - INSERT_PADDING_WORDS(4); + TessellationConfig tess_config; + INSERT_PADDING_WORDS(3); PolygonOffset poly_offset; GsInstances vgt_gs_instance_cnt; StreamOutConfig vgt_strmout_config; @@ -1216,6 +1250,8 @@ struct Liverpool { INSERT_PADDING_WORDS(0xC24C - 0xC243); u32 num_indices; VgtNumInstances num_instances; + INSERT_PADDING_WORDS(0xC250 - 0xC24D - 1); + TessFactorMemoryBase vgt_tf_memory_base; }; std::array reg_array{}; @@ -1431,6 +1467,7 @@ static_assert(GFX6_3D_REG_INDEX(color_control) == 0xA202); static_assert(GFX6_3D_REG_INDEX(clipper_control) == 0xA204); static_assert(GFX6_3D_REG_INDEX(viewport_control) == 0xA206); static_assert(GFX6_3D_REG_INDEX(vs_output_control) == 0xA207); +static_assert(GFX6_3D_REG_INDEX(hs_clamp) == 0xA287); static_assert(GFX6_3D_REG_INDEX(vgt_gs_mode) == 0xA290); static_assert(GFX6_3D_REG_INDEX(mode_control) == 0xA292); static_assert(GFX6_3D_REG_INDEX(vgt_gs_out_prim_type) == 0xA29B); @@ -1445,6 +1482,7 @@ static_assert(GFX6_3D_REG_INDEX(vgt_gsvs_ring_itemsize) == 0xA2AC); static_assert(GFX6_3D_REG_INDEX(vgt_gs_max_vert_out) == 0xA2CE); static_assert(GFX6_3D_REG_INDEX(stage_enable) == 0xA2D5); static_assert(GFX6_3D_REG_INDEX(vgt_gs_vert_itemsize[0]) == 0xA2D7); +static_assert(GFX6_3D_REG_INDEX(tess_config) == 0xA2DB); static_assert(GFX6_3D_REG_INDEX(poly_offset) == 0xA2DF); static_assert(GFX6_3D_REG_INDEX(vgt_gs_instance_cnt) == 0xA2E4); static_assert(GFX6_3D_REG_INDEX(vgt_strmout_config) == 0xA2E5); @@ -1456,6 +1494,7 @@ static_assert(GFX6_3D_REG_INDEX(color_buffers[0].slice) == 0xA31A); static_assert(GFX6_3D_REG_INDEX(color_buffers[7].base_address) == 0xA381); static_assert(GFX6_3D_REG_INDEX(primitive_type) == 0xC242); static_assert(GFX6_3D_REG_INDEX(num_instances) == 0xC24D); +static_assert(GFX6_3D_REG_INDEX(vgt_tf_memory_base) == 0xc250); #undef GFX6_3D_REG_INDEX diff --git a/src/video_core/amdgpu/types.h b/src/video_core/amdgpu/types.h index 6b95ed91..fa849166 100644 --- a/src/video_core/amdgpu/types.h +++ b/src/video_core/amdgpu/types.h @@ -3,6 +3,8 @@ #pragma once +#include +#include #include "common/types.h" namespace AmdGpu { @@ -21,6 +23,69 @@ enum class FpDenormMode : u32 { InOutAllow = 3, }; +enum class TessellationType : u32 { + Isoline = 0, + Triangle = 1, + Quad = 2, +}; + +constexpr std::string_view NameOf(TessellationType type) { + switch (type) { + case TessellationType::Isoline: + return "Isoline"; + case TessellationType::Triangle: + return "Triangle"; + case TessellationType::Quad: + return "Quad"; + default: + return "Unknown"; + } +} + +enum class TessellationPartitioning : u32 { + Integer = 0, + Pow2 = 1, + FracOdd = 2, + FracEven = 3, +}; + +constexpr std::string_view NameOf(TessellationPartitioning partitioning) { + switch (partitioning) { + case TessellationPartitioning::Integer: + return "Integer"; + case TessellationPartitioning::Pow2: + return "Pow2"; + case TessellationPartitioning::FracOdd: + return "FracOdd"; + case TessellationPartitioning::FracEven: + return "FracEven"; + default: + return "Unknown"; + } +} + +enum class TessellationTopology : u32 { + Point = 0, + Line = 1, + TriangleCw = 2, + TriangleCcw = 3, +}; + +constexpr std::string_view NameOf(TessellationTopology topology) { + switch (topology) { + case TessellationTopology::Point: + return "Point"; + case TessellationTopology::Line: + return "Line"; + case TessellationTopology::TriangleCw: + return "TriangleCw"; + case TessellationTopology::TriangleCcw: + return "TriangleCcw"; + default: + return "Unknown"; + } +} + // See `VGT_PRIMITIVE_TYPE` description in [Radeon Sea Islands 3D/Compute Register Reference Guide] enum class PrimitiveType : u32 { None = 0, @@ -118,3 +183,33 @@ enum class NumberFormat : u32 { }; } // namespace AmdGpu + +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) { + return ctx.begin(); + } + auto format(AmdGpu::TessellationType type, format_context& ctx) const { + return fmt::format_to(ctx.out(), "{}", AmdGpu::NameOf(type)); + } +}; + +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) { + return ctx.begin(); + } + auto format(AmdGpu::TessellationPartitioning type, format_context& ctx) const { + return fmt::format_to(ctx.out(), "{}", AmdGpu::NameOf(type)); + } +}; + +template <> +struct fmt::formatter { + constexpr auto parse(format_parse_context& ctx) { + return ctx.begin(); + } + auto format(AmdGpu::TessellationTopology type, format_context& ctx) const { + return fmt::format_to(ctx.out(), "{}", AmdGpu::NameOf(type)); + } +}; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 8d495ab0..a39b1837 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -16,7 +16,7 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler ComputePipelineKey compute_key_, const Shader::Info& info_, vk::ShaderModule module) : Pipeline{instance_, scheduler_, desc_heap_, pipeline_cache, true}, compute_key{compute_key_} { - auto& info = stages[int(Shader::Stage::Compute)]; + auto& info = stages[int(Shader::LogicalStage::Compute)]; info = &info_; const vk::PipelineShaderStageCreateInfo shader_ci = { diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 79553757..222ffb5a 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -8,6 +8,7 @@ #include "common/assert.h" #include "common/scope_exit.h" +#include "shader_recompiler/runtime_info.h" #include "video_core/amdgpu/resource.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" @@ -52,7 +53,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul boost::container::static_vector vertex_bindings; boost::container::static_vector vertex_attributes; if (fetch_shader && !instance.IsVertexInputDynamicState()) { - const auto& vs_info = GetStage(Shader::Stage::Vertex); + const auto& vs_info = GetStage(Shader::LogicalStage::Vertex); for (const auto& attrib : fetch_shader->attributes) { if (attrib.UsesStepRates()) { // Skip attribute binding as the data will be pulled by shader @@ -106,6 +107,10 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul key.primitive_restart_index == 0xFFFFFFFF, "Primitive restart index other than -1 is not supported yet"); + const vk::PipelineTessellationStateCreateInfo tessellation_state = { + .patchControlPoints = key.patch_control_points, + }; + const vk::PipelineRasterizationStateCreateInfo raster_state = { .depthClampEnable = false, .rasterizerDiscardEnable = false, @@ -204,7 +209,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul boost::container::static_vector shader_stages; - auto stage = u32(Shader::Stage::Vertex); + auto stage = u32(Shader::LogicalStage::Vertex); if (infos[stage]) { shader_stages.emplace_back(vk::PipelineShaderStageCreateInfo{ .stage = vk::ShaderStageFlagBits::eVertex, @@ -212,7 +217,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul .pName = "main", }); } - stage = u32(Shader::Stage::Geometry); + stage = u32(Shader::LogicalStage::Geometry); if (infos[stage]) { shader_stages.emplace_back(vk::PipelineShaderStageCreateInfo{ .stage = vk::ShaderStageFlagBits::eGeometry, @@ -220,7 +225,23 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul .pName = "main", }); } - stage = u32(Shader::Stage::Fragment); + stage = u32(Shader::LogicalStage::TessellationControl); + if (infos[stage]) { + shader_stages.emplace_back(vk::PipelineShaderStageCreateInfo{ + .stage = vk::ShaderStageFlagBits::eTessellationControl, + .module = modules[stage], + .pName = "main", + }); + } + stage = u32(Shader::LogicalStage::TessellationEval); + if (infos[stage]) { + shader_stages.emplace_back(vk::PipelineShaderStageCreateInfo{ + .stage = vk::ShaderStageFlagBits::eTessellationEvaluation, + .module = modules[stage], + .pName = "main", + }); + } + stage = u32(Shader::LogicalStage::Fragment); if (infos[stage]) { shader_stages.emplace_back(vk::PipelineShaderStageCreateInfo{ .stage = vk::ShaderStageFlagBits::eFragment, @@ -301,6 +322,8 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul .pStages = shader_stages.data(), .pVertexInputState = !instance.IsVertexInputDynamicState() ? &vertex_input_info : nullptr, .pInputAssemblyState = &input_assembly, + .pTessellationState = + stages[u32(Shader::LogicalStage::TessellationControl)] ? &tessellation_state : nullptr, .pViewportState = &viewport_info, .pRasterizationState = &raster_state, .pMultisampleState = &multisampling, @@ -327,7 +350,6 @@ void GraphicsPipeline::BuildDescSetLayout() { if (!stage) { continue; } - if (stage->has_readconst) { bindings.push_back({ .binding = binding++, diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 703a0680..444c8517 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -52,6 +52,7 @@ struct GraphicsPipelineKey { std::array blend_controls; std::array write_masks; std::array vertex_buffer_formats; + u32 patch_control_points; bool operator==(const GraphicsPipelineKey& key) const noexcept { return std::memcmp(this, &key, sizeof(key)) == 0; @@ -73,7 +74,7 @@ public: bool IsEmbeddedVs() const noexcept { static constexpr size_t EmbeddedVsHash = 0x9b2da5cf47f8c29f; - return key.stage_hashes[u32(Shader::Stage::Vertex)] == EmbeddedVsHash; + return key.stage_hashes[u32(Shader::LogicalStage::Vertex)] == EmbeddedVsHash; } auto GetWriteMasks() const { diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index e844150b..76efb215 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -327,6 +327,7 @@ bool Instance::CreateDevice() { .imageCubeArray = features.imageCubeArray, .independentBlend = features.independentBlend, .geometryShader = features.geometryShader, + .tessellationShader = features.tessellationShader, .logicOp = features.logicOp, .depthBiasClamp = features.depthBiasClamp, .fillModeNonSolid = features.fillModeNonSolid, @@ -378,6 +379,7 @@ bool Instance::CreateDevice() { vk::PhysicalDeviceExtendedDynamicStateFeaturesEXT{ .extendedDynamicState = true, }, + vk::PhysicalDeviceExtendedDynamicState2FeaturesEXT{}, vk::PhysicalDeviceExtendedDynamicState3FeaturesEXT{ .extendedDynamicState3ColorWriteMask = true, }, diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index ff27b742..58473496 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -22,6 +22,8 @@ extern std::unique_ptr presenter; namespace Vulkan { +using Shader::LogicalStage; +using Shader::Stage; using Shader::VsOutput; constexpr static std::array DescriptorHeapSizes = { @@ -78,7 +80,7 @@ void GatherVertexOutputs(Shader::VertexRuntimeInfo& info, : (ctl.IsCullDistEnabled(7) ? VsOutput::CullDist7 : VsOutput::None)); } -Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Shader::Stage stage) { +Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Stage stage, LogicalStage l_stage) { auto info = Shader::RuntimeInfo{stage}; const auto& regs = liverpool->regs; const auto BuildCommon = [&](const auto& program) { @@ -89,20 +91,47 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Shader::Stage stage) { info.fp_round_mode32 = program.settings.fp_round_mode32; }; switch (stage) { - case Shader::Stage::Export: { + case Stage::Local: { + BuildCommon(regs.ls_program); + if (regs.stage_enable.IsStageEnabled(static_cast(Stage::Hull))) { + info.ls_info.links_with_tcs = true; + Shader::TessellationDataConstantBuffer tess_constants; + const auto* pgm = regs.ProgramForStage(static_cast(Stage::Hull)); + const auto params = Liverpool::GetParams(*pgm); + const auto& hull_info = program_cache.at(params.hash)->info; + hull_info.ReadTessConstantBuffer(tess_constants); + info.ls_info.ls_stride = tess_constants.ls_stride; + } + break; + } + case Stage::Hull: { + BuildCommon(regs.hs_program); + info.hs_info.num_input_control_points = regs.ls_hs_config.hs_input_control_points.Value(); + info.hs_info.num_threads = regs.ls_hs_config.hs_output_control_points.Value(); + info.hs_info.tess_type = regs.tess_config.type; + + // We need to initialize most hs_info fields after finding the V# with tess constants + break; + } + case Stage::Export: { BuildCommon(regs.es_program); info.es_info.vertex_data_size = regs.vgt_esgs_ring_itemsize; break; } - case Shader::Stage::Vertex: { + case Stage::Vertex: { BuildCommon(regs.vs_program); GatherVertexOutputs(info.vs_info, regs.vs_output_control); info.vs_info.emulate_depth_negative_one_to_one = !instance.IsDepthClipControlSupported() && regs.clipper_control.clip_space == Liverpool::ClipSpace::MinusWToW; + if (l_stage == LogicalStage::TessellationEval) { + info.vs_info.tess_type = regs.tess_config.type; + info.vs_info.tess_topology = regs.tess_config.topology; + info.vs_info.tess_partitioning = regs.tess_config.partitioning; + } break; } - case Shader::Stage::Geometry: { + case Stage::Geometry: { BuildCommon(regs.gs_program); auto& gs_info = info.gs_info; gs_info.output_vertices = regs.vgt_gs_max_vert_out; @@ -121,7 +150,7 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Shader::Stage stage) { DumpShader(gs_info.vs_copy, gs_info.vs_copy_hash, Shader::Stage::Vertex, 0, "copy.bin"); break; } - case Shader::Stage::Fragment: { + case Stage::Fragment: { BuildCommon(regs.ps_program); info.fs_info.en_flags = regs.ps_input_ena; info.fs_info.addr_flags = regs.ps_input_addr; @@ -143,7 +172,7 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Shader::Stage stage) { } break; } - case Shader::Stage::Compute: { + case Stage::Compute: { const auto& cs_pgm = regs.cs_program; info.num_user_data = cs_pgm.settings.num_user_regs; info.num_allocated_vgprs = regs.cs_program.settings.num_vgprs * 4; @@ -277,6 +306,11 @@ bool PipelineCache::RefreshGraphicsKey() { key.mrt_swizzles.fill(Liverpool::ColorBuffer::SwapMode::Standard); key.vertex_buffer_formats.fill(vk::Format::eUndefined); + key.patch_control_points = 0; + if (regs.stage_enable.hs_en.Value()) { + key.patch_control_points = regs.ls_hs_config.hs_input_control_points.Value(); + } + // First pass of bindings check to idenitfy formats and swizzles and pass them to rhe shader // recompiler. for (auto cb = 0u; cb < Liverpool::NumColorBuffers; ++cb) { @@ -305,7 +339,7 @@ bool PipelineCache::RefreshGraphicsKey() { fetch_shader = std::nullopt; Shader::Backend::Bindings binding{}; - const auto& TryBindStageRemap = [&](Shader::Stage stage_in, Shader::Stage stage_out) -> bool { + const auto& TryBindStage = [&](Shader::Stage stage_in, Shader::LogicalStage stage_out) -> bool { const auto stage_in_idx = static_cast(stage_in); const auto stage_out_idx = static_cast(stage_out); if (!regs.stage_enable.IsStageEnabled(stage_in_idx)) { @@ -332,23 +366,23 @@ bool PipelineCache::RefreshGraphicsKey() { auto params = Liverpool::GetParams(*pgm); std::optional fetch_shader_; std::tie(infos[stage_out_idx], modules[stage_out_idx], fetch_shader_, - key.stage_hashes[stage_out_idx]) = GetProgram(stage_in, params, binding); + key.stage_hashes[stage_out_idx]) = + GetProgram(stage_in, stage_out, params, binding); if (fetch_shader_) { fetch_shader = fetch_shader_; } return true; }; - const auto& TryBindStage = [&](Shader::Stage stage) { return TryBindStageRemap(stage, stage); }; - const auto& IsGsFeaturesSupported = [&]() -> bool { // These checks are temporary until all functionality is implemented. return !regs.vgt_gs_mode.onchip && !regs.vgt_strmout_config.raw; }; - TryBindStage(Shader::Stage::Fragment); + infos.fill(nullptr); + TryBindStage(Stage::Fragment, LogicalStage::Fragment); - const auto* fs_info = infos[static_cast(Shader::Stage::Fragment)]; + const auto* fs_info = infos[static_cast(LogicalStage::Fragment)]; key.mrt_mask = fs_info ? fs_info->mrt_mask : 0u; switch (regs.stage_enable.raw) { @@ -356,22 +390,36 @@ bool PipelineCache::RefreshGraphicsKey() { if (!instance.IsGeometryStageSupported() || !IsGsFeaturesSupported()) { return false; } - if (!TryBindStageRemap(Shader::Stage::Export, Shader::Stage::Vertex)) { + if (!TryBindStage(Stage::Export, LogicalStage::Vertex)) { return false; } - if (!TryBindStage(Shader::Stage::Geometry)) { + if (!TryBindStage(Stage::Geometry, LogicalStage::Geometry)) { + return false; + } + break; + } + case Liverpool::ShaderStageEnable::VgtStages::LsHs: { + if (!instance.IsTessellationSupported()) { + break; + } + if (!TryBindStage(Stage::Hull, LogicalStage::TessellationControl)) { + return false; + } + if (!TryBindStage(Stage::Vertex, LogicalStage::TessellationEval)) { + return false; + } + if (!TryBindStage(Stage::Local, LogicalStage::Vertex)) { return false; } break; } default: { - TryBindStage(Shader::Stage::Vertex); - infos[static_cast(Shader::Stage::Geometry)] = nullptr; + TryBindStage(Stage::Vertex, LogicalStage::Vertex); break; } } - const auto vs_info = infos[static_cast(Shader::Stage::Vertex)]; + const auto vs_info = infos[static_cast(Shader::LogicalStage::Vertex)]; if (vs_info && fetch_shader && !instance.IsVertexInputDynamicState()) { u32 vertex_binding = 0; for (const auto& attrib : fetch_shader->attributes) { @@ -424,19 +472,18 @@ bool PipelineCache::RefreshGraphicsKey() { key.num_samples = num_samples; return true; -} +} // namespace Vulkan bool PipelineCache::RefreshComputeKey() { Shader::Backend::Bindings binding{}; const auto* cs_pgm = &liverpool->regs.cs_program; const auto cs_params = Liverpool::GetParams(*cs_pgm); std::tie(infos[0], modules[0], fetch_shader, compute_key.value) = - GetProgram(Shader::Stage::Compute, cs_params, binding); + GetProgram(Shader::Stage::Compute, LogicalStage::Compute, cs_params, binding); return true; } -vk::ShaderModule PipelineCache::CompileModule(Shader::Info& info, - const Shader::RuntimeInfo& runtime_info, +vk::ShaderModule PipelineCache::CompileModule(Shader::Info& info, Shader::RuntimeInfo& runtime_info, std::span code, size_t perm_idx, Shader::Backend::Bindings& binding) { LOG_INFO(Render_Vulkan, "Compiling {} shader {:#x} {}", info.stage, info.pgm_hash, @@ -461,19 +508,19 @@ vk::ShaderModule PipelineCache::CompileModule(Shader::Info& info, const auto name = fmt::format("{}_{:#018x}_{}", info.stage, info.pgm_hash, perm_idx); Vulkan::SetObjectName(instance.GetDevice(), module, name); if (Config::collectShadersForDebug()) { - DebugState.CollectShader(name, module, spv, code, patch ? *patch : std::span{}, - is_patched); + DebugState.CollectShader(name, info.l_stage, module, spv, code, + patch ? *patch : std::span{}, is_patched); } return module; } -std::tuple, u64> -PipelineCache::GetProgram(Shader::Stage stage, Shader::ShaderParams params, - Shader::Backend::Bindings& binding) { - const auto runtime_info = BuildRuntimeInfo(stage); +PipelineCache::Result PipelineCache::GetProgram(Stage stage, LogicalStage l_stage, + Shader::ShaderParams params, + Shader::Backend::Bindings& binding) { + auto runtime_info = BuildRuntimeInfo(stage, l_stage); auto [it_pgm, new_program] = program_cache.try_emplace(params.hash); if (new_program) { - it_pgm.value() = std::make_unique(stage, params); + it_pgm.value() = std::make_unique(stage, l_stage, params); auto& program = it_pgm.value(); auto start = binding; const auto module = CompileModule(program->info, runtime_info, params.code, 0, binding); @@ -492,7 +539,7 @@ PipelineCache::GetProgram(Shader::Stage stage, Shader::ShaderParams params, const auto it = std::ranges::find(program->modules, spec, &Program::Module::spec); if (it == program->modules.end()) { - auto new_info = Shader::Info(stage, params); + auto new_info = Shader::Info(stage, l_stage, params); module = CompileModule(new_info, runtime_info, params.code, perm_idx, binding); program->AddPermut(module, std::move(spec)); } else { diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index c5c2fc98..ec440644 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -34,11 +34,13 @@ struct Program { vk::ShaderModule module; Shader::StageSpecialization spec; }; + using ModuleList = boost::container::small_vector; Shader::Info info; - boost::container::small_vector modules; + ModuleList modules; - explicit Program(Shader::Stage stage, Shader::ShaderParams params) : info{stage, params} {} + explicit Program(Shader::Stage stage, Shader::LogicalStage l_stage, Shader::ShaderParams params) + : info{stage, l_stage, params} {} void AddPermut(vk::ShaderModule module, const Shader::StageSpecialization&& spec) { modules.emplace_back(module, std::move(spec)); @@ -55,10 +57,10 @@ public: const ComputePipeline* GetComputePipeline(); - std::tuple, - u64> - GetProgram(Shader::Stage stage, Shader::ShaderParams params, - Shader::Backend::Bindings& binding); + using Result = std::tuple, u64>; + Result GetProgram(Shader::Stage stage, Shader::LogicalStage l_stage, + Shader::ShaderParams params, Shader::Backend::Bindings& binding); std::optional ReplaceShader(vk::ShaderModule module, std::span spv_code); @@ -71,10 +73,10 @@ private: std::string_view ext); std::optional> GetShaderPatch(u64 hash, Shader::Stage stage, size_t perm_idx, std::string_view ext); - vk::ShaderModule CompileModule(Shader::Info& info, const Shader::RuntimeInfo& runtime_info, + vk::ShaderModule CompileModule(Shader::Info& info, Shader::RuntimeInfo& runtime_info, std::span code, size_t perm_idx, Shader::Backend::Bindings& binding); - Shader::RuntimeInfo BuildRuntimeInfo(Shader::Stage stage); + Shader::RuntimeInfo BuildRuntimeInfo(Shader::Stage stage, Shader::LogicalStage l_stage); private: const Instance& instance; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_common.h b/src/video_core/renderer_vulkan/vk_pipeline_common.h index 8c48c83f..1b13a179 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_common.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_common.h @@ -14,9 +14,10 @@ class BufferCache; namespace Vulkan { -static constexpr auto gp_stage_flags = vk::ShaderStageFlagBits::eVertex | - vk::ShaderStageFlagBits::eGeometry | - vk::ShaderStageFlagBits::eFragment; +static constexpr auto gp_stage_flags = + vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eTessellationControl | + vk::ShaderStageFlagBits::eTessellationEvaluation | vk::ShaderStageFlagBits::eGeometry | + vk::ShaderStageFlagBits::eFragment; class Instance; class Scheduler; @@ -37,6 +38,7 @@ public: } auto GetStages() const { + static_assert(static_cast(Shader::LogicalStage::Compute) == Shader::MaxStageTypes - 1); if (is_compute) { return std::span{stages.cend() - 1, stages.cend()}; } else { @@ -44,7 +46,7 @@ public: } } - const Shader::Info& GetStage(Shader::Stage stage) const noexcept { + const Shader::Info& GetStage(Shader::LogicalStage stage) const noexcept { return *stages[u32(stage)]; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index eb2ef360..fef4c7ec 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -4,6 +4,7 @@ #include "common/config.h" #include "common/debug.h" #include "core/memory.h" +#include "shader_recompiler/runtime_info.h" #include "video_core/amdgpu/liverpool.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" @@ -48,10 +49,6 @@ void Rasterizer::CpSync() { bool Rasterizer::FilterDraw() { const auto& regs = liverpool->regs; - // Tessellation is unsupported so skip the draw to avoid locking up the driver. - if (regs.primitive_type == AmdGpu::PrimitiveType::PatchPrimitive) { - return false; - } // There are several cases (e.g. FCE, FMask/HTile decompression) where we don't need to do an // actual draw hence can skip pipeline creation. if (regs.color_control.mode == Liverpool::ColorControl::OperationMode::EliminateFastClear) { @@ -214,7 +211,7 @@ void Rasterizer::Draw(bool is_indexed, u32 index_offset) { return; } - const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex); + const auto& vs_info = pipeline->GetStage(Shader::LogicalStage::Vertex); const auto& fetch_shader = pipeline->GetFetchShader(); buffer_cache.BindVertexBuffers(vs_info, fetch_shader); const u32 num_indices = buffer_cache.BindIndexBuffer(is_indexed, index_offset); @@ -271,7 +268,7 @@ void Rasterizer::DrawIndirect(bool is_indexed, VAddr arg_address, u32 offset, u3 return; } - const auto& vs_info = pipeline->GetStage(Shader::Stage::Vertex); + const auto& vs_info = pipeline->GetStage(Shader::LogicalStage::Vertex); const auto& fetch_shader = pipeline->GetFetchShader(); buffer_cache.BindVertexBuffers(vs_info, fetch_shader); buffer_cache.BindIndexBuffer(is_indexed, 0); @@ -326,7 +323,7 @@ void Rasterizer::DispatchDirect() { return; } - const auto& cs = pipeline->GetStage(Shader::Stage::Compute); + const auto& cs = pipeline->GetStage(Shader::LogicalStage::Compute); if (ExecuteShaderHLE(cs, liverpool->regs, *this)) { return; } @@ -387,7 +384,7 @@ bool Rasterizer::BindResources(const Pipeline* pipeline) { const auto& regs = liverpool->regs; if (pipeline->IsCompute()) { - const auto& info = pipeline->GetStage(Shader::Stage::Compute); + const auto& info = pipeline->GetStage(Shader::LogicalStage::Compute); // Most of the time when a metadata is updated with a shader it gets cleared. It means // we can skip the whole dispatch and update the tracked state instead. Also, it is not