Implement V_MOVREL variants (#745)

* shader_recompiler: Implement V_MOVRELS_B32, V_MOVRELD_B32,
V_MOVRELSD_B32

Generates a ton of OpSelects to hardcode reading or writing from each
possible vgpr depending on the value of m0

Future work is to do range analysis to put an upper bound on m0 and
check fewer registers.

* fix runtime info after rebase
This commit is contained in:
baggins183 2024-09-06 13:47:47 -07:00 committed by GitHub
parent 416e23fe76
commit bb29224daf
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 110 additions and 6 deletions

View file

@ -327,6 +327,10 @@ void EmitGetVccHi(EmitContext& ctx) {
UNREACHABLE_MSG("Unreachable instruction");
}
void EmitGetM0(EmitContext& ctx) {
UNREACHABLE_MSG("Unreachable instruction");
}
void EmitSetScc(EmitContext& ctx) {
UNREACHABLE_MSG("Unreachable instruction");
}
@ -351,4 +355,8 @@ void EmitSetVccHi(EmitContext& ctx) {
UNREACHABLE_MSG("Unreachable instruction");
}
void EmitSetM0(EmitContext& ctx) {
UNREACHABLE_MSG("Unreachable instruction");
}
} // namespace Shader::Backend::SPIRV

View file

@ -36,12 +36,14 @@ void EmitGetVcc(EmitContext& ctx);
void EmitGetSccLo(EmitContext& ctx);
void EmitGetVccLo(EmitContext& ctx);
void EmitGetVccHi(EmitContext& ctx);
void EmitGetM0(EmitContext& ctx);
void EmitSetScc(EmitContext& ctx);
void EmitSetExec(EmitContext& ctx);
void EmitSetVcc(EmitContext& ctx);
void EmitSetSccLo(EmitContext& ctx);
void EmitSetVccLo(EmitContext& ctx);
void EmitSetVccHi(EmitContext& ctx);
void EmitSetM0(EmitContext& ctx);
void EmitFPCmpClass32(EmitContext& ctx);
void EmitPrologue(EmitContext& ctx);
void EmitEpilogue(EmitContext& ctx);

View file

@ -153,10 +153,11 @@ T Translator::GetSrc(const InstOperand& operand) {
break;
case OperandField::M0:
if constexpr (is_float) {
UNREACHABLE();
value = ir.BitCast<IR::F32>(ir.GetM0());
} else {
return m0_value;
value = ir.GetM0();
}
break;
default:
UNREACHABLE();
}
@ -296,8 +297,7 @@ void Translator::SetDst(const InstOperand& operand, const IR::U32F32& value) {
case OperandField::VccHi:
return ir.SetVccHi(result);
case OperandField::M0:
m0_value = result;
break;
return ir.SetM0(result);
default:
UNREACHABLE();
}

View file

@ -192,6 +192,9 @@ public:
void V_MBCNT_U32_B32(bool is_low, const GcnInst& inst);
void V_BFM_B32(const GcnInst& inst);
void V_FFBH_U32(const GcnInst& inst);
void V_MOVRELS_B32(const GcnInst& inst);
void V_MOVRELD_B32(const GcnInst& inst);
void V_MOVRELSD_B32(const GcnInst& inst);
// Vector Memory
void BUFFER_LOAD(u32 num_dwords, bool is_typed, const GcnInst& inst);
@ -233,6 +236,9 @@ private:
void SetDst(const InstOperand& operand, const IR::U32F32& value);
void SetDst64(const InstOperand& operand, const IR::U64F64& value_raw);
IR::U32 VMovRelSHelper(u32 src_vgprno, const IR::U32 m0);
void VMovRelDHelper(u32 dst_vgprno, const IR::U32 src_val, const IR::U32 m0);
void LogMissingOpcode(const GcnInst& inst);
private:
@ -240,7 +246,6 @@ private:
Info& info;
const RuntimeInfo& runtime_info;
const Profile& profile;
IR::U32 m0_value;
bool opcode_missing = false;
};

View file

@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "shader_recompiler/frontend/opcodes.h"
#include "shader_recompiler/frontend/translate/translate.h"
namespace Shader::Gcn {
@ -309,6 +310,12 @@ void Translator::EmitVectorAlu(const GcnInst& inst) {
return V_MBCNT_U32_B32(true, inst);
case Opcode::V_MBCNT_HI_U32_B32:
return V_MBCNT_U32_B32(false, inst);
case Opcode::V_MOVRELS_B32:
return V_MOVRELS_B32(inst);
case Opcode::V_MOVRELD_B32:
return V_MOVRELD_B32(inst);
case Opcode::V_MOVRELSD_B32:
return V_MOVRELSD_B32(inst);
case Opcode::V_NOP:
return;
@ -990,4 +997,52 @@ void Translator::V_FFBH_U32(const GcnInst& inst) {
SetDst(inst.dst[0], IR::U32{ir.Select(cond, pos_from_left, ir.Imm32(~0U))});
}
// TODO: add range analysis pass to hopefully put an upper bound on m0, and only select one of
// [src_vgprno, src_vgprno + max_m0]. Same for dst regs we may write back to
IR::U32 Translator::VMovRelSHelper(u32 src_vgprno, const IR::U32 m0) {
// Read from VGPR0 by default when src_vgprno + m0 > num_allocated_vgprs
IR::U32 src_val = ir.GetVectorReg<IR::U32>(IR::VectorReg::V0);
for (u32 i = src_vgprno; i < runtime_info.num_allocated_vgprs; i++) {
const IR::U1 cond = ir.IEqual(m0, ir.Imm32(i - src_vgprno));
src_val =
IR::U32{ir.Select(cond, ir.GetVectorReg<IR::U32>(IR::VectorReg::V0 + i), src_val)};
}
return src_val;
}
void Translator::VMovRelDHelper(u32 dst_vgprno, const IR::U32 src_val, const IR::U32 m0) {
for (u32 i = dst_vgprno; i < runtime_info.num_allocated_vgprs; i++) {
const IR::U1 cond = ir.IEqual(m0, ir.Imm32(i - dst_vgprno));
const IR::U32 dst_val =
IR::U32{ir.Select(cond, src_val, ir.GetVectorReg<IR::U32>(IR::VectorReg::V0 + i))};
ir.SetVectorReg(IR::VectorReg::V0 + i, dst_val);
}
}
void Translator::V_MOVRELS_B32(const GcnInst& inst) {
u32 src_vgprno = inst.src[0].code - static_cast<u32>(IR::VectorReg::V0);
const IR::U32 m0 = ir.GetM0();
const IR::U32 src_val = VMovRelSHelper(src_vgprno, m0);
SetDst(inst.dst[0], src_val);
}
void Translator::V_MOVRELD_B32(const GcnInst& inst) {
const IR::U32 src_val{GetSrc(inst.src[0])};
u32 dst_vgprno = inst.dst[0].code - static_cast<u32>(IR::VectorReg::V0);
IR::U32 m0 = ir.GetM0();
VMovRelDHelper(dst_vgprno, src_val, m0);
}
void Translator::V_MOVRELSD_B32(const GcnInst& inst) {
u32 src_vgprno = inst.src[0].code - static_cast<u32>(IR::VectorReg::V0);
u32 dst_vgprno = inst.dst[0].code - static_cast<u32>(IR::VectorReg::V0);
IR::U32 m0 = ir.GetM0();
const IR::U32 src_val = VMovRelSHelper(src_vgprno, m0);
VMovRelDHelper(dst_vgprno, src_val, m0);
}
} // namespace Shader::Gcn

View file

@ -217,6 +217,10 @@ U32 IREmitter::GetVccHi() {
return Inst<U32>(Opcode::GetVccHi);
}
U32 IREmitter::GetM0() {
return Inst<U32>(Opcode::GetM0);
}
void IREmitter::SetScc(const U1& value) {
Inst(Opcode::SetScc, value);
}
@ -241,6 +245,10 @@ void IREmitter::SetVccHi(const U32& value) {
Inst(Opcode::SetVccHi, value);
}
void IREmitter::SetM0(const U32& value) {
Inst(Opcode::SetM0, value);
}
F32 IREmitter::GetAttribute(IR::Attribute attribute, u32 comp) {
return Inst<F32>(Opcode::GetAttribute, attribute, Imm32(comp));
}

View file

@ -67,12 +67,14 @@ public:
[[nodiscard]] U1 GetVcc();
[[nodiscard]] U32 GetVccLo();
[[nodiscard]] U32 GetVccHi();
[[nodiscard]] U32 GetM0();
void SetScc(const U1& value);
void SetExec(const U1& value);
void SetVcc(const U1& value);
void SetSccLo(const U32& value);
void SetVccLo(const U32& value);
void SetVccHi(const U32& value);
void SetM0(const U32& value);
[[nodiscard]] U1 Condition(IR::Condition cond);

View file

@ -60,12 +60,14 @@ OPCODE(GetExec, U1, Void,
OPCODE(GetVcc, U1, Void, )
OPCODE(GetVccLo, U32, Void, )
OPCODE(GetVccHi, U32, Void, )
OPCODE(GetM0, U32, Void, )
OPCODE(SetScc, Void, U1, )
OPCODE(SetExec, Void, U1, )
OPCODE(SetVcc, Void, U1, )
OPCODE(SetSccLo, Void, U32, )
OPCODE(SetVccLo, Void, U32, )
OPCODE(SetVccHi, Void, U32, )
OPCODE(SetM0, Void, U32, )
// Undefined
OPCODE(UndefU1, U1, )

View file

@ -33,6 +33,7 @@ struct ExecFlagTag : FlagTag {};
struct VccFlagTag : FlagTag {};
struct VccLoTag : FlagTag {};
struct VccHiTag : FlagTag {};
struct M0Tag : FlagTag {};
struct GotoVariable : FlagTag {
GotoVariable() = default;
@ -44,7 +45,7 @@ struct GotoVariable : FlagTag {
};
using Variant = std::variant<IR::ScalarReg, IR::VectorReg, GotoVariable, SccFlagTag, ExecFlagTag,
VccFlagTag, VccLoTag, VccHiTag>;
VccFlagTag, VccLoTag, VccHiTag, M0Tag>;
using ValueMap = std::unordered_map<IR::Block*, IR::Value>;
struct DefTable {
@ -103,6 +104,12 @@ struct DefTable {
void SetDef(IR::Block* block, VccFlagTag, const IR::Value& value) {
vcc_flag.insert_or_assign(block, value);
}
const IR::Value& Def(IR::Block* block, M0Tag) {
return m0_flag[block];
}
void SetDef(IR::Block* block, M0Tag, const IR::Value& value) {
m0_flag.insert_or_assign(block, value);
}
std::unordered_map<u32, ValueMap> goto_vars;
ValueMap scc_flag;
@ -111,6 +118,7 @@ struct DefTable {
ValueMap scc_lo_flag;
ValueMap vcc_lo_flag;
ValueMap vcc_hi_flag;
ValueMap m0_flag;
};
IR::Opcode UndefOpcode(IR::ScalarReg) noexcept {
@ -129,6 +137,10 @@ IR::Opcode UndefOpcode(const VccHiTag) noexcept {
return IR::Opcode::UndefU32;
}
IR::Opcode UndefOpcode(const M0Tag) noexcept {
return IR::Opcode::UndefU32;
}
IR::Opcode UndefOpcode(const FlagTag) noexcept {
return IR::Opcode::UndefU1;
}
@ -330,6 +342,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
case IR::Opcode::SetVccHi:
pass.WriteVariable(VccHiTag{}, block, inst.Arg(0));
break;
case IR::Opcode::SetM0:
pass.WriteVariable(M0Tag{}, block, inst.Arg(0));
break;
case IR::Opcode::GetThreadBitScalarReg:
case IR::Opcode::GetScalarRegister: {
const IR::ScalarReg reg{inst.Arg(0).ScalarReg()};
@ -362,6 +377,9 @@ void VisitInst(Pass& pass, IR::Block* block, IR::Inst& inst) {
case IR::Opcode::GetVccHi:
inst.ReplaceUsesWith(pass.ReadVariable(VccHiTag{}, block));
break;
case IR::Opcode::GetM0:
inst.ReplaceUsesWith(pass.ReadVariable(M0Tag{}, block));
break;
default:
break;
}

View file

@ -107,6 +107,7 @@ struct RuntimeInfo {
Stage stage;
u32 num_user_data;
u32 num_input_vgprs;
u32 num_allocated_vgprs;
VertexRuntimeInfo vs_info;
FragmentRuntimeInfo fs_info;
ComputeRuntimeInfo cs_info;

View file

@ -76,6 +76,7 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Shader::Stage stage) {
case Shader::Stage::Vertex: {
info.num_user_data = regs.vs_program.settings.num_user_regs;
info.num_input_vgprs = regs.vs_program.settings.vgpr_comp_cnt;
info.num_allocated_vgprs = regs.vs_program.settings.num_vgprs * 4;
GatherVertexOutputs(info.vs_info, regs.vs_output_control);
info.vs_info.emulate_depth_negative_one_to_one =
!instance.IsDepthClipControlSupported() &&
@ -84,6 +85,7 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Shader::Stage stage) {
}
case Shader::Stage::Fragment: {
info.num_user_data = regs.ps_program.settings.num_user_regs;
info.num_allocated_vgprs = regs.ps_program.settings.num_vgprs * 4;
std::ranges::transform(graphics_key.mrt_swizzles, info.fs_info.mrt_swizzles.begin(),
[](Liverpool::ColorBuffer::SwapMode mode) {
return static_cast<Shader::MrtSwizzle>(mode);
@ -102,6 +104,7 @@ Shader::RuntimeInfo PipelineCache::BuildRuntimeInfo(Shader::Stage stage) {
case Shader::Stage::Compute: {
const auto& cs_pgm = regs.cs_program;
info.num_user_data = cs_pgm.settings.num_user_regs;
info.num_allocated_vgprs = regs.cs_program.settings.num_vgprs * 4;
info.cs_info.workgroup_size = {cs_pgm.num_thread_x.full, cs_pgm.num_thread_y.full,
cs_pgm.num_thread_z.full};
info.cs_info.tgid_enable = {cs_pgm.IsTgidEnabled(0), cs_pgm.IsTgidEnabled(1),