diff --git a/src/shader_recompiler/frontend/translate/data_share.cpp b/src/shader_recompiler/frontend/translate/data_share.cpp index 4408cae2..62c0423d 100644 --- a/src/shader_recompiler/frontend/translate/data_share.cpp +++ b/src/shader_recompiler/frontend/translate/data_share.cpp @@ -205,7 +205,6 @@ void Translator::DS_WRITE(int bit_size, bool is_signed, bool is_pair, bool strid addr, ir.Imm32((u32(inst.control.ds.offset1) << 8u) + u32(inst.control.ds.offset0))); ir.WriteShared(bit_size, ir.GetVectorReg(data0), addr0); } - emit_ds_read_barrier = true; } void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { @@ -222,11 +221,6 @@ void Translator::DS_SWIZZLE_B32(const GcnInst& inst) { void Translator::DS_READ(int bit_size, bool is_signed, bool is_pair, bool stride64, const GcnInst& inst) { - if (emit_ds_read_barrier && profile.needs_lds_barriers) { - ir.Barrier(); - emit_ds_read_barrier = false; - } - const IR::U32 addr{ir.GetVectorReg(IR::VectorReg(inst.src[0].code))}; IR::VectorReg dst_reg{inst.dst[0].code}; if (is_pair) { diff --git a/src/shader_recompiler/frontend/translate/translate.h b/src/shader_recompiler/frontend/translate/translate.h index e8584ec2..9da0844e 100644 --- a/src/shader_recompiler/frontend/translate/translate.h +++ b/src/shader_recompiler/frontend/translate/translate.h @@ -308,7 +308,6 @@ private: const RuntimeInfo& runtime_info; const Profile& profile; bool opcode_missing = false; - bool emit_ds_read_barrier = false; }; void Translate(IR::Block* block, u32 block_base, std::span inst_list, Info& info, diff --git a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp index aad8fb14..ec7d7e98 100644 --- a/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp +++ b/src/shader_recompiler/ir/passes/shared_memory_barrier_pass.cpp @@ -8,6 +8,54 @@ namespace Shader::Optimization { +static void EmitBarrierInBlock(IR::Block* block) { + // This is inteded to insert a barrier when shared memory write and read + // occur in the same basic block. Also checks if branch depth is zero as + // we don't want to insert barrier in potentially divergent code. + bool emit_barrier_on_write = false; + bool emit_barrier_on_read = false; + const auto emit_barrier = [block](bool& emit_cond, IR::Inst& inst) { + if (emit_cond) { + IR::IREmitter ir{*block, IR::Block::InstructionList::s_iterator_to(inst)}; + ir.Barrier(); + emit_cond = false; + } + }; + for (IR::Inst& inst : block->Instructions()) { + if (inst.GetOpcode() == IR::Opcode::LoadSharedU32 || + inst.GetOpcode() == IR::Opcode::LoadSharedU64) { + emit_barrier(emit_barrier_on_read, inst); + emit_barrier_on_write = true; + } + if (inst.GetOpcode() == IR::Opcode::WriteSharedU32 || + inst.GetOpcode() == IR::Opcode::WriteSharedU64) { + emit_barrier(emit_barrier_on_write, inst); + emit_barrier_on_read = true; + } + } +} + +static void EmitBarrierInMergeBlock(const IR::AbstractSyntaxNode::Data& data) { + // Insert a barrier after divergent conditional blocks. + // This avoids potential softlocks and crashes when some threads + // initialize shared memory and others read from it. + const IR::U1 cond = data.if_node.cond; + const auto insert_barrier = + IR::BreadthFirstSearch(cond, [](IR::Inst* inst) -> std::optional { + if (inst->GetOpcode() == IR::Opcode::GetAttributeU32 && + inst->Arg(0).Attribute() == IR::Attribute::LocalInvocationId) { + return true; + } + return std::nullopt; + }); + if (insert_barrier) { + IR::Block* const merge = data.if_node.merge; + auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi); + IR::IREmitter ir{*merge, insert_point}; + ir.Barrier(); + } +} + void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile) { if (!program.info.uses_shared || !profile.needs_lds_barriers) { return; @@ -19,27 +67,12 @@ void SharedMemoryBarrierPass(IR::Program& program, const Profile& profile) { --branch_depth; continue; } - if (node.type != Type::If) { + if (node.type == Type::If && branch_depth++ == 0) { + EmitBarrierInMergeBlock(node.data); continue; } - u32 curr_depth = branch_depth++; - if (curr_depth != 0) { - continue; - } - const IR::U1 cond = node.data.if_node.cond; - const auto insert_barrier = - IR::BreadthFirstSearch(cond, [](IR::Inst* inst) -> std::optional { - if (inst->GetOpcode() == IR::Opcode::GetAttributeU32 && - inst->Arg(0).Attribute() == IR::Attribute::LocalInvocationId) { - return true; - } - return std::nullopt; - }); - if (insert_barrier) { - IR::Block* const merge = node.data.if_node.merge; - auto insert_point = std::ranges::find_if_not(merge->Instructions(), IR::IsPhi); - IR::IREmitter ir{*merge, insert_point}; - ir.Barrier(); + if (node.type == Type::Block && branch_depth == 0) { + EmitBarrierInBlock(node.data.block); } } }