Implement shader resource tables (#1165)

* Implement shader resource tables

* fix after rebase + squash

* address some review comments

* fix pipeline_common

* cleanup debug stuff

* switch to using single codegenerator
This commit is contained in:
baggins183 2024-10-31 23:55:53 -07:00 committed by GitHub
parent 7b16085c59
commit 9ec75c3feb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
30 changed files with 740 additions and 119 deletions

View file

@ -590,6 +590,7 @@ set(SHADER_RECOMPILER src/shader_recompiler/exception.h
src/shader_recompiler/frontend/structured_control_flow.h
src/shader_recompiler/ir/passes/constant_propagation_pass.cpp
src/shader_recompiler/ir/passes/dead_code_elimination_pass.cpp
src/shader_recompiler/ir/passes/flatten_extended_userdata_pass.cpp
src/shader_recompiler/ir/passes/identity_removal_pass.cpp
src/shader_recompiler/ir/passes/ir_passes.h
src/shader_recompiler/ir/passes/lower_shared_mem_to_registers.cpp

View file

@ -13,6 +13,15 @@ DecoderImpl::DecoderImpl() {
DecoderImpl::~DecoderImpl() = default;
std::string DecoderImpl::disassembleInst(ZydisDecodedInstruction& inst,
ZydisDecodedOperand* operands, u64 address) {
const int bufLen = 256;
char szBuffer[bufLen];
ZydisFormatterFormatInstruction(&m_formatter, &inst, operands, inst.operand_count_visible,
szBuffer, sizeof(szBuffer), address, ZYAN_NULL);
return szBuffer;
}
void DecoderImpl::printInstruction(void* code, u64 address) {
ZydisDecodedInstruction instruction;
ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT_VISIBLE];
@ -27,11 +36,8 @@ void DecoderImpl::printInstruction(void* code, u64 address) {
void DecoderImpl::printInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands,
u64 address) {
const int bufLen = 256;
char szBuffer[bufLen];
ZydisFormatterFormatInstruction(&m_formatter, &inst, operands, inst.operand_count_visible,
szBuffer, sizeof(szBuffer), address, ZYAN_NULL);
fmt::print("instruction: {}\n", szBuffer);
std::string s = disassembleInst(inst, operands, address);
fmt::print("instruction: {}\n", s);
}
ZyanStatus DecoderImpl::decodeInstruction(ZydisDecodedInstruction& inst,

View file

@ -14,6 +14,8 @@ public:
DecoderImpl();
~DecoderImpl();
std::string disassembleInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands,
u64 address);
void printInst(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands, u64 address);
void printInstruction(void* code, u64 address);
ZyanStatus decodeInstruction(ZydisDecodedInstruction& inst, ZydisDecodedOperand* operands,

14
src/common/hash.h Normal file
View file

@ -0,0 +1,14 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include "common/types.h"
[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) {
return seed ^ (hash + 0x9e3779b9 + (seed << 12) + (seed >> 4));
}
[[nodiscard]] inline u32 HashCombine(const u32 seed, const u32 hash) {
return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2));
}

View file

@ -1,6 +1,7 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include "common/assert.h"
#include "shader_recompiler/backend/spirv/emit_spirv_instructions.h"
#include "shader_recompiler/backend/spirv/spirv_emit_context.h"
@ -146,9 +147,14 @@ void EmitGetGotoVariable(EmitContext&) {
UNREACHABLE_MSG("Unreachable instruction");
}
Id EmitReadConst(EmitContext& ctx) {
return ctx.u32_zero_value;
UNREACHABLE_MSG("Unreachable instruction");
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst) {
u32 flatbuf_off_dw = inst->Flags<u32>();
ASSERT(ctx.srt_flatbuf.binding >= 0);
ASSERT(flatbuf_off_dw > 0);
Id index = ctx.ConstU32(flatbuf_off_dw);
auto& buffer = ctx.srt_flatbuf;
const Id ptr{ctx.OpAccessChain(buffer.pointer_type, buffer.id, ctx.u32_zero_value, index)};
return ctx.OpLoad(ctx.U32[1], ptr);
}
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index) {

View file

@ -62,7 +62,7 @@ void EmitSetVectorRegister(EmitContext& ctx);
void EmitSetGotoVariable(EmitContext& ctx);
void EmitGetGotoVariable(EmitContext& ctx);
void EmitSetScc(EmitContext& ctx);
Id EmitReadConst(EmitContext& ctx);
Id EmitReadConst(EmitContext& ctx, IR::Inst* inst);
Id EmitReadConstBuffer(EmitContext& ctx, u32 handle, Id index);
Id EmitLoadBufferU32(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);
Id EmitLoadBufferU32x2(EmitContext& ctx, IR::Inst* inst, u32 handle, Id address);

View file

@ -4,12 +4,14 @@
#include "common/assert.h"
#include "common/div_ceil.h"
#include "shader_recompiler/backend/spirv/spirv_emit_context.h"
#include "shader_recompiler/ir/passes/srt.h"
#include "video_core/amdgpu/types.h"
#include <boost/container/static_vector.hpp>
#include <fmt/format.h>
#include <numbers>
#include <string_view>
namespace Shader::Backend::SPIRV {
namespace {
@ -435,14 +437,16 @@ void EmitContext::DefinePushDataBlock() {
void EmitContext::DefineBuffers() {
boost::container::small_vector<Id, 8> type_ids;
const auto define_struct = [&](Id record_array_type, bool is_instance_data) {
const auto define_struct = [&](Id record_array_type, bool is_instance_data,
std::optional<std::string_view> explicit_name = {}) {
const Id struct_type{TypeStruct(record_array_type)};
if (std::ranges::find(type_ids, record_array_type.value, &Id::value) != type_ids.end()) {
return struct_type;
}
Decorate(record_array_type, spv::Decoration::ArrayStride, 4);
const auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage)
: fmt::format("{}_cbuf_block_f32", stage);
auto name = is_instance_data ? fmt::format("{}_instance_data_f32", stage)
: fmt::format("{}_cbuf_block_f32", stage);
name = explicit_name.value_or(name);
Name(struct_type, name);
Decorate(struct_type, spv::Decoration::Block);
MemberName(struct_type, 0, "data");
@ -451,6 +455,29 @@ void EmitContext::DefineBuffers() {
return struct_type;
};
if (info.has_readconst) {
const Id data_type = U32[1];
const auto storage_class = spv::StorageClass::Uniform;
const Id pointer_type = TypePointer(storage_class, data_type);
const Id record_array_type{
TypeArray(U32[1], ConstU32(static_cast<u32>(info.flattened_ud_buf.size())))};
const Id struct_type{define_struct(record_array_type, false, "srt_flatbuf_ty")};
const Id struct_pointer_type{TypePointer(storage_class, struct_type)};
const Id id{AddGlobalVariable(struct_pointer_type, storage_class)};
Decorate(id, spv::Decoration::Binding, binding.unified++);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, "srt_flatbuf_ubo");
srt_flatbuf = {
.id = id,
.binding = binding.buffer++,
.pointer_type = pointer_type,
};
interfaces.push_back(id);
}
for (const auto& desc : info.buffers) {
const auto sharp = desc.GetSharp(info);
const bool is_storage = desc.IsStorage(sharp);
@ -471,7 +498,7 @@ void EmitContext::DefineBuffers() {
if (is_storage && !desc.is_written) {
Decorate(id, spv::Decoration::NonWritable);
}
Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sgpr_base));
Name(id, fmt::format("{}_{}", is_storage ? "ssbo" : "cbuf", desc.sharp_idx));
buffers.push_back({
.id = id,
@ -495,7 +522,7 @@ void EmitContext::DefineTextureBuffers() {
const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)};
Decorate(id, spv::Decoration::Binding, binding.unified++);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sgpr_base));
Name(id, fmt::format("{}_{}", desc.is_written ? "imgbuf" : "texbuf", desc.sharp_idx));
texture_buffers.push_back({
.id = id,
.binding = binding.buffer++,
@ -582,7 +609,7 @@ spv::ImageFormat GetFormat(const AmdGpu::Image& image) {
}
Id ImageType(EmitContext& ctx, const ImageResource& desc, Id sampled_type) {
const auto image = ctx.info.ReadUd<AmdGpu::Image>(desc.sgpr_base, desc.dword_offset);
const auto image = ctx.info.ReadUdSharp<AmdGpu::Image>(desc.sharp_idx);
const auto format = desc.is_atomic ? GetFormat(image) : spv::ImageFormat::Unknown;
const u32 sampled = desc.is_storage ? 2 : 1;
switch (desc.type) {
@ -618,8 +645,7 @@ void EmitContext::DefineImagesAndSamplers() {
const Id id{AddGlobalVariable(pointer_type, spv::StorageClass::UniformConstant)};
Decorate(id, spv::Decoration::Binding, binding.unified++);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, fmt::format("{}_{}{}_{:02x}", stage, "img", image_desc.sgpr_base,
image_desc.dword_offset));
Name(id, fmt::format("{}_{}{}", stage, "img", image_desc.sharp_idx));
images.push_back({
.data_types = &data_types,
.id = id,
@ -643,8 +669,7 @@ void EmitContext::DefineImagesAndSamplers() {
const Id id{AddGlobalVariable(sampler_pointer_type, spv::StorageClass::UniformConstant)};
Decorate(id, spv::Decoration::Binding, binding.unified++);
Decorate(id, spv::Decoration::DescriptorSet, 0U);
Name(id, fmt::format("{}_{}{}_{:02x}", stage, "samp", samp_desc.sgpr_base,
samp_desc.dword_offset));
Name(id, fmt::format("{}_{}{}", stage, "samp", samp_desc.sharp_idx));
samplers.push_back(id);
interfaces.push_back(id);
}

View file

@ -228,6 +228,7 @@ public:
Bindings& binding;
boost::container::small_vector<BufferDefinition, 16> buffers;
boost::container::small_vector<TextureBufferDefinition, 8> texture_buffers;
BufferDefinition srt_flatbuf;
boost::container::small_vector<TextureDefinition, 8> images;
boost::container::small_vector<Id, 4> samplers;

View file

@ -10,6 +10,10 @@ static constexpr u32 SQ_SRC_LITERAL = 0xFF;
void Translator::EmitScalarMemory(const GcnInst& inst) {
switch (inst.opcode) {
// SMRD
case Opcode::S_LOAD_DWORD:
return S_LOAD_DWORD(1, inst);
case Opcode::S_LOAD_DWORDX2:
return S_LOAD_DWORD(2, inst);
case Opcode::S_LOAD_DWORDX4:
return S_LOAD_DWORD(4, inst);
case Opcode::S_LOAD_DWORDX8:

View file

@ -388,7 +388,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
IR::VectorReg dst_reg{attrib.dest_vgpr};
// Read the V# of the attribute to figure out component number and type.
const auto buffer = info.ReadUd<AmdGpu::Buffer>(attrib.sgpr_base, attrib.dword_offset);
const auto buffer = info.ReadUdReg<AmdGpu::Buffer>(attrib.sgpr_base, attrib.dword_offset);
for (u32 i = 0; i < 4; i++) {
const IR::F32 comp = [&] {
switch (buffer.GetSwizzle(i)) {
@ -418,8 +418,7 @@ void Translator::EmitFetch(const GcnInst& inst) {
if (step_rate == Info::VsInput::OverStepRate0 ||
step_rate == Info::VsInput::OverStepRate1) {
info.buffers.push_back({
.sgpr_base = attrib.sgpr_base,
.dword_offset = attrib.dword_offset,
.sharp_idx = info.srt_info.ReserveSharp(attrib.sgpr_base, attrib.dword_offset, 4),
.used_types = IR::Type::F32,
.is_instance_data = true,
});

View file

@ -2,7 +2,9 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <algorithm>
#include <span>
#include <vector>
#include <boost/container/small_vector.hpp>
#include <boost/container/static_vector.hpp>
#include "common/assert.h"
@ -10,6 +12,7 @@
#include "shader_recompiler/backend/bindings.h"
#include "shader_recompiler/frontend/copy_shader.h"
#include "shader_recompiler/ir/attribute.h"
#include "shader_recompiler/ir/passes/srt.h"
#include "shader_recompiler/ir/reg.h"
#include "shader_recompiler/ir/type.h"
#include "shader_recompiler/params.h"
@ -36,8 +39,7 @@ constexpr u32 NUM_TEXTURE_TYPES = 7;
struct Info;
struct BufferResource {
u32 sgpr_base;
u32 dword_offset;
u32 sharp_idx;
IR::Type used_types;
AmdGpu::Buffer inline_cbuf;
bool is_gds_buffer{};
@ -53,8 +55,7 @@ struct BufferResource {
using BufferResourceList = boost::container::small_vector<BufferResource, 16>;
struct TextureBufferResource {
u32 sgpr_base;
u32 dword_offset;
u32 sharp_idx;
AmdGpu::NumberFormat nfmt;
bool is_written{};
@ -63,8 +64,7 @@ struct TextureBufferResource {
using TextureBufferResourceList = boost::container::small_vector<TextureBufferResource, 16>;
struct ImageResource {
u32 sgpr_base;
u32 dword_offset;
u32 sharp_idx;
AmdGpu::ImageType type;
AmdGpu::NumberFormat nfmt;
bool is_storage{};
@ -77,8 +77,7 @@ struct ImageResource {
using ImageResourceList = boost::container::small_vector<ImageResource, 16>;
struct SamplerResource {
u32 sgpr_base;
u32 dword_offset;
u32 sharp_idx;
AmdGpu::Sampler inline_sampler{};
u32 associated_image : 4;
u32 disable_aniso : 1;
@ -180,6 +179,9 @@ struct Info {
ImageResourceList images;
SamplerResourceList samplers;
PersistentSrtInfo srt_info;
std::vector<u32> flattened_ud_buf;
std::span<const u32> user_data;
Stage stage;
@ -199,6 +201,7 @@ struct Info {
bool uses_fp64{};
bool uses_step_rates{};
bool translation_failed{}; // indicates that shader has unsupported instructions
bool has_readconst{};
u8 mrt_mask{0u};
explicit Info(Stage stage_, ShaderParams params)
@ -206,7 +209,12 @@ struct Info {
user_data{params.user_data} {}
template <typename T>
T ReadUd(u32 ptr_index, u32 dword_offset) const noexcept {
inline T ReadUdSharp(u32 sharp_idx) const noexcept {
return *reinterpret_cast<const T*>(&flattened_ud_buf[sharp_idx]);
}
template <typename T>
T ReadUdReg(u32 ptr_index, u32 dword_offset) const noexcept {
T data;
const u32* base = user_data.data();
if (ptr_index != IR::NumScalarRegs) {
@ -228,7 +236,8 @@ struct Info {
}
void AddBindings(Backend::Bindings& bnd) const {
const auto total_buffers = buffers.size() + texture_buffers.size();
const auto total_buffers =
buffers.size() + texture_buffers.size() + (has_readconst ? 1 : 0);
bnd.buffer += total_buffers;
bnd.unified += total_buffers + images.size() + samplers.size();
bnd.user_data += ud_mask.NumRegs();
@ -245,22 +254,32 @@ struct Info {
}
return {vertex_offset, instance_offset};
}
void RefreshFlatBuf() {
flattened_ud_buf.resize(srt_info.flattened_bufsize_dw);
ASSERT(user_data.size() <= NumUserDataRegs);
std::memcpy(flattened_ud_buf.data(), user_data.data(), user_data.size_bytes());
// Run the JIT program to walk the SRT and write the leaves to a flat buffer
if (srt_info.walker_func) {
srt_info.walker_func(user_data.data(), flattened_ud_buf.data());
}
}
};
constexpr AmdGpu::Buffer BufferResource::GetSharp(const Info& info) const noexcept {
return inline_cbuf ? inline_cbuf : info.ReadUd<AmdGpu::Buffer>(sgpr_base, dword_offset);
return inline_cbuf ? inline_cbuf : info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
}
constexpr AmdGpu::Buffer TextureBufferResource::GetSharp(const Info& info) const noexcept {
return info.ReadUd<AmdGpu::Buffer>(sgpr_base, dword_offset);
return info.ReadUdSharp<AmdGpu::Buffer>(sharp_idx);
}
constexpr AmdGpu::Image ImageResource::GetSharp(const Info& info) const noexcept {
return info.ReadUd<AmdGpu::Image>(sgpr_base, dword_offset);
return info.ReadUdSharp<AmdGpu::Image>(sharp_idx);
}
constexpr AmdGpu::Sampler SamplerResource::GetSharp(const Info& info) const noexcept {
return inline_sampler ? inline_sampler : info.ReadUd<AmdGpu::Sampler>(sgpr_base, dword_offset);
return inline_sampler ? inline_sampler : info.ReadUdSharp<AmdGpu::Sampler>(sharp_idx);
}
} // namespace Shader

View file

@ -118,6 +118,10 @@ std::string DumpBlock(const Block& block, const std::map<const Block*, size_t>&
} else {
ret += fmt::format(" {}", op); // '%00000 = ' -> 1 + 5 + 3 = 9 spaces
}
if (op == Opcode::ReadConst) {
ret += fmt::format(" (flags={}) ", inst.Flags<u32>());
}
const size_t arg_count{inst.NumArgs()};
for (size_t arg_index = 0; arg_index < arg_count; ++arg_index) {
const Value arg{inst.Arg(arg_index)};

View file

@ -11,34 +11,37 @@
namespace Shader::IR {
template <typename Pred>
auto BreadthFirstSearch(const Inst* inst, Pred&& pred) -> std::invoke_result_t<Pred, const Inst*> {
// Use typename Instruction so the function can be used to return either const or mutable
// Insts depending on the context.
template <typename Instruction, typename Pred>
auto BreadthFirstSearch(Instruction* inst, Pred&& pred)
-> std::invoke_result_t<Pred, Instruction*> {
// Most often case the instruction is the desired already.
if (const std::optional result = pred(inst)) {
if (std::optional result = pred(inst)) {
return result;
}
// Breadth-first search visiting the right most arguments first
boost::container::small_vector<const Inst*, 2> visited;
std::queue<const Inst*> queue;
boost::container::small_vector<Instruction*, 2> visited;
std::queue<Instruction*> queue;
queue.push(inst);
while (!queue.empty()) {
// Pop one instruction from the queue
const Inst* const inst{queue.front()};
Instruction* inst{queue.front()};
queue.pop();
if (const std::optional result = pred(inst)) {
if (std::optional result = pred(inst)) {
// This is the instruction we were looking for
return result;
}
// Visit the right most arguments first
for (size_t arg = inst->NumArgs(); arg--;) {
const Value arg_value{inst->Arg(arg)};
Value arg_value{inst->Arg(arg)};
if (arg_value.IsImmediate()) {
continue;
}
// Queue instruction if it hasn't been visited
const Inst* const arg_inst{arg_value.InstRecursive()};
Instruction* arg_inst{arg_value.InstRecursive()};
if (std::ranges::find(visited, arg_inst) == visited.end()) {
visited.push_back(arg_inst);
queue.push(arg_inst);
@ -59,4 +62,13 @@ auto BreadthFirstSearch(const Value& value, Pred&& pred)
return BreadthFirstSearch(value.InstRecursive(), pred);
}
template <typename Pred>
auto BreadthFirstSearch(Value value, Pred&& pred) -> std::invoke_result_t<Pred, Inst*> {
if (value.IsImmediate()) {
// Nothing to do with immediates
return std::nullopt;
}
return BreadthFirstSearch(value.InstRecursive(), pred);
}
} // namespace Shader::IR

View file

@ -0,0 +1,249 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <unordered_map>
#include <boost/container/flat_map.hpp>
#include <xbyak/xbyak.h>
#include <xbyak/xbyak_util.h>
#include "common/config.h"
#include "common/io_file.h"
#include "common/logging/log.h"
#include "common/path_util.h"
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/breadth_first_search.h"
#include "shader_recompiler/ir/opcodes.h"
#include "shader_recompiler/ir/passes/srt.h"
#include "shader_recompiler/ir/program.h"
#include "shader_recompiler/ir/reg.h"
#include "shader_recompiler/ir/srt_gvn_table.h"
#include "shader_recompiler/ir/value.h"
#include "src/common/arch.h"
#include "src/common/decoder.h"
using namespace Xbyak::util;
static Xbyak::CodeGenerator g_srt_codegen(32_MB);
namespace {
static void DumpSrtProgram(const Shader::Info& info, const u8* code, size_t codesize) {
#ifdef ARCH_X86_64
using namespace Common::FS;
const auto dump_dir = GetUserPath(PathType::ShaderDir) / "dumps";
if (!std::filesystem::exists(dump_dir)) {
std::filesystem::create_directories(dump_dir);
}
const auto filename = fmt::format("{}_{:#018x}.srtprogram.txt", info.stage, info.pgm_hash);
const auto file = IOFile{dump_dir / filename, FileAccessMode::Write, FileType::TextFile};
u64 address = reinterpret_cast<u64>(code);
u64 code_end = address + codesize;
ZydisDecodedInstruction instruction;
ZydisDecodedOperand operands[ZYDIS_MAX_OPERAND_COUNT];
ZyanStatus status = ZYAN_STATUS_SUCCESS;
while (address < code_end && ZYAN_SUCCESS(Common::Decoder::Instance()->decodeInstruction(
instruction, operands, reinterpret_cast<void*>(address)))) {
std::string s =
Common::Decoder::Instance()->disassembleInst(instruction, operands, address);
s += "\n";
file.WriteString(s);
address += instruction.length;
}
#endif
}
using namespace Shader;
struct PassInfo {
// map offset to inst
using PtrUserList = boost::container::flat_map<u32, Shader::IR::Inst*>;
Optimization::SrtGvnTable gvn_table;
// keys are GetUserData or ReadConst instructions that are used as pointers
std::unordered_map<IR::Inst*, PtrUserList> pointer_uses;
// GetUserData instructions corresponding to sgpr_base of SRT roots
boost::container::small_flat_map<IR::ScalarReg, IR::Inst*, 1> srt_roots;
// pick a single inst for a given value number
std::unordered_map<u32, IR::Inst*> vn_to_inst;
// Bumped during codegen to assign offsets to readconsts
u32 dst_off_dw;
PtrUserList* GetUsesAsPointer(IR::Inst* inst) {
auto it = pointer_uses.find(inst);
if (it != pointer_uses.end()) {
return &it->second;
}
return nullptr;
}
// Return a single instruction that this instruction is identical to, according
// to value number
// The "original" is arbitrary. Here it's the first instruction found for a given value number
IR::Inst* DeduplicateInstruction(IR::Inst* inst) {
auto it = vn_to_inst.try_emplace(gvn_table.GetValueNumber(inst), inst);
return it.first->second;
}
};
} // namespace
namespace Shader::Optimization {
namespace {
static inline void PushPtr(Xbyak::CodeGenerator& c, u32 off_dw) {
c.push(rdi);
c.mov(rdi, ptr[rdi + (off_dw << 2)]);
c.mov(r10, 0xFFFFFFFFFFFFULL);
c.and_(rdi, r10);
}
static inline void PopPtr(Xbyak::CodeGenerator& c) {
c.pop(rdi);
};
static void VisitPointer(u32 off_dw, IR::Inst* subtree, PassInfo& pass_info,
Xbyak::CodeGenerator& c) {
PushPtr(c, off_dw);
PassInfo::PtrUserList* use_list = pass_info.GetUsesAsPointer(subtree);
ASSERT(use_list);
// First copy all the src data from this tree level
// That way, all data that was contiguous in the guest SRT is also contiguous in the
// flattened buffer.
// TODO src and dst are contiguous. Optimize with wider loads/stores
// TODO if this subtree is dynamically indexed, don't compact it (keep it sparse)
for (auto [src_off_dw, use] : *use_list) {
c.mov(r10d, ptr[rdi + (src_off_dw << 2)]);
c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r10d);
use->SetFlags<u32>(pass_info.dst_off_dw);
pass_info.dst_off_dw++;
}
// Then visit any children used as pointers
for (const auto [src_off_dw, use] : *use_list) {
if (pass_info.GetUsesAsPointer(use)) {
VisitPointer(src_off_dw, use, pass_info, c);
}
}
PopPtr(c);
}
static void GenerateSrtProgram(Info& info, PassInfo& pass_info) {
Xbyak::CodeGenerator& c = g_srt_codegen;
if (info.srt_info.srt_reservations.empty() && pass_info.srt_roots.empty()) {
return;
}
info.srt_info.walker_func = c.getCurr<PFN_SrtWalker>();
pass_info.dst_off_dw = NumUserDataRegs;
// Special case for V# step rate buffers in fetch shader
for (const auto [sgpr_base, dword_offset, num_dwords] : info.srt_info.srt_reservations) {
// get pointer to V#
c.mov(r10d, ptr[rdi + (sgpr_base << 2)]);
u32 src_off = dword_offset << 2;
for (auto j = 0; j < num_dwords; j++) {
c.mov(r11d, ptr[r10d + src_off]);
c.mov(ptr[rsi + (pass_info.dst_off_dw << 2)], r11d);
src_off += 4;
++pass_info.dst_off_dw;
}
}
ASSERT(pass_info.dst_off_dw == info.srt_info.flattened_bufsize_dw);
for (const auto& [sgpr_base, root] : pass_info.srt_roots) {
VisitPointer(static_cast<u32>(sgpr_base), root, pass_info, c);
}
c.ret();
c.ready();
if (Config::dumpShaders()) {
size_t codesize = c.getCurr() - reinterpret_cast<const u8*>(info.srt_info.walker_func);
DumpSrtProgram(info, reinterpret_cast<const u8*>(info.srt_info.walker_func), codesize);
}
info.srt_info.flattened_bufsize_dw = pass_info.dst_off_dw;
}
}; // namespace
void FlattenExtendedUserdataPass(IR::Program& program) {
Shader::Info& info = program.info;
PassInfo pass_info;
// traverse at end and assign offsets to duplicate readconsts, using
// vn_to_inst as the source
boost::container::small_vector<IR::Inst*, 32> all_readconsts;
for (auto r_it = program.post_order_blocks.rbegin(); r_it != program.post_order_blocks.rend();
r_it++) {
IR::Block* block = *r_it;
for (IR::Inst& inst : *block) {
if (inst.GetOpcode() == IR::Opcode::ReadConst) {
if (!inst.Arg(1).IsImmediate()) {
LOG_WARNING(Render_Recompiler, "ReadConst has non-immediate offset");
continue;
}
all_readconsts.push_back(&inst);
if (pass_info.DeduplicateInstruction(&inst) != &inst) {
// This is a duplicate of a readconst we've already visited
continue;
}
IR::Inst* ptr_composite = inst.Arg(0).InstRecursive();
const auto pred = [](IR::Inst* inst) -> std::optional<IR::Inst*> {
if (inst->GetOpcode() == IR::Opcode::GetUserData ||
inst->GetOpcode() == IR::Opcode::ReadConst) {
return inst;
}
return std::nullopt;
};
auto base0 = IR::BreadthFirstSearch(ptr_composite->Arg(0), pred);
auto base1 = IR::BreadthFirstSearch(ptr_composite->Arg(1), pred);
ASSERT_MSG(base0 && base1 && "ReadConst not from constant memory");
IR::Inst* ptr_lo = base0.value();
ptr_lo = pass_info.DeduplicateInstruction(ptr_lo);
auto ptr_uses_kv =
pass_info.pointer_uses.try_emplace(ptr_lo, PassInfo::PtrUserList{});
PassInfo::PtrUserList& user_list = ptr_uses_kv.first->second;
user_list[inst.Arg(1).U32()] = &inst;
if (ptr_lo->GetOpcode() == IR::Opcode::GetUserData) {
IR::ScalarReg ud_reg = ptr_lo->Arg(0).ScalarReg();
pass_info.srt_roots[ud_reg] = ptr_lo;
}
}
}
}
GenerateSrtProgram(info, pass_info);
// Assign offsets to duplicate readconsts
for (IR::Inst* readconst : all_readconsts) {
ASSERT(pass_info.vn_to_inst.contains(pass_info.gvn_table.GetValueNumber(readconst)));
IR::Inst* original = pass_info.DeduplicateInstruction(readconst);
readconst->SetFlags<u32>(original->Flags<u32>());
}
info.RefreshFlatBuf();
}
} // namespace Shader::Optimization

View file

@ -12,6 +12,7 @@ void SsaRewritePass(IR::BlockList& program);
void IdentityRemovalPass(IR::BlockList& program);
void DeadCodeEliminationPass(IR::Program& program);
void ConstantPropagationPass(IR::BlockList& program);
void FlattenExtendedUserdataPass(IR::Program& program);
void ResourceTrackingPass(IR::Program& program);
void CollectShaderInfoPass(IR::Program& program);
void LowerSharedMemToRegisters(IR::Program& program);

View file

@ -13,12 +13,7 @@
namespace Shader::Optimization {
namespace {
struct SharpLocation {
u32 sgpr_base;
u32 dword_offset;
auto operator<=>(const SharpLocation&) const = default;
};
using SharpLocation = u32;
bool IsBufferAtomic(const IR::Inst& inst) {
switch (inst.GetOpcode()) {
@ -155,9 +150,7 @@ public:
if (desc.is_gds_buffer && existing.is_gds_buffer) {
return true;
}
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset &&
desc.inline_cbuf == existing.inline_cbuf;
return desc.sharp_idx == existing.sharp_idx && desc.inline_cbuf == existing.inline_cbuf;
})};
auto& buffer = buffer_resources[index];
buffer.used_types |= desc.used_types;
@ -167,8 +160,7 @@ public:
u32 Add(const TextureBufferResource& desc) {
const u32 index{Add(texture_buffer_resources, desc, [&desc](const auto& existing) {
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset;
return desc.sharp_idx == existing.sharp_idx;
})};
auto& buffer = texture_buffer_resources[index];
buffer.is_written |= desc.is_written;
@ -177,8 +169,7 @@ public:
u32 Add(const ImageResource& desc) {
const u32 index{Add(image_resources, desc, [&desc](const auto& existing) {
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset;
return desc.sharp_idx == existing.sharp_idx;
})};
auto& image = image_resources[index];
image.is_storage |= desc.is_storage;
@ -187,8 +178,7 @@ public:
u32 Add(const SamplerResource& desc) {
const u32 index{Add(sampler_resources, desc, [this, &desc](const auto& existing) {
return desc.sgpr_base == existing.sgpr_base &&
desc.dword_offset == existing.dword_offset;
return desc.sharp_idx == existing.sharp_idx;
})};
return index;
}
@ -259,48 +249,25 @@ std::pair<const IR::Inst*, bool> TryDisableAnisoLod0(const IR::Inst* inst) {
return {prod2, true};
}
SharpLocation TrackSharp(const IR::Inst* inst) {
SharpLocation TrackSharp(const IR::Inst* inst, const Shader::Info& info) {
// Search until we find a potential sharp source.
const auto pred0 = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
const auto pred = [](const IR::Inst* inst) -> std::optional<const IR::Inst*> {
if (inst->GetOpcode() == IR::Opcode::GetUserData ||
inst->GetOpcode() == IR::Opcode::ReadConst) {
return inst;
}
return std::nullopt;
};
const auto result = IR::BreadthFirstSearch(inst, pred0);
const auto result = IR::BreadthFirstSearch(inst, pred);
ASSERT_MSG(result, "Unable to track sharp source");
inst = result.value();
// If its from user data not much else to do.
if (inst->GetOpcode() == IR::Opcode::GetUserData) {
return SharpLocation{
.sgpr_base = u32(IR::ScalarReg::Max),
.dword_offset = u32(inst->Arg(0).ScalarReg()),
};
return static_cast<u32>(inst->Arg(0).ScalarReg());
} else {
ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst,
"Sharp load not from constant memory");
return inst->Flags<u32>();
}
ASSERT_MSG(inst->GetOpcode() == IR::Opcode::ReadConst, "Sharp load not from constant memory");
// Retrieve offset from base.
const u32 dword_offset = inst->Arg(1).U32();
const IR::Inst* spgpr_base = inst->Arg(0).InstRecursive();
// Retrieve SGPR pair that holds sbase
const auto pred1 = [](const IR::Inst* inst) -> std::optional<IR::ScalarReg> {
ASSERT(inst->GetOpcode() != IR::Opcode::ReadConst);
if (inst->GetOpcode() == IR::Opcode::GetUserData) {
return inst->Arg(0).ScalarReg();
}
return std::nullopt;
};
const auto base0 = IR::BreadthFirstSearch(spgpr_base->Arg(0), pred1);
const auto base1 = IR::BreadthFirstSearch(spgpr_base->Arg(1), pred1);
ASSERT_MSG(base0 && base1, "Nested resource loads not supported");
// Return retrieved location.
return SharpLocation{
.sgpr_base = u32(base0.value()),
.dword_offset = dword_offset,
};
}
s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
@ -327,8 +294,7 @@ s32 TryHandleInlineCbuf(IR::Inst& inst, Info& info, Descriptors& descriptors,
cbuf = std::bit_cast<AmdGpu::Buffer>(buffer);
// Assign a binding to this sharp.
return descriptors.Add(BufferResource{
.sgpr_base = std::numeric_limits<u32>::max(),
.dword_offset = 0,
.sharp_idx = std::numeric_limits<u32>::max(),
.used_types = BufferDataType(inst, cbuf.GetNumberFmt()),
.inline_cbuf = cbuf,
});
@ -341,11 +307,10 @@ void PatchBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
if (binding = TryHandleInlineCbuf(inst, info, descriptors, buffer); binding == -1) {
IR::Inst* handle = inst.Arg(0).InstRecursive();
IR::Inst* producer = handle->Arg(0).InstRecursive();
const auto sharp = TrackSharp(producer);
buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
const auto sharp = TrackSharp(producer, info);
buffer = info.ReadUdSharp<AmdGpu::Buffer>(sharp);
binding = descriptors.Add(BufferResource{
.sgpr_base = sharp.sgpr_base,
.dword_offset = sharp.dword_offset,
.sharp_idx = sharp,
.used_types = BufferDataType(inst, buffer.GetNumberFmt()),
.is_written = IsBufferStore(inst),
});
@ -404,11 +369,10 @@ void PatchTextureBufferInstruction(IR::Block& block, IR::Inst& inst, Info& info,
Descriptors& descriptors) {
const IR::Inst* handle = inst.Arg(0).InstRecursive();
const IR::Inst* producer = handle->Arg(0).InstRecursive();
const auto sharp = TrackSharp(producer);
const auto buffer = info.ReadUd<AmdGpu::Buffer>(sharp.sgpr_base, sharp.dword_offset);
const auto sharp = TrackSharp(producer, info);
const auto buffer = info.ReadUdSharp<AmdGpu::Buffer>(sharp);
const s32 binding = descriptors.Add(TextureBufferResource{
.sgpr_base = sharp.sgpr_base,
.dword_offset = sharp.dword_offset,
.sharp_idx = sharp,
.nfmt = buffer.GetNumberFmt(),
.is_written = inst.GetOpcode() == IR::Opcode::StoreBufferFormatF32,
});
@ -456,18 +420,16 @@ void PatchImageSampleInstruction(IR::Block& block, IR::Inst& inst, Info& info,
if (handle.IsImmediate()) {
LOG_WARNING(Render_Vulkan, "Inline sampler detected");
return descriptors.Add(SamplerResource{
.sgpr_base = std::numeric_limits<u32>::max(),
.dword_offset = 0,
.sharp_idx = std::numeric_limits<u32>::max(),
.inline_sampler = AmdGpu::Sampler{.raw0 = handle.U32()},
});
}
// Normal sampler resource.
const auto ssharp_handle = handle.InstRecursive();
const auto& [ssharp_ud, disable_aniso] = TryDisableAnisoLod0(ssharp_handle);
const auto ssharp = TrackSharp(ssharp_ud);
const auto ssharp = TrackSharp(ssharp_ud, info);
return descriptors.Add(SamplerResource{
.sgpr_base = ssharp.sgpr_base,
.dword_offset = ssharp.dword_offset,
.sharp_idx = ssharp,
.associated_image = image_binding,
.disable_aniso = disable_aniso,
});
@ -647,9 +609,9 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
const auto tsharp_handle = has_sampler ? producer->Arg(0).InstRecursive() : producer;
// Read image sharp.
const auto tsharp = TrackSharp(tsharp_handle);
const auto tsharp = TrackSharp(tsharp_handle, info);
const auto inst_info = inst.Flags<IR::TextureInstInfo>();
auto image = info.ReadUd<AmdGpu::Image>(tsharp.sgpr_base, tsharp.dword_offset);
auto image = info.ReadUdSharp<AmdGpu::Image>(tsharp);
if (!image.Valid()) {
LOG_ERROR(Render_Vulkan, "Shader compiled with unbound image!");
image = AmdGpu::Image::Null();
@ -658,8 +620,7 @@ void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descrip
const bool is_storage = IsImageStorageInstruction(inst);
const auto type = image.IsPartialCubemap() ? AmdGpu::ImageType::Color2DArray : image.GetType();
u32 image_binding = descriptors.Add(ImageResource{
.sgpr_base = tsharp.sgpr_base,
.dword_offset = tsharp.dword_offset,
.sharp_idx = tsharp,
.type = type,
.nfmt = image.GetNumberFmt(),
.is_storage = is_storage,
@ -763,6 +724,7 @@ void PatchDataRingInstruction(IR::Block& block, IR::Inst& inst, Info& info,
void ResourceTrackingPass(IR::Program& program) {
// Iterate resource instructions and patch them after finding the sharp.
auto& info = program.info;
Descriptors descriptors{info};
for (IR::Block* const block : program.blocks) {
for (IR::Inst& inst : block->Instructions()) {

View file

@ -63,6 +63,9 @@ void Visit(Info& info, IR::Inst& inst) {
case IR::Opcode::LaneId:
info.uses_lane_id = true;
break;
case IR::Opcode::ReadConst:
info.has_readconst = true;
break;
default:
break;
}

View file

@ -0,0 +1,37 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#pragma once
#include <boost/container/set.hpp>
#include <boost/container/small_vector.hpp>
#include "common/types.h"
namespace Shader {
using PFN_SrtWalker = void PS4_SYSV_ABI (*)(const u32* /*user_data*/, u32* /*flat_dst*/);
struct PersistentSrtInfo {
// Special case when fetch shader uses step rates.
struct SrtSharpReservation {
u32 sgpr_base;
u32 dword_offset;
u32 num_dwords;
};
PFN_SrtWalker walker_func{};
boost::container::small_vector<SrtSharpReservation, 2> srt_reservations;
u32 flattened_bufsize_dw = 16; // NumUserDataRegs
// Special case for fetch shaders because we don't generate IR to read from step rate buffers,
// so we won't see usage with GetUserData/ReadConst.
// Reserve space in the flattened buffer for a sharp ahead of time
u32 ReserveSharp(u32 sgpr_base, u32 dword_offset, u32 num_dwords) {
u32 rv = flattened_bufsize_dw;
srt_reservations.emplace_back(sgpr_base, dword_offset, num_dwords);
flattened_bufsize_dw += num_dwords;
return rv;
}
};
} // namespace Shader

View file

@ -0,0 +1,157 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <unordered_map>
#include <boost/container/set.hpp>
#include <boost/container/small_vector.hpp>
#include "common/assert.h"
#include "common/hash.h"
#include "common/types.h"
#include "shader_recompiler/ir/breadth_first_search.h"
#include "shader_recompiler/ir/opcodes.h"
#include "shader_recompiler/ir/value.h"
namespace Shader::Optimization {
// Does global value numbering on a subset of instructions that are used
// for loads from shader resource tables.
// Inspiration from spirv-opt
class SrtGvnTable {
public:
using ValueNumberTable = std::unordered_map<IR::Value, u32>;
using ValueNum = u32;
SrtGvnTable() : value_numbers(), next_num(0) {}
u32 GetValueNumber(IR::Inst* inst) {
return GetValueNumber(IR::Value{inst});
}
u32 GetValueNumber(IR::Value v) {
v = v.Resolve();
if (auto it = value_numbers.find(v); it != value_numbers.end()) {
return it->second;
}
if (auto inst = v.TryInstRecursive()) {
return ComputeInstValueNumber(inst);
}
return NextValueNumber(v);
}
private:
u32 ComputeInstValueNumber(IR::Inst* inst) {
ASSERT(!value_numbers.contains(
IR::Value(inst))); // Should always be checking before calling this function
if (inst->MayHaveSideEffects()) {
return NextValueNumber(IR::Value(inst));
}
u32 vn;
switch (inst->GetOpcode()) {
case IR::Opcode::Phi: {
// hack to get to parity with main
// Need to fix ssa_rewrite pass to remove certain phis
std::optional<IR::Value> source = TryRemoveTrivialPhi(inst);
if (!source) {
const auto pred = [](IR::Inst* inst) -> std::optional<IR::Inst*> {
if (inst->GetOpcode() == IR::Opcode::GetUserData ||
inst->GetOpcode() == IR::Opcode::CompositeConstructU32x2 ||
inst->GetOpcode() == IR::Opcode::ReadConst) {
return inst;
}
return std::nullopt;
};
source = IR::BreadthFirstSearch(inst, pred).transform([](auto inst) {
return IR::Value{inst};
});
ASSERT(source);
}
vn = GetValueNumber(source.value());
value_numbers[IR::Value(inst)] = vn;
break;
}
case IR::Opcode::GetUserData:
case IR::Opcode::CompositeConstructU32x2:
case IR::Opcode::ReadConst: {
InstVector iv = MakeInstVector(inst);
if (auto it = iv_to_vn.find(iv); it != iv_to_vn.end()) {
vn = it->second;
value_numbers[IR::Value(inst)] = vn;
} else {
vn = NextValueNumber(IR::Value(inst));
iv_to_vn.emplace(std::move(iv), vn);
}
break;
}
default:
vn = NextValueNumber(IR::Value(inst));
break;
}
return vn;
}
u32 NextValueNumber(IR::Value v) {
u32 rv = next_num++;
value_numbers[v] = rv;
return rv;
}
ValueNumberTable value_numbers;
u32 next_num;
using InstVector = boost::container::small_vector<u32, 8>;
InstVector MakeInstVector(IR::Inst* inst) {
ASSERT(inst->GetOpcode() != IR::Opcode::Identity);
InstVector iv;
iv.reserve(2 + inst->NumArgs());
iv.push_back(static_cast<u32>(inst->GetOpcode()));
iv.push_back(inst->Flags<u32>());
for (auto i = 0; i < inst->NumArgs(); i++) {
iv.push_back(GetValueNumber(inst->Arg(i)));
}
return iv;
}
// Temp workaround for something like this:
// [0000555558a5baf8] %297 = Phi [ %24, {Block $1} ], [ %297, {Block $5} ] (uses: 4)
// [0000555558a4e038] %305 = CompositeConstructU32x2 %297, %296 (uses: 4)
// [0000555558a4e0a8] %306 = ReadConst %305, #0 (uses: 2)
// Should probably be fixed in ssa_rewrite
std::optional<IR::Value> TryRemoveTrivialPhi(IR::Inst* phi) {
IR::Value single_source{};
for (auto i = 0; i < phi->NumArgs(); i++) {
IR::Value v = phi->Arg(i).Resolve();
if (v == IR::Value(phi)) {
continue;
}
if (!single_source.IsEmpty() && single_source != v) {
return std::nullopt;
}
single_source = v;
}
ASSERT(!single_source.IsEmpty());
phi->ReplaceUsesWith(single_source);
return single_source;
}
struct HashInstVector {
size_t operator()(const InstVector& iv) const {
u32 h = 0;
for (auto vn : iv) {
h = HashCombine(vn, h);
}
return h;
}
};
std::unordered_map<InstVector, u32, HashInstVector> iv_to_vn;
};
} // namespace Shader::Optimization

View file

@ -1,7 +1,9 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#include <string>
#include <cstddef>
#include <string_view>
#include "common/hash.h"
#include "shader_recompiler/ir/value.h"
namespace Shader::IR {
@ -97,3 +99,52 @@ bool Value::operator!=(const Value& other) const {
}
} // namespace Shader::IR
namespace std {
std::size_t hash<Shader::IR::Value>::operator()(const Shader::IR::Value& v) const {
using namespace Shader::IR;
u64 h = HashCombine(static_cast<u64>(v.type), 0ULL);
switch (v.type) {
case Type::Void:
return h;
case Type::Opaque:
return reinterpret_cast<u64>(v.InstRecursive());
case Type::ScalarReg:
return HashCombine(static_cast<u64>(v.sreg), h);
case Type::VectorReg:
return HashCombine(static_cast<u64>(v.vreg), h);
case Type::Attribute:
return HashCombine(static_cast<u64>(v.attribute), h);
case Type::U1:
return HashCombine(static_cast<u64>(v.attribute), h);
case Type::U8:
return HashCombine(static_cast<u64>(v.imm_u8), h);
case Type::U16:
case Type::F16:
return HashCombine(static_cast<u64>(v.imm_u16), h);
case Type::U32:
case Type::F32:
return HashCombine(static_cast<u64>(v.imm_u32), h);
case Type::U64:
case Type::F64:
return HashCombine(static_cast<u64>(v.imm_u64), h);
case Type::U32x2:
case Type::U32x3:
case Type::U32x4:
case Type::F16x2:
case Type::F16x3:
case Type::F16x4:
case Type::F32x2:
case Type::F32x3:
case Type::F32x4:
case Type::F64x2:
case Type::F64x3:
case Type::F64x4:
default:
break;
}
UNREACHABLE_MSG("Invalid type {}", v.type);
}
} // namespace std

View file

@ -29,6 +29,7 @@ class Value {
public:
Value() noexcept = default;
explicit Value(IR::Inst* value) noexcept;
explicit Value(const IR::Inst* value) noexcept;
explicit Value(IR::ScalarReg reg) noexcept;
explicit Value(IR::VectorReg reg) noexcept;
explicit Value(IR::Attribute value) noexcept;
@ -82,6 +83,8 @@ private:
f64 imm_f64;
const char* string_literal;
};
friend class std::hash<Value>;
};
static_assert(static_cast<u32>(IR::Type::Void) == 0, "memset relies on IR::Type being zero");
static_assert(std::is_trivially_copyable_v<Value>);
@ -364,3 +367,10 @@ inline const char* Value::StringLiteral() const {
}
} // namespace Shader::IR
namespace std {
template <>
struct hash<Shader::IR::Value> {
std::size_t operator()(const Shader::IR::Value& v) const;
};
} // namespace std

View file

@ -64,6 +64,7 @@ IR::Program TranslateProgram(std::span<const u32> code, Pools& pools, Info& info
Shader::Optimization::LowerSharedMemToRegisters(program);
}
Shader::Optimization::RingAccessElimination(program, runtime_info, program.info.stage);
Shader::Optimization::FlattenExtendedUserdataPass(program);
Shader::Optimization::ResourceTrackingPass(program);
Shader::Optimization::IdentityRemovalPass(program.blocks);
Shader::Optimization::DeadCodeEliminationPass(program);

View file

@ -8,6 +8,7 @@
#include "common/types.h"
#include "shader_recompiler/backend/bindings.h"
#include "shader_recompiler/info.h"
#include "shader_recompiler/ir/passes/srt.h"
namespace Shader {
@ -52,6 +53,9 @@ struct StageSpecialization {
Backend::Bindings start_)
: info{&info_}, runtime_info{runtime_info_}, start{start_} {
u32 binding{};
if (info->has_readconst) {
binding++;
}
ForEachSharp(binding, buffers, info->buffers,
[](auto& spec, const auto& desc, AmdGpu::Buffer sharp) {
spec.stride = sharp.GetStride();
@ -90,6 +94,12 @@ struct StageSpecialization {
return false;
}
u32 binding{};
if (info->has_readconst != other.info->has_readconst) {
return false;
}
if (info->has_readconst) {
binding++;
}
for (u32 i = 0; i < buffers.size(); i++) {
if (other.bitset[binding++] && buffers[i] != other.buffers[i]) {
return false;

View file

@ -4,6 +4,7 @@
#include <algorithm>
#include "common/alignment.h"
#include "common/scope_exit.h"
#include "common/types.h"
#include "shader_recompiler/info.h"
#include "video_core/amdgpu/liverpool.h"
#include "video_core/buffer_cache/buffer_cache.h"
@ -156,7 +157,7 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) {
continue;
}
const auto& buffer = vs_info.ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
const auto& buffer = vs_info.ReadUdReg<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
if (buffer.GetSize() == 0) {
continue;
}
@ -301,6 +302,14 @@ void BufferCache::InlineData(VAddr address, const void* value, u32 num_bytes, bo
cmdbuf.updateBuffer(buffer->Handle(), buf_barrier.offset, num_bytes, value);
}
std::pair<Buffer*, u32> BufferCache::ObtainHostUBO(std::span<const u32> data) {
static constexpr u64 StreamThreshold = CACHING_PAGESIZE;
ASSERT(data.size_bytes() <= StreamThreshold);
const u64 offset = stream_buffer.Copy(reinterpret_cast<VAddr>(data.data()), data.size_bytes(),
instance.UniformMinAlignment());
return {&stream_buffer, offset};
}
std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, bool is_written,
bool is_texel_buffer, BufferId buffer_id) {
// For small uniform buffers that have not been modified by gpu

View file

@ -84,6 +84,8 @@ public:
/// Writes a value to GPU buffer.
void InlineData(VAddr address, const void* value, u32 num_bytes, bool is_gds);
[[nodiscard]] std::pair<Buffer*, u32> ObtainHostUBO(std::span<const u32> data);
/// Obtains a buffer for the specified region.
[[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(VAddr gpu_addr, u32 size, bool is_written,
bool is_texel_buffer = false,

View file

@ -25,6 +25,15 @@ ComputePipeline::ComputePipeline(const Instance& instance_, Scheduler& scheduler
u32 binding{};
boost::container::small_vector<vk::DescriptorSetLayoutBinding, 32> bindings;
if (info->has_readconst) {
bindings.push_back({
.binding = binding++,
.descriptorType = vk::DescriptorType::eUniformBuffer,
.descriptorCount = 1,
.stageFlags = vk::ShaderStageFlagBits::eCompute,
});
}
for (const auto& buffer : info->buffers) {
const auto sharp = buffer.GetSharp(*info);
bindings.push_back({

View file

@ -60,7 +60,7 @@ GraphicsPipeline::GraphicsPipeline(const Instance& instance_, Scheduler& schedul
}
const auto buffer =
vs_info->ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
vs_info->ReadUdReg<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
if (buffer.GetSize() == 0) {
continue;
}
@ -327,6 +327,15 @@ void GraphicsPipeline::BuildDescSetLayout() {
if (!stage) {
continue;
}
if (stage->has_readconst) {
bindings.push_back({
.binding = binding++,
.descriptorType = vk::DescriptorType::eUniformBuffer,
.descriptorCount = 1,
.stageFlags = gp_stage_flags,
});
}
for (const auto& buffer : stage->buffers) {
const auto sharp = buffer.GetSharp(*stage);
bindings.push_back({

View file

@ -4,6 +4,7 @@
#include <ranges>
#include "common/config.h"
#include "common/hash.h"
#include "common/io_file.h"
#include "common/path_util.h"
#include "shader_recompiler/backend/spirv/emit_spirv.h"
@ -22,10 +23,6 @@ namespace Vulkan {
using Shader::VsOutput;
[[nodiscard]] inline u64 HashCombine(const u64 seed, const u64 hash) {
return seed ^ (hash + 0x9e3779b9 + (seed << 6) + (seed >> 2));
}
constexpr static std::array DescriptorHeapSizes = {
vk::DescriptorPoolSize{vk::DescriptorType::eUniformBuffer, 8192},
vk::DescriptorPoolSize{vk::DescriptorType::eStorageBuffer, 1024},
@ -351,7 +348,7 @@ bool PipelineCache::RefreshGraphicsKey() {
continue;
}
const auto& buffer =
vs_info->ReadUd<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
vs_info->ReadUdReg<AmdGpu::Buffer>(input.sgpr_base, input.dword_offset);
if (buffer.GetSize() == 0) {
continue;
}
@ -424,7 +421,8 @@ std::tuple<const Shader::Info*, vk::ShaderModule, u64> PipelineCache::GetProgram
}
Program* program = it_pgm->second;
const auto& info = program->info;
auto& info = program->info;
info.RefreshFlatBuf();
const auto spec = Shader::StageSpecialization(info, runtime_info, binding);
size_t perm_idx = program->modules.size();
vk::ShaderModule module{};

View file

@ -57,6 +57,22 @@ void Pipeline::BindBuffers(VideoCore::BufferCache& buffer_cache,
}
}
// Bind the flattened user data buffer as a UBO so it's accessible to the shader
if (stage.has_readconst) {
const auto [vk_buffer, offset] = buffer_cache.ObtainHostUBO(stage.flattened_ud_buf);
buffer_infos.emplace_back(vk_buffer->Handle(), offset,
stage.flattened_ud_buf.size() * sizeof(u32));
set_writes.push_back({
.dstSet = VK_NULL_HANDLE,
.dstBinding = binding.unified++,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = vk::DescriptorType::eUniformBuffer,
.pBufferInfo = &buffer_infos.back(),
});
++binding.buffer;
}
// Second pass to re-bind buffers that were updated after binding
for (u32 i = 0; i < buffer_bindings.size(); i++) {
const auto& [buffer_id, vsharp] = buffer_bindings[i];

View file

@ -12,6 +12,10 @@
#include "video_core/texture_cache/texture_cache.h"
#include "vk_rasterizer.h"
#ifdef MemoryBarrier
#undef MemoryBarrier
#endif
namespace Vulkan {
Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_,