Merge pull request #225 from shadps4-emu/stabilization/10

Various fixes and improvements
This commit is contained in:
georgemoralis 2024-07-01 13:09:11 +03:00 committed by GitHub
commit 1f83824a8a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 220 additions and 38 deletions

View file

@ -8,6 +8,7 @@
#include "common/slot_vector.h"
#include "core/libraries/error_codes.h"
#include "core/libraries/gnmdriver/gnmdriver.h"
#include "core/libraries/kernel/libkernel.h"
#include "core/libraries/libs.h"
#include "core/libraries/videoout/video_out.h"
#include "core/platform.h"
@ -212,12 +213,61 @@ static constexpr std::array InitSequence350{
0xc0016900u, 0x2aau, 0xffu,
};
static_assert(InitSequence350.size() == 0x7c);
static constexpr std::array CtxInitSequence{
0xc0012800u, 0x80000000u, 0x80000000u,
0xc0001200u, 0u,
0xc0002f00u, 1u,
0xc0016900u, 0x102u, 0u,
0xc0016900u, 0x202u, 0xcc0010u,
0xc0111000u, 0u
};
static_assert(CtxInitSequence.size() == 0x0f);
static constexpr std::array CtxInitSequence400{
0xc0012800u, 0x80000000u, 0x80000000u,
0xc0001200u, 0u,
0xc0016900u, 0x2f9u, 0x2du,
0xc0016900u, 0x282u, 8u,
0xc0016900u, 0x280u, 0x80008u,
0xc0016900u, 0x281u, 0xffff0000u,
0xc0016900u, 0x204u, 0u,
0xc0016900u, 0x206u, 0x43fu,
0xc0016900u, 0x83u, 0xffffu,
0xc0016900u, 0x317u, 0x10u,
0xc0016900u, 0x2fau, 0x3f800000u,
0xc0016900u, 0x2fcu, 0x3f800000u,
0xc0016900u, 0x2fbu, 0x3f800000u,
0xc0016900u, 0x2fdu, 0x3f800000u,
0xc0016900u, 0x202u, 0xcc0010u,
0xc0016900u, 0x30eu, 0xffffffffu,
0xc0016900u, 0x30fu, 0xffffffffu,
0xc0002f00u, 1u,
0xc0016900u, 0x1b1u, 2u,
0xc0016900u, 0x101u, 0u,
0xc0016900u, 0x100u, 0xffffffffu,
0xc0016900u, 0x103u, 0u,
0xc0016900u, 0x284u, 0u,
0xc0016900u, 0x290u, 0u,
0xc0016900u, 0x2aeu, 0u,
0xc0016900u, 0x102u, 0u,
0xc0016900u, 0x292u, 0u,
0xc0016900u, 0x293u, 0x6020000u,
0xc0016900u, 0x2f8u, 0u,
0xc0016900u, 0x2deu, 0x1e9u,
0xc0036900u, 0x295u, 0x100u, 0x100u, 4u,
0xc0016900u, 0x2aau, 0xffu,
0xc09e1000u,
};
static_assert(CtxInitSequence400.size() == 0x61);
// clang-format on
// In case if `submitDone` is issued we need to block submissions until GPU idle
static u32 submission_lock{};
static std::mutex m_submission{};
static u64 frames_submitted{}; // frame counter
static u64 frames_submitted{}; // frame counter
static bool send_init_packet{true}; // initialize HW state before first game's submit in a frame
static int sdk_version{0};
struct AscQueueInfo {
VAddr map_addr;
@ -664,9 +714,10 @@ u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState175(u32* cmdbuf, u32 size) {
cmdbuf = ClearContextState(cmdbuf);
std::memcpy(cmdbuf, InitSequence175.data(), InitSequence175.size() * 4);
cmdbuf += InitSequence175.size();
cmdbuf[0x7f] = 0xc07f1000;
cmdbuf[0x80] = 0;
constexpr auto cmdbuf_left = HwInitPacketSize - InitSequence175.size() - 0xc - 1;
WriteTrailingNop<cmdbuf_left>(cmdbuf);
return HwInitPacketSize;
}
@ -723,14 +774,28 @@ u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState350(u32* cmdbuf, u32 size) {
return SetupContext350(cmdbuf, size, true);
}
int PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState() {
LOG_ERROR(Lib_GnmDriver, "(STUBBED) called");
return ORBIS_OK;
u32 PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState(u32* cmdbuf, u32 size) {
LOG_TRACE(Lib_GnmDriver, "called");
constexpr auto CtxInitPacketSize = 0x20u;
if (size != CtxInitPacketSize) {
return 0;
}
std::memcpy(cmdbuf, CtxInitSequence.data(), CtxInitSequence.size() * 4);
return CtxInitPacketSize;
}
int PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState400() {
LOG_ERROR(Lib_GnmDriver, "(STUBBED) called");
return ORBIS_OK;
u32 PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState400(u32* cmdbuf, u32 size) {
LOG_TRACE(Lib_GnmDriver, "called");
constexpr auto CtxInitPacketSize = 0x100u;
if (size != CtxInitPacketSize) {
return 0;
}
std::memcpy(cmdbuf, CtxInitSequence400.data(), CtxInitSequence400.size() * 4);
return CtxInitPacketSize;
}
int PS4_SYSV_ABI sceGnmDrawOpaqueAuto() {
@ -1873,6 +1938,17 @@ s32 PS4_SYSV_ABI sceGnmSubmitCommandBuffers(u32 count, const u32* dcb_gpu_addrs[
submission_lock = 0;
}
if (send_init_packet) {
if (sdk_version <= 0x1ffffffu) {
liverpool->SubmitGfx(InitSequence, {});
} else if (sdk_version <= 0x3ffffffu) {
liverpool->SubmitGfx(InitSequence200, {});
} else {
liverpool->SubmitGfx(InitSequence350, {});
}
send_init_packet = false;
}
for (auto cbpair = 0u; cbpair < count; ++cbpair) {
const auto* ccb = ccb_gpu_addrs ? ccb_gpu_addrs[cbpair] : nullptr;
const auto ccb_size_in_bytes = ccb_sizes_in_bytes ? ccb_sizes_in_bytes[cbpair] : 0;
@ -1915,6 +1991,7 @@ int PS4_SYSV_ABI sceGnmSubmitDone() {
submission_lock = true;
}
liverpool->NotifySubmitDone();
send_init_packet = true;
++frames_submitted;
return ORBIS_OK;
}
@ -2388,6 +2465,11 @@ void RegisterlibSceGnmDriver(Core::Loader::SymbolsResolver* sym) {
liverpool = std::make_unique<AmdGpu::Liverpool>();
renderer = std::make_unique<Vulkan::RendererVulkan>(*g_window, liverpool.get());
const int result = sceKernelGetCompiledSdkVersion(&sdk_version);
if (result != ORBIS_OK) {
sdk_version = 0;
}
LIB_FUNCTION("b0xyllnVY-I", "libSceGnmDriver", 1, "libSceGnmDriver", 1, 1, sceGnmAddEqEvent);
LIB_FUNCTION("b08AgtPlHPg", "libSceGnmDriver", 1, "libSceGnmDriver", 1, 1,
sceGnmAreSubmitsAllowed);

View file

@ -60,8 +60,8 @@ u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState(u32* cmdbuf, u32 size);
u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState175(u32* cmdbuf, u32 size);
u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState200(u32* cmdbuf, u32 size);
u32 PS4_SYSV_ABI sceGnmDrawInitDefaultHardwareState350(u32* cmdbuf, u32 size);
int PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState();
int PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState400();
u32 PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState(u32* cmdbuf, u32 size);
u32 PS4_SYSV_ABI sceGnmDrawInitToDefaultContextState400(u32* cmdbuf, u32 size);
int PS4_SYSV_ABI sceGnmDrawOpaqueAuto();
int PS4_SYSV_ABI sceGnmDriverCaptureInProgress();
int PS4_SYSV_ABI sceGnmDriverInternalRetrieveGnmInterface();

View file

@ -161,7 +161,7 @@ int PS4_SYSV_ABI sceKernelGetCompiledSdkVersion(int* ver) {
int version = param_sfo->GetInteger("SYSTEM_VER");
LOG_INFO(Kernel, "returned system version = {:#x}", version);
*ver = version;
return ORBIS_OK;
return (version > 0) ? ORBIS_OK : ORBIS_KERNEL_ERROR_EINVAL;
}
s64 PS4_SYSV_ABI ps4__read(int d, void* buf, u64 nbytes) {

View file

@ -30,6 +30,7 @@ typedef struct {
} OrbisKernelUuid;
int* PS4_SYSV_ABI __Error();
int PS4_SYSV_ABI sceKernelGetCompiledSdkVersion(int* ver);
void LibKernel_Register(Core::Loader::SymbolsResolver* sym);

View file

@ -42,10 +42,6 @@ Emulator::Emulator() : window{WindowWidth, WindowHeight, controller} {
// Start logger.
Common::Log::Initialize();
Common::Log::Start();
// Initialize kernel and library facilities.
Libraries::Kernel::init_pthreads();
Libraries::InitHLELibs(&linker->GetHLESymbols());
}
Emulator::~Emulator() {
@ -93,6 +89,10 @@ void Emulator::Run(const std::filesystem::path& file) {
const auto& mount_temp_dir = Common::FS::GetUserPath(Common::FS::PathType::TempDataDir) / id;
mnt->Mount(mount_temp_dir, "/temp0"); // called in app_content ==> stat/mkdir
// Initialize kernel and library facilities.
Libraries::Kernel::init_pthreads();
Libraries::InitHLELibs(&linker->GetHLESymbols());
// Load the module with the linker
linker->LoadModule(file);

View file

@ -2,7 +2,9 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <algorithm>
#include <deque>
#include <boost/container/small_vector.hpp>
#include "shader_recompiler/ir/basic_block.h"
#include "shader_recompiler/ir/ir_emitter.h"
#include "shader_recompiler/ir/program.h"
@ -250,11 +252,25 @@ IR::Value PatchCubeCoord(IR::IREmitter& ir, const IR::Value& s, const IR::Value&
}
void PatchImageInstruction(IR::Block& block, IR::Inst& inst, Info& info, Descriptors& descriptors) {
IR::Inst* producer = inst.Arg(0).InstRecursive();
ASSERT(producer->GetOpcode() ==
IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler)
producer->GetOpcode() == IR::Opcode::ReadConst || // IMAGE_LOAD (image only)
producer->GetOpcode() == IR::Opcode::GetUserData);
std::deque<IR::Inst*> insts{&inst};
const auto& pred = [](auto opcode) -> bool {
return (opcode == IR::Opcode::CompositeConstructU32x2 || // IMAGE_SAMPLE (image+sampler)
opcode == IR::Opcode::ReadConst || // IMAGE_LOAD (image only)
opcode == IR::Opcode::GetUserData);
};
IR::Inst* producer{};
while (!insts.empty() && (producer = insts.front(), !pred(producer->GetOpcode()))) {
for (auto arg_idx = 0u; arg_idx < producer->NumArgs(); ++arg_idx) {
const auto arg = producer->Arg(arg_idx);
if (arg.TryInstRecursive()) {
insts.push_back(arg.InstRecursive());
}
}
insts.pop_front();
}
ASSERT(pred(producer->GetOpcode()));
const auto [tsharp_handle, ssharp_handle] = [&] -> std::pair<IR::Inst*, IR::Inst*> {
if (producer->GetOpcode() == IR::Opcode::CompositeConstructU32x2) {
return std::make_pair(producer->Arg(0).InstRecursive(),

View file

@ -518,13 +518,14 @@ struct PM4CmdEventWriteEos {
struct PM4WriteConstRam {
PM4Type3Header header;
union {
BitField<0, 16, u32> offset; // in DWs
BitField<0, 16, u32> offset; ///< Starting DW granularity offset into the constant RAM.
///< Thus, bits[1:0] are zero.
u32 dw1;
};
u32 data[0];
[[nodiscard]] u32 Offset() const {
return offset.Value() << 2u;
return offset.Value();
}
[[nodiscard]] u32 Size() const {

View file

@ -3,6 +3,7 @@
set(SHADER_FILES
detile_m8x1.comp
detile_m8x2.comp
detile_m32x1.comp
detile_m32x2.comp
detile_m32x4.comp

View file

@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later
#version 450
// NOTE: Current subgroup utilization is subotimal on most GPUs, so
// it will be nice to process two tiles at once here.
layout (local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
layout(std430, binding = 0) buffer input_buf {
uint in_data[];
};
layout(rg8ui, binding = 1) uniform writeonly uimage2D output_img;
layout(push_constant) uniform image_info {
uint pitch;
} info;
#define MICRO_TILE_DIM 8
#define TEXELS_PER_ELEMENT 2
// Inverse morton LUT, small enough to fit into K$
uint rmort[16] = {
0x11011000, 0x31213020,
0x13031202, 0x33233222,
0x51415040, 0x71617060,
0x53435242, 0x73637262,
0x15051404, 0x35253424,
0x17071606, 0x37273626,
0x55455444, 0x75657464,
0x57475646, 0x77677666,
};
void main() {
uint src_tx = in_data[gl_GlobalInvocationID.x];
uint p[TEXELS_PER_ELEMENT] = {
(src_tx >> 16) & 0xffff,
src_tx & 0xffff
};
uint bit_ofs = 8 * TEXELS_PER_ELEMENT * (gl_LocalInvocationID.x % 4);
uint packed_pos = rmort[gl_LocalInvocationID.x >> 1] >> bit_ofs;
uint col = bitfieldExtract(packed_pos, 4, 4);
uint row = bitfieldExtract(packed_pos, 0, 4);
uint tiles_per_pitch = info.pitch >> 3; // log2(MICRO_TILE_DIM)
uint target_tile_x = gl_WorkGroupID.x % tiles_per_pitch;
uint target_tile_y = gl_WorkGroupID.x / tiles_per_pitch;
uint dw_ofs_x = target_tile_x * MICRO_TILE_DIM + col;
uint dw_ofs_y = target_tile_y * MICRO_TILE_DIM + row;
ivec2 img_pos = ivec2(dw_ofs_x, dw_ofs_y);
#pragma unroll
for (int ofs = 0; ofs < TEXELS_PER_ELEMENT; ++ofs) {
uint p0 = (p[ofs] >> 8) & 0xff;
uint p1 = p[ofs] & 0xff;
imageStore(output_img, img_pos + ivec2(ofs, 0), uvec4(p0, p1, 0, 0));
}
}

View file

@ -325,6 +325,18 @@ vk::Format SurfaceFormat(AmdGpu::DataFormat data_format, AmdGpu::NumberFormat nu
num_format == AmdGpu::NumberFormat::Sint) {
return vk::Format::eR16G16B16A16Sint;
}
if (data_format == AmdGpu::DataFormat::Format16_16 &&
num_format == AmdGpu::NumberFormat::Float) {
return vk::Format::eR16G16Sfloat;
}
if (data_format == AmdGpu::DataFormat::Format10_11_11 &&
num_format == AmdGpu::NumberFormat::Float) {
return vk::Format::eB10G11R11UfloatPack32;
}
if (data_format == AmdGpu::DataFormat::Format2_10_10_10 &&
num_format == AmdGpu::NumberFormat::Unorm) {
return vk::Format::eA2B10G10R10UnormPack32;
}
if (data_format == AmdGpu::DataFormat::FormatBc7 && num_format == AmdGpu::NumberFormat::Srgb) {
return vk::Format::eBc7SrgbBlock;
}
@ -490,6 +502,8 @@ vk::SampleCountFlagBits NumSamples(u32 num_samples) {
return vk::SampleCountFlagBits::e2;
case 4:
return vk::SampleCountFlagBits::e4;
case 8:
return vk::SampleCountFlagBits::e8;
default:
UNREACHABLE();
}

View file

@ -122,9 +122,8 @@ void PipelineCache::RefreshGraphicsKey() {
key.depth.depth_enable.Assign(key.depth_format != vk::Format::eUndefined);
}
// TODO: Should be a check for `OperationMode::Disable` once we emulate HW state init packet
// sent by system software.
const auto skip_cb_binding = false;
const auto skip_cb_binding =
regs.color_control.mode == AmdGpu::Liverpool::ColorControl::OperationMode::Disable;
// `RenderingInfo` is assumed to be initialized with a contiguous array of valid color
// attachments. This might be not a case as HW color buffers can be bound in an arbitrary order.

View file

@ -91,9 +91,13 @@ static vk::ImageUsageFlags ImageUsageFlags(const ImageInfo& info) {
usage |= vk::ImageUsageFlagBits::eColorAttachment;
}
}
if (info.is_tiled || info.usage.storage) {
usage |= vk::ImageUsageFlagBits::eStorage;
}
// In cases where an image is created as a render/depth target and cleared with compute,
// we cannot predict whether it will be used as a storage image. A proper solution would
// involve re-creating the resource with a new configuration and copying previous content into
// it. However, for now, we will set storage usage for all images (if the format allows),
// sacrificing a bit of performance. Note use of ExtendedUsage flag set by default.
usage |= vk::ImageUsageFlagBits::eStorage;
return usage;
}
@ -217,7 +221,8 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
: instance{&instance_}, scheduler{&scheduler_}, info{info_},
image{instance->GetDevice(), instance->GetAllocator()}, cpu_addr{cpu_addr},
cpu_addr_end{cpu_addr + info.guest_size_bytes} {
vk::ImageCreateFlags flags{vk::ImageCreateFlagBits::eMutableFormat};
vk::ImageCreateFlags flags{vk::ImageCreateFlagBits::eMutableFormat |
vk::ImageCreateFlagBits::eExtendedUsage};
if (info.type == vk::ImageType::e2D && info.resources.layers >= 6 &&
info.size.width == info.size.height) {
flags |= vk::ImageCreateFlagBits::eCubeCompatible;
@ -225,11 +230,8 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_,
if (info.type == vk::ImageType::e3D) {
flags |= vk::ImageCreateFlagBits::e2DArrayCompatible;
}
if (info.is_tiled) {
flags |= vk::ImageCreateFlagBits::eExtendedUsage;
if (info.IsBlockCoded()) {
flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible;
}
if (info.IsBlockCoded()) {
flags |= vk::ImageCreateFlagBits::eBlockTexelViewCompatible;
}
usage = ImageUsageFlags(info);

View file

@ -12,6 +12,7 @@
#include "video_core/host_shaders/detile_m32x2_comp.h"
#include "video_core/host_shaders/detile_m32x4_comp.h"
#include "video_core/host_shaders/detile_m8x1_comp.h"
#include "video_core/host_shaders/detile_m8x2_comp.h"
#include <boost/container/static_vector.hpp>
#include <magic_enum.hpp>
@ -177,6 +178,8 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format) {
switch (format) {
case vk::Format::eR8Unorm:
return vk::Format::eR8Uint;
case vk::Format::eR8G8Unorm:
return vk::Format::eR8G8Uint;
case vk::Format::eR8G8B8A8Srgb:
[[fallthrough]];
case vk::Format::eB8G8R8A8Srgb:
@ -207,6 +210,8 @@ const DetilerContext* TileManager::GetDetiler(const Image& image) const {
switch (format) {
case vk::Format::eR8Uint:
return &detilers[DetilerType::Micro8x1];
case vk::Format::eR8G8Uint:
return &detilers[DetilerType::Micro8x2];
case vk::Format::eR32Uint:
return &detilers[DetilerType::Micro32x1];
case vk::Format::eR32G32Uint:
@ -229,9 +234,8 @@ TileManager::TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& sc
staging{instance, scheduler, StagingFlags, 64_MB, Vulkan::BufferType::Upload} {
static const std::array detiler_shaders{
HostShaders::DETILE_M8X1_COMP,
HostShaders::DETILE_M32X1_COMP,
HostShaders::DETILE_M32X2_COMP,
HostShaders::DETILE_M8X1_COMP, HostShaders::DETILE_M8X2_COMP,
HostShaders::DETILE_M32X1_COMP, HostShaders::DETILE_M32X2_COMP,
HostShaders::DETILE_M32X4_COMP,
};

View file

@ -19,6 +19,7 @@ vk::Format DemoteImageFormatForDetiling(vk::Format format);
enum DetilerType : u32 {
Micro8x1,
Micro8x2,
Micro32x1,
Micro32x2,
Micro32x4,