mirror of
https://github.com/shadps4-emu/shadPS4.git
synced 2025-01-29 01:38:24 +00:00
renderer_vulkan: Introduce shader HLE system with copy shader implementation. (#1683)
* renderer_vulkan: Introduce shader HLE system with copy shader implementation. Co-authored-by: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com> * buffer_cache: Handle obtaining buffer views partially within buffers. * vk_shader_hle: Make more efficient --------- Co-authored-by: TheTurtle <47210458+raphaelthegreat@users.noreply.github.com>
This commit is contained in:
parent
41fd1c84cf
commit
e5e1aba241
|
@ -738,6 +738,8 @@ set(VIDEO_CORE src/video_core/amdgpu/liverpool.cpp
|
||||||
src/video_core/renderer_vulkan/vk_resource_pool.h
|
src/video_core/renderer_vulkan/vk_resource_pool.h
|
||||||
src/video_core/renderer_vulkan/vk_scheduler.cpp
|
src/video_core/renderer_vulkan/vk_scheduler.cpp
|
||||||
src/video_core/renderer_vulkan/vk_scheduler.h
|
src/video_core/renderer_vulkan/vk_scheduler.h
|
||||||
|
src/video_core/renderer_vulkan/vk_shader_hle.cpp
|
||||||
|
src/video_core/renderer_vulkan/vk_shader_hle.h
|
||||||
src/video_core/renderer_vulkan/vk_shader_util.cpp
|
src/video_core/renderer_vulkan/vk_shader_util.cpp
|
||||||
src/video_core/renderer_vulkan/vk_shader_util.h
|
src/video_core/renderer_vulkan/vk_shader_util.h
|
||||||
src/video_core/renderer_vulkan/vk_swapchain.cpp
|
src/video_core/renderer_vulkan/vk_swapchain.cpp
|
||||||
|
|
|
@ -360,7 +360,8 @@ std::pair<Buffer*, u32> BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b
|
||||||
return {&buffer, buffer.Offset(device_addr)};
|
return {&buffer, buffer.Offset(device_addr)};
|
||||||
}
|
}
|
||||||
|
|
||||||
std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size) {
|
std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size, bool prefer_gpu) {
|
||||||
|
// Check if any buffer contains the full requested range.
|
||||||
const u64 page = gpu_addr >> CACHING_PAGEBITS;
|
const u64 page = gpu_addr >> CACHING_PAGEBITS;
|
||||||
const BufferId buffer_id = page_table[page];
|
const BufferId buffer_id = page_table[page];
|
||||||
if (buffer_id) {
|
if (buffer_id) {
|
||||||
|
@ -370,6 +371,13 @@ std::pair<Buffer*, u32> BufferCache::ObtainViewBuffer(VAddr gpu_addr, u32 size)
|
||||||
return {&buffer, buffer.Offset(gpu_addr)};
|
return {&buffer, buffer.Offset(gpu_addr)};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// If no buffer contains the full requested range but some buffer within was GPU-modified,
|
||||||
|
// fall back to ObtainBuffer to create a full buffer and avoid losing GPU modifications.
|
||||||
|
// This is only done if the request prefers to use GPU memory, otherwise we can skip it.
|
||||||
|
if (prefer_gpu && memory_tracker.IsRegionGpuModified(gpu_addr, size)) {
|
||||||
|
return ObtainBuffer(gpu_addr, size, false, false);
|
||||||
|
}
|
||||||
|
// In all other cases, just do a CPU copy to the staging buffer.
|
||||||
const u32 offset = staging_buffer.Copy(gpu_addr, size, 16);
|
const u32 offset = staging_buffer.Copy(gpu_addr, size, 16);
|
||||||
return {&staging_buffer, offset};
|
return {&staging_buffer, offset};
|
||||||
}
|
}
|
||||||
|
|
|
@ -96,7 +96,8 @@ public:
|
||||||
BufferId buffer_id = {});
|
BufferId buffer_id = {});
|
||||||
|
|
||||||
/// Attempts to obtain a buffer without modifying the cache contents.
|
/// Attempts to obtain a buffer without modifying the cache contents.
|
||||||
[[nodiscard]] std::pair<Buffer*, u32> ObtainViewBuffer(VAddr gpu_addr, u32 size);
|
[[nodiscard]] std::pair<Buffer*, u32> ObtainViewBuffer(VAddr gpu_addr, u32 size,
|
||||||
|
bool prefer_gpu);
|
||||||
|
|
||||||
/// Return true when a region is registered on the cache
|
/// Return true when a region is registered on the cache
|
||||||
[[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
|
[[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size);
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#include "video_core/renderer_vulkan/vk_instance.h"
|
#include "video_core/renderer_vulkan/vk_instance.h"
|
||||||
#include "video_core/renderer_vulkan/vk_rasterizer.h"
|
#include "video_core/renderer_vulkan/vk_rasterizer.h"
|
||||||
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
||||||
|
#include "video_core/renderer_vulkan/vk_shader_hle.h"
|
||||||
#include "video_core/texture_cache/image_view.h"
|
#include "video_core/texture_cache/image_view.h"
|
||||||
#include "video_core/texture_cache/texture_cache.h"
|
#include "video_core/texture_cache/texture_cache.h"
|
||||||
#include "vk_rasterizer.h"
|
#include "vk_rasterizer.h"
|
||||||
|
@ -318,6 +319,11 @@ void Rasterizer::DispatchDirect() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const auto& cs = pipeline->GetStage(Shader::Stage::Compute);
|
||||||
|
if (ExecuteShaderHLE(cs, liverpool->regs, *this)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (!BindResources(pipeline)) {
|
if (!BindResources(pipeline)) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,14 @@ public:
|
||||||
AmdGpu::Liverpool* liverpool);
|
AmdGpu::Liverpool* liverpool);
|
||||||
~Rasterizer();
|
~Rasterizer();
|
||||||
|
|
||||||
|
[[nodiscard]] Scheduler& GetScheduler() noexcept {
|
||||||
|
return scheduler;
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] VideoCore::BufferCache& GetBufferCache() noexcept {
|
||||||
|
return buffer_cache;
|
||||||
|
}
|
||||||
|
|
||||||
[[nodiscard]] VideoCore::TextureCache& GetTextureCache() noexcept {
|
[[nodiscard]] VideoCore::TextureCache& GetTextureCache() noexcept {
|
||||||
return texture_cache;
|
return texture_cache;
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,6 +10,10 @@
|
||||||
#include "video_core/renderer_vulkan/vk_master_semaphore.h"
|
#include "video_core/renderer_vulkan/vk_master_semaphore.h"
|
||||||
#include "video_core/renderer_vulkan/vk_resource_pool.h"
|
#include "video_core/renderer_vulkan/vk_resource_pool.h"
|
||||||
|
|
||||||
|
namespace tracy {
|
||||||
|
class VkCtxScope;
|
||||||
|
}
|
||||||
|
|
||||||
namespace Vulkan {
|
namespace Vulkan {
|
||||||
|
|
||||||
class Instance;
|
class Instance;
|
||||||
|
|
139
src/video_core/renderer_vulkan/vk_shader_hle.cpp
Normal file
139
src/video_core/renderer_vulkan/vk_shader_hle.cpp
Normal file
|
@ -0,0 +1,139 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
#include "shader_recompiler/info.h"
|
||||||
|
#include "video_core/renderer_vulkan/vk_scheduler.h"
|
||||||
|
#include "video_core/renderer_vulkan/vk_shader_hle.h"
|
||||||
|
|
||||||
|
#include "vk_rasterizer.h"
|
||||||
|
|
||||||
|
namespace Vulkan {
|
||||||
|
|
||||||
|
static constexpr u64 COPY_SHADER_HASH = 0xfefebf9f;
|
||||||
|
|
||||||
|
bool ExecuteCopyShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs,
|
||||||
|
Rasterizer& rasterizer) {
|
||||||
|
auto& scheduler = rasterizer.GetScheduler();
|
||||||
|
auto& buffer_cache = rasterizer.GetBufferCache();
|
||||||
|
|
||||||
|
// Copy shader defines three formatted buffers as inputs: control, source, and destination.
|
||||||
|
const auto ctl_buf_sharp = info.texture_buffers[0].GetSharp(info);
|
||||||
|
const auto src_buf_sharp = info.texture_buffers[1].GetSharp(info);
|
||||||
|
const auto dst_buf_sharp = info.texture_buffers[2].GetSharp(info);
|
||||||
|
const auto buf_stride = src_buf_sharp.GetStride();
|
||||||
|
ASSERT(buf_stride == dst_buf_sharp.GetStride());
|
||||||
|
|
||||||
|
struct CopyShaderControl {
|
||||||
|
u32 dst_idx;
|
||||||
|
u32 src_idx;
|
||||||
|
u32 end;
|
||||||
|
};
|
||||||
|
static_assert(sizeof(CopyShaderControl) == 12);
|
||||||
|
ASSERT(ctl_buf_sharp.GetStride() == sizeof(CopyShaderControl));
|
||||||
|
const auto ctl_buf = reinterpret_cast<const CopyShaderControl*>(ctl_buf_sharp.base_address);
|
||||||
|
|
||||||
|
static std::vector<vk::BufferCopy> copies;
|
||||||
|
copies.clear();
|
||||||
|
copies.reserve(regs.cs_program.dim_x);
|
||||||
|
|
||||||
|
for (u32 i = 0; i < regs.cs_program.dim_x; i++) {
|
||||||
|
const auto& [dst_idx, src_idx, end] = ctl_buf[i];
|
||||||
|
const u32 local_dst_offset = dst_idx * buf_stride;
|
||||||
|
const u32 local_src_offset = src_idx * buf_stride;
|
||||||
|
const u32 local_size = (end + 1) * buf_stride;
|
||||||
|
copies.emplace_back(local_src_offset, local_dst_offset, local_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
scheduler.EndRendering();
|
||||||
|
|
||||||
|
static constexpr vk::MemoryBarrier READ_BARRIER{
|
||||||
|
.srcAccessMask = vk::AccessFlagBits::eMemoryWrite,
|
||||||
|
.dstAccessMask = vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite,
|
||||||
|
};
|
||||||
|
static constexpr vk::MemoryBarrier WRITE_BARRIER{
|
||||||
|
.srcAccessMask = vk::AccessFlagBits::eTransferWrite,
|
||||||
|
.dstAccessMask = vk::AccessFlagBits::eMemoryRead | vk::AccessFlagBits::eMemoryWrite,
|
||||||
|
};
|
||||||
|
scheduler.CommandBuffer().pipelineBarrier(
|
||||||
|
vk::PipelineStageFlagBits::eAllCommands, vk::PipelineStageFlagBits::eTransfer,
|
||||||
|
vk::DependencyFlagBits::eByRegion, READ_BARRIER, {}, {});
|
||||||
|
|
||||||
|
static constexpr vk::DeviceSize MaxDistanceForMerge = 64_MB;
|
||||||
|
u32 batch_start = 0;
|
||||||
|
u32 batch_end = 1;
|
||||||
|
|
||||||
|
while (batch_end < copies.size()) {
|
||||||
|
// Place first copy into the current batch
|
||||||
|
const auto& copy = copies[batch_start];
|
||||||
|
auto src_offset_min = copy.srcOffset;
|
||||||
|
auto src_offset_max = copy.srcOffset + copy.size;
|
||||||
|
auto dst_offset_min = copy.dstOffset;
|
||||||
|
auto dst_offset_max = copy.dstOffset + copy.size;
|
||||||
|
|
||||||
|
for (int i = batch_start + 1; i < copies.size(); i++) {
|
||||||
|
// Compute new src and dst bounds if we were to batch this copy
|
||||||
|
const auto [src_offset, dst_offset, size] = copies[i];
|
||||||
|
auto new_src_offset_min = std::min(src_offset_min, src_offset);
|
||||||
|
auto new_src_offset_max = std::max(src_offset_max, src_offset + size);
|
||||||
|
if (new_src_offset_max - new_src_offset_min > MaxDistanceForMerge) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto new_dst_offset_min = std::min(dst_offset_min, dst_offset);
|
||||||
|
auto new_dst_offset_max = std::max(dst_offset_max, dst_offset + size);
|
||||||
|
if (new_dst_offset_max - new_dst_offset_min > MaxDistanceForMerge) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We can batch this copy
|
||||||
|
src_offset_min = new_src_offset_min;
|
||||||
|
src_offset_max = new_src_offset_max;
|
||||||
|
dst_offset_min = new_dst_offset_min;
|
||||||
|
dst_offset_max = new_dst_offset_max;
|
||||||
|
if (i != batch_end) {
|
||||||
|
std::swap(copies[i], copies[batch_end]);
|
||||||
|
}
|
||||||
|
++batch_end;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Obtain buffers for the total source and destination ranges.
|
||||||
|
const auto [src_buf, src_buf_offset] =
|
||||||
|
buffer_cache.ObtainBuffer(src_buf_sharp.base_address + src_offset_min,
|
||||||
|
src_offset_max - src_offset_min, false, false);
|
||||||
|
const auto [dst_buf, dst_buf_offset] =
|
||||||
|
buffer_cache.ObtainBuffer(dst_buf_sharp.base_address + dst_offset_min,
|
||||||
|
dst_offset_max - dst_offset_min, true, false);
|
||||||
|
|
||||||
|
// Apply found buffer base.
|
||||||
|
const auto vk_copies = std::span{copies}.subspan(batch_start, batch_end - batch_start);
|
||||||
|
for (auto& copy : vk_copies) {
|
||||||
|
copy.srcOffset = copy.srcOffset - src_offset_min + src_buf_offset;
|
||||||
|
copy.dstOffset = copy.dstOffset - dst_offset_min + dst_buf_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Execute buffer copies.
|
||||||
|
LOG_TRACE(Render_Vulkan, "HLE buffer copy: src_size = {}, dst_size = {}",
|
||||||
|
src_offset_max - src_offset_min, dst_offset_max - dst_offset_min);
|
||||||
|
scheduler.CommandBuffer().copyBuffer(src_buf->Handle(), dst_buf->Handle(), vk_copies);
|
||||||
|
batch_start = batch_end;
|
||||||
|
++batch_end;
|
||||||
|
}
|
||||||
|
|
||||||
|
scheduler.CommandBuffer().pipelineBarrier(
|
||||||
|
vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands,
|
||||||
|
vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {});
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs,
|
||||||
|
Rasterizer& rasterizer) {
|
||||||
|
switch (info.pgm_hash) {
|
||||||
|
case COPY_SHADER_HASH:
|
||||||
|
return ExecuteCopyShaderHLE(info, regs, rasterizer);
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Vulkan
|
20
src/video_core/renderer_vulkan/vk_shader_hle.h
Normal file
20
src/video_core/renderer_vulkan/vk_shader_hle.h
Normal file
|
@ -0,0 +1,20 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2024 shadPS4 Emulator Project
|
||||||
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "video_core/amdgpu/liverpool.h"
|
||||||
|
|
||||||
|
namespace Shader {
|
||||||
|
struct Info;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace Vulkan {
|
||||||
|
|
||||||
|
class Rasterizer;
|
||||||
|
|
||||||
|
/// Attempts to execute a shader using HLE if possible.
|
||||||
|
bool ExecuteShaderHLE(const Shader::Info& info, const AmdGpu::Liverpool::Regs& regs,
|
||||||
|
Rasterizer& rasterizer);
|
||||||
|
|
||||||
|
} // namespace Vulkan
|
|
@ -466,6 +466,9 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
|
||||||
const auto& num_mips = image.info.resources.levels;
|
const auto& num_mips = image.info.resources.levels;
|
||||||
ASSERT(num_mips == image.info.mips_layout.size());
|
ASSERT(num_mips == image.info.mips_layout.size());
|
||||||
|
|
||||||
|
const bool is_gpu_modified = True(image.flags & ImageFlagBits::GpuModified);
|
||||||
|
const bool is_gpu_dirty = True(image.flags & ImageFlagBits::GpuDirty);
|
||||||
|
|
||||||
boost::container::small_vector<vk::BufferImageCopy, 14> image_copy{};
|
boost::container::small_vector<vk::BufferImageCopy, 14> image_copy{};
|
||||||
for (u32 m = 0; m < num_mips; m++) {
|
for (u32 m = 0; m < num_mips; m++) {
|
||||||
const u32 width = std::max(image.info.size.width >> m, 1u);
|
const u32 width = std::max(image.info.size.width >> m, 1u);
|
||||||
|
@ -475,8 +478,6 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
|
||||||
const auto& mip = image.info.mips_layout[m];
|
const auto& mip = image.info.mips_layout[m];
|
||||||
|
|
||||||
// Protect GPU modified resources from accidental CPU reuploads.
|
// Protect GPU modified resources from accidental CPU reuploads.
|
||||||
const bool is_gpu_modified = True(image.flags & ImageFlagBits::GpuModified);
|
|
||||||
const bool is_gpu_dirty = True(image.flags & ImageFlagBits::GpuDirty);
|
|
||||||
if (is_gpu_modified && !is_gpu_dirty) {
|
if (is_gpu_modified && !is_gpu_dirty) {
|
||||||
const u8* addr = std::bit_cast<u8*>(image.info.guest_address);
|
const u8* addr = std::bit_cast<u8*>(image.info.guest_address);
|
||||||
const u64 hash = XXH3_64bits(addr + mip.offset, mip.size);
|
const u64 hash = XXH3_64bits(addr + mip.offset, mip.size);
|
||||||
|
@ -515,7 +516,8 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule
|
||||||
|
|
||||||
const VAddr image_addr = image.info.guest_address;
|
const VAddr image_addr = image.info.guest_address;
|
||||||
const size_t image_size = image.info.guest_size_bytes;
|
const size_t image_size = image.info.guest_size_bytes;
|
||||||
const auto [vk_buffer, buf_offset] = buffer_cache.ObtainViewBuffer(image_addr, image_size);
|
const auto [vk_buffer, buf_offset] =
|
||||||
|
buffer_cache.ObtainViewBuffer(image_addr, image_size, is_gpu_dirty);
|
||||||
// The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW
|
// The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW
|
||||||
// hazard
|
// hazard
|
||||||
if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead,
|
if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead,
|
||||||
|
|
Loading…
Reference in a new issue