diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index 11d472a4..919afcb4 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -414,11 +414,6 @@ ScePthreadMutex* createMutex(ScePthreadMutex* addr) { if (addr == nullptr || *addr != nullptr) { return addr; } - static std::mutex mutex; - std::scoped_lock lk{mutex}; - if (*addr != nullptr) { - return addr; - } const VAddr vaddr = reinterpret_cast(addr); std::string name = fmt::format("mutex{:#x}", vaddr); scePthreadMutexInit(addr, nullptr, name.c_str()); @@ -584,8 +579,7 @@ int PS4_SYSV_ABI scePthreadMutexLock(ScePthreadMutex* mutex) { } int PS4_SYSV_ABI scePthreadMutexUnlock(ScePthreadMutex* mutex) { - mutex = createMutex(mutex); - if (mutex == nullptr) { + if (mutex == nullptr || *mutex == nullptr) { return SCE_KERNEL_ERROR_EINVAL; } diff --git a/src/video_core/buffer_cache/buffer.cpp b/src/video_core/buffer_cache/buffer.cpp index adcea000..70295803 100644 --- a/src/video_core/buffer_cache/buffer.cpp +++ b/src/video_core/buffer_cache/buffer.cpp @@ -91,10 +91,10 @@ void UniqueBuffer::Create(const vk::BufferCreateInfo& buffer_ci, MemoryUsage usa buffer = vk::Buffer{unsafe_buffer}; } -Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_addr_, - vk::BufferUsageFlags flags, u64 size_bytes_) - : cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, usage{usage_}, - buffer{instance->GetDevice(), instance->GetAllocator()} { +Buffer::Buffer(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, MemoryUsage usage_, + VAddr cpu_addr_, vk::BufferUsageFlags flags, u64 size_bytes_) + : cpu_addr{cpu_addr_}, size_bytes{size_bytes_}, instance{&instance_}, scheduler{&scheduler_}, + usage{usage_}, buffer{instance->GetDevice(), instance->GetAllocator()} { // Create buffer object. const vk::BufferCreateInfo buffer_ci = { .size = size_bytes, @@ -117,13 +117,6 @@ Buffer::Buffer(const Vulkan::Instance& instance_, MemoryUsage usage_, VAddr cpu_ vk::BufferView Buffer::View(u32 offset, u32 size, bool is_written, AmdGpu::DataFormat dfmt, AmdGpu::NumberFormat nfmt) { - const auto it{std::ranges::find_if(views, [=](const BufferView& view) { - return offset == view.offset && size == view.size && is_written == view.is_written && - dfmt == view.dfmt && nfmt == view.nfmt; - })}; - if (it != views.end()) { - return *it->handle; - } const vk::BufferUsageFlags2CreateInfoKHR usage_flags = { .usage = is_written ? vk::BufferUsageFlagBits2KHR::eStorageTexelBuffer : vk::BufferUsageFlagBits2KHR::eUniformTexelBuffer, @@ -135,23 +128,18 @@ vk::BufferView Buffer::View(u32 offset, u32 size, bool is_written, AmdGpu::DataF .offset = offset, .range = size, }; - views.push_back({ - .offset = offset, - .size = size, - .is_written = is_written, - .dfmt = dfmt, - .nfmt = nfmt, - .handle = instance->GetDevice().createBufferViewUnique(view_ci), - }); - return *views.back().handle; + const auto view = instance->GetDevice().createBufferView(view_ci); + scheduler->DeferOperation( + [view, device = instance->GetDevice()] { device.destroyBufferView(view); }); + return view; } constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; -StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler_, +StreamBuffer::StreamBuffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, MemoryUsage usage, u64 size_bytes) - : Buffer{instance, usage, 0, AllFlags, size_bytes}, scheduler{scheduler_} { + : Buffer{instance, scheduler, usage, 0, AllFlags, size_bytes} { ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); const auto device = instance.GetDevice(); @@ -206,7 +194,7 @@ void StreamBuffer::Commit() { auto& watch = current_watches[current_watch_cursor++]; watch.upper_bound = offset; - watch.tick = scheduler.CurrentTick(); + watch.tick = scheduler->CurrentTick(); } void StreamBuffer::ReserveWatches(std::vector& watches, std::size_t grow_size) { @@ -220,7 +208,7 @@ void StreamBuffer::WaitPendingOperations(u64 requested_upper_bound) { while (requested_upper_bound > wait_bound && wait_cursor < *invalidation_mark) { auto& watch = previous_watches[wait_cursor]; wait_bound = watch.upper_bound; - scheduler.Wait(watch.tick); + scheduler->Wait(watch.tick); ++wait_cursor; } } diff --git a/src/video_core/buffer_cache/buffer.h b/src/video_core/buffer_cache/buffer.h index 33497578..403d4ed8 100644 --- a/src/video_core/buffer_cache/buffer.h +++ b/src/video_core/buffer_cache/buffer.h @@ -73,8 +73,9 @@ struct UniqueBuffer { class Buffer { public: - explicit Buffer(const Vulkan::Instance& instance, MemoryUsage usage, VAddr cpu_addr_, - vk::BufferUsageFlags flags, u64 size_bytes_); + explicit Buffer(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, + MemoryUsage usage, VAddr cpu_addr_, vk::BufferUsageFlags flags, + u64 size_bytes_); Buffer& operator=(const Buffer&) = delete; Buffer(const Buffer&) = delete; @@ -144,20 +145,12 @@ public: int stream_score = 0; size_t size_bytes = 0; std::span mapped_data; - const Vulkan::Instance* instance{}; + const Vulkan::Instance* instance; + Vulkan::Scheduler* scheduler; MemoryUsage usage; UniqueBuffer buffer; vk::AccessFlagBits2 access_mask{vk::AccessFlagBits2::eNone}; vk::PipelineStageFlagBits2 stage{vk::PipelineStageFlagBits2::eNone}; - struct BufferView { - u32 offset; - u32 size; - bool is_written; - AmdGpu::DataFormat dfmt; - AmdGpu::NumberFormat nfmt; - vk::UniqueBufferView handle; - }; - std::vector views; }; class StreamBuffer : public Buffer { @@ -196,7 +189,6 @@ private: void WaitPendingOperations(u64 requested_upper_bound); private: - Vulkan::Scheduler& scheduler; u64 offset{}; u64 mapped_size{}; std::vector current_watches; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index d67e953e..89032e99 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -10,20 +10,24 @@ #include "video_core/renderer_vulkan/liverpool_to_vk.h" #include "video_core/renderer_vulkan/vk_instance.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/texture_cache/texture_cache.h" namespace VideoCore { +static constexpr size_t NumVertexBuffers = 32; static constexpr size_t StagingBufferSize = 512_MB; static constexpr size_t UboStreamBufferSize = 64_MB; BufferCache::BufferCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, - const AmdGpu::Liverpool* liverpool_, PageManager& tracker_) - : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, tracker{tracker_}, + const AmdGpu::Liverpool* liverpool_, TextureCache& texture_cache_, + PageManager& tracker_) + : instance{instance_}, scheduler{scheduler_}, liverpool{liverpool_}, + texture_cache{texture_cache_}, tracker{tracker_}, staging_buffer{instance, scheduler, MemoryUsage::Upload, StagingBufferSize}, stream_buffer{instance, scheduler, MemoryUsage::Stream, UboStreamBufferSize}, memory_tracker{&tracker} { // Ensure the first slot is used for the null buffer - void(slot_buffers.insert(instance, MemoryUsage::DeviceLocal, 0, ReadFlags, 1)); + void(slot_buffers.insert(instance, scheduler, MemoryUsage::DeviceLocal, 0, ReadFlags, 1)); } BufferCache::~BufferCache() = default; @@ -100,9 +104,9 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { return false; } - std::array host_buffers; - std::array host_offsets; - boost::container::static_vector guest_buffers; + std::array host_buffers; + std::array host_offsets; + boost::container::static_vector guest_buffers; struct BufferRange { VAddr base_address; @@ -117,7 +121,7 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { // Calculate buffers memory overlaps bool has_step_rate = false; - boost::container::static_vector ranges{}; + boost::container::static_vector ranges{}; for (const auto& input : vs_info.vs_inputs) { if (input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate0 || input.instance_step_rate == Shader::Info::VsInput::InstanceIdType::OverStepRate1) { @@ -152,7 +156,7 @@ bool BufferCache::BindVertexBuffers(const Shader::Info& vs_info) { return lhv.base_address < rhv.base_address; }); - boost::container::static_vector ranges_merged{ranges[0]}; + boost::container::static_vector ranges_merged{ranges[0]}; for (auto range : ranges) { auto& prev_range = ranges_merged.back(); if (prev_range.end_address < range.base_address) { @@ -232,7 +236,7 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b bool is_texel_buffer) { static constexpr u64 StreamThreshold = CACHING_PAGESIZE; const bool is_gpu_dirty = memory_tracker.IsRegionGpuModified(device_addr, size); - if (!is_written && !is_texel_buffer && size <= StreamThreshold && !is_gpu_dirty) { + if (!is_written && size <= StreamThreshold && !is_gpu_dirty) { // For small uniform buffers that have not been modified by gpu // use device local stream buffer to reduce renderpass breaks. const u64 offset = stream_buffer.Copy(device_addr, size, instance.UniformMinAlignment()); @@ -241,7 +245,7 @@ std::pair BufferCache::ObtainBuffer(VAddr device_addr, u32 size, b const BufferId buffer_id = FindBuffer(device_addr, size); Buffer& buffer = slot_buffers[buffer_id]; - SynchronizeBuffer(buffer, device_addr, size); + SynchronizeBuffer(buffer, device_addr, size, is_texel_buffer); if (is_written) { memory_tracker.MarkRegionAsGpuModified(device_addr, size); } @@ -420,8 +424,8 @@ BufferId BufferCache::CreateBuffer(VAddr device_addr, u32 wanted_size) { wanted_size = static_cast(device_addr_end - device_addr); const OverlapResult overlap = ResolveOverlaps(device_addr, wanted_size); const u32 size = static_cast(overlap.end - overlap.begin); - const BufferId new_buffer_id = - slot_buffers.insert(instance, MemoryUsage::DeviceLocal, overlap.begin, AllFlags, size); + const BufferId new_buffer_id = slot_buffers.insert( + instance, scheduler, MemoryUsage::DeviceLocal, overlap.begin, AllFlags, size); auto& new_buffer = slot_buffers[new_buffer_id]; const size_t size_bytes = new_buffer.SizeBytes(); const auto cmdbuf = scheduler.CommandBuffer(); @@ -459,7 +463,8 @@ void BufferCache::ChangeRegister(BufferId buffer_id) { } } -bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size) { +void BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, + bool is_texel_buffer) { std::scoped_lock lk{mutex}; boost::container::small_vector copies; u64 total_size_bytes = 0; @@ -479,8 +484,13 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size) // Prevent uploading to gpu modified regions. // gpu_modified_ranges.ForEachNotInRange(device_addr_out, range_size, add_copy); }); + SCOPE_EXIT { + if (is_texel_buffer) { + SynchronizeBufferFromImage(buffer, device_addr, size); + } + }; if (total_size_bytes == 0) { - return true; + return; } vk::Buffer src_buffer = staging_buffer.Handle(); if (total_size_bytes < StagingBufferSize) { @@ -496,7 +506,11 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size) } else { // For large one time transfers use a temporary host buffer. // RenderDoc can lag quite a bit if the stream buffer is too large. - Buffer temp_buffer{instance, MemoryUsage::Upload, 0, vk::BufferUsageFlagBits::eTransferSrc, + Buffer temp_buffer{instance, + scheduler, + MemoryUsage::Upload, + 0, + vk::BufferUsageFlagBits::eTransferSrc, total_size_bytes}; src_buffer = temp_buffer.Handle(); u8* const staging = temp_buffer.mapped_data.data(); @@ -524,7 +538,68 @@ bool BufferCache::SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size) cmdbuf.pipelineBarrier(vk::PipelineStageFlagBits::eTransfer, vk::PipelineStageFlagBits::eAllCommands, vk::DependencyFlagBits::eByRegion, WRITE_BARRIER, {}, {}); - return false; +} + +bool BufferCache::SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size) { + boost::container::small_vector image_ids; + const u32 inv_size = std::min(size, MaxInvalidateDist); + texture_cache.ForEachImageInRegion(device_addr, inv_size, [&](ImageId image_id, Image& image) { + // Only consider GPU modified images, i.e render targets or storage images. + // Also avoid any CPU modified images as the image data is likely to be stale. + if (True(image.flags & ImageFlagBits::CpuModified) || + False(image.flags & ImageFlagBits::GpuModified)) { + return; + } + // Image must fully overlap with the provided buffer range. + if (image.cpu_addr < device_addr || image.cpu_addr_end > device_addr + size) { + return; + } + image_ids.push_back(image_id); + }); + if (image_ids.empty()) { + return false; + } + // Sort images by modification tick. If there are overlaps we want to + // copy from least to most recently modified. + std::ranges::sort(image_ids, [&](ImageId lhs_id, ImageId rhs_id) { + const Image& lhs = texture_cache.GetImage(lhs_id); + const Image& rhs = texture_cache.GetImage(rhs_id); + return lhs.tick_accessed_last < rhs.tick_accessed_last; + }); + boost::container::small_vector copies; + for (const ImageId image_id : image_ids) { + copies.clear(); + Image& image = texture_cache.GetImage(image_id); + u32 offset = buffer.Offset(image.cpu_addr); + const u32 num_layers = image.info.resources.layers; + for (u32 m = 0; m < image.info.resources.levels; m++) { + const u32 width = std::max(image.info.size.width >> m, 1u); + const u32 height = std::max(image.info.size.height >> m, 1u); + const u32 depth = + image.info.props.is_volume ? std::max(image.info.size.depth >> m, 1u) : 1u; + const auto& [mip_size, mip_pitch, mip_height, mip_ofs] = image.info.mips_layout[m]; + copies.push_back({ + .bufferOffset = offset, + .bufferRowLength = static_cast(mip_pitch), + .bufferImageHeight = static_cast(mip_height), + .imageSubresource{ + .aspectMask = image.aspect_mask & ~vk::ImageAspectFlagBits::eStencil, + .mipLevel = m, + .baseArrayLayer = 0, + .layerCount = num_layers, + }, + .imageOffset = {0, 0, 0}, + .imageExtent = {width, height, depth}, + }); + offset += mip_ofs * num_layers; + } + scheduler.EndRendering(); + image.Transit(vk::ImageLayout::eTransferSrcOptimal, vk::AccessFlagBits::eTransferRead); + const auto cmdbuf = scheduler.CommandBuffer(); + cmdbuf.copyImageToBuffer(image.image, vk::ImageLayout::eTransferSrcOptimal, buffer.buffer, + copies); + } + return true; } void BufferCache::DeleteBuffer(BufferId buffer_id, bool do_not_mark) { diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 9be258ab..b38b00f0 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -28,7 +28,7 @@ using BufferId = Common::SlotId; static constexpr BufferId NULL_BUFFER_ID{0}; -static constexpr u32 NUM_VERTEX_BUFFERS = 32; +class TextureCache; class BufferCache { public: @@ -53,7 +53,8 @@ public: public: explicit BufferCache(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler, - const AmdGpu::Liverpool* liverpool, PageManager& tracker); + const AmdGpu::Liverpool* liverpool, TextureCache& texture_cache, + PageManager& tracker); ~BufferCache(); /// Invalidates any buffer in the logical page range. @@ -116,13 +117,16 @@ private: template void ChangeRegister(BufferId buffer_id); - bool SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size); + void SynchronizeBuffer(Buffer& buffer, VAddr device_addr, u32 size, bool is_texel_buffer); + + bool SynchronizeBufferFromImage(Buffer& buffer, VAddr device_addr, u32 size); void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false); const Vulkan::Instance& instance; Vulkan::Scheduler& scheduler; const AmdGpu::Liverpool* liverpool; + TextureCache& texture_cache; PageManager& tracker; StreamBuffer staging_buffer; StreamBuffer stream_buffer; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index b1a23532..b87d3c91 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -167,9 +167,6 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, LOG_WARNING(Render_Vulkan, "Unexpected metadata read by a CS shader (buffer)"); } } - if (desc.is_written) { - texture_cache.InvalidateMemory(address, size); - } const u32 alignment = instance.TexelBufferMinAlignment(); const auto [vk_buffer, offset] = buffer_cache.ObtainBuffer(address, size, desc.is_written, true); @@ -184,13 +181,15 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, } buffer_view = vk_buffer->View(offset_aligned, size + adjust, desc.is_written, vsharp.GetDataFmt(), vsharp.GetNumberFmt()); - if (auto barrier = vk_buffer->GetBarrier(desc.is_written ? vk::AccessFlagBits2::eShaderWrite : vk::AccessFlagBits2::eShaderRead, vk::PipelineStageFlagBits2::eComputeShader)) { buffer_barriers.emplace_back(*barrier); } + if (desc.is_written) { + texture_cache.InvalidateMemory(address, size); + } } set_writes.push_back({ .dstSet = VK_NULL_HANDLE, @@ -206,7 +205,7 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, for (const auto& image_desc : info->images) { const auto tsharp = image_desc.GetSharp(*info); if (tsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid) { - VideoCore::ImageInfo image_info{tsharp}; + VideoCore::ImageInfo image_info{tsharp, image_desc.is_depth}; VideoCore::ImageViewInfo view_info{tsharp, image_desc.is_storage}; const auto& image_view = texture_cache.FindTexture(image_info, view_info); const auto& image = texture_cache.GetImage(image_view.image_id); @@ -252,10 +251,12 @@ bool ComputePipeline::BindResources(VideoCore::BufferCache& buffer_cache, const auto cmdbuf = scheduler.CommandBuffer(); if (!buffer_barriers.empty()) { - auto dependencies = vk::DependencyInfo{ + const auto dependencies = vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = u32(buffer_barriers.size()), .pBufferMemoryBarriers = buffer_barriers.data(), }; + scheduler.EndRendering(); cmdbuf.pipelineBarrier2(dependencies); } diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 5aec456f..6ac4dcf1 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -405,15 +405,15 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, }); } - for (const auto& tex_buffer : stage->texture_buffers) { - const auto vsharp = tex_buffer.GetSharp(*stage); + for (const auto& desc : stage->texture_buffers) { + const auto vsharp = desc.GetSharp(*stage); vk::BufferView& buffer_view = buffer_views.emplace_back(VK_NULL_HANDLE); const u32 size = vsharp.GetSize(); if (vsharp.GetDataFmt() != AmdGpu::DataFormat::FormatInvalid && size != 0) { const VAddr address = vsharp.base_address; const u32 alignment = instance.TexelBufferMinAlignment(); const auto [vk_buffer, offset] = - buffer_cache.ObtainBuffer(address, size, tex_buffer.is_written, true); + buffer_cache.ObtainBuffer(address, size, desc.is_written, true); const u32 fmt_stride = AmdGpu::NumBits(vsharp.GetDataFmt()) >> 3; ASSERT_MSG(fmt_stride == vsharp.GetStride(), "Texel buffer stride must match format stride"); @@ -423,22 +423,25 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, ASSERT(adjust % fmt_stride == 0); push_data.AddOffset(binding, adjust / fmt_stride); } - buffer_view = vk_buffer->View(offset_aligned, size + adjust, tex_buffer.is_written, + buffer_view = vk_buffer->View(offset_aligned, size + adjust, desc.is_written, vsharp.GetDataFmt(), vsharp.GetNumberFmt()); - const auto dst_access = tex_buffer.is_written ? vk::AccessFlagBits2::eShaderWrite - : vk::AccessFlagBits2::eShaderRead; + const auto dst_access = desc.is_written ? vk::AccessFlagBits2::eShaderWrite + : vk::AccessFlagBits2::eShaderRead; if (auto barrier = vk_buffer->GetBarrier( dst_access, vk::PipelineStageFlagBits2::eVertexShader)) { buffer_barriers.emplace_back(*barrier); } + if (desc.is_written) { + texture_cache.InvalidateMemory(address, size); + } } set_writes.push_back({ .dstSet = VK_NULL_HANDLE, .dstBinding = binding++, .dstArrayElement = 0, .descriptorCount = 1, - .descriptorType = tex_buffer.is_written ? vk::DescriptorType::eStorageTexelBuffer - : vk::DescriptorType::eUniformTexelBuffer, + .descriptorType = desc.is_written ? vk::DescriptorType::eStorageTexelBuffer + : vk::DescriptorType::eUniformTexelBuffer, .pTexelBufferView = &buffer_view, }); } @@ -497,10 +500,12 @@ void GraphicsPipeline::BindResources(const Liverpool::Regs& regs, const auto cmdbuf = scheduler.CommandBuffer(); if (!buffer_barriers.empty()) { - auto dependencies = vk::DependencyInfo{ + const auto dependencies = vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, .bufferMemoryBarrierCount = u32(buffer_barriers.size()), .pBufferMemoryBarriers = buffer_barriers.data(), }; + scheduler.EndRendering(); cmdbuf.pipelineBarrier2(dependencies); } diff --git a/src/video_core/renderer_vulkan/vk_instance.cpp b/src/video_core/renderer_vulkan/vk_instance.cpp index c0105d8f..a055cf3b 100644 --- a/src/video_core/renderer_vulkan/vk_instance.cpp +++ b/src/video_core/renderer_vulkan/vk_instance.cpp @@ -297,6 +297,7 @@ bool Instance::CreateDevice() { .shaderFloat16 = vk12_features.shaderFloat16, .scalarBlockLayout = vk12_features.scalarBlockLayout, .uniformBufferStandardLayout = vk12_features.uniformBufferStandardLayout, + .separateDepthStencilLayouts = vk12_features.separateDepthStencilLayouts, .hostQueryReset = vk12_features.hostQueryReset, .timelineSemaphore = vk12_features.timelineSemaphore, }, diff --git a/src/video_core/renderer_vulkan/vk_platform.cpp b/src/video_core/renderer_vulkan/vk_platform.cpp index 2318bb24..feadda96 100644 --- a/src/video_core/renderer_vulkan/vk_platform.cpp +++ b/src/video_core/renderer_vulkan/vk_platform.cpp @@ -42,6 +42,8 @@ static VKAPI_ATTR VkBool32 VKAPI_CALL DebugUtilsCallback( switch (static_cast(callback_data->messageIdNumber)) { case 0x609a13b: // Vertex attribute at location not consumed by shader case 0xc81ad50e: + case 0xb7c39078: + case 0x32868fde: // vkCreateBufferView(): pCreateInfo->range does not equal VK_WHOLE_SIZE case 0x92d66fc1: // `pMultisampleState is NULL` for depth only passes (confirmed VL error) return VK_FALSE; default: diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 4207c18d..9f72d044 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -17,7 +17,7 @@ namespace Vulkan { Rasterizer::Rasterizer(const Instance& instance_, Scheduler& scheduler_, AmdGpu::Liverpool* liverpool_) : instance{instance_}, scheduler{scheduler_}, page_manager{this}, - buffer_cache{instance, scheduler, liverpool_, page_manager}, + buffer_cache{instance, scheduler, liverpool_, texture_cache, page_manager}, texture_cache{instance, scheduler, buffer_cache, page_manager}, liverpool{liverpool_}, memory{Core::Memory::Instance()}, pipeline_cache{instance, scheduler, liverpool} { if (!Config::nullGpu()) { diff --git a/src/video_core/texture_cache/host_compatibility.h b/src/video_core/texture_cache/host_compatibility.h index 0b4b6764..a73f7e6b 100644 --- a/src/video_core/texture_cache/host_compatibility.h +++ b/src/video_core/texture_cache/host_compatibility.h @@ -7,7 +7,7 @@ #pragma once #include -#include +#include "video_core/renderer_vulkan/vk_common.h" namespace VideoCore { /** @@ -383,9 +383,10 @@ static const std::unordered_map vkFormatCl * @url * https://registry.khronos.org/vulkan/specs/1.3-extensions/html/vkspec.html#formats-compatibility */ -static bool IsVulkanFormatCompatible(VkFormat lhs, VkFormat rhs) { - if (lhs == rhs) +static bool IsVulkanFormatCompatible(vk::Format lhs, vk::Format rhs) { + if (lhs == rhs) { return true; - return vkFormatClassTable.at(lhs) == vkFormatClassTable.at(rhs); + } + return vkFormatClassTable.at(VkFormat(lhs)) == vkFormatClassTable.at(VkFormat(rhs)); } } // namespace VideoCore diff --git a/src/video_core/texture_cache/image.cpp b/src/video_core/texture_cache/image.cpp index 13ea7ce9..2a5c4c43 100644 --- a/src/video_core/texture_cache/image.cpp +++ b/src/video_core/texture_cache/image.cpp @@ -166,8 +166,9 @@ Image::Image(const Vulkan::Instance& instance_, Vulkan::Scheduler& scheduler_, image.Create(image_ci); - Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {:#x}:{:#x}", - info.guest_address, info.guest_size_bytes); + Vulkan::SetObjectName(instance->GetDevice(), (vk::Image)image, "Image {}x{}x{} {:#x}:{:#x}", + info.size.width, info.size.height, info.size.depth, info.guest_address, + info.guest_size_bytes); } void Image::Transit(vk::ImageLayout dst_layout, vk::Flags dst_mask, diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp index bd467168..66fde5c8 100644 --- a/src/video_core/texture_cache/image_info.cpp +++ b/src/video_core/texture_cache/image_info.cpp @@ -187,7 +187,7 @@ ImageInfo::ImageInfo(const AmdGpu::Liverpool::DepthBuffer& buffer, u32 num_slice size.width = hint.Valid() ? hint.width : buffer.Pitch(); size.height = hint.Valid() ? hint.height : buffer.Height(); size.depth = 1; - pitch = size.width; + pitch = buffer.Pitch(); resources.layers = num_slices; meta_info.htile_addr = buffer.z_info.tile_surface_en ? htile_address : 0; usage.depth_target = true; @@ -207,7 +207,7 @@ ImageInfo::ImageInfo(const AmdGpu::Image& image, bool force_depth /*= false*/) n if (force_depth || tiling_mode == AmdGpu::TilingMode::Depth_MacroTiled) { if (pixel_format == vk::Format::eR32Sfloat) { pixel_format = vk::Format::eD32SfloatS8Uint; - } else if (pixel_format == vk::Format::eR16Sfloat) { + } else if (pixel_format == vk::Format::eR16Unorm) { pixel_format = vk::Format::eD16UnormS8Uint; } else { UNREACHABLE(); diff --git a/src/video_core/texture_cache/image_view.cpp b/src/video_core/texture_cache/image_view.cpp index bcdc11ad..e554bad7 100644 --- a/src/video_core/texture_cache/image_view.cpp +++ b/src/video_core/texture_cache/image_view.cpp @@ -123,7 +123,8 @@ ImageView::ImageView(const Vulkan::Instance& instance, const ImageViewInfo& info // When sampling D32 texture from shader, the T# specifies R32 Float format so adjust it. vk::Format format = info.format; vk::ImageAspectFlags aspect = image.aspect_mask; - if (image.aspect_mask & vk::ImageAspectFlagBits::eDepth && format == vk::Format::eR32Sfloat) { + if (image.aspect_mask & vk::ImageAspectFlagBits::eDepth && + (format == vk::Format::eR32Sfloat || format == vk::Format::eD32Sfloat)) { format = image.info.pixel_format; aspect = vk::ImageAspectFlagBits::eDepth; } diff --git a/src/video_core/texture_cache/texture_cache.cpp b/src/video_core/texture_cache/texture_cache.cpp index 90dc7140..0d0c81f5 100644 --- a/src/video_core/texture_cache/texture_cache.cpp +++ b/src/video_core/texture_cache/texture_cache.cpp @@ -38,13 +38,14 @@ TextureCache::TextureCache(const Vulkan::Instance& instance_, Vulkan::Scheduler& TextureCache::~TextureCache() = default; void TextureCache::InvalidateMemory(VAddr address, size_t size) { - std::unique_lock lock{mutex}; + std::scoped_lock lock{mutex}; ForEachImageInRegion(address, size, [&](ImageId image_id, Image& image) { - if (!image.Overlaps(address, size)) { - return; + const size_t image_dist = + image.cpu_addr > address ? image.cpu_addr - address : address - image.cpu_addr; + if (image_dist < MaxInvalidateDist) { + // Ensure image is reuploaded when accessed again. + image.flags |= ImageFlagBits::CpuModified; } - // Ensure image is reuploaded when accessed again. - image.flags |= ImageFlagBits::CpuModified; // Untrack image, so the range is unprotected and the guest can write freely. UntrackImage(image_id); }); @@ -144,17 +145,12 @@ ImageId TextureCache::ResolveOverlap(const ImageInfo& image_info, ImageId cache_ FreeImage(cache_image_id); } - - if (tex_cache_image.info.IsSliceOf(image_info)) { - UNREACHABLE(); - } } return merged_image_id; } ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) { - const auto new_image_id = slot_images.insert(instance, scheduler, info); RegisterImage(new_image_id); @@ -171,50 +167,37 @@ ImageId TextureCache::ExpandImage(const ImageInfo& info, ImageId image_id) { return new_image_id; } -ImageId TextureCache::FindImage(const ImageInfo& info) { +ImageId TextureCache::FindImage(const ImageInfo& info, FindFlags flags) { if (info.guest_address == 0) [[unlikely]] { return NULL_IMAGE_VIEW_ID; } - std::unique_lock lock{mutex}; + std::scoped_lock lock{mutex}; boost::container::small_vector image_ids; - ForEachImageInRegion( - info.guest_address, info.guest_size_bytes, [&](ImageId image_id, Image& image) { - // Ignore images scheduled for deletion - if (True(image.flags & ImageFlagBits::Deleted)) { - return; - } - - // Check if image is fully outside of the region - const auto in_image_cpu_addr = info.guest_address; - const auto in_image_cpu_addr_end = info.guest_address + info.guest_size_bytes; - if (in_image_cpu_addr_end <= image.cpu_addr) { - return; - } - if (in_image_cpu_addr >= image.cpu_addr_end) { - return; - } - - image_ids.push_back(image_id); - }); + ForEachImageInRegion(info.guest_address, info.guest_size_bytes, + [&](ImageId image_id, Image& image) { image_ids.push_back(image_id); }); ImageId image_id{}; // Check for a perfect match first for (const auto& cache_id : image_ids) { auto& cache_image = slot_images[cache_id]; - - if (cache_image.info.guest_address == info.guest_address && - cache_image.info.guest_size_bytes == info.guest_size_bytes && - cache_image.info.size == info.size) { - - ASSERT(cache_image.info.type == info.type); - if (IsVulkanFormatCompatible((VkFormat)info.pixel_format, - (VkFormat)cache_image.info.pixel_format)) { - image_id = cache_id; - } - break; + if (cache_image.info.guest_address != info.guest_address) { + continue; } + if (False(flags & FindFlags::RelaxSize) && + cache_image.info.guest_size_bytes != info.guest_size_bytes) { + continue; + } + if (False(flags & FindFlags::RelaxDim) && cache_image.info.size != info.size) { + continue; + } + if (False(flags & FindFlags::RelaxFmt) && + !IsVulkanFormatCompatible(info.pixel_format, cache_image.info.pixel_format)) { + continue; + } + ASSERT(cache_image.info.type == info.type); + image_id = cache_id; } // Try to resolve overlaps (if any) @@ -225,13 +208,18 @@ ImageId TextureCache::FindImage(const ImageInfo& info) { } } + if (True(flags & FindFlags::NoCreate) && !image_id) { + return {}; + } + // Create and register a new image if (!image_id) { image_id = slot_images.insert(instance, scheduler, info); RegisterImage(image_id); } - slot_images[image_id].tick_accessed_last = scheduler.CurrentTick(); + Image& image = slot_images[image_id]; + image.tick_accessed_last = scheduler.CurrentTick(); return image_id; } @@ -259,8 +247,11 @@ ImageView& TextureCache::RegisterImageView(ImageId image_id, const ImageViewInfo ImageView& TextureCache::FindTexture(const ImageInfo& info, const ImageViewInfo& view_info) { const ImageId image_id = FindImage(info); - UpdateImage(image_id); Image& image = slot_images[image_id]; + if (view_info.is_storage) { + image.flags |= ImageFlagBits::GpuModified; + } + UpdateImage(image_id); auto& usage = image.info.usage; if (view_info.is_storage) { @@ -354,6 +345,10 @@ ImageView& TextureCache::FindDepthTarget(const ImageInfo& image_info, } void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_scheduler /*= nullptr*/) { + if (False(image.flags & ImageFlagBits::CpuModified)) { + return; + } + // Mark image as validated. image.flags &= ~ImageFlagBits::CpuModified; @@ -407,27 +402,20 @@ void TextureCache::RefreshImage(Image& image, Vulkan::Scheduler* custom_schedule const VAddr image_addr = image.info.guest_address; const size_t image_size = image.info.guest_size_bytes; - vk::Buffer buffer{}; - u32 offset{}; - if (auto upload_buffer = tile_manager.TryDetile(image); upload_buffer) { - buffer = *upload_buffer; - } else { - const auto [vk_buffer, buf_offset] = buffer_cache.ObtainTempBuffer(image_addr, image_size); - buffer = vk_buffer->Handle(); - offset = buf_offset; - - // The obtained buffer may be written by a shader so we need to emit a barrier to prevent - // RAW hazard - if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead, - vk::PipelineStageFlagBits2::eTransfer)) { - auto dependencies = vk::DependencyInfo{ - .bufferMemoryBarrierCount = 1, - .pBufferMemoryBarriers = &barrier.value(), - }; - cmdbuf.pipelineBarrier2(dependencies); - } + const auto [vk_buffer, buf_offset] = buffer_cache.ObtainTempBuffer(image_addr, image_size); + // The obtained buffer may be written by a shader so we need to emit a barrier to prevent RAW + // hazard + if (auto barrier = vk_buffer->GetBarrier(vk::AccessFlagBits2::eTransferRead, + vk::PipelineStageFlagBits2::eTransfer)) { + const auto dependencies = vk::DependencyInfo{ + .dependencyFlags = vk::DependencyFlagBits::eByRegion, + .bufferMemoryBarrierCount = 1, + .pBufferMemoryBarriers = &barrier.value(), + }; + cmdbuf.pipelineBarrier2(dependencies); } + const auto [buffer, offset] = tile_manager.TryDetile(vk_buffer->Handle(), buf_offset, image); for (auto& copy : image_copy) { copy.bufferOffset += offset; } diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 14209396..44bc2b43 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -23,6 +23,16 @@ namespace VideoCore { class BufferCache; class PageManager; +enum class FindFlags { + NoCreate = 1 << 0, ///< Do not create an image if searching for one fails. + RelaxDim = 1 << 1, ///< Do not check the dimentions of image, only address. + RelaxSize = 1 << 2, ///< Do not check that the size matches exactly. + RelaxFmt = 1 << 3, ///< Do not check that format is compatible. +}; +DECLARE_ENUM_FLAG_OPERATORS(FindFlags) + +static constexpr u32 MaxInvalidateDist = 12_MB; + class TextureCache { struct Traits { using Entry = boost::container::small_vector; @@ -44,7 +54,7 @@ public: void UnmapMemory(VAddr cpu_addr, size_t size); /// Retrieves the image handle of the image with the provided attributes. - [[nodiscard]] ImageId FindImage(const ImageInfo& info); + [[nodiscard]] ImageId FindImage(const ImageInfo& info, FindFlags flags = {}); /// Retrieves an image view with the properties of the specified image descriptor. [[nodiscard]] ImageView& FindTexture(const ImageInfo& image_info, @@ -61,11 +71,8 @@ public: /// Updates image contents if it was modified by CPU. void UpdateImage(ImageId image_id, Vulkan::Scheduler* custom_scheduler = nullptr) { Image& image = slot_images[image_id]; - if (False(image.flags & ImageFlagBits::CpuModified)) { - return; - } - RefreshImage(image, custom_scheduler); TrackImage(image_id); + RefreshImage(image, custom_scheduler); } [[nodiscard]] ImageId ResolveOverlap(const ImageInfo& info, ImageId cache_img_id, @@ -109,31 +116,12 @@ public: return false; } -private: - ImageView& RegisterImageView(ImageId image_id, const ImageViewInfo& view_info); - - /// Iterate over all page indices in a range - template - static void ForEachPage(PAddr addr, size_t size, Func&& func) { - static constexpr bool RETURNS_BOOL = std::is_same_v, bool>; - const u64 page_end = (addr + size - 1) >> Traits::PageBits; - for (u64 page = addr >> Traits::PageBits; page <= page_end; ++page) { - if constexpr (RETURNS_BOOL) { - if (func(page)) { - break; - } - } else { - func(page); - } - } - } - template void ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& func) { using FuncReturn = typename std::invoke_result::type; static constexpr bool BOOL_BREAK = std::is_same_v; boost::container::small_vector images; - ForEachPage(cpu_addr, size, [this, &images, func](u64 page) { + ForEachPage(cpu_addr, size, [this, &images, cpu_addr, size, func](u64 page) { const auto it = page_table.find(page); if (it == nullptr) { if constexpr (BOOL_BREAK) { @@ -147,6 +135,9 @@ private: if (image.flags & ImageFlagBits::Picked) { continue; } + if (!image.Overlaps(cpu_addr, size)) { + continue; + } image.flags |= ImageFlagBits::Picked; images.push_back(image_id); if constexpr (BOOL_BREAK) { @@ -166,6 +157,26 @@ private: } } +private: + /// Iterate over all page indices in a range + template + static void ForEachPage(PAddr addr, size_t size, Func&& func) { + static constexpr bool RETURNS_BOOL = std::is_same_v, bool>; + const u64 page_end = (addr + size - 1) >> Traits::PageBits; + for (u64 page = addr >> Traits::PageBits; page <= page_end; ++page) { + if constexpr (RETURNS_BOOL) { + if (func(page)) { + break; + } + } else { + func(page); + } + } + } + + /// Registers an image view for provided image + ImageView& RegisterImageView(ImageId image_id, const ImageViewInfo& view_info); + /// Create an image from the given parameters [[nodiscard]] ImageId InsertImage(const ImageInfo& info, VAddr cpu_addr); diff --git a/src/video_core/texture_cache/tile_manager.cpp b/src/video_core/texture_cache/tile_manager.cpp index 7fe5598d..7e06291e 100644 --- a/src/video_core/texture_cache/tile_manager.cpp +++ b/src/video_core/texture_cache/tile_manager.cpp @@ -377,30 +377,23 @@ void TileManager::FreeBuffer(ScratchBuffer buffer) { vmaDestroyBuffer(instance.GetAllocator(), buffer.first, buffer.second); } -std::optional TileManager::TryDetile(Image& image) { +std::pair TileManager::TryDetile(vk::Buffer in_buffer, u32 in_offset, + Image& image) { if (!image.info.props.is_tiled) { - return std::nullopt; + return {in_buffer, in_offset}; } const auto* detiler = GetDetiler(image); if (!detiler) { - if (image.info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled) { + if (image.info.tiling_mode != AmdGpu::TilingMode::Texture_MacroTiled && + image.info.tiling_mode != AmdGpu::TilingMode::Display_MacroTiled) { LOG_ERROR(Render_Vulkan, "Unsupported tiled image: {} ({})", vk::to_string(image.info.pixel_format), NameOf(image.info.tiling_mode)); } - return std::nullopt; + return {in_buffer, in_offset}; } - // Prepare input buffer const u32 image_size = image.info.guest_size_bytes; - const auto [in_buffer, in_offset] = [&] -> std::pair { - // Request temporary host buffer for larger sizes. - auto in_buffer = AllocBuffer(image_size); - const auto addr = reinterpret_cast(image.info.guest_address); - Upload(in_buffer, addr, image_size); - scheduler.DeferOperation([=, this]() { FreeBuffer(in_buffer); }); - return {in_buffer.first, 0}; - }(); // Prepare output buffer auto out_buffer = AllocBuffer(image_size, true); @@ -471,7 +464,7 @@ std::optional TileManager::TryDetile(Image& image) { vk::PipelineStageFlagBits::eTransfer, vk::DependencyFlagBits::eByRegion, {}, post_barrier, {}); - return {out_buffer.first}; + return {out_buffer.first, 0}; } } // namespace VideoCore diff --git a/src/video_core/texture_cache/tile_manager.h b/src/video_core/texture_cache/tile_manager.h index 0baabf98..d0e5eb0f 100644 --- a/src/video_core/texture_cache/tile_manager.h +++ b/src/video_core/texture_cache/tile_manager.h @@ -39,7 +39,7 @@ public: TileManager(const Vulkan::Instance& instance, Vulkan::Scheduler& scheduler); ~TileManager(); - std::optional TryDetile(Image& image); + std::pair TryDetile(vk::Buffer in_buffer, u32 in_offset, Image& image); ScratchBuffer AllocBuffer(u32 size, bool is_storage = false); void Upload(ScratchBuffer buffer, const void* data, size_t size);