diff --git a/CMakeLists.txt b/CMakeLists.txt index a65e9c59..a5331a7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -636,7 +636,7 @@ target_link_libraries(shadps4 PRIVATE Boost::headers GPUOpen::VulkanMemoryAlloca if (APPLE) # Reserve system-managed memory space. - target_link_options(shadps4 PRIVATE -Wl,-no_pie,-no_fixup_chains,-no_huge,-pagezero_size,0x400000,-segaddr,GUEST_SYSTEM,0x400000,-image_base,0x20000000000) + target_link_options(shadps4 PRIVATE -Wl,-no_pie,-no_fixup_chains,-no_huge,-pagezero_size,0x4000,-segaddr,TCB_SPACE,0x4000,-segaddr,GUEST_SYSTEM,0x400000,-image_base,0x20000000000) # Link MoltenVK for Vulkan support find_library(MOLTENVK MoltenVK REQUIRED) diff --git a/src/core/cpu_patches.cpp b/src/core/cpu_patches.cpp index e713155a..151d3498 100644 --- a/src/core/cpu_patches.cpp +++ b/src/core/cpu_patches.cpp @@ -499,7 +499,6 @@ static bool FilterTcbAccess(const ZydisDecodedOperand* operands) { static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGenerator& c) { const auto dst = ZydisToXbyakRegisterOperand(operands[0]); - const auto slot = GetTcbKey(); #if defined(_WIN32) // The following logic is based on the Kernel32.dll asm of TlsGetValue @@ -507,6 +506,8 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe static constexpr u32 TlsExpansionSlotsOffset = 0x1780; static constexpr u32 TlsMinimumAvailable = 64; + const auto slot = GetTcbKey(); + // Load the pointer to the table of TLS slots. c.putSeg(gs); if (slot < TlsMinimumAvailable) { @@ -520,11 +521,6 @@ static void GenerateTcbAccess(const ZydisDecodedOperand* operands, Xbyak::CodeGe // Load the pointer to our buffer. c.mov(dst, qword[dst + tls_index * sizeof(LPVOID)]); } -#elif defined(__APPLE__) - // The following logic is based on the Darwin implementation of _os_tsd_get_direct, used by - // pthread_getspecific https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L89-L96 - c.putSeg(gs); - c.mov(dst, qword[reinterpret_cast(slot * sizeof(void*))]); #else const auto src = ZydisToXbyakMemoryOperand(operands[1]); @@ -548,10 +544,10 @@ struct PatchInfo { }; static const std::unordered_map Patches = { -#if defined(_WIN32) || defined(__APPLE__) - // Windows and Apple need a trampoline. +#if defined(_WIN32) + // Windows needs a trampoline. {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, true}}, -#else +#elif !defined(__APPLE__) {ZYDIS_MNEMONIC_MOV, {FilterTcbAccess, GenerateTcbAccess, false}}, #endif diff --git a/src/core/libraries/kernel/thread_management.cpp b/src/core/libraries/kernel/thread_management.cpp index b69f8b41..11d472a4 100644 --- a/src/core/libraries/kernel/thread_management.cpp +++ b/src/core/libraries/kernel/thread_management.cpp @@ -18,6 +18,7 @@ #include "core/libraries/kernel/threads/threads.h" #include "core/libraries/libs.h" #include "core/linker.h" +#include "core/tls.h" #ifdef _WIN64 #include #else @@ -987,6 +988,7 @@ static void cleanup_thread(void* arg) { destructor(value); } } + Core::SetTcbBase(nullptr); thread->is_almost_done = true; } diff --git a/src/core/linker.cpp b/src/core/linker.cpp index 0c914cef..0d76f4b9 100644 --- a/src/core/linker.cpp +++ b/src/core/linker.cpp @@ -106,6 +106,8 @@ void Linker::Execute() { RunMainEntry(m->GetEntryAddress(), &p, ProgramExitFunc); } } + + SetTcbBase(nullptr); } s32 Linker::LoadModule(const std::filesystem::path& elf_name, bool is_dynamic) { diff --git a/src/core/tls.cpp b/src/core/tls.cpp index 3216d0fe..4a0cdb0d 100644 --- a/src/core/tls.cpp +++ b/src/core/tls.cpp @@ -9,7 +9,10 @@ #ifdef _WIN32 #include #elif defined(__APPLE__) -#include +#include +#include +#include +#include #endif namespace Core { @@ -17,11 +20,17 @@ namespace Core { #ifdef _WIN32 static DWORD slot = 0; +static std::once_flag slot_alloc_flag; static void AllocTcbKey() { slot = TlsAlloc(); } +u32 GetTcbKey() { + std::call_once(slot_alloc_flag, &AllocTcbKey); + return slot; +} + void SetTcbBase(void* image_address) { const BOOL result = TlsSetValue(GetTcbKey(), image_address); ASSERT(result != 0); @@ -33,27 +42,98 @@ Tcb* GetTcbBase() { #elif defined(__APPLE__) -static pthread_key_t slot = 0; +// Reserve space in the 32-bit address range for allocating TCB pages. +asm(".zerofill TCB_SPACE,TCB_SPACE,__guest_system,0x3FC000"); -static void AllocTcbKey() { - ASSERT(pthread_key_create(&slot, nullptr) == 0); +static constexpr u64 ldt_region_base = 0x4000; +static constexpr u64 ldt_region_size = 0x3FC000; +static constexpr u16 ldt_block_size = 0x1000; +static constexpr u16 ldt_index_base = 8; +static constexpr u16 ldt_index_total = (ldt_region_size - ldt_region_base) / ldt_block_size; + +static boost::icl::interval_set free_ldts{}; +static std::mutex free_ldts_lock; +static std::once_flag ldt_region_init_flag; + +static u16 GetLdtIndex() { + sel_t selector; + asm volatile("mov %%fs, %0" : "=r"(selector)); + return selector.index; +} + +static void InitLdtRegion() { + const void* result = + mmap(reinterpret_cast(ldt_region_base), ldt_region_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + ASSERT_MSG(result != MAP_FAILED, "Failed to map memory region for LDT entries."); + + free_ldts += + boost::icl::interval::right_open(ldt_index_base, ldt_index_base + ldt_index_total); +} + +static void** SetupThreadLdt() { + std::call_once(ldt_region_init_flag, InitLdtRegion); + + // Allocate a new LDT index for the current thread. + u16 ldt_index; + { + std::unique_lock lock{free_ldts_lock}; + ASSERT_MSG(!free_ldts.empty(), "Out of LDT space."); + ldt_index = first(*free_ldts.begin()); + free_ldts -= ldt_index; + } + const u64 addr = ldt_region_base + (ldt_index - ldt_index_base) * ldt_block_size; + + // Create an LDT entry for the TCB. + const ldt_entry ldt{.data{ + .base00 = static_cast(addr), + .base16 = static_cast(addr >> 16), + .base24 = static_cast(addr >> 24), + .limit00 = static_cast(ldt_block_size - 1), + .limit16 = 0, + .type = DESC_DATA_WRITE, + .dpl = 3, // User accessible + .present = 1, // Segment present + .stksz = DESC_DATA_32B, + .granular = DESC_GRAN_BYTE, + }}; + int ret = i386_set_ldt(ldt_index, &ldt, 1); + ASSERT_MSG(ret == ldt_index, + "Failed to set LDT for TLS area: expected {}, but syscall returned {}", ldt_index, + ret); + + // Set the FS segment to the created LDT. + const sel_t sel{ + .rpl = USER_PRIV, + .ti = SEL_LDT, + .index = ldt_index, + }; + asm volatile("mov %0, %%fs" ::"r"(sel)); + + return reinterpret_cast(addr); +} + +static void FreeThreadLdt() { + std::unique_lock lock{free_ldts_lock}; + free_ldts += GetLdtIndex(); } void SetTcbBase(void* image_address) { - ASSERT(pthread_setspecific(GetTcbKey(), image_address) == 0); + if (image_address != nullptr) { + *SetupThreadLdt() = image_address; + } else { + FreeThreadLdt(); + } } Tcb* GetTcbBase() { - return reinterpret_cast(pthread_getspecific(GetTcbKey())); + Tcb* tcb; + asm volatile("mov %%fs:0x0, %0" : "=r"(tcb)); + return tcb; } #else -// Placeholder for code compatibility. -static constexpr u32 slot = 0; - -static void AllocTcbKey() {} - void SetTcbBase(void* image_address) { asm volatile("wrgsbase %0" ::"r"(image_address) : "memory"); } @@ -66,11 +146,4 @@ Tcb* GetTcbBase() { #endif -static std::once_flag slot_alloc_flag; - -u32 GetTcbKey() { - std::call_once(slot_alloc_flag, &AllocTcbKey); - return slot; -} - } // namespace Core diff --git a/src/core/tls.h b/src/core/tls.h index 9829c8d9..f5bf3318 100644 --- a/src/core/tls.h +++ b/src/core/tls.h @@ -22,8 +22,10 @@ struct Tcb { void* tcb_thread; }; +#ifdef _WIN32 /// Gets the thread local storage key for the TCB block. u32 GetTcbKey(); +#endif /// Sets the data pointer to the TCB block. void SetTcbBase(void* image_address);