Compare commits

..

2 Commits

Author SHA1 Message Date
Caio Oliveira
112b14b564 [chore] fix build errors
Signed-off-by: Caio Oliveira <caiooliveirafarias0@gmail.com>
2025-12-17 21:46:22 -03:00
lizzie
754883db97 [core] pin core threads to logical CPUs 0-3
this basically allows the threads to exist in these logical CPUs, undisturbed, and without trashing each other's cache
this could improve performance, very tricky thing to pull off correctly, but again, this is mostly an experiment
will mainly benefit: Linux, Android, FreeBSD, Windows (not ARM)
Additionally, this means no context trashing :)

Signed-off-by: lizzie <lizzie@eden-emu.dev>
2025-12-17 21:23:30 -03:00
9 changed files with 46 additions and 72 deletions

View File

@@ -5,9 +5,11 @@
// SPDX-License-Identifier: GPL-2.0-or-later
#include <string>
#include <thread>
#include "common/error.h"
#include "common/logging/log.h"
#include "common/assert.h"
#include "common/thread.h"
#ifdef __APPLE__
#include <mach/mach.h>
@@ -18,6 +20,8 @@
#include "common/string_util.h"
#else
#if defined(__Bitrig__) || defined(__DragonFly__) || defined(__FreeBSD__) || defined(__OpenBSD__)
#include <sys/cpuset.h>
#include <sys/_cpuset.h>
#include <pthread_np.h>
#endif
#include <pthread.h>
@@ -28,7 +32,7 @@
#endif
#ifdef __FreeBSD__
#define cpu_set_t cpuset_t
# define cpu_set_t cpuset_t
#endif
namespace Common {
@@ -77,22 +81,14 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) {
#endif
}
void SetCurrentThreadName(const char* name) {
#ifdef _MSC_VER
// Sets the debugger-visible name of the current thread.
void SetCurrentThreadName(const char* name) {
static auto pf = (decltype(&SetThreadDescription))(void*)GetProcAddress(GetModuleHandle(TEXT("KernelBase.dll")), "SetThreadDescription");
if (pf)
// Sets the debugger-visible name of the current thread.
if (auto pf = (decltype(&SetThreadDescription))(void*)GetProcAddress(GetModuleHandle(TEXT("KernelBase.dll")), "SetThreadDescription"); pf)
pf(GetCurrentThread(), UTF8ToUTF16W(name).data()); // Windows 10+
}
#else // !MSVC_VER, so must be POSIX threads
// MinGW with the POSIX threading model does not support pthread_setname_np
void SetCurrentThreadName(const char* name) {
// See for reference
// https://gitlab.freedesktop.org/mesa/mesa/-/blame/main/src/util/u_thread.c?ref_type=heads#L75
#ifdef __APPLE__
else
; // No-op
#elif defined(__APPLE__)
pthread_setname_np(name);
#elif defined(__HAIKU__)
rename_thread(find_thread(NULL), name);
@@ -112,13 +108,33 @@ void SetCurrentThreadName(const char* name) {
pthread_setname_np(pthread_self(), buf);
}
#elif defined(_WIN32)
// mingw stub
// MinGW with the POSIX threading model does not support pthread_setname_np
// See for reference
// https://gitlab.freedesktop.org/mesa/mesa/-/blame/main/src/util/u_thread.c?ref_type=heads#L75
(void)name;
#else
pthread_setname_np(pthread_self(), name);
#endif
}
void PinCurrentThreadToPerformanceCore(size_t core_id) {
ASSERT(core_id < 4);
// If we set a flag for a CPU that doesn't exist, the thread may not be allowed to
// run in ANY processor!
auto const total_cores = std::thread::hardware_concurrency();
if (core_id < total_cores) {
#if defined(__linux__) || defined(__FreeBSD__)
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(core_id, &set);
pthread_setaffinity_np(pthread_self(), sizeof(set), &set);
#elif defined(_WIN32)
DWORD set = 1UL << core_id;
SetThreadAffinityMask(GetCurrentThread(), set);
#else
// No pin functionality implemented
#endif
}
}
} // namespace Common

View File

@@ -106,7 +106,7 @@ enum class ThreadPriority : u32 {
};
void SetCurrentThreadPriority(ThreadPriority new_priority);
void SetCurrentThreadName(const char* name);
void PinCurrentThreadToPerformanceCore(size_t core_id);
} // namespace Common

View File

@@ -7,6 +7,7 @@
#include "common/fiber.h"
#include "common/scope_exit.h"
#include "common/thread.h"
#include "common/settings.h"
#include "core/core.h"
#include "core/core_timing.h"
#include "core/cpu_manager.h"
@@ -25,11 +26,8 @@ CpuManager::~CpuManager() = default;
void CpuManager::Initialize() {
num_cores = is_multicore ? Core::Hardware::NUM_CPU_CORES : 1;
gpu_barrier = std::make_unique<Common::Barrier>(num_cores + 1);
for (std::size_t core = 0; core < num_cores; core++) {
core_data[core].host_thread =
std::jthread([this, core](std::stop_token token) { RunThread(token, core); });
}
for (std::size_t core = 0; core < num_cores; core++)
core_data[core].host_thread = std::jthread([this, core](std::stop_token token) { RunThread(token, core); });
}
void CpuManager::Shutdown() {
@@ -188,14 +186,10 @@ void CpuManager::ShutdownThread() {
void CpuManager::RunThread(std::stop_token token, std::size_t core) {
/// Initialization
system.RegisterCoreThread(core);
std::string name;
if (is_multicore) {
name = "CPUCore_" + std::to_string(core);
} else {
name = "CPUThread";
}
std::string name = is_multicore ? ("CPUCore_" + std::to_string(core)) : std::string{"CPUThread"};
Common::SetCurrentThreadName(name.c_str());
Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical);
Common::PinCurrentThreadToPerformanceCore(core);
auto& data = core_data[core];
data.host_context = Common::Fiber::ThreadToFiber();

View File

@@ -194,8 +194,6 @@ HostFeature GetHostFeatures() {
features |= HostFeature::LZCNT;
if (cpu_info.has(Cpu::tGFNI))
features |= HostFeature::GFNI;
if (cpu_info.has(Cpu::tWAITPKG))
features |= HostFeature::WAITPKG;
if (cpu_info.has(Cpu::tBMI2)) {
// BMI2 instructions such as pdep and pext have been very slow up until Zen 3.

View File

@@ -420,11 +420,10 @@ void AxxEmitX64::EmitExclusiveWriteMemoryInline(AxxEmitContext& ctx, IR::Inst* i
const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(code, args[1]);
const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr(code).cvt32();
const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(code);
const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(code);
const auto wrapped_fn = exclusive_write_fallbacks[std::make_tuple(ordered, bitsize, vaddr.getIdx(), value.getIdx())];
EmitExclusiveLock(code, conf, tmp, tmp2.cvt32());
EmitExclusiveLock(code, conf, tmp, eax);
SharedLabel end = GenSharedLabel();

View File

@@ -346,7 +346,7 @@ void EmitExclusiveLock(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 p
}
code.mov(pointer, std::bit_cast<u64>(GetExclusiveMonitorLockPointer(conf.global_monitor)));
EmitSpinLockLock(code, pointer, tmp, code.HasHostFeature(HostFeature::WAITPKG));
EmitSpinLockLock(code, pointer, tmp);
}
template<typename UserConfig>

View File

@@ -35,10 +35,9 @@ enum class HostFeature : u64 {
BMI2 = 1ULL << 19,
LZCNT = 1ULL << 20,
GFNI = 1ULL << 21,
WAITPKG = 1ULL << 22,
// Zen-based BMI2
FastBMI2 = 1ULL << 23,
FastBMI2 = 1ULL << 22,
// Orthographic AVX512 features on 128 and 256 vectors
AVX512_Ortho = AVX512F | AVX512VL,

View File

@@ -22,46 +22,17 @@ static const auto default_cg_mode = nullptr; //Allow RWE
namespace Dynarmic {
void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp, bool waitpkg) {
// TODO: this is because we lack regalloc - so better to be safe :(
if (waitpkg) {
code.push(Xbyak::util::eax);
code.push(Xbyak::util::ebx);
code.push(Xbyak::util::edx);
}
void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) {
Xbyak::Label start, loop;
code.jmp(start, code.T_NEAR);
code.L(loop);
if (waitpkg) {
// TODO: This clobbers EAX and EDX did we tell the regalloc?
// ARM ptr for address-monitoring
code.umonitor(ptr);
// tmp.bit[0] = 0: C0.1 | Slow Wakup | Better Savings
// tmp.bit[0] = 1: C0.2 | Fast Wakup | Lesser Savings
// edx:eax is implicitly used as a 64-bit deadline timestamp
// Use the maximum so that we use the operating system's maximum
// allowed wait time within the IA32_UMWAIT_CONTROL register
// Enter power state designated by tmp and wait for a write to lock_ptr
code.mov(Xbyak::util::eax, 0xFFFFFFFF);
code.mov(Xbyak::util::edx, Xbyak::util::eax);
// TODO: We can only be here because tmp is 1 already - however we repeatedly overwrite it...
code.mov(Xbyak::util::ebx, 1);
code.umwait(Xbyak::util::ebx);
// CF == 1 if we hit the OS-timeout in IA32_UMWAIT_CONTROL without a write
// CF == 0 if we exited the wait for any other reason
} else {
code.pause();
}
code.pause();
code.L(start);
code.mov(tmp, 1);
/*code.lock();*/ code.xchg(code.dword[ptr], tmp);
code.test(tmp, tmp);
code.jnz(loop, code.T_NEAR);
if (waitpkg) {
code.pop(Xbyak::util::edx);
code.pop(Xbyak::util::ebx);
code.pop(Xbyak::util::eax);
}
}
void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) {
@@ -89,7 +60,7 @@ void SpinLockImpl::Initialize() noexcept {
Xbyak::Reg64 const ABI_PARAM1 = Backend::X64::HostLocToReg64(Backend::X64::ABI_PARAM1);
code.align();
lock = code.getCurr<void (*)(volatile int*)>();
EmitSpinLockLock(code, ABI_PARAM1, code.eax, false);
EmitSpinLockLock(code, ABI_PARAM1, code.eax);
code.ret();
code.align();
unlock = code.getCurr<void (*)(volatile int*)>();

View File

@@ -1,6 +1,3 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
/* This file is part of the dynarmic project.
* Copyright (c) 2022 MerryMage
* SPDX-License-Identifier: 0BSD
@@ -12,7 +9,7 @@
namespace Dynarmic {
void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp, bool waitpkg);
void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp);
void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp);
} // namespace Dynarmic