From 09cf05ab91ac0cc26c7994a99a2e4eca41a2a8c9 Mon Sep 17 00:00:00 2001 From: pineappleEA Date: Sun, 27 Feb 2022 23:27:57 +0100 Subject: [PATCH] early-access version 2519 --- README.md | 2 +- .../dynarmic/src/dynarmic/CMakeLists.txt | 7 + .../src/dynarmic/backend/x64/a32_emit_x64.cpp | 430 ------- .../src/dynarmic/backend/x64/a32_emit_x64.h | 6 + .../backend/x64/a32_emit_x64_memory.cpp | 672 +++++++++++ .../src/dynarmic/backend/x64/a64_emit_x64.cpp | 747 ------------ .../src/dynarmic/backend/x64/a64_emit_x64.h | 8 + .../backend/x64/a64_emit_x64_memory.cpp | 1025 +++++++++++++++++ .../dynarmic/backend/x64/emit_x64_memory.h | 62 + .../backend/x64/exclusive_monitor.cpp | 4 +- .../backend/x64/exclusive_monitor_friend.h | 28 + .../dynarmic/src/dynarmic/common/spin_lock.h | 17 + .../src/dynarmic/common/spin_lock_x64.cpp | 70 ++ .../src/dynarmic/common/spin_lock_x64.h | 15 + .../src/dynarmic/interface/A32/config.h | 9 + .../src/dynarmic/interface/A64/config.h | 9 + .../dynarmic/interface/exclusive_monitor.h | 9 +- .../dynarmic/interface/optimization_flags.h | 4 + src/common/settings.cpp | 1 + src/common/settings.h | 3 + src/core/arm/dynarmic/arm_dynarmic_32.cpp | 12 + src/core/arm/dynarmic/arm_dynarmic_64.cpp | 13 + .../arm/dynarmic/arm_exclusive_monitor.cpp | 4 +- src/core/arm/dynarmic/arm_exclusive_monitor.h | 2 +- src/core/arm/exclusive_monitor.h | 2 +- src/core/hle/kernel/k_address_arbiter.cpp | 4 +- src/core/hle/kernel/k_memory_manager.cpp | 12 +- src/core/hle/kernel/k_memory_manager.h | 4 +- src/core/hle/kernel/k_memory_region_type.h | 2 +- src/core/hle/kernel/kernel.cpp | 84 +- src/yuzu/configuration/config.cpp | 4 + src/yuzu/configuration/configure_cpu.cpp | 9 + src/yuzu/configuration/configure_cpu.h | 1 + src/yuzu/configuration/configure_cpu.ui | 12 + .../configuration/configure_cpu_debug.cpp | 8 + src/yuzu/configuration/configure_cpu_debug.ui | 29 +- src/yuzu_cmd/config.cpp | 3 + src/yuzu_cmd/default_ini.h | 13 + 38 files changed, 2107 insertions(+), 1239 deletions(-) create mode 100755 externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp create mode 100755 externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp create mode 100755 externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h create mode 100755 externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h create mode 100755 externals/dynarmic/src/dynarmic/common/spin_lock.h create mode 100755 externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp create mode 100755 externals/dynarmic/src/dynarmic/common/spin_lock_x64.h diff --git a/README.md b/README.md index 1697c968a..c9e7b7194 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 2516. +This is the source code for early-access 2519. ## Legal Notice diff --git a/externals/dynarmic/src/dynarmic/CMakeLists.txt b/externals/dynarmic/src/dynarmic/CMakeLists.txt index b5f9b1762..126b9dbad 100755 --- a/externals/dynarmic/src/dynarmic/CMakeLists.txt +++ b/externals/dynarmic/src/dynarmic/CMakeLists.txt @@ -58,6 +58,7 @@ add_library(dynarmic common/memory_pool.h common/safe_ops.h common/scope_exit.h + common/spin_lock.h common/string_util.h common/u128.cpp common/u128.h @@ -281,6 +282,7 @@ if (ARCHITECTURE STREQUAL "x86_64") backend/x64/emit_x64_crc32.cpp backend/x64/emit_x64_data_processing.cpp backend/x64/emit_x64_floating_point.cpp + backend/x64/emit_x64_memory.h backend/x64/emit_x64_packed.cpp backend/x64/emit_x64_saturation.cpp backend/x64/emit_x64_sm4.cpp @@ -289,6 +291,7 @@ if (ARCHITECTURE STREQUAL "x86_64") backend/x64/emit_x64_vector_saturation.cpp backend/x64/exception_handler.h backend/x64/exclusive_monitor.cpp + backend/x64/exclusive_monitor_friend.h backend/x64/host_feature.h backend/x64/hostloc.cpp backend/x64/hostloc.h @@ -299,12 +302,15 @@ if (ARCHITECTURE STREQUAL "x86_64") backend/x64/reg_alloc.cpp backend/x64/reg_alloc.h backend/x64/stack_layout.h + common/spin_lock_x64.cpp + common/spin_lock_x64.h ) if ("A32" IN_LIST DYNARMIC_FRONTENDS) target_sources(dynarmic PRIVATE backend/x64/a32_emit_x64.cpp backend/x64/a32_emit_x64.h + backend/x64/a32_emit_x64_memory.cpp backend/x64/a32_interface.cpp backend/x64/a32_jitstate.cpp backend/x64/a32_jitstate.h @@ -315,6 +321,7 @@ if (ARCHITECTURE STREQUAL "x86_64") target_sources(dynarmic PRIVATE backend/x64/a64_emit_x64.cpp backend/x64/a64_emit_x64.h + backend/x64/a64_emit_x64_memory.cpp backend/x64/a64_interface.cpp backend/x64/a64_jitstate.cpp backend/x64/a64_jitstate.h diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp index 4f819825f..d85da5134 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.cpp @@ -11,7 +11,6 @@ #include #include -#include #include "dynarmic/backend/x64/a32_jitstate.h" #include "dynarmic/backend/x64/abi.h" @@ -26,11 +25,9 @@ #include "dynarmic/common/common_types.h" #include "dynarmic/common/scope_exit.h" #include "dynarmic/common/variant_util.h" -#include "dynarmic/common/x64_disassemble.h" #include "dynarmic/frontend/A32/a32_location_descriptor.h" #include "dynarmic/frontend/A32/a32_types.h" #include "dynarmic/interface/A32/coprocessor.h" -#include "dynarmic/interface/exclusive_monitor.h" #include "dynarmic/ir/basic_block.h" #include "dynarmic/ir/microinstruction.h" #include "dynarmic/ir/opcodes.h" @@ -198,67 +195,6 @@ void A32EmitX64::ClearFastDispatchTable() { } } -void A32EmitX64::GenFastmemFallbacks() { - const std::initializer_list idxes{0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; - const std::array, 4> read_callbacks{{ - {8, Devirtualize<&A32::UserCallbacks::MemoryRead8>(conf.callbacks)}, - {16, Devirtualize<&A32::UserCallbacks::MemoryRead16>(conf.callbacks)}, - {32, Devirtualize<&A32::UserCallbacks::MemoryRead32>(conf.callbacks)}, - {64, Devirtualize<&A32::UserCallbacks::MemoryRead64>(conf.callbacks)}, - }}; - const std::array, 4> write_callbacks{{ - {8, Devirtualize<&A32::UserCallbacks::MemoryWrite8>(conf.callbacks)}, - {16, Devirtualize<&A32::UserCallbacks::MemoryWrite16>(conf.callbacks)}, - {32, Devirtualize<&A32::UserCallbacks::MemoryWrite32>(conf.callbacks)}, - {64, Devirtualize<&A32::UserCallbacks::MemoryWrite64>(conf.callbacks)}, - }}; - - for (int vaddr_idx : idxes) { - for (int value_idx : idxes) { - for (const auto& [bitsize, callback] : read_callbacks) { - code.align(); - read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - callback.EmitCall(code); - if (value_idx != code.ABI_RETURN.getIdx()) { - code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); - } - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - code.ret(); - PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize)); - } - - for (const auto& [bitsize, callback] : write_callbacks) { - code.align(); - write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStack(code); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - } - callback.EmitCall(code); - ABI_PopCallerSaveRegistersAndAdjustStack(code); - code.ret(); - PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); - } - } - } -} - void A32EmitX64::GenTerminalHandlers() { // PC ends up in ebp, location_descriptor ends up in rbx const auto calculate_location_descriptor = [this] { @@ -875,372 +811,6 @@ void A32EmitX64::EmitA32SetFpscrNZCV(A32EmitContext& ctx, IR::Inst* inst) { code.mov(dword[r15 + offsetof(A32JitState, fpsr_nzcv)], value); } -void A32EmitX64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) { - code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); -} - -std::optional A32EmitX64::ShouldFastmem(A32EmitContext& ctx, IR::Inst* inst) const { - if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { - return std::nullopt; - } - - const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); - if (do_not_fastmem.count(marker) > 0) { - return std::nullopt; - } - return marker; -} - -FakeCall A32EmitX64::FastmemCallback(u64 rip_) { - const auto iter = fastmem_patch_info.find(rip_); - - if (iter == fastmem_patch_info.end()) { - fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); - fmt::print("Segfault wasn't at a fastmem patch location!\n"); - fmt::print("Now dumping code.......\n\n"); - Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); - ASSERT_FALSE("iter != fastmem_patch_info.end()"); - } - - if (conf.recompile_on_fastmem_failure) { - const auto marker = iter->second.marker; - do_not_fastmem.emplace(marker); - InvalidateBasicBlocks({std::get<0>(marker)}); - } - FakeCall ret; - ret.call_rip = iter->second.callback; - ret.ret_rip = iter->second.resume_rip; - return ret; -} - -namespace { - -constexpr size_t page_bits = 12; -constexpr size_t page_size = 1 << page_bits; -constexpr size_t page_mask = (1 << page_bits) - 1; - -void EmitDetectMisaignedVAddr(BlockOfCode& code, A32EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg32 vaddr, Xbyak::Reg32 tmp) { - if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) { - return; - } - - const u32 align_mask = [bitsize]() -> u32 { - switch (bitsize) { - case 16: - return 0b1; - case 32: - return 0b11; - case 64: - return 0b111; - } - UNREACHABLE(); - }(); - - code.test(vaddr, align_mask); - - if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { - code.jnz(abort, code.T_NEAR); - return; - } - - const u32 page_align_mask = static_cast(page_size - 1) & ~align_mask; - - Xbyak::Label detect_boundary, resume; - - code.jnz(detect_boundary, code.T_NEAR); - code.L(resume); - - code.SwitchToFarCode(); - code.L(detect_boundary); - code.mov(tmp, vaddr); - code.and_(tmp, page_align_mask); - code.cmp(tmp, page_align_mask); - code.jne(resume, code.T_NEAR); - // NOTE: We expect to fallthrough into abort code here. - code.SwitchToNearCode(); -} - -Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A32EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) { - const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr(); - const Xbyak::Reg32 tmp = ctx.conf.absolute_offset_page_table ? page.cvt32() : ctx.reg_alloc.ScratchGpr().cvt32(); - - EmitDetectMisaignedVAddr(code, ctx, bitsize, abort, vaddr.cvt32(), tmp); - - // TODO: This code assumes vaddr has been zext from 32-bits to 64-bits. - - code.mov(tmp, vaddr.cvt32()); - code.shr(tmp, static_cast(page_bits)); - code.mov(page, qword[r14 + tmp.cvt64() * sizeof(void*)]); - if (ctx.conf.page_table_pointer_mask_bits == 0) { - code.test(page, page); - } else { - code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits); - } - code.jz(abort, code.T_NEAR); - if (ctx.conf.absolute_offset_page_table) { - return page + vaddr; - } - code.mov(tmp, vaddr.cvt32()); - code.and_(tmp, static_cast(page_mask)); - return page + tmp.cvt64(); -} - -template -void EmitReadMemoryMov(BlockOfCode& code, const Xbyak::Reg64& value, const Xbyak::RegExp& addr) { - switch (bitsize) { - case 8: - code.movzx(value.cvt32(), code.byte[addr]); - return; - case 16: - code.movzx(value.cvt32(), word[addr]); - return; - case 32: - code.mov(value.cvt32(), dword[addr]); - return; - case 64: - code.mov(value, qword[addr]); - return; - default: - ASSERT_FALSE("Invalid bitsize"); - } -} - -template -void EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, const Xbyak::Reg64& value) { - switch (bitsize) { - case 8: - code.mov(code.byte[addr], value.cvt8()); - return; - case 16: - code.mov(word[addr], value.cvt16()); - return; - case 32: - code.mov(dword[addr], value.cvt32()); - return; - case 64: - code.mov(qword[addr], value); - return; - default: - ASSERT_FALSE("Invalid bitsize"); - } -} - -} // anonymous namespace - -template -void A32EmitX64::EmitMemoryRead(A32EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - ctx.reg_alloc.HostCall(inst, {}, args[0]); - Devirtualize(conf.callbacks).EmitCall(code); - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); - - const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; - - if (fastmem_marker) { - // Use fastmem - const auto src_ptr = r13 + vaddr; - - const auto location = code.getCurr(); - EmitReadMemoryMov(code, value, src_ptr); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - }); - - ctx.reg_alloc.DefineValue(inst, value); - return; - } - - // Use page table - ASSERT(conf.page_table); - Xbyak::Label abort, end; - - const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); - EmitReadMemoryMov(code, value, src_ptr); - code.L(end); - - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - - ctx.reg_alloc.DefineValue(inst, value); -} - -template -void A32EmitX64::EmitMemoryWrite(A32EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); - Devirtualize(conf.callbacks).EmitCall(code); - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); - - const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; - - if (fastmem_marker) { - // Use fastmem - const auto dest_ptr = r13 + vaddr; - - const auto location = code.getCurr(); - EmitWriteMemoryMov(code, dest_ptr, value); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - }); - - return; - } - - // Use page table - ASSERT(conf.page_table); - Xbyak::Label abort, end; - - const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); - EmitWriteMemoryMov(code, dest_ptr, value); - code.L(end); - - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); -} - -void A32EmitX64::EmitA32ReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { - EmitMemoryRead<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); -} - -void A32EmitX64::EmitA32ReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { - EmitMemoryRead<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); -} - -void A32EmitX64::EmitA32ReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { - EmitMemoryRead<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); -} - -void A32EmitX64::EmitA32ReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { - EmitMemoryRead<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); -} - -void A32EmitX64::EmitA32WriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { - EmitMemoryWrite<8, &A32::UserCallbacks::MemoryWrite8>(ctx, inst); -} - -void A32EmitX64::EmitA32WriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { - EmitMemoryWrite<16, &A32::UserCallbacks::MemoryWrite16>(ctx, inst); -} - -void A32EmitX64::EmitA32WriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { - EmitMemoryWrite<32, &A32::UserCallbacks::MemoryWrite32>(ctx, inst); -} - -void A32EmitX64::EmitA32WriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { - EmitMemoryWrite<64, &A32::UserCallbacks::MemoryWrite64>(ctx, inst); -} - -template -void A32EmitX64::ExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst) { - using T = mp::unsigned_integer_of_size; - - ASSERT(conf.global_monitor != nullptr); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - ctx.reg_alloc.HostCall(inst, {}, args[0]); - - code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(1)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - code.CallLambda( - [](A32::UserConfig& conf, u32 vaddr) -> T { - return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> T { - return (conf.callbacks->*callback)(vaddr); - }); - }); -} - -template -void A32EmitX64::ExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst) { - using T = mp::unsigned_integer_of_size; - - ASSERT(conf.global_monitor != nullptr); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); - - Xbyak::Label end; - - code.mov(code.ABI_RETURN, u32(1)); - code.cmp(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); - code.je(end); - code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - code.CallLambda( - [](A32::UserConfig& conf, u32 vaddr, T value) -> u32 { - return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, - [&](T expected) -> bool { - return (conf.callbacks->*callback)(vaddr, value, expected); - }) - ? 0 - : 1; - }); - code.L(end); -} - -void A32EmitX64::EmitA32ExclusiveReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); -} - -void A32EmitX64::EmitA32ExclusiveReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); -} - -void A32EmitX64::EmitA32ExclusiveReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); -} - -void A32EmitX64::EmitA32ExclusiveReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); -} - -void A32EmitX64::EmitA32ExclusiveWriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); -} - -void A32EmitX64::EmitA32ExclusiveWriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); -} - -void A32EmitX64::EmitA32ExclusiveWriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); -} - -void A32EmitX64::EmitA32ExclusiveWriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { - ExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); -} - static void EmitCoprocessorException() { ASSERT_FALSE("Should raise coproc exception here"); } diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h index 6684835f7..43f90f6a1 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64.h @@ -73,6 +73,7 @@ protected: std::map, void (*)()> read_fallbacks; std::map, void (*)()> write_fallbacks; + std::map, void (*)()> exclusive_write_fallbacks; void GenFastmemFallbacks(); const void* terminal_handler_pop_rsb_hint; @@ -98,6 +99,7 @@ protected: u64 resume_rip; u64 callback; DoNotFastmemMarker marker; + bool compile; }; tsl::robin_map fastmem_patch_info; std::set do_not_fastmem; @@ -113,6 +115,10 @@ protected: void ExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst); template void ExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst); + template + void ExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst); + template + void ExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst); // Terminal instruction emitters void EmitSetUpperLocationDescriptor(IR::LocationDescriptor new_location, IR::LocationDescriptor old_location); diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp new file mode 100755 index 000000000..171db013c --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_emit_x64_memory.cpp @@ -0,0 +1,672 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "dynarmic/backend/x64/a32_emit_x64.h" +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/emit_x64_memory.h" +#include "dynarmic/backend/x64/exclusive_monitor_friend.h" +#include "dynarmic/backend/x64/perf_map.h" +#include "dynarmic/common/x64_disassemble.h" +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +void A32EmitX64::GenFastmemFallbacks() { + const std::initializer_list idxes{0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}; + const std::array, 4> read_callbacks{{ + {8, Devirtualize<&A32::UserCallbacks::MemoryRead8>(conf.callbacks)}, + {16, Devirtualize<&A32::UserCallbacks::MemoryRead16>(conf.callbacks)}, + {32, Devirtualize<&A32::UserCallbacks::MemoryRead32>(conf.callbacks)}, + {64, Devirtualize<&A32::UserCallbacks::MemoryRead64>(conf.callbacks)}, + }}; + const std::array, 4> write_callbacks{{ + {8, Devirtualize<&A32::UserCallbacks::MemoryWrite8>(conf.callbacks)}, + {16, Devirtualize<&A32::UserCallbacks::MemoryWrite16>(conf.callbacks)}, + {32, Devirtualize<&A32::UserCallbacks::MemoryWrite32>(conf.callbacks)}, + {64, Devirtualize<&A32::UserCallbacks::MemoryWrite64>(conf.callbacks)}, + }}; + const std::array, 4> exclusive_write_callbacks{{ + {8, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive8>(conf.callbacks)}, + {16, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive16>(conf.callbacks)}, + {32, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive32>(conf.callbacks)}, + {64, Devirtualize<&A32::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, + }}; + + for (int vaddr_idx : idxes) { + for (int value_idx : idxes) { + for (const auto& [bitsize, callback] : read_callbacks) { + code.align(); + read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + callback.EmitCall(code); + if (value_idx != code.ABI_RETURN.getIdx()) { + code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_read_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : write_callbacks) { + code.align(); + write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_write_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.mov(code.ABI_PARAM4, rax); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a32_exclusive_write_fallback_{}", bitsize)); + } + } + } +} + +std::optional A32EmitX64::ShouldFastmem(A32EmitContext& ctx, IR::Inst* inst) const { + if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { + return std::nullopt; + } + + const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); + if (do_not_fastmem.count(marker) > 0) { + return std::nullopt; + } + return marker; +} + +FakeCall A32EmitX64::FastmemCallback(u64 rip_) { + const auto iter = fastmem_patch_info.find(rip_); + + if (iter == fastmem_patch_info.end()) { + fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); + fmt::print("Segfault wasn't at a fastmem patch location!\n"); + fmt::print("Now dumping code.......\n\n"); + Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); + ASSERT_FALSE("iter != fastmem_patch_info.end()"); + } + + if (iter->second.compile) { + const auto marker = iter->second.marker; + do_not_fastmem.emplace(marker); + InvalidateBasicBlocks({std::get<0>(marker)}); + } + + return FakeCall{ + .call_rip = iter->second.callback, + .ret_rip = iter->second.resume_rip, + }; +} + +namespace { + +constexpr size_t page_bits = 12; +constexpr size_t page_size = 1 << page_bits; +constexpr size_t page_mask = (1 << page_bits) - 1; + +void EmitDetectMisaignedVAddr(BlockOfCode& code, A32EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg32 vaddr, Xbyak::Reg32 tmp) { + if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) { + return; + } + + const u32 align_mask = [bitsize]() -> u32 { + switch (bitsize) { + case 16: + return 0b1; + case 32: + return 0b11; + case 64: + return 0b111; + } + UNREACHABLE(); + }(); + + code.test(vaddr, align_mask); + + if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { + code.jnz(abort, code.T_NEAR); + return; + } + + const u32 page_align_mask = static_cast(page_size - 1) & ~align_mask; + + Xbyak::Label detect_boundary, resume; + + code.jnz(detect_boundary, code.T_NEAR); + code.L(resume); + + code.SwitchToFarCode(); + code.L(detect_boundary); + code.mov(tmp, vaddr); + code.and_(tmp, page_align_mask); + code.cmp(tmp, page_align_mask); + code.jne(resume, code.T_NEAR); + // NOTE: We expect to fallthrough into abort code here. + code.SwitchToNearCode(); +} + +Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A32EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) { + const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg32 tmp = ctx.conf.absolute_offset_page_table ? page.cvt32() : ctx.reg_alloc.ScratchGpr().cvt32(); + + EmitDetectMisaignedVAddr(code, ctx, bitsize, abort, vaddr.cvt32(), tmp); + + // TODO: This code assumes vaddr has been zext from 32-bits to 64-bits. + + code.mov(tmp, vaddr.cvt32()); + code.shr(tmp, static_cast(page_bits)); + code.mov(page, qword[r14 + tmp.cvt64() * sizeof(void*)]); + if (ctx.conf.page_table_pointer_mask_bits == 0) { + code.test(page, page); + } else { + code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits); + } + code.jz(abort, code.T_NEAR); + if (ctx.conf.absolute_offset_page_table) { + return page + vaddr; + } + code.mov(tmp, vaddr.cvt32()); + code.and_(tmp, static_cast(page_mask)); + return page + tmp.cvt64(); +} + +template +void EmitReadMemoryMov(BlockOfCode& code, const Xbyak::Reg64& value, const Xbyak::RegExp& addr) { + switch (bitsize) { + case 8: + code.movzx(value.cvt32(), code.byte[addr]); + return; + case 16: + code.movzx(value.cvt32(), word[addr]); + return; + case 32: + code.mov(value.cvt32(), dword[addr]); + return; + case 64: + code.mov(value, qword[addr]); + return; + default: + ASSERT_FALSE("Invalid bitsize"); + } +} + +template +void EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, const Xbyak::Reg64& value) { + switch (bitsize) { + case 8: + code.mov(code.byte[addr], value.cvt8()); + return; + case 16: + code.mov(word[addr], value.cvt16()); + return; + case 32: + code.mov(dword[addr], value.cvt32()); + return; + case 64: + code.mov(qword[addr], value); + return; + default: + ASSERT_FALSE("Invalid bitsize"); + } +} + +} // anonymous namespace + +template +void A32EmitX64::EmitMemoryRead(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto fastmem_marker = ShouldFastmem(ctx, inst); + + if (!conf.page_table && !fastmem_marker) { + // Neither fastmem nor page table: Use callbacks + ctx.reg_alloc.HostCall(inst, {}, args[0]); + Devirtualize(conf.callbacks).EmitCall(code); + return; + } + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; + + if (fastmem_marker) { + // Use fastmem + const auto src_ptr = r13 + vaddr; + + const auto location = code.getCurr(); + EmitReadMemoryMov(code, value, src_ptr); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_fastmem_failure, + }); + + ctx.reg_alloc.DefineValue(inst, value); + return; + } + + // Use page table + ASSERT(conf.page_table); + Xbyak::Label abort, end; + + const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); + EmitReadMemoryMov(code, value, src_ptr); + code.L(end); + + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + + ctx.reg_alloc.DefineValue(inst, value); +} + +template +void A32EmitX64::EmitMemoryWrite(A32EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto fastmem_marker = ShouldFastmem(ctx, inst); + + if (!conf.page_table && !fastmem_marker) { + // Neither fastmem nor page table: Use callbacks + ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); + Devirtualize(conf.callbacks).EmitCall(code); + return; + } + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); + + const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; + + if (fastmem_marker) { + // Use fastmem + const auto dest_ptr = r13 + vaddr; + + const auto location = code.getCurr(); + EmitWriteMemoryMov(code, dest_ptr, value); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_fastmem_failure, + }); + + return; + } + + // Use page table + ASSERT(conf.page_table); + Xbyak::Label abort, end; + + const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); + EmitWriteMemoryMov(code, dest_ptr, value); + code.L(end); + + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); +} + +void A32EmitX64::EmitA32ReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); +} + +void A32EmitX64::EmitA32ReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); +} + +void A32EmitX64::EmitA32ReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); +} + +void A32EmitX64::EmitA32ReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); +} + +void A32EmitX64::EmitA32WriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<8, &A32::UserCallbacks::MemoryWrite8>(ctx, inst); +} + +void A32EmitX64::EmitA32WriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<16, &A32::UserCallbacks::MemoryWrite16>(ctx, inst); +} + +void A32EmitX64::EmitA32WriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<32, &A32::UserCallbacks::MemoryWrite32>(ctx, inst); +} + +void A32EmitX64::EmitA32WriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<64, &A32::UserCallbacks::MemoryWrite64>(ctx, inst); +} + +template +void A32EmitX64::ExclusiveReadMemory(A32EmitContext& ctx, IR::Inst* inst) { + using T = mp::unsigned_integer_of_size; + + ASSERT(conf.global_monitor != nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.HostCall(inst, {}, args[0]); + + code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(1)); + code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); + code.CallLambda( + [](A32::UserConfig& conf, u32 vaddr) -> T { + return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> T { + return (conf.callbacks->*callback)(vaddr); + }); + }); +} + +template +void A32EmitX64::ExclusiveWriteMemory(A32EmitContext& ctx, IR::Inst* inst) { + using T = mp::unsigned_integer_of_size; + + ASSERT(conf.global_monitor != nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); + + Xbyak::Label end; + + code.mov(code.ABI_RETURN, u32(1)); + code.cmp(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); + code.je(end); + code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); + code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); + code.CallLambda( + [](A32::UserConfig& conf, u32 vaddr, T value) -> u32 { + return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, + [&](T expected) -> bool { + return (conf.callbacks->*callback)(vaddr, value, expected); + }) + ? 0 + : 1; + }); + code.L(end); +} + +template +void A32EmitX64::ExclusiveReadMemoryInline(A32EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + ExclusiveReadMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; + + EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); + + code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(1)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(qword[tmp], vaddr); + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + Xbyak::Label end; + + const auto src_ptr = r13 + vaddr; + + const auto location = code.getCurr(); + EmitReadMemoryMov(code, value, src_ptr); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.L(end); + } else { + code.call(wrapped_fn); + } + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + EmitWriteMemoryMov(code, tmp, value); + + EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); + + ctx.reg_alloc.DefineValue(inst, value); +} + +template +void A32EmitX64::ExclusiveWriteMemoryInline(A32EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + ExclusiveWriteMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + const auto fallback_fn = exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; + + EmitExclusiveLock(code, conf, tmp, eax); + + Xbyak::Label end; + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(status, u32(1)); + code.cmp(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); + code.je(end, code.T_NEAR); + code.cmp(qword[tmp], vaddr); + code.jne(end, code.T_NEAR); + + EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); + + code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + + EmitReadMemoryMov(code, rax, tmp); + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + const auto dest_ptr = r13 + vaddr; + + const auto location = code.getCurr(); + + switch (bitsize) { + case 8: + code.lock(); + code.cmpxchg(code.byte[dest_ptr], value.cvt8()); + break; + case 16: + code.lock(); + code.cmpxchg(word[dest_ptr], value.cvt16()); + break; + case 32: + code.lock(); + code.cmpxchg(dword[dest_ptr], value.cvt32()); + break; + case 64: + code.lock(); + code.cmpxchg(qword[dest_ptr], value.cvt64()); + break; + default: + UNREACHABLE(); + } + + code.setnz(status.cvt8()); + + code.SwitchToFarCode(); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(fallback_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } else { + code.call(fallback_fn); + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + } + + code.L(end); + + EmitExclusiveUnlock(code, conf, tmp, eax); + + ctx.reg_alloc.DefineValue(inst, status); +} + +void A32EmitX64::EmitA32ClearExclusive(A32EmitContext&, IR::Inst*) { + code.mov(code.byte[r15 + offsetof(A32JitState, exclusive_state)], u8(0)); +} + +void A32EmitX64::EmitA32ExclusiveReadMemory8(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + ExclusiveReadMemoryInline<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); + } else { + ExclusiveReadMemory<8, &A32::UserCallbacks::MemoryRead8>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveReadMemory16(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + ExclusiveReadMemoryInline<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); + } else { + ExclusiveReadMemory<16, &A32::UserCallbacks::MemoryRead16>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveReadMemory32(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + ExclusiveReadMemoryInline<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); + } else { + ExclusiveReadMemory<32, &A32::UserCallbacks::MemoryRead32>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveReadMemory64(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + ExclusiveReadMemoryInline<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); + } else { + ExclusiveReadMemory<64, &A32::UserCallbacks::MemoryRead64>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveWriteMemory8(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + ExclusiveWriteMemoryInline<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } else { + ExclusiveWriteMemory<8, &A32::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveWriteMemory16(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + ExclusiveWriteMemoryInline<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } else { + ExclusiveWriteMemory<16, &A32::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveWriteMemory32(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + ExclusiveWriteMemoryInline<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } else { + ExclusiveWriteMemory<32, &A32::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } +} + +void A32EmitX64::EmitA32ExclusiveWriteMemory64(A32EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + ExclusiveWriteMemoryInline<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } else { + ExclusiveWriteMemory<64, &A32::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp index 8de3ee5f4..a4d36aa9f 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.cpp @@ -5,8 +5,6 @@ #include "dynarmic/backend/x64/a64_emit_x64.h" -#include - #include #include #include @@ -23,10 +21,8 @@ #include "dynarmic/common/bit_util.h" #include "dynarmic/common/common_types.h" #include "dynarmic/common/scope_exit.h" -#include "dynarmic/common/x64_disassemble.h" #include "dynarmic/frontend/A64/a64_location_descriptor.h" #include "dynarmic/frontend/A64/a64_types.h" -#include "dynarmic/interface/exclusive_monitor.h" #include "dynarmic/ir/basic_block.h" #include "dynarmic/ir/cond.h" #include "dynarmic/ir/microinstruction.h" @@ -156,155 +152,6 @@ void A64EmitX64::ClearFastDispatchTable() { } } -void A64EmitX64::GenMemory128Accessors() { - code.align(); - memory_read_128 = code.getCurr(); -#ifdef _WIN32 - Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCallWithReturnPointer(code, [&](Xbyak::Reg64 return_value_ptr, [[maybe_unused]] RegList args) { - code.mov(code.ABI_PARAM3, code.ABI_PARAM2); - code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE); - code.lea(return_value_ptr, ptr[rsp + ABI_SHADOW_SPACE]); - }); - code.movups(xmm1, xword[code.ABI_RETURN]); - code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); -#else - code.sub(rsp, 8); - Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCall(code); - if (code.HasHostFeature(HostFeature::SSE41)) { - code.movq(xmm1, code.ABI_RETURN); - code.pinsrq(xmm1, code.ABI_RETURN2, 1); - } else { - code.movq(xmm1, code.ABI_RETURN); - code.movq(xmm2, code.ABI_RETURN2); - code.punpcklqdq(xmm1, xmm2); - } - code.add(rsp, 8); -#endif - code.ret(); - PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_read_128"); - - code.align(); - memory_write_128 = code.getCurr(); -#ifdef _WIN32 - code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE); - code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); - code.movaps(xword[code.ABI_PARAM3], xmm1); - Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code); - code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); -#else - code.sub(rsp, 8); - if (code.HasHostFeature(HostFeature::SSE41)) { - code.movq(code.ABI_PARAM3, xmm1); - code.pextrq(code.ABI_PARAM4, xmm1, 1); - } else { - code.movq(code.ABI_PARAM3, xmm1); - code.punpckhqdq(xmm1, xmm1); - code.movq(code.ABI_PARAM4, xmm1); - } - Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code); - code.add(rsp, 8); -#endif - code.ret(); - PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_write_128"); -} - -void A64EmitX64::GenFastmemFallbacks() { - const std::initializer_list idxes{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; - const std::array, 4> read_callbacks{{ - {8, Devirtualize<&A64::UserCallbacks::MemoryRead8>(conf.callbacks)}, - {16, Devirtualize<&A64::UserCallbacks::MemoryRead16>(conf.callbacks)}, - {32, Devirtualize<&A64::UserCallbacks::MemoryRead32>(conf.callbacks)}, - {64, Devirtualize<&A64::UserCallbacks::MemoryRead64>(conf.callbacks)}, - }}; - const std::array, 4> write_callbacks{{ - {8, Devirtualize<&A64::UserCallbacks::MemoryWrite8>(conf.callbacks)}, - {16, Devirtualize<&A64::UserCallbacks::MemoryWrite16>(conf.callbacks)}, - {32, Devirtualize<&A64::UserCallbacks::MemoryWrite32>(conf.callbacks)}, - {64, Devirtualize<&A64::UserCallbacks::MemoryWrite64>(conf.callbacks)}, - }}; - - for (int vaddr_idx : idxes) { - if (vaddr_idx == 4 || vaddr_idx == 15) { - continue; - } - - for (int value_idx : idxes) { - code.align(); - read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - code.call(memory_read_128); - if (value_idx != 1) { - code.movaps(Xbyak::Xmm{value_idx}, xmm1); - } - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); - code.ret(); - PerfMapRegister(read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); - - code.align(); - write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStack(code); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - if (value_idx != 1) { - code.movaps(xmm1, Xbyak::Xmm{value_idx}); - } - code.call(memory_write_128); - ABI_PopCallerSaveRegistersAndAdjustStack(code); - code.ret(); - PerfMapRegister(write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); - - if (value_idx == 4 || value_idx == 15) { - continue; - } - - for (const auto& [bitsize, callback] : read_callbacks) { - code.align(); - read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - callback.EmitCall(code); - if (value_idx != code.ABI_RETURN.getIdx()) { - code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); - } - ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); - code.ret(); - PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); - } - - for (const auto& [bitsize, callback] : write_callbacks) { - code.align(); - write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); - ABI_PushCallerSaveRegistersAndAdjustStack(code); - if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { - code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); - } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - } else { - if (value_idx != code.ABI_PARAM3.getIdx()) { - code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); - } - if (vaddr_idx != code.ABI_PARAM2.getIdx()) { - code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); - } - } - callback.EmitCall(code); - ABI_PopCallerSaveRegistersAndAdjustStack(code); - code.ret(); - PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); - } - } - } -} - void A64EmitX64::GenTerminalHandlers() { // PC ends up in rbp, location_descriptor ends up in rbx const auto calculate_location_descriptor = [this] { @@ -742,600 +589,6 @@ void A64EmitX64::EmitA64SetTPIDR(A64EmitContext& ctx, IR::Inst* inst) { } } -void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) { - code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); -} - -std::optional A64EmitX64::ShouldFastmem(A64EmitContext& ctx, IR::Inst* inst) const { - if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { - return std::nullopt; - } - - const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); - if (do_not_fastmem.count(marker) > 0) { - return std::nullopt; - } - return marker; -} - -FakeCall A64EmitX64::FastmemCallback(u64 rip_) { - const auto iter = fastmem_patch_info.find(rip_); - - if (iter == fastmem_patch_info.end()) { - fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); - fmt::print("Segfault wasn't at a fastmem patch location!\n"); - fmt::print("Now dumping code.......\n\n"); - Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); - ASSERT_FALSE("iter != fastmem_patch_info.end()"); - } - - if (conf.recompile_on_fastmem_failure) { - const auto marker = iter->second.marker; - do_not_fastmem.emplace(marker); - InvalidateBasicBlocks({std::get<0>(marker)}); - } - FakeCall ret; - ret.call_rip = iter->second.callback; - ret.ret_rip = iter->second.resume_rip; - return ret; -} - -namespace { - -constexpr size_t page_bits = 12; -constexpr size_t page_size = 1 << page_bits; -constexpr size_t page_mask = (1 << page_bits) - 1; - -void EmitDetectMisaignedVAddr(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr, Xbyak::Reg64 tmp) { - if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) { - return; - } - - const u32 align_mask = [bitsize]() -> u32 { - switch (bitsize) { - case 16: - return 0b1; - case 32: - return 0b11; - case 64: - return 0b111; - case 128: - return 0b1111; - } - UNREACHABLE(); - }(); - - code.test(vaddr, align_mask); - - if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { - code.jnz(abort, code.T_NEAR); - return; - } - - const u32 page_align_mask = static_cast(page_size - 1) & ~align_mask; - - Xbyak::Label detect_boundary, resume; - - code.jnz(detect_boundary, code.T_NEAR); - code.L(resume); - - code.SwitchToFarCode(); - code.L(detect_boundary); - code.mov(tmp, vaddr); - code.and_(tmp, page_align_mask); - code.cmp(tmp, page_align_mask); - code.jne(resume, code.T_NEAR); - // NOTE: We expect to fallthrough into abort code here. - code.SwitchToNearCode(); -} - -Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) { - const size_t valid_page_index_bits = ctx.conf.page_table_address_space_bits - page_bits; - const size_t unused_top_bits = 64 - ctx.conf.page_table_address_space_bits; - - const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr(); - const Xbyak::Reg64 tmp = ctx.conf.absolute_offset_page_table ? page : ctx.reg_alloc.ScratchGpr(); - - EmitDetectMisaignedVAddr(code, ctx, bitsize, abort, vaddr, tmp); - - if (unused_top_bits == 0) { - code.mov(tmp, vaddr); - code.shr(tmp, int(page_bits)); - } else if (ctx.conf.silently_mirror_page_table) { - if (valid_page_index_bits >= 32) { - if (code.HasHostFeature(HostFeature::BMI2)) { - const Xbyak::Reg64 bit_count = ctx.reg_alloc.ScratchGpr(); - code.mov(bit_count, unused_top_bits); - code.bzhi(tmp, vaddr, bit_count); - code.shr(tmp, int(page_bits)); - ctx.reg_alloc.Release(bit_count); - } else { - code.mov(tmp, vaddr); - code.shl(tmp, int(unused_top_bits)); - code.shr(tmp, int(unused_top_bits + page_bits)); - } - } else { - code.mov(tmp, vaddr); - code.shr(tmp, int(page_bits)); - code.and_(tmp, u32((1 << valid_page_index_bits) - 1)); - } - } else { - ASSERT(valid_page_index_bits < 32); - code.mov(tmp, vaddr); - code.shr(tmp, int(page_bits)); - code.test(tmp, u32(-(1 << valid_page_index_bits))); - code.jnz(abort, code.T_NEAR); - } - code.mov(page, qword[r14 + tmp * sizeof(void*)]); - if (ctx.conf.page_table_pointer_mask_bits == 0) { - code.test(page, page); - } else { - code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits); - } - code.jz(abort, code.T_NEAR); - if (ctx.conf.absolute_offset_page_table) { - return page + vaddr; - } - code.mov(tmp, vaddr); - code.and_(tmp, static_cast(page_mask)); - return page + tmp; -} - -Xbyak::RegExp EmitFastmemVAddr(BlockOfCode& code, A64EmitContext& ctx, Xbyak::Label& abort, Xbyak::Reg64 vaddr, bool& require_abort_handling) { - const size_t unused_top_bits = 64 - ctx.conf.fastmem_address_space_bits; - - if (unused_top_bits == 0) { - return r13 + vaddr; - } else if (ctx.conf.silently_mirror_fastmem) { - Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); - if (unused_top_bits < 32) { - code.mov(tmp, vaddr); - code.shl(tmp, int(unused_top_bits)); - code.shr(tmp, int(unused_top_bits)); - } else if (unused_top_bits == 32) { - code.mov(tmp.cvt32(), vaddr.cvt32()); - } else { - code.mov(tmp.cvt32(), vaddr.cvt32()); - code.and_(tmp, u32((1 << ctx.conf.fastmem_address_space_bits) - 1)); - } - return r13 + tmp; - } else { - if (ctx.conf.fastmem_address_space_bits < 32) { - code.test(vaddr, u32(-(1 << ctx.conf.fastmem_address_space_bits))); - code.jnz(abort, code.T_NEAR); - require_abort_handling = true; - } else { - // TODO: Consider having TEST as above but coalesce 64-bit constant in register allocator - Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); - code.mov(tmp, vaddr); - code.shr(tmp, int(ctx.conf.fastmem_address_space_bits)); - code.jnz(abort, code.T_NEAR); - require_abort_handling = true; - } - return r13 + vaddr; - } -} - -template -void EmitReadMemoryMov(BlockOfCode& code, const Xbyak::Reg64& value, const Xbyak::RegExp& addr) { - switch (bitsize) { - case 8: - code.movzx(value.cvt32(), code.byte[addr]); - return; - case 16: - code.movzx(value.cvt32(), word[addr]); - return; - case 32: - code.mov(value.cvt32(), dword[addr]); - return; - case 64: - code.mov(value, qword[addr]); - return; - default: - ASSERT_FALSE("Invalid bitsize"); - } -} - -template -void EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, const Xbyak::Reg64& value) { - switch (bitsize) { - case 8: - code.mov(code.byte[addr], value.cvt8()); - return; - case 16: - code.mov(word[addr], value.cvt16()); - return; - case 32: - code.mov(dword[addr], value.cvt32()); - return; - case 64: - code.mov(qword[addr], value); - return; - default: - ASSERT_FALSE("Invalid bitsize"); - } -} - -} // namespace - -template -void A64EmitX64::EmitMemoryRead(A64EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - ctx.reg_alloc.HostCall(inst, {}, args[0]); - Devirtualize(conf.callbacks).EmitCall(code); - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Reg64 value = ctx.reg_alloc.ScratchGpr(); - - const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; - - Xbyak::Label abort, end; - bool require_abort_handling = false; - - if (fastmem_marker) { - // Use fastmem - const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - - const auto location = code.getCurr(); - EmitReadMemoryMov(code, value, src_ptr); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - }); - } else { - // Use page table - ASSERT(conf.page_table); - const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); - require_abort_handling = true; - EmitReadMemoryMov(code, value, src_ptr); - } - code.L(end); - - if (require_abort_handling) { - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - } - - ctx.reg_alloc.DefineValue(inst, value); -} - -template -void A64EmitX64::EmitMemoryWrite(A64EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); - Devirtualize(conf.callbacks).EmitCall(code); - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Reg64 value = ctx.reg_alloc.UseGpr(args[1]); - - const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; - - Xbyak::Label abort, end; - bool require_abort_handling = false; - - if (fastmem_marker) { - // Use fastmem - const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - - const auto location = code.getCurr(); - EmitWriteMemoryMov(code, dest_ptr, value); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - }); - } else { - // Use page table - ASSERT(conf.page_table); - const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); - require_abort_handling = true; - EmitWriteMemoryMov(code, dest_ptr, value); - } - code.L(end); - - if (require_abort_handling) { - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - } -} - -void A64EmitX64::EmitA64ReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { - EmitMemoryRead<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); -} - -void A64EmitX64::EmitA64ReadMemory16(A64EmitContext& ctx, IR::Inst* inst) { - EmitMemoryRead<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); -} - -void A64EmitX64::EmitA64ReadMemory32(A64EmitContext& ctx, IR::Inst* inst) { - EmitMemoryRead<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); -} - -void A64EmitX64::EmitA64ReadMemory64(A64EmitContext& ctx, IR::Inst* inst) { - EmitMemoryRead<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); -} - -void A64EmitX64::EmitA64ReadMemory128(A64EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - ctx.reg_alloc.HostCall(nullptr, {}, args[0]); - code.CallFunction(memory_read_128); - ctx.reg_alloc.DefineValue(inst, xmm1); - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Xmm value = ctx.reg_alloc.ScratchXmm(); - - const auto wrapped_fn = read_fallbacks[std::make_tuple(128, vaddr.getIdx(), value.getIdx())]; - - Xbyak::Label abort, end; - bool require_abort_handling = false; - - if (fastmem_marker) { - // Use fastmem - const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - - const auto location = code.getCurr(); - code.movups(value, xword[src_ptr]); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - }); - } else { - // Use page table - ASSERT(conf.page_table); - const auto src_ptr = EmitVAddrLookup(code, ctx, 128, abort, vaddr); - require_abort_handling = true; - code.movups(value, xword[src_ptr]); - } - code.L(end); - - if (require_abort_handling) { - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - } - - ctx.reg_alloc.DefineValue(inst, value); -} - -void A64EmitX64::EmitA64WriteMemory8(A64EmitContext& ctx, IR::Inst* inst) { - EmitMemoryWrite<8, &A64::UserCallbacks::MemoryWrite8>(ctx, inst); -} - -void A64EmitX64::EmitA64WriteMemory16(A64EmitContext& ctx, IR::Inst* inst) { - EmitMemoryWrite<16, &A64::UserCallbacks::MemoryWrite16>(ctx, inst); -} - -void A64EmitX64::EmitA64WriteMemory32(A64EmitContext& ctx, IR::Inst* inst) { - EmitMemoryWrite<32, &A64::UserCallbacks::MemoryWrite32>(ctx, inst); -} - -void A64EmitX64::EmitA64WriteMemory64(A64EmitContext& ctx, IR::Inst* inst) { - EmitMemoryWrite<64, &A64::UserCallbacks::MemoryWrite64>(ctx, inst); -} - -void A64EmitX64::EmitA64WriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - const auto fastmem_marker = ShouldFastmem(ctx, inst); - - if (!conf.page_table && !fastmem_marker) { - // Neither fastmem nor page table: Use callbacks - ctx.reg_alloc.Use(args[0], ABI_PARAM2); - ctx.reg_alloc.Use(args[1], HostLoc::XMM1); - ctx.reg_alloc.EndOfAllocScope(); - ctx.reg_alloc.HostCall(nullptr); - code.CallFunction(memory_write_128); - return; - } - - const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); - const Xbyak::Xmm value = ctx.reg_alloc.UseXmm(args[1]); - - const auto wrapped_fn = write_fallbacks[std::make_tuple(128, vaddr.getIdx(), value.getIdx())]; - - Xbyak::Label abort, end; - bool require_abort_handling = false; - - if (fastmem_marker) { - // Use fastmem - const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); - - const auto location = code.getCurr(); - code.movups(xword[dest_ptr], value); - - fastmem_patch_info.emplace( - Common::BitCast(location), - FastmemPatchInfo{ - Common::BitCast(code.getCurr()), - Common::BitCast(wrapped_fn), - *fastmem_marker, - }); - } else { - // Use page table - ASSERT(conf.page_table); - const auto dest_ptr = EmitVAddrLookup(code, ctx, 128, abort, vaddr); - require_abort_handling = true; - code.movups(xword[dest_ptr], value); - } - code.L(end); - - if (require_abort_handling) { - code.SwitchToFarCode(); - code.L(abort); - code.call(wrapped_fn); - code.jmp(end, code.T_NEAR); - code.SwitchToNearCode(); - } -} - -template -void A64EmitX64::EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst) { - ASSERT(conf.global_monitor != nullptr); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - if constexpr (bitsize != 128) { - using T = mp::unsigned_integer_of_size; - - ctx.reg_alloc.HostCall(inst, {}, args[0]); - - code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - code.CallLambda( - [](A64::UserConfig& conf, u64 vaddr) -> T { - return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> T { - return (conf.callbacks->*callback)(vaddr); - }); - }); - } else { - const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); - ctx.reg_alloc.Use(args[0], ABI_PARAM2); - ctx.reg_alloc.EndOfAllocScope(); - ctx.reg_alloc.HostCall(nullptr); - - code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); - code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); - code.CallLambda( - [](A64::UserConfig& conf, u64 vaddr, A64::Vector& ret) { - ret = conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> A64::Vector { - return (conf.callbacks->*callback)(vaddr); - }); - }); - code.movups(result, xword[rsp + ABI_SHADOW_SPACE]); - ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); - - ctx.reg_alloc.DefineValue(inst, result); - } -} - -void A64EmitX64::EmitA64ExclusiveReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); -} - -void A64EmitX64::EmitA64ExclusiveReadMemory16(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); -} - -void A64EmitX64::EmitA64ExclusiveReadMemory32(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); -} - -void A64EmitX64::EmitA64ExclusiveReadMemory64(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); -} - -void A64EmitX64::EmitA64ExclusiveReadMemory128(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveReadMemory<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); -} - -template -void A64EmitX64::EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst) { - ASSERT(conf.global_monitor != nullptr); - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - - if constexpr (bitsize != 128) { - ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); - } else { - ctx.reg_alloc.Use(args[0], ABI_PARAM2); - ctx.reg_alloc.Use(args[1], HostLoc::XMM1); - ctx.reg_alloc.EndOfAllocScope(); - ctx.reg_alloc.HostCall(inst); - } - - Xbyak::Label end; - - code.mov(code.ABI_RETURN, u32(1)); - code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); - code.je(end); - code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); - code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); - if constexpr (bitsize != 128) { - using T = mp::unsigned_integer_of_size; - - code.CallLambda( - [](A64::UserConfig& conf, u64 vaddr, T value) -> u32 { - return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, - [&](T expected) -> bool { - return (conf.callbacks->*callback)(vaddr, value, expected); - }) - ? 0 - : 1; - }); - } else { - ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); - code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); - code.movaps(xword[code.ABI_PARAM3], xmm1); - code.CallLambda( - [](A64::UserConfig& conf, u64 vaddr, A64::Vector& value) -> u32 { - return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, - [&](A64::Vector expected) -> bool { - return (conf.callbacks->*callback)(vaddr, value, expected); - }) - ? 0 - : 1; - }); - ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); - } - code.L(end); -} - -void A64EmitX64::EmitA64ExclusiveWriteMemory8(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); -} - -void A64EmitX64::EmitA64ExclusiveWriteMemory16(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); -} - -void A64EmitX64::EmitA64ExclusiveWriteMemory32(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); -} - -void A64EmitX64::EmitA64ExclusiveWriteMemory64(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); -} - -void A64EmitX64::EmitA64ExclusiveWriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { - EmitExclusiveWriteMemory<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst); -} - std::string A64EmitX64::LocationDescriptorToFriendlyName(const IR::LocationDescriptor& ir_descriptor) const { const A64::LocationDescriptor descriptor{ir_descriptor}; return fmt::format("a64_{:016X}_fpcr{:08X}", diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h index 32ab24e85..cbfa28bd5 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64.h @@ -7,6 +7,7 @@ #include #include +#include #include #include "dynarmic/backend/x64/a64_jitstate.h" @@ -67,10 +68,12 @@ protected: void (*memory_read_128)(); void (*memory_write_128)(); + void (*memory_exclusive_write_128)(); void GenMemory128Accessors(); std::map, void (*)()> read_fallbacks; std::map, void (*)()> write_fallbacks; + std::map, void (*)()> exclusive_write_fallbacks; void GenFastmemFallbacks(); const void* terminal_handler_pop_rsb_hint; @@ -97,6 +100,7 @@ protected: u64 resume_rip; u64 callback; DoNotFastmemMarker marker; + bool recompile; }; tsl::robin_map fastmem_patch_info; std::set do_not_fastmem; @@ -112,6 +116,10 @@ protected: void EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst); template void EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst); + template + void EmitExclusiveReadMemoryInline(A64EmitContext& ctx, IR::Inst* inst); + template + void EmitExclusiveWriteMemoryInline(A64EmitContext& ctx, IR::Inst* inst); // Terminal instruction emitters void EmitTerminalImpl(IR::Term::Interpret terminal, IR::LocationDescriptor initial_location, bool is_single_step) override; diff --git a/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp new file mode 100755 index 000000000..7c1aff7bc --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_emit_x64_memory.cpp @@ -0,0 +1,1025 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "dynarmic/backend/x64/a64_emit_x64.h" +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/devirtualize.h" +#include "dynarmic/backend/x64/emit_x64_memory.h" +#include "dynarmic/backend/x64/exclusive_monitor_friend.h" +#include "dynarmic/backend/x64/perf_map.h" +#include "dynarmic/common/spin_lock_x64.h" +#include "dynarmic/common/x64_disassemble.h" +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic::Backend::X64 { + +using namespace Xbyak::util; + +void A64EmitX64::GenMemory128Accessors() { + code.align(); + memory_read_128 = code.getCurr(); +#ifdef _WIN32 + Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCallWithReturnPointer(code, [&](Xbyak::Reg64 return_value_ptr, [[maybe_unused]] RegList args) { + code.mov(code.ABI_PARAM3, code.ABI_PARAM2); + code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE); + code.lea(return_value_ptr, ptr[rsp + ABI_SHADOW_SPACE]); + }); + code.movups(xmm1, xword[code.ABI_RETURN]); + code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); +#else + code.sub(rsp, 8); + Devirtualize<&A64::UserCallbacks::MemoryRead128>(conf.callbacks).EmitCall(code); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(xmm1, code.ABI_RETURN); + code.pinsrq(xmm1, code.ABI_RETURN2, 1); + } else { + code.movq(xmm1, code.ABI_RETURN); + code.movq(xmm2, code.ABI_RETURN2); + code.punpcklqdq(xmm1, xmm2); + } + code.add(rsp, 8); +#endif + code.ret(); + PerfMapRegister(memory_read_128, code.getCurr(), "a64_memory_read_128"); + + code.align(); + memory_write_128 = code.getCurr(); +#ifdef _WIN32 + code.sub(rsp, 8 + 16 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + code.movaps(xword[code.ABI_PARAM3], xmm1); + Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); +#else + code.sub(rsp, 8); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(code.ABI_PARAM3, xmm1); + code.pextrq(code.ABI_PARAM4, xmm1, 1); + } else { + code.movq(code.ABI_PARAM3, xmm1); + code.punpckhqdq(xmm1, xmm1); + code.movq(code.ABI_PARAM4, xmm1); + } + Devirtualize<&A64::UserCallbacks::MemoryWrite128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8); +#endif + code.ret(); + PerfMapRegister(memory_write_128, code.getCurr(), "a64_memory_write_128"); + + code.align(); + memory_exclusive_write_128 = code.getCurr(); +#ifdef _WIN32 + code.sub(rsp, 8 + 32 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + code.lea(code.ABI_PARAM4, ptr[rsp + ABI_SHADOW_SPACE + 16]); + code.movaps(xword[code.ABI_PARAM3], xmm1); + code.movaps(xword[code.ABI_PARAM4], xmm2); + Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8 + 16 + ABI_SHADOW_SPACE); +#else + code.sub(rsp, 8); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(code.ABI_PARAM3, xmm1); + code.pextrq(code.ABI_PARAM4, xmm1, 1); + code.movq(code.ABI_PARAM5, xmm2); + code.pextrq(code.ABI_PARAM6, xmm2, 1); + } else { + code.movq(code.ABI_PARAM3, xmm1); + code.punpckhqdq(xmm1, xmm1); + code.movq(code.ABI_PARAM4, xmm1); + code.movq(code.ABI_PARAM5, xmm2); + code.punpckhqdq(xmm2, xmm2); + code.movq(code.ABI_PARAM6, xmm2); + } + Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive128>(conf.callbacks).EmitCall(code); + code.add(rsp, 8); +#endif + code.ret(); + PerfMapRegister(memory_exclusive_write_128, code.getCurr(), "a64_memory_exclusive_write_128"); +} + +void A64EmitX64::GenFastmemFallbacks() { + const std::initializer_list idxes{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + const std::array, 4> read_callbacks{{ + {8, Devirtualize<&A64::UserCallbacks::MemoryRead8>(conf.callbacks)}, + {16, Devirtualize<&A64::UserCallbacks::MemoryRead16>(conf.callbacks)}, + {32, Devirtualize<&A64::UserCallbacks::MemoryRead32>(conf.callbacks)}, + {64, Devirtualize<&A64::UserCallbacks::MemoryRead64>(conf.callbacks)}, + }}; + const std::array, 4> write_callbacks{{ + {8, Devirtualize<&A64::UserCallbacks::MemoryWrite8>(conf.callbacks)}, + {16, Devirtualize<&A64::UserCallbacks::MemoryWrite16>(conf.callbacks)}, + {32, Devirtualize<&A64::UserCallbacks::MemoryWrite32>(conf.callbacks)}, + {64, Devirtualize<&A64::UserCallbacks::MemoryWrite64>(conf.callbacks)}, + }}; + const std::array, 4> exclusive_write_callbacks{{ + {8, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive8>(conf.callbacks)}, + {16, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive16>(conf.callbacks)}, + {32, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive32>(conf.callbacks)}, + {64, Devirtualize<&A64::UserCallbacks::MemoryWriteExclusive64>(conf.callbacks)}, + }}; + + for (int vaddr_idx : idxes) { + if (vaddr_idx == 4 || vaddr_idx == 15) { + continue; + } + + for (int value_idx : idxes) { + code.align(); + read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + code.call(memory_read_128); + if (value_idx != 1) { + code.movaps(Xbyak::Xmm{value_idx}, xmm1); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(value_idx)); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_read_fallback_128"); + + code.align(); + write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + if (value_idx != 1) { + code.movaps(xmm1, Xbyak::Xmm{value_idx}); + } + code.call(memory_write_128); + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); + + code.align(); + exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (value_idx != 1) { + code.movaps(xmm1, Xbyak::Xmm{value_idx}); + } + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(xmm2, rax); + code.pinsrq(xmm2, rdx, 1); + } else { + code.movq(xmm2, rax); + code.movq(xmm0, rdx); + code.punpcklqdq(xmm2, xmm0); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + code.call(memory_exclusive_write_128); + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(128, vaddr_idx, value_idx)], code.getCurr(), "a64_write_fallback_128"); + + if (value_idx == 4 || value_idx == 15) { + continue; + } + + for (const auto& [bitsize, callback] : read_callbacks) { + code.align(); + read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + callback.EmitCall(code); + if (value_idx != code.ABI_RETURN.getIdx()) { + code.mov(Xbyak::Reg64{value_idx}, code.ABI_RETURN); + } + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocRegIdx(value_idx)); + code.ret(); + PerfMapRegister(read_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_read_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : write_callbacks) { + code.align(); + write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStack(code); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStack(code); + code.ret(); + PerfMapRegister(write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_write_fallback_{}", bitsize)); + } + + for (const auto& [bitsize, callback] : exclusive_write_callbacks) { + code.align(); + exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)] = code.getCurr(); + ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + if (vaddr_idx == code.ABI_PARAM3.getIdx() && value_idx == code.ABI_PARAM2.getIdx()) { + code.xchg(code.ABI_PARAM2, code.ABI_PARAM3); + } else if (vaddr_idx == code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + } else { + if (value_idx != code.ABI_PARAM3.getIdx()) { + code.mov(code.ABI_PARAM3, Xbyak::Reg64{value_idx}); + } + if (vaddr_idx != code.ABI_PARAM2.getIdx()) { + code.mov(code.ABI_PARAM2, Xbyak::Reg64{vaddr_idx}); + } + } + code.mov(code.ABI_PARAM4, rax); + callback.EmitCall(code); + ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLoc::RAX); + code.ret(); + PerfMapRegister(exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr_idx, value_idx)], code.getCurr(), fmt::format("a64_exclusive_write_fallback_{}", bitsize)); + } + } + } +} + +std::optional A64EmitX64::ShouldFastmem(A64EmitContext& ctx, IR::Inst* inst) const { + if (!conf.fastmem_pointer || !exception_handler.SupportsFastmem()) { + return std::nullopt; + } + + const auto marker = std::make_tuple(ctx.Location(), ctx.GetInstOffset(inst)); + if (do_not_fastmem.count(marker) > 0) { + return std::nullopt; + } + return marker; +} + +FakeCall A64EmitX64::FastmemCallback(u64 rip_) { + const auto iter = fastmem_patch_info.find(rip_); + + if (iter == fastmem_patch_info.end()) { + fmt::print("dynarmic: Segfault happened within JITted code at rip = {:016x}\n", rip_); + fmt::print("Segfault wasn't at a fastmem patch location!\n"); + fmt::print("Now dumping code.......\n\n"); + Common::DumpDisassembledX64((void*)(rip_ & ~u64(0xFFF)), 0x1000); + ASSERT_FALSE("iter != fastmem_patch_info.end()"); + } + + if (iter->second.recompile) { + const auto marker = iter->second.marker; + do_not_fastmem.emplace(marker); + InvalidateBasicBlocks({std::get<0>(marker)}); + } + + return FakeCall{ + .call_rip = iter->second.callback, + .ret_rip = iter->second.resume_rip, + }; +} + +namespace { + +constexpr size_t page_bits = 12; +constexpr size_t page_size = 1 << page_bits; +constexpr size_t page_mask = (1 << page_bits) - 1; + +void EmitDetectMisaignedVAddr(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr, Xbyak::Reg64 tmp) { + if (bitsize == 8 || (ctx.conf.detect_misaligned_access_via_page_table & bitsize) == 0) { + return; + } + + const u32 align_mask = [bitsize]() -> u32 { + switch (bitsize) { + case 16: + return 0b1; + case 32: + return 0b11; + case 64: + return 0b111; + case 128: + return 0b1111; + } + UNREACHABLE(); + }(); + + code.test(vaddr, align_mask); + + if (!ctx.conf.only_detect_misalignment_via_page_table_on_page_boundary) { + code.jnz(abort, code.T_NEAR); + return; + } + + const u32 page_align_mask = static_cast(page_size - 1) & ~align_mask; + + Xbyak::Label detect_boundary, resume; + + code.jnz(detect_boundary, code.T_NEAR); + code.L(resume); + + code.SwitchToFarCode(); + code.L(detect_boundary); + code.mov(tmp, vaddr); + code.and_(tmp, page_align_mask); + code.cmp(tmp, page_align_mask); + code.jne(resume, code.T_NEAR); + // NOTE: We expect to fallthrough into abort code here. + code.SwitchToNearCode(); +} + +Xbyak::RegExp EmitVAddrLookup(BlockOfCode& code, A64EmitContext& ctx, size_t bitsize, Xbyak::Label& abort, Xbyak::Reg64 vaddr) { + const size_t valid_page_index_bits = ctx.conf.page_table_address_space_bits - page_bits; + const size_t unused_top_bits = 64 - ctx.conf.page_table_address_space_bits; + + const Xbyak::Reg64 page = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp = ctx.conf.absolute_offset_page_table ? page : ctx.reg_alloc.ScratchGpr(); + + EmitDetectMisaignedVAddr(code, ctx, bitsize, abort, vaddr, tmp); + + if (unused_top_bits == 0) { + code.mov(tmp, vaddr); + code.shr(tmp, int(page_bits)); + } else if (ctx.conf.silently_mirror_page_table) { + if (valid_page_index_bits >= 32) { + if (code.HasHostFeature(HostFeature::BMI2)) { + const Xbyak::Reg64 bit_count = ctx.reg_alloc.ScratchGpr(); + code.mov(bit_count, unused_top_bits); + code.bzhi(tmp, vaddr, bit_count); + code.shr(tmp, int(page_bits)); + ctx.reg_alloc.Release(bit_count); + } else { + code.mov(tmp, vaddr); + code.shl(tmp, int(unused_top_bits)); + code.shr(tmp, int(unused_top_bits + page_bits)); + } + } else { + code.mov(tmp, vaddr); + code.shr(tmp, int(page_bits)); + code.and_(tmp, u32((1 << valid_page_index_bits) - 1)); + } + } else { + ASSERT(valid_page_index_bits < 32); + code.mov(tmp, vaddr); + code.shr(tmp, int(page_bits)); + code.test(tmp, u32(-(1 << valid_page_index_bits))); + code.jnz(abort, code.T_NEAR); + } + code.mov(page, qword[r14 + tmp * sizeof(void*)]); + if (ctx.conf.page_table_pointer_mask_bits == 0) { + code.test(page, page); + } else { + code.and_(page, ~u32(0) << ctx.conf.page_table_pointer_mask_bits); + } + code.jz(abort, code.T_NEAR); + if (ctx.conf.absolute_offset_page_table) { + return page + vaddr; + } + code.mov(tmp, vaddr); + code.and_(tmp, static_cast(page_mask)); + return page + tmp; +} + +Xbyak::RegExp EmitFastmemVAddr(BlockOfCode& code, A64EmitContext& ctx, Xbyak::Label& abort, Xbyak::Reg64 vaddr, bool& require_abort_handling, std::optional tmp = std::nullopt) { + const size_t unused_top_bits = 64 - ctx.conf.fastmem_address_space_bits; + + if (unused_top_bits == 0) { + return r13 + vaddr; + } else if (ctx.conf.silently_mirror_fastmem) { + if (!tmp) { + tmp = ctx.reg_alloc.ScratchGpr(); + } + if (unused_top_bits < 32) { + code.mov(*tmp, vaddr); + code.shl(*tmp, int(unused_top_bits)); + code.shr(*tmp, int(unused_top_bits)); + } else if (unused_top_bits == 32) { + code.mov(tmp->cvt32(), vaddr.cvt32()); + } else { + code.mov(tmp->cvt32(), vaddr.cvt32()); + code.and_(*tmp, u32((1 << ctx.conf.fastmem_address_space_bits) - 1)); + } + return r13 + *tmp; + } else { + if (ctx.conf.fastmem_address_space_bits < 32) { + code.test(vaddr, u32(-(1 << ctx.conf.fastmem_address_space_bits))); + code.jnz(abort, code.T_NEAR); + require_abort_handling = true; + } else { + // TODO: Consider having TEST as above but coalesce 64-bit constant in register allocator + if (!tmp) { + tmp = ctx.reg_alloc.ScratchGpr(); + } + code.mov(*tmp, vaddr); + code.shr(*tmp, int(ctx.conf.fastmem_address_space_bits)); + code.jnz(abort, code.T_NEAR); + require_abort_handling = true; + } + return r13 + vaddr; + } +} + +template +void EmitReadMemoryMov(BlockOfCode& code, int value_idx, const Xbyak::RegExp& addr) { + switch (bitsize) { + case 8: + code.movzx(Xbyak::Reg32{value_idx}, code.byte[addr]); + return; + case 16: + code.movzx(Xbyak::Reg32{value_idx}, word[addr]); + return; + case 32: + code.mov(Xbyak::Reg32{value_idx}, dword[addr]); + return; + case 64: + code.mov(Xbyak::Reg64{value_idx}, qword[addr]); + return; + case 128: + code.movups(Xbyak::Xmm{value_idx}, xword[addr]); + return; + default: + ASSERT_FALSE("Invalid bitsize"); + } +} + +template +void EmitWriteMemoryMov(BlockOfCode& code, const Xbyak::RegExp& addr, int value_idx) { + switch (bitsize) { + case 8: + code.mov(code.byte[addr], Xbyak::Reg64{value_idx}.cvt8()); + return; + case 16: + code.mov(word[addr], Xbyak::Reg16{value_idx}); + return; + case 32: + code.mov(dword[addr], Xbyak::Reg32{value_idx}); + return; + case 64: + code.mov(qword[addr], Xbyak::Reg64{value_idx}); + return; + case 128: + code.movups(xword[addr], Xbyak::Xmm{value_idx}); + return; + default: + ASSERT_FALSE("Invalid bitsize"); + } +} + +} // namespace + +template +void A64EmitX64::EmitMemoryRead(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto fastmem_marker = ShouldFastmem(ctx, inst); + + if (!conf.page_table && !fastmem_marker) { + // Neither fastmem nor page table: Use callbacks + if constexpr (bitsize == 128) { + ctx.reg_alloc.HostCall(nullptr, {}, args[0]); + code.CallFunction(memory_read_128); + ctx.reg_alloc.DefineValue(inst, xmm1); + } else { + ctx.reg_alloc.HostCall(inst, {}, args[0]); + Devirtualize(conf.callbacks).EmitCall(code); + } + return; + } + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; + + Xbyak::Label abort, end; + bool require_abort_handling = false; + + if (fastmem_marker) { + // Use fastmem + const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); + + const auto location = code.getCurr(); + EmitReadMemoryMov(code, value_idx, src_ptr); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_fastmem_failure, + }); + } else { + // Use page table + ASSERT(conf.page_table); + const auto src_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); + require_abort_handling = true; + EmitReadMemoryMov(code, value_idx, src_ptr); + } + code.L(end); + + if (require_abort_handling) { + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } + + if constexpr (bitsize == 128) { + ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); + } else { + ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); + } +} + +template +void A64EmitX64::EmitMemoryWrite(A64EmitContext& ctx, IR::Inst* inst) { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + const auto fastmem_marker = ShouldFastmem(ctx, inst); + + if (!conf.page_table && !fastmem_marker) { + // Neither fastmem nor page table: Use callbacks + if constexpr (bitsize == 128) { + ctx.reg_alloc.Use(args[0], ABI_PARAM2); + ctx.reg_alloc.Use(args[1], HostLoc::XMM1); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + code.CallFunction(memory_write_128); + } else { + ctx.reg_alloc.HostCall(nullptr, {}, args[0], args[1]); + Devirtualize(conf.callbacks).EmitCall(code); + } + return; + } + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const int value_idx = bitsize == 128 ? ctx.reg_alloc.UseXmm(args[1]).getIdx() : ctx.reg_alloc.UseGpr(args[1]).getIdx(); + + const auto wrapped_fn = write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; + + Xbyak::Label abort, end; + bool require_abort_handling = false; + + if (fastmem_marker) { + // Use fastmem + const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); + + const auto location = code.getCurr(); + EmitWriteMemoryMov(code, dest_ptr, value_idx); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_fastmem_failure, + }); + } else { + // Use page table + ASSERT(conf.page_table); + const auto dest_ptr = EmitVAddrLookup(code, ctx, bitsize, abort, vaddr); + require_abort_handling = true; + EmitWriteMemoryMov(code, dest_ptr, value_idx); + } + code.L(end); + + if (require_abort_handling) { + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } +} + +void A64EmitX64::EmitA64ReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); +} + +void A64EmitX64::EmitA64ReadMemory16(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); +} + +void A64EmitX64::EmitA64ReadMemory32(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); +} + +void A64EmitX64::EmitA64ReadMemory64(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); +} + +void A64EmitX64::EmitA64ReadMemory128(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryRead<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory8(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<8, &A64::UserCallbacks::MemoryWrite8>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory16(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<16, &A64::UserCallbacks::MemoryWrite16>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory32(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<32, &A64::UserCallbacks::MemoryWrite32>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory64(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<64, &A64::UserCallbacks::MemoryWrite64>(ctx, inst); +} + +void A64EmitX64::EmitA64WriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { + EmitMemoryWrite<128, &A64::UserCallbacks::MemoryWrite64>(ctx, inst); +} + +template +void A64EmitX64::EmitExclusiveReadMemory(A64EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor != nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if constexpr (bitsize != 128) { + using T = mp::unsigned_integer_of_size; + + ctx.reg_alloc.HostCall(inst, {}, args[0]); + + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); + code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); + code.CallLambda( + [](A64::UserConfig& conf, u64 vaddr) -> T { + return conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> T { + return (conf.callbacks->*callback)(vaddr); + }); + }); + } else { + const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); + ctx.reg_alloc.Use(args[0], ABI_PARAM2); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(nullptr); + + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); + code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); + ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + code.CallLambda( + [](A64::UserConfig& conf, u64 vaddr, A64::Vector& ret) { + ret = conf.global_monitor->ReadAndMark(conf.processor_id, vaddr, [&]() -> A64::Vector { + return (conf.callbacks->*callback)(vaddr); + }); + }); + code.movups(result, xword[rsp + ABI_SHADOW_SPACE]); + ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); + + ctx.reg_alloc.DefineValue(inst, result); + } +} + +template +void A64EmitX64::EmitExclusiveWriteMemory(A64EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor != nullptr); + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + if constexpr (bitsize != 128) { + ctx.reg_alloc.HostCall(inst, {}, args[0], args[1]); + } else { + ctx.reg_alloc.Use(args[0], ABI_PARAM2); + ctx.reg_alloc.Use(args[1], HostLoc::XMM1); + ctx.reg_alloc.EndOfAllocScope(); + ctx.reg_alloc.HostCall(inst); + } + + Xbyak::Label end; + + code.mov(code.ABI_RETURN, u32(1)); + code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); + code.je(end); + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); + code.mov(code.ABI_PARAM1, reinterpret_cast(&conf)); + if constexpr (bitsize != 128) { + using T = mp::unsigned_integer_of_size; + + code.CallLambda( + [](A64::UserConfig& conf, u64 vaddr, T value) -> u32 { + return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, + [&](T expected) -> bool { + return (conf.callbacks->*callback)(vaddr, value, expected); + }) + ? 0 + : 1; + }); + } else { + ctx.reg_alloc.AllocStackSpace(16 + ABI_SHADOW_SPACE); + code.lea(code.ABI_PARAM3, ptr[rsp + ABI_SHADOW_SPACE]); + code.movaps(xword[code.ABI_PARAM3], xmm1); + code.CallLambda( + [](A64::UserConfig& conf, u64 vaddr, A64::Vector& value) -> u32 { + return conf.global_monitor->DoExclusiveOperation(conf.processor_id, vaddr, + [&](A64::Vector expected) -> bool { + return (conf.callbacks->*callback)(vaddr, value, expected); + }) + ? 0 + : 1; + }); + ctx.reg_alloc.ReleaseStackSpace(16 + ABI_SHADOW_SPACE); + } + code.L(end); +} + +template +void A64EmitX64::EmitExclusiveReadMemoryInline(A64EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + EmitExclusiveReadMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const int value_idx = bitsize == 128 ? ctx.reg_alloc.ScratchXmm().getIdx() : ctx.reg_alloc.ScratchGpr().getIdx(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + const Xbyak::Reg64 tmp2 = ctx.reg_alloc.ScratchGpr(); + + const auto wrapped_fn = read_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value_idx)]; + + EmitExclusiveLock(code, conf, tmp, tmp2.cvt32()); + + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(1)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(qword[tmp], vaddr); + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + Xbyak::Label abort, end; + bool require_abort_handling = false; + + const auto src_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling); + + const auto location = code.getCurr(); + EmitReadMemoryMov(code, value_idx, src_ptr); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(wrapped_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.L(end); + + if (require_abort_handling) { + code.SwitchToFarCode(); + code.L(abort); + code.call(wrapped_fn); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } + } else { + code.call(wrapped_fn); + } + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + EmitWriteMemoryMov(code, tmp, value_idx); + + EmitExclusiveUnlock(code, conf, tmp, tmp2.cvt32()); + + if constexpr (bitsize == 128) { + ctx.reg_alloc.DefineValue(inst, Xbyak::Xmm{value_idx}); + } else { + ctx.reg_alloc.DefineValue(inst, Xbyak::Reg64{value_idx}); + } +} + +template +void A64EmitX64::EmitExclusiveWriteMemoryInline(A64EmitContext& ctx, IR::Inst* inst) { + ASSERT(conf.global_monitor && conf.fastmem_pointer); + if (!exception_handler.SupportsFastmem()) { + EmitExclusiveWriteMemory(ctx, inst); + return; + } + + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + + const auto value = [&] { + if constexpr (bitsize == 128) { + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + ctx.reg_alloc.ScratchGpr(HostLoc::RBX); + ctx.reg_alloc.ScratchGpr(HostLoc::RCX); + ctx.reg_alloc.ScratchGpr(HostLoc::RDX); + return ctx.reg_alloc.UseXmm(args[1]); + } else { + ctx.reg_alloc.ScratchGpr(HostLoc::RAX); + return ctx.reg_alloc.UseGpr(args[1]); + } + }(); + const Xbyak::Reg64 vaddr = ctx.reg_alloc.UseGpr(args[0]); + const Xbyak::Reg32 status = ctx.reg_alloc.ScratchGpr().cvt32(); + const Xbyak::Reg64 tmp = ctx.reg_alloc.ScratchGpr(); + + const auto fallback_fn = exclusive_write_fallbacks[std::make_tuple(bitsize, vaddr.getIdx(), value.getIdx())]; + + EmitExclusiveLock(code, conf, tmp, eax); + + Xbyak::Label end; + + code.mov(tmp, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, conf.processor_id))); + code.mov(status, u32(1)); + code.cmp(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); + code.je(end, code.T_NEAR); + code.cmp(qword[tmp], vaddr); + code.jne(end, code.T_NEAR); + + EmitExclusiveTestAndClear(code, conf, vaddr, tmp, rax); + + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); + code.mov(tmp, Common::BitCast(GetExclusiveMonitorValuePointer(conf.global_monitor, conf.processor_id))); + + if constexpr (bitsize == 128) { + code.mov(rax, qword[tmp + 0]); + code.mov(rdx, qword[tmp + 8]); + if (code.HasHostFeature(HostFeature::SSE41)) { + code.movq(rbx, value); + code.pextrq(rcx, value, 1); + } else { + code.movaps(xmm0, value); + code.movq(rbx, xmm0); + code.punpckhqdq(xmm0, xmm0); + code.movq(rcx, xmm0); + } + } else { + EmitReadMemoryMov(code, rax.getIdx(), tmp); + } + + const auto fastmem_marker = ShouldFastmem(ctx, inst); + if (fastmem_marker) { + Xbyak::Label abort; + bool require_abort_handling = false; + + const auto dest_ptr = EmitFastmemVAddr(code, ctx, abort, vaddr, require_abort_handling, tmp); + + const auto location = code.getCurr(); + + if constexpr (bitsize == 128) { + code.lock(); + code.cmpxchg16b(ptr[dest_ptr]); + } else { + switch (bitsize) { + case 8: + code.lock(); + code.cmpxchg(code.byte[dest_ptr], value.cvt8()); + break; + case 16: + code.lock(); + code.cmpxchg(word[dest_ptr], value.cvt16()); + break; + case 32: + code.lock(); + code.cmpxchg(dword[dest_ptr], value.cvt32()); + break; + case 64: + code.lock(); + code.cmpxchg(qword[dest_ptr], value.cvt64()); + break; + default: + UNREACHABLE(); + } + } + + code.setnz(status.cvt8()); + + code.SwitchToFarCode(); + code.L(abort); + code.call(fallback_fn); + + fastmem_patch_info.emplace( + Common::BitCast(location), + FastmemPatchInfo{ + Common::BitCast(code.getCurr()), + Common::BitCast(fallback_fn), + *fastmem_marker, + conf.recompile_on_exclusive_fastmem_failure, + }); + + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + code.jmp(end, code.T_NEAR); + code.SwitchToNearCode(); + } else { + code.call(fallback_fn); + code.cmp(al, 0); + code.setz(status.cvt8()); + code.movzx(status.cvt32(), status.cvt8()); + } + + code.L(end); + + EmitExclusiveUnlock(code, conf, tmp, eax); + + ctx.reg_alloc.DefineValue(inst, status); +} + +void A64EmitX64::EmitA64ClearExclusive(A64EmitContext&, IR::Inst*) { + code.mov(code.byte[r15 + offsetof(A64JitState, exclusive_state)], u8(0)); +} + +void A64EmitX64::EmitA64ExclusiveReadMemory8(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); + } else { + EmitExclusiveReadMemory<8, &A64::UserCallbacks::MemoryRead8>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveReadMemory16(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); + } else { + EmitExclusiveReadMemory<16, &A64::UserCallbacks::MemoryRead16>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveReadMemory32(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); + } else { + EmitExclusiveReadMemory<32, &A64::UserCallbacks::MemoryRead32>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveReadMemory64(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); + } else { + EmitExclusiveReadMemory<64, &A64::UserCallbacks::MemoryRead64>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveReadMemory128(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveReadMemoryInline<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); + } else { + EmitExclusiveReadMemory<128, &A64::UserCallbacks::MemoryRead128>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory8(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } else { + EmitExclusiveWriteMemory<8, &A64::UserCallbacks::MemoryWriteExclusive8>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory16(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } else { + EmitExclusiveWriteMemory<16, &A64::UserCallbacks::MemoryWriteExclusive16>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory32(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } else { + EmitExclusiveWriteMemory<32, &A64::UserCallbacks::MemoryWriteExclusive32>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory64(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } else { + EmitExclusiveWriteMemory<64, &A64::UserCallbacks::MemoryWriteExclusive64>(ctx, inst); + } +} + +void A64EmitX64::EmitA64ExclusiveWriteMemory128(A64EmitContext& ctx, IR::Inst* inst) { + if (conf.fastmem_exclusive_access) { + EmitExclusiveWriteMemoryInline<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst); + } else { + EmitExclusiveWriteMemory<128, &A64::UserCallbacks::MemoryWriteExclusive128>(ctx, inst); + } +} + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h new file mode 100755 index 000000000..e5cf6a483 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_memory.h @@ -0,0 +1,62 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include + +#include "dynarmic/backend/x64/a64_emit_x64.h" +#include "dynarmic/backend/x64/exclusive_monitor_friend.h" +#include "dynarmic/common/spin_lock_x64.h" +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic::Backend::X64 { + +namespace { + +using namespace Xbyak::util; + +template +void EmitExclusiveLock(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 pointer, Xbyak::Reg32 tmp) { + if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) { + return; + } + + code.mov(pointer, Common::BitCast(GetExclusiveMonitorLockPointer(conf.global_monitor))); + EmitSpinLockLock(code, pointer, tmp); +} + +template +void EmitExclusiveUnlock(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 pointer, Xbyak::Reg32 tmp) { + if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) { + return; + } + + code.mov(pointer, Common::BitCast(GetExclusiveMonitorLockPointer(conf.global_monitor))); + EmitSpinLockUnlock(code, pointer, tmp); +} + +template +void EmitExclusiveTestAndClear(BlockOfCode& code, const UserConfig& conf, Xbyak::Reg64 vaddr, Xbyak::Reg64 pointer, Xbyak::Reg64 tmp) { + if (conf.HasOptimization(OptimizationFlag::Unsafe_IgnoreGlobalMonitor)) { + return; + } + + code.mov(tmp, 0xDEAD'DEAD'DEAD'DEAD); + const size_t processor_count = GetExclusiveMonitorProcessorCount(conf.global_monitor); + for (size_t processor_index = 0; processor_index < processor_count; processor_index++) { + if (processor_index == conf.processor_id) { + continue; + } + Xbyak::Label ok; + code.mov(pointer, Common::BitCast(GetExclusiveMonitorAddressPointer(conf.global_monitor, processor_index))); + code.cmp(qword[pointer], vaddr); + code.jne(ok); + code.mov(qword[pointer], tmp); + code.L(ok); + } +} + +} // namespace + +} // namespace Dynarmic::Backend::X64 diff --git a/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp index 0f66270f0..6a323b9f9 100755 --- a/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp +++ b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor.cpp @@ -21,11 +21,11 @@ size_t ExclusiveMonitor::GetProcessorCount() const { } void ExclusiveMonitor::Lock() { - while (is_locked.test_and_set(std::memory_order_acquire)) {} + lock.Lock(); } void ExclusiveMonitor::Unlock() { - is_locked.clear(std::memory_order_release); + lock.Unlock(); } bool ExclusiveMonitor::CheckAndClear(size_t processor_id, VAddr address) { diff --git a/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h new file mode 100755 index 000000000..7f7fa2425 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/backend/x64/exclusive_monitor_friend.h @@ -0,0 +1,28 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include "dynarmic/interface/exclusive_monitor.h" + +namespace Dynarmic { + +inline volatile int* GetExclusiveMonitorLockPointer(ExclusiveMonitor* monitor) { + return &monitor->lock.storage; +} + +inline size_t GetExclusiveMonitorProcessorCount(ExclusiveMonitor* monitor) { + return monitor->exclusive_addresses.size(); +} + +inline VAddr* GetExclusiveMonitorAddressPointer(ExclusiveMonitor* monitor, size_t index) { + return monitor->exclusive_addresses.data() + index; +} + +inline Vector* GetExclusiveMonitorValuePointer(ExclusiveMonitor* monitor, size_t index) { + return monitor->exclusive_values.data() + index; +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock.h b/externals/dynarmic/src/dynarmic/common/spin_lock.h new file mode 100755 index 000000000..a6ea9b682 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/spin_lock.h @@ -0,0 +1,17 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +namespace Dynarmic { + +struct SpinLock { + void Lock(); + void Unlock(); + + volatile int storage; +}; + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp new file mode 100755 index 000000000..0f8499312 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.cpp @@ -0,0 +1,70 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#include + +#include "dynarmic/backend/x64/abi.h" +#include "dynarmic/backend/x64/hostloc.h" +#include "dynarmic/common/spin_lock.h" + +namespace Dynarmic { + +void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) { + Xbyak::Label start, loop; + + code.jmp(start); + code.L(loop); + code.pause(); + code.L(start); + code.mov(tmp, 1); + code.lock(); + code.xchg(code.dword[ptr], tmp); + code.test(tmp, tmp); + code.jnz(loop); +} + +void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp) { + code.xor_(tmp, tmp); + code.xchg(code.dword[ptr], tmp); + code.mfence(); +} + +namespace { + +struct SpinLockImpl { + SpinLockImpl(); + + Xbyak::CodeGenerator code; + void (*lock)(volatile int*); + void (*unlock)(volatile int*); +}; + +SpinLockImpl impl; + +SpinLockImpl::SpinLockImpl() { + const Xbyak::Reg64 ABI_PARAM1 = Backend::X64::HostLocToReg64(Backend::X64::ABI_PARAM1); + + code.align(); + lock = code.getCurr(); + EmitSpinLockLock(code, ABI_PARAM1, code.eax); + code.ret(); + + code.align(); + unlock = code.getCurr(); + EmitSpinLockUnlock(code, ABI_PARAM1, code.eax); + code.ret(); +} + +} // namespace + +void SpinLock::Lock() { + impl.lock(&storage); +} + +void SpinLock::Unlock() { + impl.unlock(&storage); +} + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/common/spin_lock_x64.h b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.h new file mode 100755 index 000000000..df6a3d740 --- /dev/null +++ b/externals/dynarmic/src/dynarmic/common/spin_lock_x64.h @@ -0,0 +1,15 @@ +/* This file is part of the dynarmic project. + * Copyright (c) 2022 MerryMage + * SPDX-License-Identifier: 0BSD + */ + +#pragma once + +#include + +namespace Dynarmic { + +void EmitSpinLockLock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp); +void EmitSpinLockUnlock(Xbyak::CodeGenerator& code, Xbyak::Reg64 ptr, Xbyak::Reg32 tmp); + +} // namespace Dynarmic diff --git a/externals/dynarmic/src/dynarmic/interface/A32/config.h b/externals/dynarmic/src/dynarmic/interface/A32/config.h index dbf64c6b2..75c1ec59c 100755 --- a/externals/dynarmic/src/dynarmic/interface/A32/config.h +++ b/externals/dynarmic/src/dynarmic/interface/A32/config.h @@ -177,6 +177,15 @@ struct UserConfig { /// accesses will hit the memory callbacks. bool recompile_on_fastmem_failure = true; + /// Determines if we should use the above fastmem_pointer for exclusive reads and + /// writes. On x64, dynarmic currently relies on x64 cmpxchg semantics which may not + /// provide fully accurate emulation. + bool fastmem_exclusive_access = false; + /// Determines if exclusive access instructions that pagefault should cause + /// recompilation of that block with fastmem disabled. Recompiled code will use memory + /// callbacks. + bool recompile_on_exclusive_fastmem_failure = true; + // Coprocessors std::array, 16> coprocessors{}; diff --git a/externals/dynarmic/src/dynarmic/interface/A64/config.h b/externals/dynarmic/src/dynarmic/interface/A64/config.h index 1d4a758ce..7926fb5ac 100755 --- a/externals/dynarmic/src/dynarmic/interface/A64/config.h +++ b/externals/dynarmic/src/dynarmic/interface/A64/config.h @@ -254,6 +254,15 @@ struct UserConfig { /// This is only used if fastmem_pointer is not nullptr. bool silently_mirror_fastmem = true; + /// Determines if we should use the above fastmem_pointer for exclusive reads and + /// writes. On x64, dynarmic currently relies on x64 cmpxchg semantics which may not + /// provide fully accurate emulation. + bool fastmem_exclusive_access = false; + /// Determines if exclusive access instructions that pagefault should cause + /// recompilation of that block with fastmem disabled. Recompiled code will use memory + /// callbacks. + bool recompile_on_exclusive_fastmem_failure = true; + /// This option relates to translation. Generally when we run into an unpredictable /// instruction the ExceptionRaised callback is called. If this is true, we define /// definite behaviour for some unpredictable instructions. diff --git a/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h b/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h index 70b0c7f81..481367587 100755 --- a/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h +++ b/externals/dynarmic/src/dynarmic/interface/exclusive_monitor.h @@ -12,6 +12,8 @@ #include #include +#include + namespace Dynarmic { using VAddr = std::uint64_t; @@ -71,9 +73,14 @@ private: void Lock(); void Unlock(); + friend volatile int* GetExclusiveMonitorLockPointer(ExclusiveMonitor*); + friend size_t GetExclusiveMonitorProcessorCount(ExclusiveMonitor*); + friend VAddr* GetExclusiveMonitorAddressPointer(ExclusiveMonitor*, size_t index); + friend Vector* GetExclusiveMonitorValuePointer(ExclusiveMonitor*, size_t index); + static constexpr VAddr RESERVATION_GRANULE_MASK = 0xFFFF'FFFF'FFFF'FFFFull; static constexpr VAddr INVALID_EXCLUSIVE_ADDRESS = 0xDEAD'DEAD'DEAD'DEADull; - std::atomic_flag is_locked; + SpinLock lock; std::vector exclusive_addresses; std::vector exclusive_values; }; diff --git a/externals/dynarmic/src/dynarmic/interface/optimization_flags.h b/externals/dynarmic/src/dynarmic/interface/optimization_flags.h index df7eee3e5..2f65f0bfa 100755 --- a/externals/dynarmic/src/dynarmic/interface/optimization_flags.h +++ b/externals/dynarmic/src/dynarmic/interface/optimization_flags.h @@ -45,6 +45,10 @@ enum class OptimizationFlag : std::uint32_t { /// This is an UNSAFE optimization that causes ASIMD floating-point instructions to be run with incorrect /// rounding modes. This may result in inaccurate results with all floating-point ASIMD instructions. Unsafe_IgnoreStandardFPCRValue = 0x00080000, + /// This is an UNSAFE optimization that causes the global monitor to be ignored. This may + /// result in unexpected behaviour in multithreaded scenarios, including but not limited + /// to data races and deadlocks. + Unsafe_IgnoreGlobalMonitor = 0x00100000, }; constexpr OptimizationFlag no_optimizations = static_cast(0); diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 2810cec15..877e0faa4 100755 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -176,6 +176,7 @@ void RestoreGlobalState(bool is_powered_on) { values.cpuopt_unsafe_ignore_standard_fpcr.SetGlobal(true); values.cpuopt_unsafe_inaccurate_nan.SetGlobal(true); values.cpuopt_unsafe_fastmem_check.SetGlobal(true); + values.cpuopt_unsafe_ignore_global_monitor.SetGlobal(true); // Renderer values.renderer_backend.SetGlobal(true); diff --git a/src/common/settings.h b/src/common/settings.h index dd36076f3..3de4ba1a7 100755 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -484,12 +484,15 @@ struct Values { BasicSetting cpuopt_misc_ir{true, "cpuopt_misc_ir"}; BasicSetting cpuopt_reduce_misalign_checks{true, "cpuopt_reduce_misalign_checks"}; BasicSetting cpuopt_fastmem{true, "cpuopt_fastmem"}; + BasicSetting cpuopt_fastmem_exclusives{true, "cpuopt_fastmem_exclusives"}; + BasicSetting cpuopt_recompile_exclusives{true, "cpuopt_recompile_exclusives"}; Setting cpuopt_unsafe_unfuse_fma{true, "cpuopt_unsafe_unfuse_fma"}; Setting cpuopt_unsafe_reduce_fp_error{true, "cpuopt_unsafe_reduce_fp_error"}; Setting cpuopt_unsafe_ignore_standard_fpcr{true, "cpuopt_unsafe_ignore_standard_fpcr"}; Setting cpuopt_unsafe_inaccurate_nan{true, "cpuopt_unsafe_inaccurate_nan"}; Setting cpuopt_unsafe_fastmem_check{true, "cpuopt_unsafe_fastmem_check"}; + Setting cpuopt_unsafe_ignore_global_monitor{true, "cpuopt_unsafe_ignore_global_monitor"}; // Renderer RangedSetting renderer_backend{ diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp index b0d89c539..286976623 100755 --- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp @@ -137,6 +137,8 @@ std::shared_ptr ARM_Dynarmic_32::MakeJit(Common::PageTable* config.page_table_pointer_mask_bits = Common::PageTable::ATTRIBUTE_BITS; config.detect_misaligned_access_via_page_table = 16 | 32 | 64 | 128; config.only_detect_misalignment_via_page_table_on_page_boundary = true; + config.fastmem_exclusive_access = true; + config.recompile_on_exclusive_fastmem_failure = true; // Multi-process state config.processor_id = core_index; @@ -178,6 +180,12 @@ std::shared_ptr ARM_Dynarmic_32::MakeJit(Common::PageTable* if (!Settings::values.cpuopt_fastmem) { config.fastmem_pointer = nullptr; } + if (!Settings::values.cpuopt_fastmem_exclusives) { + config.fastmem_exclusive_access = false; + } + if (!Settings::values.cpuopt_recompile_exclusives) { + config.recompile_on_exclusive_fastmem_failure = false; + } } // Unsafe optimizations @@ -195,6 +203,9 @@ std::shared_ptr ARM_Dynarmic_32::MakeJit(Common::PageTable* if (Settings::values.cpuopt_unsafe_inaccurate_nan) { config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN; } + if (Settings::values.cpuopt_unsafe_ignore_global_monitor) { + config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor; + } } // Curated optimizations @@ -203,6 +214,7 @@ std::shared_ptr ARM_Dynarmic_32::MakeJit(Common::PageTable* config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA; config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreStandardFPCRValue; config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN; + config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor; } return std::make_unique(config); diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp index 56836bd05..d96226c41 100755 --- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp @@ -185,6 +185,9 @@ std::shared_ptr ARM_Dynarmic_64::MakeJit(Common::PageTable* config.fastmem_pointer = page_table->fastmem_arena; config.fastmem_address_space_bits = address_space_bits; config.silently_mirror_fastmem = false; + + config.fastmem_exclusive_access = true; + config.recompile_on_exclusive_fastmem_failure = true; } // Multi-process state @@ -237,6 +240,12 @@ std::shared_ptr ARM_Dynarmic_64::MakeJit(Common::PageTable* if (!Settings::values.cpuopt_fastmem) { config.fastmem_pointer = nullptr; } + if (!Settings::values.cpuopt_fastmem_exclusives) { + config.fastmem_exclusive_access = false; + } + if (!Settings::values.cpuopt_recompile_exclusives) { + config.recompile_on_exclusive_fastmem_failure = false; + } } // Unsafe optimizations @@ -254,6 +263,9 @@ std::shared_ptr ARM_Dynarmic_64::MakeJit(Common::PageTable* if (Settings::values.cpuopt_unsafe_fastmem_check) { config.fastmem_address_space_bits = 64; } + if (Settings::values.cpuopt_unsafe_ignore_global_monitor) { + config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor; + } } // Curated optimizations @@ -262,6 +274,7 @@ std::shared_ptr ARM_Dynarmic_64::MakeJit(Common::PageTable* config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_UnfuseFMA; config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN; config.fastmem_address_space_bits = 64; + config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_IgnoreGlobalMonitor; } return std::make_shared(config); diff --git a/src/core/arm/dynarmic/arm_exclusive_monitor.cpp b/src/core/arm/dynarmic/arm_exclusive_monitor.cpp index 397d054a8..ea6b224e0 100755 --- a/src/core/arm/dynarmic/arm_exclusive_monitor.cpp +++ b/src/core/arm/dynarmic/arm_exclusive_monitor.cpp @@ -37,8 +37,8 @@ u128 DynarmicExclusiveMonitor::ExclusiveRead128(std::size_t core_index, VAddr ad }); } -void DynarmicExclusiveMonitor::ClearExclusive() { - monitor.Clear(); +void DynarmicExclusiveMonitor::ClearExclusive(std::size_t core_index) { + monitor.ClearProcessor(core_index); } bool DynarmicExclusiveMonitor::ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) { diff --git a/src/core/arm/dynarmic/arm_exclusive_monitor.h b/src/core/arm/dynarmic/arm_exclusive_monitor.h index 265c4ecef..5a15b43ef 100755 --- a/src/core/arm/dynarmic/arm_exclusive_monitor.h +++ b/src/core/arm/dynarmic/arm_exclusive_monitor.h @@ -29,7 +29,7 @@ public: u32 ExclusiveRead32(std::size_t core_index, VAddr addr) override; u64 ExclusiveRead64(std::size_t core_index, VAddr addr) override; u128 ExclusiveRead128(std::size_t core_index, VAddr addr) override; - void ClearExclusive() override; + void ClearExclusive(std::size_t core_index) override; bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) override; bool ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) override; diff --git a/src/core/arm/exclusive_monitor.h b/src/core/arm/exclusive_monitor.h index 62f6e6023..9914ca3da 100755 --- a/src/core/arm/exclusive_monitor.h +++ b/src/core/arm/exclusive_monitor.h @@ -23,7 +23,7 @@ public: virtual u32 ExclusiveRead32(std::size_t core_index, VAddr addr) = 0; virtual u64 ExclusiveRead64(std::size_t core_index, VAddr addr) = 0; virtual u128 ExclusiveRead128(std::size_t core_index, VAddr addr) = 0; - virtual void ClearExclusive() = 0; + virtual void ClearExclusive(std::size_t core_index) = 0; virtual bool ExclusiveWrite8(std::size_t core_index, VAddr vaddr, u8 value) = 0; virtual bool ExclusiveWrite16(std::size_t core_index, VAddr vaddr, u16 value) = 0; diff --git a/src/core/hle/kernel/k_address_arbiter.cpp b/src/core/hle/kernel/k_address_arbiter.cpp index 783c69858..1d1f5e5f8 100755 --- a/src/core/hle/kernel/k_address_arbiter.cpp +++ b/src/core/hle/kernel/k_address_arbiter.cpp @@ -49,7 +49,7 @@ bool DecrementIfLessThan(Core::System& system, s32* out, VAddr address, s32 valu } } else { // Otherwise, clear our exclusive hold and finish - monitor.ClearExclusive(); + monitor.ClearExclusive(current_core); } // We're done. @@ -78,7 +78,7 @@ bool UpdateIfEqual(Core::System& system, s32* out, VAddr address, s32 value, s32 } } else { // Otherwise, clear our exclusive hold and finish. - monitor.ClearExclusive(); + monitor.ClearExclusive(current_core); } // We're done. diff --git a/src/core/hle/kernel/k_memory_manager.cpp b/src/core/hle/kernel/k_memory_manager.cpp index b989027e5..7c75b38c6 100755 --- a/src/core/hle/kernel/k_memory_manager.cpp +++ b/src/core/hle/kernel/k_memory_manager.cpp @@ -31,7 +31,7 @@ constexpr KMemoryManager::Pool GetPoolFromMemoryRegionType(u32 type) { } else if ((type | KMemoryRegionType_DramSystemNonSecurePool) == type) { return KMemoryManager::Pool::SystemNonSecure; } else { - ASSERT("InvalidMemoryRegionType for conversion to Pool"); + ASSERT_MSG("InvalidMemoryRegionType for conversion to Pool"); return {}; } } @@ -102,9 +102,8 @@ void KMemoryManager::Initialize(VAddr management_region, size_t management_regio Impl* manager = std::addressof(managers[num_managers++]); ASSERT(num_managers <= managers.size()); - const size_t cur_size = - manager->Initialize(system, region_address, region_size, management_region, - management_region_end, region_pool); + const size_t cur_size = manager->Initialize(region_address, region_size, management_region, + management_region_end, region_pool); management_region += cur_size; ASSERT(management_region <= management_region_end); @@ -384,9 +383,8 @@ void KMemoryManager::Open(const KPageLinkedList& pg) { } } -size_t KMemoryManager::Impl::Initialize([[maybe_unused]] Core::System& system, PAddr address, - size_t size, VAddr management, VAddr management_end, - Pool p) { +size_t KMemoryManager::Impl::Initialize(PAddr address, size_t size, VAddr management, + VAddr management_end, Pool p) { // Calculate management sizes. const size_t ref_count_size = (size / PageSize) * sizeof(u16); const size_t optimize_map_size = CalculateOptimizedProcessOverheadSize(size); diff --git a/src/core/hle/kernel/k_memory_manager.h b/src/core/hle/kernel/k_memory_manager.h index 86a502e7a..18775b262 100755 --- a/src/core/hle/kernel/k_memory_manager.h +++ b/src/core/hle/kernel/k_memory_manager.h @@ -112,8 +112,8 @@ private: Impl() = default; ~Impl() = default; - size_t Initialize(Core::System& system, PAddr address, size_t size, VAddr management, - VAddr management_end, Pool p); + size_t Initialize(PAddr address, size_t size, VAddr management, VAddr management_end, + Pool p); VAddr AllocateBlock(s32 index, bool random) { return heap.AllocateBlock(index, random); diff --git a/src/core/hle/kernel/k_memory_region_type.h b/src/core/hle/kernel/k_memory_region_type.h index c522f8b9d..0baeddf51 100755 --- a/src/core/hle/kernel/k_memory_region_type.h +++ b/src/core/hle/kernel/k_memory_region_type.h @@ -241,7 +241,7 @@ static_assert(KMemoryRegionType_VirtualDramKernelPtHeap.GetValue() == 0x2A); static_assert(KMemoryRegionType_VirtualDramKernelTraceBuffer.GetValue() == 0x4A); // UNUSED: .DeriveSparse(2, 2, 0); -constexpr inline const auto KMemoryRegionType_VirtualDramUnknownDebug = +constexpr auto KMemoryRegionType_VirtualDramUnknownDebug = KMemoryRegionType_Dram.DeriveSparse(2, 2, 1); static_assert(KMemoryRegionType_VirtualDramUnknownDebug.GetValue() == (0x52)); diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp index 67b7e3f55..8c1634ec5 100755 --- a/src/core/hle/kernel/kernel.cpp +++ b/src/core/hle/kernel/kernel.cpp @@ -71,7 +71,7 @@ struct KernelCore::Impl { // Derive the initial memory layout from the emulated board Init::InitializeSlabResourceCounts(kernel); DeriveInitialMemoryLayout(); - Init::InitializeSlabHeaps(system, memory_layout); + Init::InitializeSlabHeaps(system, *memory_layout); // Initialize kernel memory and resources. InitializeSystemResourceLimit(kernel, system.CoreTiming()); @@ -223,7 +223,7 @@ struct KernelCore::Impl { system_resource_limit = KResourceLimit::Create(system.Kernel()); system_resource_limit->Initialize(&core_timing); - const auto [total_size, kernel_size] = memory_layout.GetTotalAndKernelMemorySizes(); + const auto [total_size, kernel_size] = memory_layout->GetTotalAndKernelMemorySizes(); // If setting the default system values fails, then something seriously wrong has occurred. ASSERT(system_resource_limit->SetLimitValue(LimitableResource::PhysicalMemory, total_size) @@ -353,15 +353,17 @@ struct KernelCore::Impl { } void DeriveInitialMemoryLayout() { + memory_layout = std::make_unique(); + // Insert the root region for the virtual memory tree, from which all other regions will // derive. - memory_layout.GetVirtualMemoryRegionTree().InsertDirectly( + memory_layout->GetVirtualMemoryRegionTree().InsertDirectly( KernelVirtualAddressSpaceBase, KernelVirtualAddressSpaceBase + KernelVirtualAddressSpaceSize - 1); // Insert the root region for the physical memory tree, from which all other regions will // derive. - memory_layout.GetPhysicalMemoryRegionTree().InsertDirectly( + memory_layout->GetPhysicalMemoryRegionTree().InsertDirectly( KernelPhysicalAddressSpaceBase, KernelPhysicalAddressSpaceBase + KernelPhysicalAddressSpaceSize - 1); @@ -378,7 +380,7 @@ struct KernelCore::Impl { if (!(kernel_region_start + KernelRegionSize - 1 <= KernelVirtualAddressSpaceLast)) { kernel_region_size = KernelVirtualAddressSpaceEnd - kernel_region_start; } - ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert( + ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert( kernel_region_start, kernel_region_size, KMemoryRegionType_Kernel)); // Setup the code region. @@ -387,11 +389,11 @@ struct KernelCore::Impl { Common::AlignDown(code_start_virt_addr, CodeRegionAlign); constexpr VAddr code_region_end = Common::AlignUp(code_end_virt_addr, CodeRegionAlign); constexpr size_t code_region_size = code_region_end - code_region_start; - ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert( + ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert( code_region_start, code_region_size, KMemoryRegionType_KernelCode)); // Setup board-specific device physical regions. - Init::SetupDevicePhysicalMemoryRegions(memory_layout); + Init::SetupDevicePhysicalMemoryRegions(*memory_layout); // Determine the amount of space needed for the misc region. size_t misc_region_needed_size; @@ -400,7 +402,7 @@ struct KernelCore::Impl { misc_region_needed_size = Core::Hardware::NUM_CPU_CORES * (3 * (PageSize + PageSize)); // Account for each auto-map device. - for (const auto& region : memory_layout.GetPhysicalMemoryRegionTree()) { + for (const auto& region : memory_layout->GetPhysicalMemoryRegionTree()) { if (region.HasTypeAttribute(KMemoryRegionAttr_ShouldKernelMap)) { // Check that the region is valid. ASSERT(region.GetEndAddress() != 0); @@ -425,22 +427,22 @@ struct KernelCore::Impl { // Setup the misc region. const VAddr misc_region_start = - memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegion( + memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegion( misc_region_size, MiscRegionAlign, KMemoryRegionType_Kernel); - ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert( + ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert( misc_region_start, misc_region_size, KMemoryRegionType_KernelMisc)); // Setup the stack region. constexpr size_t StackRegionSize = 14_MiB; constexpr size_t StackRegionAlign = KernelAslrAlignment; const VAddr stack_region_start = - memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegion( + memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegion( StackRegionSize, StackRegionAlign, KMemoryRegionType_Kernel); - ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert( + ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert( stack_region_start, StackRegionSize, KMemoryRegionType_KernelStack)); // Determine the size of the resource region. - const size_t resource_region_size = memory_layout.GetResourceRegionSizeForInit(); + const size_t resource_region_size = memory_layout->GetResourceRegionSizeForInit(); // Determine the size of the slab region. const size_t slab_region_size = @@ -457,23 +459,23 @@ struct KernelCore::Impl { Common::AlignUp(code_end_phys_addr + slab_region_size, SlabRegionAlign) - Common::AlignDown(code_end_phys_addr, SlabRegionAlign); const VAddr slab_region_start = - memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegion( + memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegion( slab_region_needed_size, SlabRegionAlign, KMemoryRegionType_Kernel) + (code_end_phys_addr % SlabRegionAlign); - ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert( + ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert( slab_region_start, slab_region_size, KMemoryRegionType_KernelSlab)); // Setup the temp region. constexpr size_t TempRegionSize = 128_MiB; constexpr size_t TempRegionAlign = KernelAslrAlignment; const VAddr temp_region_start = - memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegion( + memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegion( TempRegionSize, TempRegionAlign, KMemoryRegionType_Kernel); - ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert(temp_region_start, TempRegionSize, - KMemoryRegionType_KernelTemp)); + ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert(temp_region_start, TempRegionSize, + KMemoryRegionType_KernelTemp)); // Automatically map in devices that have auto-map attributes. - for (auto& region : memory_layout.GetPhysicalMemoryRegionTree()) { + for (auto& region : memory_layout->GetPhysicalMemoryRegionTree()) { // We only care about kernel regions. if (!region.IsDerivedFrom(KMemoryRegionType_Kernel)) { continue; @@ -500,21 +502,21 @@ struct KernelCore::Impl { const size_t map_size = Common::AlignUp(region.GetEndAddress(), PageSize) - map_phys_addr; const VAddr map_virt_addr = - memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegionWithGuard( + memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegionWithGuard( map_size, PageSize, KMemoryRegionType_KernelMisc, PageSize); - ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert( + ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert( map_virt_addr, map_size, KMemoryRegionType_KernelMiscMappedDevice)); region.SetPairAddress(map_virt_addr + region.GetAddress() - map_phys_addr); } - Init::SetupDramPhysicalMemoryRegions(memory_layout); + Init::SetupDramPhysicalMemoryRegions(*memory_layout); // Insert a physical region for the kernel code region. - ASSERT(memory_layout.GetPhysicalMemoryRegionTree().Insert( + ASSERT(memory_layout->GetPhysicalMemoryRegionTree().Insert( code_start_phys_addr, code_region_size, KMemoryRegionType_DramKernelCode)); // Insert a physical region for the kernel slab region. - ASSERT(memory_layout.GetPhysicalMemoryRegionTree().Insert( + ASSERT(memory_layout->GetPhysicalMemoryRegionTree().Insert( slab_start_phys_addr, slab_region_size, KMemoryRegionType_DramKernelSlab)); // Determine size available for kernel page table heaps, requiring > 8 MB. @@ -523,12 +525,12 @@ struct KernelCore::Impl { ASSERT(page_table_heap_size / 4_MiB > 2); // Insert a physical region for the kernel page table heap region - ASSERT(memory_layout.GetPhysicalMemoryRegionTree().Insert( + ASSERT(memory_layout->GetPhysicalMemoryRegionTree().Insert( slab_end_phys_addr, page_table_heap_size, KMemoryRegionType_DramKernelPtHeap)); // All DRAM regions that we haven't tagged by this point will be mapped under the linear // mapping. Tag them. - for (auto& region : memory_layout.GetPhysicalMemoryRegionTree()) { + for (auto& region : memory_layout->GetPhysicalMemoryRegionTree()) { if (region.GetType() == KMemoryRegionType_Dram) { // Check that the region is valid. ASSERT(region.GetEndAddress() != 0); @@ -540,7 +542,7 @@ struct KernelCore::Impl { // Get the linear region extents. const auto linear_extents = - memory_layout.GetPhysicalMemoryRegionTree().GetDerivedRegionExtents( + memory_layout->GetPhysicalMemoryRegionTree().GetDerivedRegionExtents( KMemoryRegionAttr_LinearMapped); ASSERT(linear_extents.GetEndAddress() != 0); @@ -552,7 +554,7 @@ struct KernelCore::Impl { Common::AlignUp(linear_extents.GetEndAddress(), LinearRegionAlign) - aligned_linear_phys_start; const VAddr linear_region_start = - memory_layout.GetVirtualMemoryRegionTree().GetRandomAlignedRegionWithGuard( + memory_layout->GetVirtualMemoryRegionTree().GetRandomAlignedRegionWithGuard( linear_region_size, LinearRegionAlign, KMemoryRegionType_None, LinearRegionAlign); const u64 linear_region_phys_to_virt_diff = linear_region_start - aligned_linear_phys_start; @@ -561,7 +563,7 @@ struct KernelCore::Impl { { PAddr cur_phys_addr = 0; u64 cur_size = 0; - for (auto& region : memory_layout.GetPhysicalMemoryRegionTree()) { + for (auto& region : memory_layout->GetPhysicalMemoryRegionTree()) { if (!region.HasTypeAttribute(KMemoryRegionAttr_LinearMapped)) { continue; } @@ -580,47 +582,47 @@ struct KernelCore::Impl { const VAddr region_virt_addr = region.GetAddress() + linear_region_phys_to_virt_diff; - ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert( + ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert( region_virt_addr, region.GetSize(), GetTypeForVirtualLinearMapping(region.GetType()))); region.SetPairAddress(region_virt_addr); KMemoryRegion* virt_region = - memory_layout.GetVirtualMemoryRegionTree().FindModifiable(region_virt_addr); + memory_layout->GetVirtualMemoryRegionTree().FindModifiable(region_virt_addr); ASSERT(virt_region != nullptr); virt_region->SetPairAddress(region.GetAddress()); } } // Insert regions for the initial page table region. - ASSERT(memory_layout.GetPhysicalMemoryRegionTree().Insert( + ASSERT(memory_layout->GetPhysicalMemoryRegionTree().Insert( resource_end_phys_addr, KernelPageTableHeapSize, KMemoryRegionType_DramKernelInitPt)); - ASSERT(memory_layout.GetVirtualMemoryRegionTree().Insert( + ASSERT(memory_layout->GetVirtualMemoryRegionTree().Insert( resource_end_phys_addr + linear_region_phys_to_virt_diff, KernelPageTableHeapSize, KMemoryRegionType_VirtualDramKernelInitPt)); // All linear-mapped DRAM regions that we haven't tagged by this point will be allocated to // some pool partition. Tag them. - for (auto& region : memory_layout.GetPhysicalMemoryRegionTree()) { + for (auto& region : memory_layout->GetPhysicalMemoryRegionTree()) { if (region.GetType() == (KMemoryRegionType_Dram | KMemoryRegionAttr_LinearMapped)) { region.SetType(KMemoryRegionType_DramPoolPartition); } } // Setup all other memory regions needed to arrange the pool partitions. - Init::SetupPoolPartitionMemoryRegions(memory_layout); + Init::SetupPoolPartitionMemoryRegions(*memory_layout); // Cache all linear regions in their own trees for faster access, later. - memory_layout.InitializeLinearMemoryRegionTrees(aligned_linear_phys_start, - linear_region_start); + memory_layout->InitializeLinearMemoryRegionTrees(aligned_linear_phys_start, + linear_region_start); } void InitializeMemoryLayout() { - const auto system_pool = memory_layout.GetKernelSystemPoolRegionPhysicalExtents(); + const auto system_pool = memory_layout->GetKernelSystemPoolRegionPhysicalExtents(); // Initialize the memory manager. memory_manager = std::make_unique(system); - const auto& management_region = memory_layout.GetPoolManagementRegion(); + const auto& management_region = memory_layout->GetPoolManagementRegion(); ASSERT(management_region.GetEndAddress() != 0); memory_manager->Initialize(management_region.GetAddress(), management_region.GetSize()); @@ -773,7 +775,7 @@ struct KernelCore::Impl { Kernel::KSharedMemory* hidbus_shared_mem{}; // Memory layout - KMemoryLayout memory_layout; + std::unique_ptr memory_layout; // Threads used for services std::unordered_set> service_threads; @@ -1149,7 +1151,7 @@ const KWorkerTaskManager& KernelCore::WorkerTaskManager() const { } const KMemoryLayout& KernelCore::MemoryLayout() const { - return impl->memory_layout; + return *impl->memory_layout; } bool KernelCore::IsPhantomModeForSingleCore() const { diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index 5865d6c51..5ac7a4533 100755 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -632,6 +632,7 @@ void Config::ReadCpuValues() { ReadGlobalSetting(Settings::values.cpuopt_unsafe_ignore_standard_fpcr); ReadGlobalSetting(Settings::values.cpuopt_unsafe_inaccurate_nan); ReadGlobalSetting(Settings::values.cpuopt_unsafe_fastmem_check); + ReadGlobalSetting(Settings::values.cpuopt_unsafe_ignore_global_monitor); if (global) { ReadBasicSetting(Settings::values.cpu_debug_mode); @@ -644,6 +645,8 @@ void Config::ReadCpuValues() { ReadBasicSetting(Settings::values.cpuopt_misc_ir); ReadBasicSetting(Settings::values.cpuopt_reduce_misalign_checks); ReadBasicSetting(Settings::values.cpuopt_fastmem); + ReadBasicSetting(Settings::values.cpuopt_fastmem_exclusives); + ReadBasicSetting(Settings::values.cpuopt_recompile_exclusives); } qt_config->endGroup(); @@ -1173,6 +1176,7 @@ void Config::SaveCpuValues() { WriteGlobalSetting(Settings::values.cpuopt_unsafe_ignore_standard_fpcr); WriteGlobalSetting(Settings::values.cpuopt_unsafe_inaccurate_nan); WriteGlobalSetting(Settings::values.cpuopt_unsafe_fastmem_check); + WriteGlobalSetting(Settings::values.cpuopt_unsafe_ignore_global_monitor); if (global) { WriteBasicSetting(Settings::values.cpu_debug_mode); diff --git a/src/yuzu/configuration/configure_cpu.cpp b/src/yuzu/configuration/configure_cpu.cpp index f66cab5d4..bf74ccc7c 100755 --- a/src/yuzu/configuration/configure_cpu.cpp +++ b/src/yuzu/configuration/configure_cpu.cpp @@ -36,6 +36,7 @@ void ConfigureCpu::SetConfiguration() { ui->cpuopt_unsafe_ignore_standard_fpcr->setEnabled(runtime_lock); ui->cpuopt_unsafe_inaccurate_nan->setEnabled(runtime_lock); ui->cpuopt_unsafe_fastmem_check->setEnabled(runtime_lock); + ui->cpuopt_unsafe_ignore_global_monitor->setEnabled(runtime_lock); ui->cpuopt_unsafe_unfuse_fma->setChecked(Settings::values.cpuopt_unsafe_unfuse_fma.GetValue()); ui->cpuopt_unsafe_reduce_fp_error->setChecked( @@ -46,6 +47,8 @@ void ConfigureCpu::SetConfiguration() { Settings::values.cpuopt_unsafe_inaccurate_nan.GetValue()); ui->cpuopt_unsafe_fastmem_check->setChecked( Settings::values.cpuopt_unsafe_fastmem_check.GetValue()); + ui->cpuopt_unsafe_ignore_global_monitor->setChecked( + Settings::values.cpuopt_unsafe_ignore_global_monitor.GetValue()); if (Settings::IsConfiguringGlobal()) { ui->accuracy->setCurrentIndex(static_cast(Settings::values.cpu_accuracy.GetValue())); @@ -82,6 +85,9 @@ void ConfigureCpu::ApplyConfiguration() { ConfigurationShared::ApplyPerGameSetting(&Settings::values.cpuopt_unsafe_fastmem_check, ui->cpuopt_unsafe_fastmem_check, cpuopt_unsafe_fastmem_check); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.cpuopt_unsafe_ignore_global_monitor, + ui->cpuopt_unsafe_ignore_global_monitor, + cpuopt_unsafe_ignore_global_monitor); } void ConfigureCpu::changeEvent(QEvent* event) { @@ -120,4 +126,7 @@ void ConfigureCpu::SetupPerGameUI() { ConfigurationShared::SetColoredTristate(ui->cpuopt_unsafe_fastmem_check, Settings::values.cpuopt_unsafe_fastmem_check, cpuopt_unsafe_fastmem_check); + ConfigurationShared::SetColoredTristate(ui->cpuopt_unsafe_ignore_global_monitor, + Settings::values.cpuopt_unsafe_ignore_global_monitor, + cpuopt_unsafe_ignore_global_monitor); } diff --git a/src/yuzu/configuration/configure_cpu.h b/src/yuzu/configuration/configure_cpu.h index ed9af0e9f..733e38be4 100755 --- a/src/yuzu/configuration/configure_cpu.h +++ b/src/yuzu/configuration/configure_cpu.h @@ -45,6 +45,7 @@ private: ConfigurationShared::CheckState cpuopt_unsafe_ignore_standard_fpcr; ConfigurationShared::CheckState cpuopt_unsafe_inaccurate_nan; ConfigurationShared::CheckState cpuopt_unsafe_fastmem_check; + ConfigurationShared::CheckState cpuopt_unsafe_ignore_global_monitor; const Core::System& system; }; diff --git a/src/yuzu/configuration/configure_cpu.ui b/src/yuzu/configuration/configure_cpu.ui index d8064db24..5d80a8c91 100755 --- a/src/yuzu/configuration/configure_cpu.ui +++ b/src/yuzu/configuration/configure_cpu.ui @@ -150,6 +150,18 @@ + + + + + <div>This option improves speed by relying only on the semantics of cmpxchg to ensure safety of exclusive access instructions. Please note this may result in deadlocks and other race conditions.</div> + + + + Ignore global monitor + + + diff --git a/src/yuzu/configuration/configure_cpu_debug.cpp b/src/yuzu/configuration/configure_cpu_debug.cpp index 05a90963d..616a0be75 100755 --- a/src/yuzu/configuration/configure_cpu_debug.cpp +++ b/src/yuzu/configuration/configure_cpu_debug.cpp @@ -44,6 +44,12 @@ void ConfigureCpuDebug::SetConfiguration() { Settings::values.cpuopt_reduce_misalign_checks.GetValue()); ui->cpuopt_fastmem->setEnabled(runtime_lock); ui->cpuopt_fastmem->setChecked(Settings::values.cpuopt_fastmem.GetValue()); + ui->cpuopt_fastmem_exclusives->setEnabled(runtime_lock); + ui->cpuopt_fastmem_exclusives->setChecked( + Settings::values.cpuopt_fastmem_exclusives.GetValue()); + ui->cpuopt_recompile_exclusives->setEnabled(runtime_lock); + ui->cpuopt_recompile_exclusives->setChecked( + Settings::values.cpuopt_recompile_exclusives.GetValue()); } void ConfigureCpuDebug::ApplyConfiguration() { @@ -56,6 +62,8 @@ void ConfigureCpuDebug::ApplyConfiguration() { Settings::values.cpuopt_misc_ir = ui->cpuopt_misc_ir->isChecked(); Settings::values.cpuopt_reduce_misalign_checks = ui->cpuopt_reduce_misalign_checks->isChecked(); Settings::values.cpuopt_fastmem = ui->cpuopt_fastmem->isChecked(); + Settings::values.cpuopt_fastmem_exclusives = ui->cpuopt_fastmem_exclusives->isChecked(); + Settings::values.cpuopt_recompile_exclusives = ui->cpuopt_recompile_exclusives->isChecked(); } void ConfigureCpuDebug::changeEvent(QEvent* event) { diff --git a/src/yuzu/configuration/configure_cpu_debug.ui b/src/yuzu/configuration/configure_cpu_debug.ui index 6e635bb2f..2bc268810 100755 --- a/src/yuzu/configuration/configure_cpu_debug.ui +++ b/src/yuzu/configuration/configure_cpu_debug.ui @@ -144,7 +144,34 @@ - Enable Host MMU Emulation + Enable Host MMU Emulation (general memory instructions) + + + + + + + + <div style="white-space: nowrap">This optimization speeds up exclusive memory accesses by the guest program.</div> + <div style="white-space: nowrap">Enabling it causes guest exclusive memory reads/writes to be done directly into memory and make use of Host's MMU.</div> + <div style="white-space: nowrap">Disabling this forces all exclusive memory accesses to use Software MMU Emulation.</div> + + + + Enable Host MMU Emulation (exclusive memory instructions) + + + + + + + + <div style="white-space: nowrap">This optimization speeds up exclusive memory accesses by the guest program.</div> + <div style="white-space: nowrap">Enabling it reduces the overhead of fastmem failure of exclusive memory accesses.</div> + + + + Enable recompilation of exclusive memory instructions diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 30963a8bb..b74411c84 100755 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -280,11 +280,14 @@ void Config::ReadValues() { ReadSetting("Cpu", Settings::values.cpuopt_misc_ir); ReadSetting("Cpu", Settings::values.cpuopt_reduce_misalign_checks); ReadSetting("Cpu", Settings::values.cpuopt_fastmem); + ReadSetting("Cpu", Settings::values.cpuopt_fastmem_exclusives); + ReadSetting("Cpu", Settings::values.cpuopt_recompile_exclusives); ReadSetting("Cpu", Settings::values.cpuopt_unsafe_unfuse_fma); ReadSetting("Cpu", Settings::values.cpuopt_unsafe_reduce_fp_error); ReadSetting("Cpu", Settings::values.cpuopt_unsafe_ignore_standard_fpcr); ReadSetting("Cpu", Settings::values.cpuopt_unsafe_inaccurate_nan); ReadSetting("Cpu", Settings::values.cpuopt_unsafe_fastmem_check); + ReadSetting("Cpu", Settings::values.cpuopt_unsafe_ignore_global_monitor); // Renderer ReadSetting("Renderer", Settings::values.renderer_backend); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index 6d613bf7a..3ac1440c9 100755 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -174,6 +174,14 @@ cpuopt_reduce_misalign_checks = # 0: Disabled, 1 (default): Enabled cpuopt_fastmem = +# Enable Host MMU Emulation for exclusive memory instructions (faster guest memory access) +# 0: Disabled, 1 (default): Enabled +cpuopt_fastmem_exclusives = + +# Enable fallback on failure of fastmem of exclusive memory instructions (faster guest memory access) +# 0: Disabled, 1 (default): Enabled +cpuopt_recompile_exclusives = + # Enable unfuse FMA (improve performance on CPUs without FMA) # Only enabled if cpu_accuracy is set to Unsafe. Automatically chosen with cpu_accuracy = Auto-select. # 0: Disabled, 1 (default): Enabled @@ -199,6 +207,11 @@ cpuopt_unsafe_inaccurate_nan = # 0: Disabled, 1 (default): Enabled cpuopt_unsafe_fastmem_check = +# Enable faster exclusive instructions +# Only enabled if cpu_accuracy is set to Unsafe. Automatically chosen with cpu_accuracy = Auto-select. +# 0: Disabled, 1 (default): Enabled +cpuopt_unsafe_ignore_global_monitor = + [Renderer] # Which backend API to use. # 0 (default): OpenGL, 1: Vulkan