early-access version 1279

2021-01-02 22:47:26 +01:00 · 2021-01-02 22:47:26 +01:00 · f7b2c59575
commit f7b2c59575
parent 84d5e05316
35 changed files with 399 additions and 112 deletions
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 yuzu emulator early access
 =============

-This is the source code for early-access 1277.
+This is the source code for early-access 1279.

 ## Legal Notice

--- a/externals/dynarmic/include/dynarmic/optimization_flags.h
+++ b/externals/dynarmic/include/dynarmic/optimization_flags.h
@ -39,6 +39,9 @@ enum class OptimizationFlag : std::uint32_t {
    /// This is an UNSAFE optimization that reduces accuracy of certain floating-point instructions.
    /// This allows results of FRECPE and FRSQRTE to have **less** error than spec allows.
    Unsafe_ReducedErrorFP   = 0x00020000,
+    /// This is an UNSAFE optimization that causes floating-point instructions to not produce correct NaNs.
+    /// This may also result in inaccurate results when instructions are given certain special values.
+    Unsafe_InaccurateNaN    = 0x00040000,
 };

 constexpr OptimizationFlag no_optimizations = static_cast<OptimizationFlag>(0);
--- a/externals/dynarmic/src/backend/x64/emit_x64_data_processing.cpp
+++ b/externals/dynarmic/src/backend/x64/emit_x64_data_processing.cpp
@ -1080,29 +1080,40 @@ static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit

    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    auto& carry_in = args[2];
+    const bool is_cmp = inst->UseCount() == size_t(!!carry_inst + !!overflow_inst + !!nzcv_inst) && carry_in.IsImmediate() && carry_in.GetImmediateU1();

    const Xbyak::Reg64 nzcv = DoNZCV(code, ctx.reg_alloc, nzcv_inst);
-    const Xbyak::Reg result = ctx.reg_alloc.UseScratchGpr(args[0]).changeBit(bitsize);
+    const Xbyak::Reg result = (is_cmp ? ctx.reg_alloc.UseGpr(args[0]) : ctx.reg_alloc.UseScratchGpr(args[0])).changeBit(bitsize);
    const Xbyak::Reg8 carry = DoCarry(ctx.reg_alloc, carry_in, carry_inst);
    const Xbyak::Reg8 overflow = overflow_inst ? ctx.reg_alloc.ScratchGpr().cvt8() : Xbyak::Reg8{-1};

    // TODO: Consider using LEA.
-    // TODO: Optimize CMP case.
    // Note that x64 CF is inverse of what the ARM carry flag is here.

-    if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
+    bool invert_output_carry = true;
+
+    if (is_cmp) {
+        if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
+            const u32 op_arg = args[1].GetImmediateU32();
+            code.cmp(result, op_arg);
+        } else {
+            OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
+            op_arg.setBit(bitsize);
+            code.cmp(result, *op_arg);
+        }
+    } else if (args[1].IsImmediate() && args[1].GetType() == IR::Type::U32) {
        const u32 op_arg = args[1].GetImmediateU32();
        if (carry_in.IsImmediate()) {
            if (carry_in.GetImmediateU1()) {
                code.sub(result, op_arg);
            } else {
-                code.stc();
-                code.sbb(result, op_arg);
+                code.add(result, ~op_arg);
+                invert_output_carry = false;
            }
        } else {
            code.bt(carry.cvt32(), 0);
-            code.cmc();
-            code.sbb(result, op_arg);
+            code.adc(result, ~op_arg);
+            invert_output_carry = false;
        }
    } else {
        OpArg op_arg = ctx.reg_alloc.UseOpArg(args[1]);
@ -1122,14 +1133,20 @@ static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit
    }

    if (nzcv_inst) {
-        code.cmc();
+        if (invert_output_carry) {
+            code.cmc();
+        }
        code.lahf();
        code.seto(code.al);
        ctx.reg_alloc.DefineValue(nzcv_inst, nzcv);
        ctx.EraseInstruction(nzcv_inst);
    }
    if (carry_inst) {
-        code.setnc(carry);
+        if (invert_output_carry) {
+            code.setnc(carry);
+        } else {
+            code.setc(carry);
+        }
        ctx.reg_alloc.DefineValue(carry_inst, carry);
        ctx.EraseInstruction(carry_inst);
    }
@ -1138,8 +1155,9 @@ static void EmitSub(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, int bit
        ctx.reg_alloc.DefineValue(overflow_inst, overflow);
        ctx.EraseInstruction(overflow_inst);
    }
-
-    ctx.reg_alloc.DefineValue(inst, result);
+    if (!is_cmp) {
+        ctx.reg_alloc.DefineValue(inst, result);
+    }
 }

 void EmitX64::EmitSub32(EmitContext& ctx, IR::Inst* inst) {
--- a/externals/dynarmic/src/backend/x64/emit_x64_floating_point.cpp
+++ b/externals/dynarmic/src/backend/x64/emit_x64_floating_point.cpp
@ -257,7 +257,7 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {

    Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);

-    if (!ctx.FPCR().DN()) {
+    if (!ctx.FPCR().DN() && !ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
        end = ProcessNaN<fsize>(code, result);
    }
    if constexpr (std::is_member_function_pointer_v<Function>) {
@ -265,7 +265,9 @@ void FPTwoOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn) {
    } else {
        fn(result);
    }
-    if (ctx.FPCR().DN()) {
+    if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+        // Do nothing
+    } else if (ctx.FPCR().DN()) {
        ForceToDefaultNaN<fsize>(code, result);
    } else {
        PostProcessNaN<fsize>(code, result, ctx.reg_alloc.ScratchXmm());
@ -281,6 +283,20 @@ void FPThreeOp(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Function fn)

    auto args = ctx.reg_alloc.GetArgumentInfo(inst);

+    if (ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+        const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+        const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
+
+        if constexpr (std::is_member_function_pointer_v<Function>) {
+            (code.*fn)(result, operand);
+        } else {
+            fn(result, operand);
+        }
+
+        ctx.reg_alloc.DefineValue(inst, result);
+        return;
+    }
+
    if (ctx.FPCR().DN()) {
        const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
        const Xbyak::Xmm operand = ctx.reg_alloc.UseScratchXmm(args[1]);
@ -590,9 +606,20 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    using FPT = mp::unsigned_integer_of_size<fsize>;

    if constexpr (fsize != 16) {
-        if (code.HasFMA()) {
-            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);

+        if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+            const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+            const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
+
+            FCODE(vfmadd231s)(result, operand2, operand3);
+
+            ctx.reg_alloc.DefineValue(inst, result);
+            return;
+        }
+
+        if (code.HasFMA()) {
            Xbyak::Label end, fallback;

            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
@ -641,8 +668,6 @@ static void EmitFPMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
        }

        if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
-            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
            const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
@ -810,6 +835,22 @@ static void EmitFPRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
    using FPT = mp::unsigned_integer_of_size<fsize>;

    if constexpr (fsize != 16) {
+        if (code.HasFMA() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+            Xbyak::Label end, fallback;
+
+            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+            code.movaps(result, code.MConst(xword, FP::FPValue<FPT, false, 0, 2>()));
+            FCODE(vfnmadd231s)(result, operand1, operand2);
+
+            ctx.reg_alloc.DefineValue(inst, result);
+            return;
+        }
+
        if (code.HasFMA()) {
            auto args = ctx.reg_alloc.GetArgumentInfo(inst);

@ -998,6 +1039,21 @@ static void EmitFPRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst*
    using FPT = mp::unsigned_integer_of_size<fsize>;

    if constexpr (fsize != 16) {
+        if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+
+            code.vmovaps(result, code.MConst(xword, FP::FPValue<FPT, false, 0, 3>()));
+            FCODE(vfnmadd231s)(result, operand1, operand2);
+            FCODE(vmuls)(result, result, code.MConst(xword, FP::FPValue<FPT, false, -1, 1>()));
+
+            ctx.reg_alloc.DefineValue(inst, result);
+            return;
+        }
+
        if (code.HasFMA() && code.HasAVX()) {
            auto args = ctx.reg_alloc.GetArgumentInfo(inst);

--- a/externals/dynarmic/src/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/externals/dynarmic/src/backend/x64/emit_x64_vector_floating_point.cpp
@ -290,7 +290,7 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    const bool fpcr_controlled = args[fpcr_controlled_arg_index].GetImmediateU1();

-    if (ctx.FPCR(fpcr_controlled).DN()) {
+    if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
        Xbyak::Xmm result;

        if constexpr (std::is_member_function_pointer_v<Function>) {
@ -306,7 +306,9 @@ void EmitTwoOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* ins
            });
        }

-        ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
+        if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+            ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), result);
+        }

        ctx.reg_alloc.DefineValue(inst, result);
        return;
@ -342,7 +344,7 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    const bool fpcr_controlled = args[2].GetImmediateU1();

-    if (ctx.FPCR(fpcr_controlled).DN()) {
+    if (ctx.FPCR(fpcr_controlled).DN() || ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
        const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseScratchXmm(args[0]);
        const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);

@ -356,7 +358,9 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
            });
        }

-        ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), xmm_a);
+        if (!ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+            ForceToDefaultNaN<fsize>(code, ctx.FPCR(fpcr_controlled), xmm_a);
+        }

        ctx.reg_alloc.DefineValue(inst, xmm_a);
        return;
@ -985,11 +989,23 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
    };

    if constexpr (fsize != 16) {
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        const bool fpcr_controlled = args[3].GetImmediateU1();
+
+        if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
+            const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
+            const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
+            const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
+
+            MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
+                FCODE(vfmadd231p)(result, xmm_b, xmm_c);
+            });
+
+            ctx.reg_alloc.DefineValue(inst, result);
+            return;
+        }
+
        if (code.HasFMA() && code.HasAVX()) {
-            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
-            const bool fpcr_controlled = args[3].GetImmediateU1();
-
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
            const Xbyak::Xmm xmm_a = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm xmm_b = ctx.reg_alloc.UseXmm(args[1]);
@ -1025,8 +1041,6 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
        }

        if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
-            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseScratchXmm(args[1]);
            const Xbyak::Xmm operand3 = ctx.reg_alloc.UseXmm(args[2]);
@ -1233,10 +1247,24 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
    };

    if constexpr (fsize != 16) {
-        if (code.HasFMA() && code.HasAVX()) {
-            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-            const bool fpcr_controlled = args[2].GetImmediateU1();
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        const bool fpcr_controlled = args[2].GetImmediateU1();

+        if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+
+            MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
+                code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
+                FCODE(vfnmadd231p)(result, operand1, operand2);
+            });
+
+            ctx.reg_alloc.DefineValue(inst, result);
+            return;
+        }
+
+        if (code.HasFMA() && code.HasAVX()) {
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
@ -1269,8 +1297,6 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
        }

        if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
-            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
@ -1428,10 +1454,25 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
    };

    if constexpr (fsize != 16) {
-        if (code.HasFMA() && code.HasAVX()) {
-            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-            const bool fpcr_controlled = args[2].GetImmediateU1();
+        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+        const bool fpcr_controlled = args[2].GetImmediateU1();

+        if (code.HasFMA() && code.HasAVX() && ctx.HasOptimization(OptimizationFlag::Unsafe_InaccurateNaN)) {
+            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
+            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
+            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
+
+            MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&]{
+                code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code));
+                FCODE(vfnmadd231p)(result, operand1, operand2);
+                FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code));
+            });
+
+            ctx.reg_alloc.DefineValue(inst, result);
+            return;
+        }
+
+        if (code.HasFMA() && code.HasAVX()) {
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseXmm(args[0]);
            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
@ -1470,8 +1511,6 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
        }

        if (ctx.HasOptimization(OptimizationFlag::Unsafe_UnfuseFMA)) {
-            auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
            const Xbyak::Xmm operand1 = ctx.reg_alloc.UseScratchXmm(args[0]);
            const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
            const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
--- a/externals/dynarmic/src/frontend/A64/translate/impl/data_processing_conditional_compare.cpp
+++ b/externals/dynarmic/src/frontend/A64/translate/impl/data_processing_conditional_compare.cpp
@ -27,7 +27,7 @@ bool TranslatorVisitor::CCMP_reg(bool sf, Reg Rm, Cond cond, Reg Rn, Imm<4> nzcv
    const IR::U32U64 operand1 = X(datasize, Rn);
    const IR::U32U64 operand2 = X(datasize, Rm);

-    const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, ir.Not(operand2), ir.Imm1(1)));
+    const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1)));
    const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
    ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
    return true;
@ -53,7 +53,7 @@ bool TranslatorVisitor::CCMP_imm(bool sf, Imm<5> imm5, Cond cond, Reg Rn, Imm<4>
    const IR::U32U64 operand1 = X(datasize, Rn);
    const IR::U32U64 operand2 = I(datasize, imm5.ZeroExtend<u32>());

-    const IR::NZCV then_flags = ir.NZCVFrom(ir.AddWithCarry(operand1, ir.Not(operand2), ir.Imm1(1)));
+    const IR::NZCV then_flags = ir.NZCVFrom(ir.SubWithCarry(operand1, operand2, ir.Imm1(1)));
    const IR::NZCV else_flags = ir.NZCVFromPackedFlags(ir.Imm32(flags));
    ir.SetNZCV(ir.ConditionalSelect(cond, then_flags, else_flags));
    return true;
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@ -98,7 +98,6 @@ add_library(common STATIC
    algorithm.h
    alignment.h
    assert.h
-    atomic_ops.cpp
    atomic_ops.h
    detached_tasks.cpp
    detached_tasks.h
--- a/src/common/atomic_ops.h
+++ b/src/common/atomic_ops.h
@ -4,14 +4,75 @@

 #pragma once

+#include <cstring>
+#include <memory>
+
 #include "common/common_types.h"

+#if _MSC_VER
+#include <intrin.h>
+#endif
+
 namespace Common {

-[[nodiscard]] bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected);
-[[nodiscard]] bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected);
-[[nodiscard]] bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected);
-[[nodiscard]] bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected);
-[[nodiscard]] bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected);
+#if _MSC_VER
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
+    const u8 result =
+        _InterlockedCompareExchange8(reinterpret_cast<volatile char*>(pointer), value, expected);
+    return result == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
+    const u16 result =
+        _InterlockedCompareExchange16(reinterpret_cast<volatile short*>(pointer), value, expected);
+    return result == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
+    const u32 result =
+        _InterlockedCompareExchange(reinterpret_cast<volatile long*>(pointer), value, expected);
+    return result == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
+    const u64 result = _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*>(pointer),
+                                                     value, expected);
+    return result == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
+    return _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), value[1],
+                                          value[0],
+                                          reinterpret_cast<__int64*>(expected.data())) != 0;
+}
+
+#else
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
+    return __sync_bool_compare_and_swap(pointer, expected, value);
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
+    unsigned __int128 value_a;
+    unsigned __int128 expected_a;
+    std::memcpy(&value_a, value.data(), sizeof(u128));
+    std::memcpy(&expected_a, expected.data(), sizeof(u128));
+    return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
+}
+
+#endif

 } // namespace Common
--- a/src/common/page_table.h
+++ b/src/common/page_table.h
@ -90,7 +90,7 @@ struct PageTable {
    PageTable& operator=(PageTable&&) noexcept = default;

    /**
-     * Resizes the page table to be able to accomodate enough pages within
+     * Resizes the page table to be able to accommodate enough pages within
     * a given address space.
     *
     * @param address_space_width_in_bits The address size width in bits.
--- a/src/common/swap.h
+++ b/src/common/swap.h
@ -394,7 +394,7 @@ public:
    template <typename S, typename T2, typename F2>
    friend S operator%(const S& p, const swapped_t v);

-    // Arithmetics + assignements
+    // Arithmetics + assignments
    template <typename S, typename T2, typename F2>
    friend S operator+=(const S& p, const swapped_t v);

@ -451,7 +451,7 @@ S operator%(const S& i, const swap_struct_t<T, F> v) {
    return i % v.swap();
 }

-// Arithmetics + assignements
+// Arithmetics + assignments
 template <typename S, typename T, typename F>
 S& operator+=(S& i, const swap_struct_t<T, F> v) {
    i += v.swap();
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@ -2,19 +2,74 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <array>
 #include <chrono>
+#include <limits>
 #include <mutex>
 #include <thread>

 #ifdef _MSC_VER
 #include <intrin.h>
+
+#pragma intrinsic(__umulh)
+#pragma intrinsic(_udiv128)
 #else
 #include <x86intrin.h>
 #endif

+#include "common/atomic_ops.h"
 #include "common/uint128.h"
 #include "common/x64/native_clock.h"

+namespace {
+
+[[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) {
+#ifdef __SIZEOF_INT128__
+    const auto base = static_cast<unsigned __int128>(numerator) << 64ULL;
+    return static_cast<u64>(base / divisor);
+#elif defined(_M_X64) || defined(_M_ARM64)
+    std::array<u64, 2> r = {0, numerator};
+    u64 remainder;
+#if _MSC_VER < 1923
+    return udiv128(r[1], r[0], divisor, &remainder);
+#else
+    return _udiv128(r[1], r[0], divisor, &remainder);
+#endif
+#else
+    // This one is bit more inaccurate.
+    return MultiplyAndDivide64(std::numeric_limits<u64>::max(), numerator, divisor);
+#endif
+}
+
+[[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) {
+#ifdef __SIZEOF_INT128__
+    return (static_cast<unsigned __int128>(a) * static_cast<unsigned __int128>(b)) >> 64;
+#elif defined(_M_X64) || defined(_M_ARM64)
+    return __umulh(a, b); // MSVC
+#else
+    // Generic fallback
+    const u64 a_lo = u32(a);
+    const u64 a_hi = a >> 32;
+    const u64 b_lo = u32(b);
+    const u64 b_hi = b >> 32;
+
+    const u64 a_x_b_hi = a_hi * b_hi;
+    const u64 a_x_b_mid = a_hi * b_lo;
+    const u64 b_x_a_mid = b_hi * a_lo;
+    const u64 a_x_b_lo = a_lo * b_lo;
+
+    const u64 carry_bit = (static_cast<u64>(static_cast<u32>(a_x_b_mid)) +
+                           static_cast<u64>(static_cast<u32>(b_x_a_mid)) + (a_x_b_lo >> 32)) >>
+                          32;
+
+    const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit;
+
+    return multhi;
+#endif
+}
+
+} // namespace
+
 namespace Common {

 u64 EstimateRDTSCFrequency() {
@ -48,54 +103,71 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen
    : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{
                                                                               rtsc_frequency_} {
    _mm_mfence();
-    last_measure = __rdtsc();
-    accumulated_ticks = 0U;
+    time_point.inner.last_measure = __rdtsc();
+    time_point.inner.accumulated_ticks = 0U;
+    ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency);
+    us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency);
+    ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency);
+    clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency);
+    cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency);
 }

 u64 NativeClock::GetRTSC() {
-    std::scoped_lock scope{rtsc_serialize};
-    _mm_mfence();
-    const u64 current_measure = __rdtsc();
-    u64 diff = current_measure - last_measure;
-    diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
-    if (current_measure > last_measure) {
-        last_measure = current_measure;
-    }
-    accumulated_ticks += diff;
+    TimePoint new_time_point{};
+    TimePoint current_time_point{};
+    do {
+        current_time_point.pack = time_point.pack;
+        _mm_mfence();
+        const u64 current_measure = __rdtsc();
+        u64 diff = current_measure - current_time_point.inner.last_measure;
+        diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
+        new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure
+                                                ? current_measure
+                                                : current_time_point.inner.last_measure;
+        new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
+    } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
+                                           current_time_point.pack));
    /// The clock cannot be more precise than the guest timer, remove the lower bits
-    return accumulated_ticks & inaccuracy_mask;
+    return new_time_point.inner.accumulated_ticks & inaccuracy_mask;
 }

 void NativeClock::Pause(bool is_paused) {
    if (!is_paused) {
-        _mm_mfence();
-        last_measure = __rdtsc();
+        TimePoint current_time_point{};
+        TimePoint new_time_point{};
+        do {
+            current_time_point.pack = time_point.pack;
+            new_time_point.pack = current_time_point.pack;
+            _mm_mfence();
+            new_time_point.inner.last_measure = __rdtsc();
+        } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
+                                               current_time_point.pack));
    }
 }

 std::chrono::nanoseconds NativeClock::GetTimeNS() {
    const u64 rtsc_value = GetRTSC();
-    return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)};
+    return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)};
 }

 std::chrono::microseconds NativeClock::GetTimeUS() {
    const u64 rtsc_value = GetRTSC();
-    return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)};
+    return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)};
 }

 std::chrono::milliseconds NativeClock::GetTimeMS() {
    const u64 rtsc_value = GetRTSC();
-    return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)};
+    return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)};
 }

 u64 NativeClock::GetClockCycles() {
    const u64 rtsc_value = GetRTSC();
-    return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency);
+    return MultiplyHigh(rtsc_value, clock_rtsc_factor);
 }

 u64 NativeClock::GetCPUCycles() {
    const u64 rtsc_value = GetRTSC();
-    return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency);
+    return MultiplyHigh(rtsc_value, cpu_rtsc_factor);
 }

 } // namespace X64
--- a/src/common/x64/native_clock.h
+++ b/src/common/x64/native_clock.h
@ -6,7 +6,6 @@

 #include <optional>

-#include "common/spin_lock.h"
 #include "common/wall_clock.h"

 namespace Common {
@ -32,14 +31,28 @@ public:
 private:
    u64 GetRTSC();

+    union alignas(16) TimePoint {
+        TimePoint() : pack{} {}
+        u128 pack{};
+        struct Inner {
+            u64 last_measure{};
+            u64 accumulated_ticks{};
+        } inner;
+    };
+
    /// value used to reduce the native clocks accuracy as some apss rely on
    /// undefined behavior where the level of accuracy in the clock shouldn't
    /// be higher.
    static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1);

-    SpinLock rtsc_serialize{};
-    u64 last_measure{};
-    u64 accumulated_ticks{};
+    TimePoint time_point;
+    // factors
+    u64 clock_rtsc_factor{};
+    u64 cpu_rtsc_factor{};
+    u64 ns_rtsc_factor{};
+    u64 us_rtsc_factor{};
+    u64 ms_rtsc_factor{};
+
    u64 rtsc_frequency;
 };
 } // namespace X64
--- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp
@ -181,6 +181,9 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable&
        if (Settings::values.cpuopt_unsafe_reduce_fp_error) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_ReducedErrorFP;
        }
+        if (Settings::values.cpuopt_unsafe_inaccurate_nan) {
+            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
+        }
    }

    return std::make_unique<Dynarmic::A32::Jit>(config);
--- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp
@ -212,6 +212,9 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable&
        if (Settings::values.cpuopt_unsafe_reduce_fp_error) {
            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_ReducedErrorFP;
        }
+        if (Settings::values.cpuopt_unsafe_inaccurate_nan) {
+            config.optimizations |= Dynarmic::OptimizationFlag::Unsafe_InaccurateNaN;
+        }
    }

    return std::make_shared<Dynarmic::A64::Jit>(config);
--- a/src/core/hle/kernel/memory/memory_block.h
+++ b/src/core/hle/kernel/memory/memory_block.h
@ -73,12 +73,12 @@ enum class MemoryState : u32 {
    ThreadLocal =
        static_cast<u32>(Svc::MemoryState::ThreadLocal) | FlagMapped | FlagReferenceCounted,

-    Transfered = static_cast<u32>(Svc::MemoryState::Transfered) | FlagsMisc |
-                 FlagCanAlignedDeviceMap | FlagCanChangeAttribute | FlagCanUseIpc |
-                 FlagCanUseNonSecureIpc | FlagCanUseNonDeviceIpc,
+    Transferred = static_cast<u32>(Svc::MemoryState::Transferred) | FlagsMisc |
+                  FlagCanAlignedDeviceMap | FlagCanChangeAttribute | FlagCanUseIpc |
+                  FlagCanUseNonSecureIpc | FlagCanUseNonDeviceIpc,

-    SharedTransfered = static_cast<u32>(Svc::MemoryState::SharedTransfered) | FlagsMisc |
-                       FlagCanAlignedDeviceMap | FlagCanUseNonSecureIpc | FlagCanUseNonDeviceIpc,
+    SharedTransferred = static_cast<u32>(Svc::MemoryState::SharedTransferred) | FlagsMisc |
+                        FlagCanAlignedDeviceMap | FlagCanUseNonSecureIpc | FlagCanUseNonDeviceIpc,

    SharedCode = static_cast<u32>(Svc::MemoryState::SharedCode) | FlagMapped |
                 FlagReferenceCounted | FlagCanUseNonSecureIpc | FlagCanUseNonDeviceIpc,
@ -111,8 +111,8 @@ static_assert(static_cast<u32>(MemoryState::AliasCodeData) == 0x03FFBD09);
 static_assert(static_cast<u32>(MemoryState::Ipc) == 0x005C3C0A);
 static_assert(static_cast<u32>(MemoryState::Stack) == 0x005C3C0B);
 static_assert(static_cast<u32>(MemoryState::ThreadLocal) == 0x0040200C);
-static_assert(static_cast<u32>(MemoryState::Transfered) == 0x015C3C0D);
-static_assert(static_cast<u32>(MemoryState::SharedTransfered) == 0x005C380E);
+static_assert(static_cast<u32>(MemoryState::Transferred) == 0x015C3C0D);
+static_assert(static_cast<u32>(MemoryState::SharedTransferred) == 0x005C380E);
 static_assert(static_cast<u32>(MemoryState::SharedCode) == 0x0040380F);
 static_assert(static_cast<u32>(MemoryState::Inaccessible) == 0x00000010);
 static_assert(static_cast<u32>(MemoryState::NonSecureIpc) == 0x005C3811);
--- a/src/core/hle/kernel/memory/page_table.cpp
+++ b/src/core/hle/kernel/memory/page_table.cpp
@ -1007,8 +1007,8 @@ constexpr VAddr PageTable::GetRegionAddress(MemoryState state) const {
    case MemoryState::Shared:
    case MemoryState::AliasCode:
    case MemoryState::AliasCodeData:
-    case MemoryState::Transfered:
-    case MemoryState::SharedTransfered:
+    case MemoryState::Transferred:
+    case MemoryState::SharedTransferred:
    case MemoryState::SharedCode:
    case MemoryState::GeneratedCode:
    case MemoryState::CodeOut:
@ -1042,8 +1042,8 @@ constexpr std::size_t PageTable::GetRegionSize(MemoryState state) const {
    case MemoryState::Shared:
    case MemoryState::AliasCode:
    case MemoryState::AliasCodeData:
-    case MemoryState::Transfered:
-    case MemoryState::SharedTransfered:
+    case MemoryState::Transferred:
+    case MemoryState::SharedTransferred:
    case MemoryState::SharedCode:
    case MemoryState::GeneratedCode:
    case MemoryState::CodeOut:
@ -1080,8 +1080,8 @@ constexpr bool PageTable::CanContain(VAddr addr, std::size_t size, MemoryState s
    case MemoryState::AliasCodeData:
    case MemoryState::Stack:
    case MemoryState::ThreadLocal:
-    case MemoryState::Transfered:
-    case MemoryState::SharedTransfered:
+    case MemoryState::Transferred:
+    case MemoryState::SharedTransferred:
    case MemoryState::SharedCode:
    case MemoryState::GeneratedCode:
    case MemoryState::CodeOut:
--- a/src/core/hle/kernel/svc_types.h
+++ b/src/core/hle/kernel/svc_types.h
@ -23,8 +23,8 @@ enum class MemoryState : u32 {
    Ipc = 0x0A,
    Stack = 0x0B,
    ThreadLocal = 0x0C,
-    Transfered = 0x0D,
-    SharedTransfered = 0x0E,
+    Transferred = 0x0D,
+    SharedTransferred = 0x0E,
    SharedCode = 0x0F,
    Inaccessible = 0x10,
    NonSecureIpc = 0x11,
--- a/src/core/hle/service/am/am.cpp
+++ b/src/core/hle/service/am/am.cpp
@ -560,14 +560,14 @@ void ISelfController::GetAccumulatedSuspendedTickChangedEvent(Kernel::HLERequest

 AppletMessageQueue::AppletMessageQueue(Kernel::KernelCore& kernel) {
    on_new_message =
-        Kernel::WritableEvent::CreateEventPair(kernel, "AMMessageQueue:OnMessageRecieved");
+        Kernel::WritableEvent::CreateEventPair(kernel, "AMMessageQueue:OnMessageReceived");
    on_operation_mode_changed =
        Kernel::WritableEvent::CreateEventPair(kernel, "AMMessageQueue:OperationModeChanged");
 }

 AppletMessageQueue::~AppletMessageQueue() = default;

-const std::shared_ptr<Kernel::ReadableEvent>& AppletMessageQueue::GetMesssageRecieveEvent() const {
+const std::shared_ptr<Kernel::ReadableEvent>& AppletMessageQueue::GetMessageReceiveEvent() const {
    return on_new_message.readable;
 }

@ -675,7 +675,7 @@ void ICommonStateGetter::GetEventHandle(Kernel::HLERequestContext& ctx) {

    IPC::ResponseBuilder rb{ctx, 2, 1};
    rb.Push(RESULT_SUCCESS);
-    rb.PushCopyObjects(msg_queue->GetMesssageRecieveEvent());
+    rb.PushCopyObjects(msg_queue->GetMessageReceiveEvent());
 }

 void ICommonStateGetter::ReceiveMessage(Kernel::HLERequestContext& ctx) {
--- a/src/core/hle/service/am/am.h
+++ b/src/core/hle/service/am/am.h
@ -55,7 +55,7 @@ public:
    explicit AppletMessageQueue(Kernel::KernelCore& kernel);
    ~AppletMessageQueue();

-    const std::shared_ptr<Kernel::ReadableEvent>& GetMesssageRecieveEvent() const;
+    const std::shared_ptr<Kernel::ReadableEvent>& GetMessageReceiveEvent() const;
    const std::shared_ptr<Kernel::ReadableEvent>& GetOperationModeChangedEvent() const;
    void PushMessage(AppletMessage msg);
    AppletMessage PopMessage();
--- a/src/core/settings.h
+++ b/src/core/settings.h
@ -131,6 +131,7 @@ struct Values {

    bool cpuopt_unsafe_unfuse_fma;
    bool cpuopt_unsafe_reduce_fp_error;
+    bool cpuopt_unsafe_inaccurate_nan;

    // Renderer
    Setting<RendererBackend> renderer_backend;
@ -221,7 +222,7 @@ struct Values {
    bool disable_macro_jit;
    bool extended_logging;

-    // Misceallaneous
+    // Miscellaneous
    std::string log_filter;
    bool use_dev_keys;

--- a/src/input_common/gcadapter/gc_adapter.h
+++ b/src/input_common/gcadapter/gc_adapter.h
@ -120,17 +120,17 @@ private:
    /// For use in initialization, querying devices to find the adapter
    void Setup();

-    /// Resets status of all GC controller devices to a disconected state
+    /// Resets status of all GC controller devices to a disconnected state
    void ResetDevices();

-    /// Resets status of device connected to a disconected state
+    /// Resets status of device connected to a disconnected state
    void ResetDevice(std::size_t port);

    /// Returns true if we successfully gain access to GC Adapter
    bool CheckDeviceAccess();

    /// Captures GC Adapter endpoint address
-    /// Returns true if the endpoind was set correctly
+    /// Returns true if the endpoint was set correctly
    bool GetGCEndpoint(libusb_device* device);

    /// For shutting down, clear all data, join all threads, release usb
--- a/src/input_common/motion_input.cpp
+++ b/src/input_common/motion_input.cpp
@ -129,7 +129,7 @@ void MotionInput::UpdateOrientation(u64 elapsed_time) {
            rad_gyro += ki * integral_error;
            rad_gyro += kd * derivative_error;
        } else {
-            // Give more weight to acelerometer values to compensate for the lack of gyro
+            // Give more weight to accelerometer values to compensate for the lack of gyro
            rad_gyro += 35.0f * kp * real_error;
            rad_gyro += 10.0f * ki * integral_error;
            rad_gyro += 10.0f * kd * derivative_error;
--- a/src/input_common/mouse/mouse_input.h
+++ b/src/input_common/mouse/mouse_input.h
@ -20,7 +20,7 @@ enum class MouseButton {
    Left,
    Wheel,
    Right,
-    Foward,
+    Forward,
    Backward,
    Undefined,
 };
--- a/src/input_common/udp/udp.cpp
+++ b/src/input_common/udp/udp.cpp
@ -28,14 +28,14 @@ private:
    mutable std::mutex mutex;
 };

-/// A motion device factory that creates motion devices from JC Adapter
+/// A motion device factory that creates motion devices from a UDP client
 UDPMotionFactory::UDPMotionFactory(std::shared_ptr<CemuhookUDP::Client> client_)
    : client(std::move(client_)) {}

 /**
 * Creates motion device
 * @param params contains parameters for creating the device:
- *     - "port": the nth jcpad on the adapter
+ *     - "port": the UDP port number
 */
 std::unique_ptr<Input::MotionDevice> UDPMotionFactory::Create(const Common::ParamPackage& params) {
    auto ip = params.Get("ip", "127.0.0.1");
@ -90,14 +90,14 @@ private:
    mutable std::mutex mutex;
 };

-/// A motion device factory that creates motion devices from JC Adapter
+/// A motion device factory that creates motion devices from a UDP client
 UDPTouchFactory::UDPTouchFactory(std::shared_ptr<CemuhookUDP::Client> client_)
    : client(std::move(client_)) {}

 /**
 * Creates motion device
 * @param params contains parameters for creating the device:
- *     - "port": the nth jcpad on the adapter
+ *     - "port": the UDP port number
 */
 std::unique_ptr<Input::TouchDevice> UDPTouchFactory::Create(const Common::ParamPackage& params) {
    auto ip = params.Get("ip", "127.0.0.1");
--- a/src/tests/common/fibers.cpp
+++ b/src/tests/common/fibers.cpp
@ -207,7 +207,7 @@ static void ThreadStart2_2(u32 id, TestControl2& test_control) {
 }

 /** This test checks for fiber thread exchange configuration and validates that fibers are
- *  that a fiber has been succesfully transfered from one thread to another and that the TLS
+ *  that a fiber has been successfully transferred from one thread to another and that the TLS
 *  region of the thread is kept while changing fibers.
 */
 TEST_CASE("Fibers::InterExchange", "[common]") {
@ -299,7 +299,7 @@ static void ThreadStart3(u32 id, TestControl3& test_control) {
 }

 /** This test checks for one two threads racing for starting the same fiber.
- *  It checks execution occured in an ordered manner and by no time there were
+ *  It checks execution occurred in an ordered manner and by no time there were
 *  two contexts at the same time.
 */
 TEST_CASE("Fibers::StartRace", "[common]") {
--- a/src/video_core/command_classes/vic.cpp
+++ b/src/video_core/command_classes/vic.cpp
@ -53,7 +53,7 @@ void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) {

 void Vic::Execute() {
    if (output_surface_luma_address == 0) {
-        LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Recieved 0x{:X}",
+        LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Received 0x{:X}",
                  vic_state.output_surface.luma_offset);
        return;
    }
--- a/src/video_core/renderer_vulkan/vk_device.cpp
+++ b/src/video_core/renderer_vulkan/vk_device.cpp
@ -491,7 +491,7 @@ VkFormat VKDevice::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFla
 }

 void VKDevice::ReportLoss() const {
-    LOG_CRITICAL(Render_Vulkan, "Device loss occured!");
+    LOG_CRITICAL(Render_Vulkan, "Device loss occurred!");

    // Wait for the log to flush and for Nsight Aftermath to dump the results
    std::this_thread::sleep_for(std::chrono::seconds{15});
--- a/src/yuzu/applets/error.cpp
+++ b/src/yuzu/applets/error.cpp
@ -19,7 +19,7 @@ QtErrorDisplay::~QtErrorDisplay() = default;
 void QtErrorDisplay::ShowError(ResultCode error, std::function<void()> finished) const {
    callback = std::move(finished);
    emit MainWindowDisplayError(
-        tr("An error has occured.\nPlease try again or contact the developer of the "
+        tr("An error has occurred.\nPlease try again or contact the developer of the "
           "software.\n\nError Code: %1-%2 (0x%3)")
            .arg(static_cast<u32>(error.module.Value()) + 2000, 4, 10, QChar::fromLatin1('0'))
            .arg(error.description, 4, 10, QChar::fromLatin1('0'))
@ -32,7 +32,7 @@ void QtErrorDisplay::ShowErrorWithTimestamp(ResultCode error, std::chrono::secon

    const QDateTime date_time = QDateTime::fromSecsSinceEpoch(time.count());
    emit MainWindowDisplayError(
-        tr("An error occured on %1 at %2.\nPlease try again or contact the "
+        tr("An error occurred on %1 at %2.\nPlease try again or contact the "
           "developer of the software.\n\nError Code: %3-%4 (0x%5)")
            .arg(date_time.toString(QStringLiteral("dddd, MMMM d, yyyy")))
            .arg(date_time.toString(QStringLiteral("h:mm:ss A")))
@ -46,7 +46,7 @@ void QtErrorDisplay::ShowCustomErrorText(ResultCode error, std::string dialog_te
                                         std::function<void()> finished) const {
    callback = std::move(finished);
    emit MainWindowDisplayError(
-        tr("An error has occured.\nError Code: %1-%2 (0x%3)\n\n%4\n\n%5")
+        tr("An error has occurred.\nError Code: %1-%2 (0x%3)\n\n%4\n\n%5")
            .arg(static_cast<u32>(error.module.Value()) + 2000, 4, 10, QChar::fromLatin1('0'))
            .arg(error.description, 4, 10, QChar::fromLatin1('0'))
            .arg(error.raw, 8, 16, QChar::fromLatin1('0'))
--- a/src/yuzu/compatdb.cpp
+++ b/src/yuzu/compatdb.cpp
@ -72,7 +72,7 @@ void CompatDB::Submit() {
 void CompatDB::OnTestcaseSubmitted() {
    if (!testcase_watcher.result()) {
        QMessageBox::critical(this, tr("Communication error"),
-                              tr("An error occured while sending the Testcase"));
+                              tr("An error occurred while sending the Testcase"));
        button(NextButton)->setEnabled(true);
        button(NextButton)->setText(tr("Next"));
        button(CancelButton)->setVisible(true);
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@ -764,6 +764,8 @@ void Config::ReadCpuValues() {
            ReadSetting(QStringLiteral("cpuopt_unsafe_unfuse_fma"), true).toBool();
        Settings::values.cpuopt_unsafe_reduce_fp_error =
            ReadSetting(QStringLiteral("cpuopt_unsafe_reduce_fp_error"), true).toBool();
+        Settings::values.cpuopt_unsafe_inaccurate_nan =
+            ReadSetting(QStringLiteral("cpuopt_unsafe_inaccurate_nan"), true).toBool();
    }

    qt_config->endGroup();
@ -1327,6 +1329,8 @@ void Config::SaveCpuValues() {
                     Settings::values.cpuopt_unsafe_unfuse_fma, true);
        WriteSetting(QStringLiteral("cpuopt_unsafe_reduce_fp_error"),
                     Settings::values.cpuopt_unsafe_reduce_fp_error, true);
+        WriteSetting(QStringLiteral("cpuopt_unsafe_inaccurate_nan"),
+                     Settings::values.cpuopt_unsafe_inaccurate_nan, true);
    }

    qt_config->endGroup();
--- a/src/yuzu/configuration/configure_cpu.cpp
+++ b/src/yuzu/configuration/configure_cpu.cpp
@ -36,6 +36,8 @@ void ConfigureCpu::SetConfiguration() {
    ui->cpuopt_unsafe_unfuse_fma->setChecked(Settings::values.cpuopt_unsafe_unfuse_fma);
    ui->cpuopt_unsafe_reduce_fp_error->setEnabled(runtime_lock);
    ui->cpuopt_unsafe_reduce_fp_error->setChecked(Settings::values.cpuopt_unsafe_reduce_fp_error);
+    ui->cpuopt_unsafe_inaccurate_nan->setEnabled(runtime_lock);
+    ui->cpuopt_unsafe_inaccurate_nan->setChecked(Settings::values.cpuopt_unsafe_inaccurate_nan);
 }

 void ConfigureCpu::AccuracyUpdated(int index) {
@ -61,6 +63,7 @@ void ConfigureCpu::ApplyConfiguration() {
        static_cast<Settings::CPUAccuracy>(ui->accuracy->currentIndex());
    Settings::values.cpuopt_unsafe_unfuse_fma = ui->cpuopt_unsafe_unfuse_fma->isChecked();
    Settings::values.cpuopt_unsafe_reduce_fp_error = ui->cpuopt_unsafe_reduce_fp_error->isChecked();
+    Settings::values.cpuopt_unsafe_inaccurate_nan = ui->cpuopt_unsafe_inaccurate_nan->isChecked();
 }

 void ConfigureCpu::changeEvent(QEvent* event) {
--- a/src/yuzu/configuration/configure_cpu.ui
+++ b/src/yuzu/configuration/configure_cpu.ui
@ -109,6 +109,18 @@
          </property>
         </widget>
        </item>
+        <item>
+         <widget class="QCheckBox" name="cpuopt_unsafe_inaccurate_nan">
+          <property name="text">
+           <string>Inaccurate NaN handling</string>
+          </property>
+          <property name="toolTip">
+           <string>
+            &lt;div&gt;This option improves speed by removing NaN checking. Please note this also reduces accuracy of certain floating-point instructions.&lt;/div&gt;
+           </string>
+          </property>
+         </widget>
+        </item>
       </layout>
      </widget>
     </item>
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@ -142,7 +142,7 @@ constexpr int default_mouse_timeout = 2500;
 /**
 * "Callouts" are one-time instructional messages shown to the user. In the config settings, there
 * is a bitfield "callout_flags" options, used to track if a message has already been shown to the
- * user. This is 32-bits - if we have more than 32 callouts, we should retire and recyle old ones.
+ * user. This is 32-bits - if we have more than 32 callouts, we should retire and recycle old ones.
 */
 enum class CalloutFlag : uint32_t {
    Telemetry = 0x1,
--- a/src/yuzu_cmd/yuzu.cpp
+++ b/src/yuzu_cmd/yuzu.cpp
@ -202,7 +202,7 @@ int main(int argc, char** argv) {
            const u16 loader_id = static_cast<u16>(Core::System::ResultStatus::ErrorLoader);
            const u16 error_id = static_cast<u16>(load_result) - loader_id;
            LOG_CRITICAL(Frontend,
-                         "While attempting to load the ROM requested, an error occured. Please "
+                         "While attempting to load the ROM requested, an error occurred. Please "
                         "refer to the yuzu wiki for more information or the yuzu discord for "
                         "additional help.\n\nError Code: {:04X}-{:04X}\nError Description: {}",
                         loader_id, error_id, static_cast<Loader::ResultStatus>(error_id));
--- a/src/yuzu_tester/yuzu.cpp
+++ b/src/yuzu_tester/yuzu.cpp
@ -242,7 +242,7 @@ int main(int argc, char** argv) {
            const u16 loader_id = static_cast<u16>(Core::System::ResultStatus::ErrorLoader);
            const u16 error_id = static_cast<u16>(load_result) - loader_id;
            LOG_CRITICAL(Frontend,
-                         "While attempting to load the ROM requested, an error occured. Please "
+                         "While attempting to load the ROM requested, an error occurred. Please "
                         "refer to the yuzu wiki for more information or the yuzu discord for "
                         "additional help.\n\nError Code: {:04X}-{:04X}\nError Description: {}",
                         loader_id, error_id, static_cast<Loader::ResultStatus>(error_id));