early-access version 2585

2022-03-21 16:53:01 +01:00 · 2022-03-21 16:53:01 +01:00 · 9c48e94f2d
commit 9c48e94f2d
parent dd6ea95c54
31 changed files with 1072 additions and 483 deletions
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 yuzu emulator early access
 =============

-This is the source code for early-access 2576.
+This is the source code for early-access 2585.

 ## Legal Notice

--- a/externals/dynarmic/.github/workflows/build-and-test.yml
+++ b/externals/dynarmic/.github/workflows/build-and-test.yml
@ -49,7 +49,24 @@ jobs:
        run: UNICORN_ARCHS=aarch64,arm ./make.sh

      - name: Configure CMake
-      if: ${{matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'}}
+        if: ${{matrix.os == 'ubuntu-latest'}}
+        env:
+          CC: gcc-10
+          CXX: g++-10
+        run: >
+          cmake
+          -B ${{github.workspace}}/build
+          -DBoost_INCLUDE_DIRS=${{github.workspace}}/externals/ext-boost
+          -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
+          -DDYNARMIC_ENABLE_CPU_FEATURE_DETECTION=${{matrix.cpu_detection}}
+          -DDYNARMIC_TESTS_USE_UNICORN=1
+          -DDYNARMIC_USE_LLVM=1
+          -DLIBUNICORN_INCLUDE_DIR=${{github.workspace}}/externals/unicorn/include
+          -DLIBUNICORN_LIBRARY=${{github.workspace}}/externals/unicorn/libunicorn.a
+          -G Ninja
+
+      - name: Configure CMake
+        if: ${{matrix.os == 'macos-latest'}}
        run: >
          cmake
          -B ${{github.workspace}}/build
--- a/externals/dynarmic/CMakeLists.txt
+++ b/externals/dynarmic/CMakeLists.txt
@ -134,7 +134,6 @@ endif()
 if (DYNARMIC_NO_BUNDLED_VIXL AND ARCHITECTURE STREQUAL "arm64")
    find_package(PkgConfig REQUIRED)
    pkg_check_modules(vixl REQUIRED IMPORTED_TARGET vixl)
-    target_include_directories(PkgConfig::vixl INTERFACE "${vixl_INCLUDE_DIRS}/vixl")
    add_library(vixl ALIAS PkgConfig::vixl)
 endif()

--- a/externals/dynarmic/externals/catch/include/catch2/catch.hpp
+++ b/externals/dynarmic/externals/catch/include/catch2/catch.hpp
--- a/externals/dynarmic/src/dynarmic/CMakeLists.txt
+++ b/externals/dynarmic/src/dynarmic/CMakeLists.txt
@ -92,6 +92,7 @@ add_library(dynarmic
    ir/opt/identity_removal_pass.cpp
    ir/opt/ir_matcher.h
    ir/opt/passes.h
+    ir/opt/polyfill_pass.cpp
    ir/opt/verification_pass.cpp
    ir/terminal.h
    ir/type.cpp
@ -286,6 +287,7 @@ if (ARCHITECTURE STREQUAL "x86_64")
        backend/x64/emit_x64_packed.cpp
        backend/x64/emit_x64_saturation.cpp
        backend/x64/emit_x64_sm4.cpp
+        backend/x64/emit_x64_sha.cpp
        backend/x64/emit_x64_vector.cpp
        backend/x64/emit_x64_vector_floating_point.cpp
        backend/x64/emit_x64_vector_saturation.cpp
--- a/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a32_interface.cpp
@ -50,16 +50,24 @@ static std::function<void(BlockOfCode&)> GenRCP(const A32::UserConfig& conf) {
    };
 }

+static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
+    return Optimization::PolyfillOptions{
+        .sha256 = !code.HasHostFeature(HostFeature::SHA),
+    };
+}
+
 struct Jit::Impl {
    Impl(Jit* jit, A32::UserConfig conf)
            : block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this), JitStateInfo{jit_state}, conf.code_cache_size, conf.far_code_offset, GenRCP(conf))
            , emitter(block_of_code, conf, jit)
+            , polyfill_options(GenPolyfillOptions(block_of_code))
            , conf(std::move(conf))
            , jit_interface(jit) {}

    A32JitState jit_state;
    BlockOfCode block_of_code;
    A32EmitX64 emitter;
+    Optimization::PolyfillOptions polyfill_options;

    const A32::UserConfig conf;

@ -154,6 +162,7 @@ private:
        }

        IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, conf.callbacks, {conf.arch_version, conf.define_unpredictable_behaviour, conf.hook_hint_instructions});
+        Optimization::PolyfillPass(ir_block, polyfill_options);
        if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) {
            Optimization::A32GetSetElimination(ir_block);
            Optimization::DeadCodeElimination(ir_block);
--- a/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/a64_interface.cpp
@ -45,12 +45,19 @@ static std::function<void(BlockOfCode&)> GenRCP(const A64::UserConfig& conf) {
    };
 }

+static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
+    return Optimization::PolyfillOptions{
+        .sha256 = !code.HasHostFeature(HostFeature::SHA),
+    };
+}
+
 struct Jit::Impl final {
 public:
    Impl(Jit* jit, UserConfig conf)
            : conf(conf)
            , block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this), JitStateInfo{jit_state}, conf.code_cache_size, conf.far_code_offset, GenRCP(conf))
-            , emitter(block_of_code, conf, jit) {
+            , emitter(block_of_code, conf, jit)
+            , polyfill_options(GenPolyfillOptions(block_of_code)) {
        ASSERT(conf.page_table_address_space_bits >= 12 && conf.page_table_address_space_bits <= 64);
    }

@ -253,6 +260,7 @@ private:
        const auto get_code = [this](u64 vaddr) { return conf.callbacks->MemoryReadCode(vaddr); };
        IR::Block ir_block = A64::Translate(A64::LocationDescriptor{current_location}, get_code,
                                            {conf.define_unpredictable_behaviour, conf.wall_clock_cntpct});
+        Optimization::PolyfillPass(ir_block, polyfill_options);
        Optimization::A64CallbackConfigPass(ir_block, conf);
        if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) {
            Optimization::A64GetSetElimination(ir_block);
@ -301,6 +309,7 @@ private:
    A64JitState jit_state;
    BlockOfCode block_of_code;
    A64EmitX64 emitter;
+    Optimization::PolyfillOptions polyfill_options;

    bool invalidate_entire_cache = false;
    boost::icl::interval_set<u64> invalid_cache_ranges;
--- a/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/block_of_code.cpp
@ -114,6 +114,8 @@ HostFeature GetHostFeatures() {
        features |= HostFeature::FMA;
    if (cpu_info.has(Cpu::tAESNI))
        features |= HostFeature::AES;
+    if (cpu_info.has(Cpu::tSHA))
+        features |= HostFeature::SHA;
    if (cpu_info.has(Cpu::tPOPCNT))
        features |= HostFeature::POPCNT;
    if (cpu_info.has(Cpu::tBMI1))
--- a/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp
+++ b/externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp
@ -0,0 +1,81 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/backend/x64/block_of_code.h"
+#include "dynarmic/backend/x64/emit_x64.h"
+
+namespace Dynarmic::Backend::X64 {
+
+using namespace Xbyak::util;
+
+void EmitX64::EmitSHA256Hash(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    const bool part1 = args[3].GetImmediateU1();
+
+    ASSERT(code.HasHostFeature(HostFeature::SHA));
+
+    //      3   2   1   0
+    // x =  d   c   b   a
+    // y =  h   g   f   e
+    // w = wk3 wk2 wk1 wk0
+
+    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+    const Xbyak::Xmm w = ctx.reg_alloc.UseXmm(args[2]);
+
+    // x64 expects:
+    //         3   2   1   0
+    // src1 =  c   d   g   h
+    // src2 =  a   b   e   f
+    // xmm0 =  -   -  wk1 wk0
+
+    code.movaps(xmm0, y);
+    code.shufps(xmm0, x, 0b10111011);  // src1
+    code.shufps(y, x, 0b00010001);     // src2
+    code.movaps(x, xmm0);
+
+    code.movaps(xmm0, w);
+    code.sha256rnds2(x, y);
+
+    code.punpckhqdq(xmm0, xmm0);
+    code.sha256rnds2(y, x);
+
+    code.shufps(y, x, part1 ? 0b10111011 : 0b00010001);
+
+    ctx.reg_alloc.DefineValue(inst, y);
+}
+
+void EmitX64::EmitSHA256MessageSchedule0(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    ASSERT(code.HasHostFeature(HostFeature::SHA));
+
+    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+
+    code.sha256msg1(x, y);
+
+    ctx.reg_alloc.DefineValue(inst, x);
+}
+
+void EmitX64::EmitSHA256MessageSchedule1(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
+
+    ASSERT(code.HasHostFeature(HostFeature::SHA));
+
+    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+    const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
+    const Xbyak::Xmm z = ctx.reg_alloc.UseXmm(args[2]);
+
+    code.movaps(xmm0, z);
+    code.palignr(xmm0, y, 4);
+    code.paddd(x, xmm0);
+    code.sha256msg2(x, z);
+
+    ctx.reg_alloc.DefineValue(inst, x);
+}
+
+}  // namespace Dynarmic::Backend::X64
--- a/externals/dynarmic/src/dynarmic/backend/x64/host_feature.h
+++ b/externals/dynarmic/src/dynarmic/backend/x64/host_feature.h
@ -26,14 +26,15 @@ enum class HostFeature : u64 {
    F16C = 1ULL << 13,
    FMA = 1ULL << 14,
    AES = 1ULL << 15,
-    POPCNT = 1ULL << 16,
-    BMI1 = 1ULL << 17,
-    BMI2 = 1ULL << 18,
-    LZCNT = 1ULL << 19,
-    GFNI = 1ULL << 20,
+    SHA = 1ULL << 16,
+    POPCNT = 1ULL << 17,
+    BMI1 = 1ULL << 18,
+    BMI2 = 1ULL << 19,
+    LZCNT = 1ULL << 20,
+    GFNI = 1ULL << 21,

    // Zen-based BMI2
-    FastBMI2 = 1ULL << 21,
+    FastBMI2 = 1ULL << 22,

    // Orthographic AVX512 features on 128 and 256 vectors
    AVX512_Ortho = AVX512F | AVX512VL,
--- a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.inc
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/asimd.inc
@ -50,6 +50,9 @@ INST(asimd_VPMAX_float,     "VPMAX (floating-point)",   "111100110D0znnnndddd111
 INST(asimd_VPMIN_float,     "VPMIN (floating-point)",   "111100110D1znnnndddd1111NQM0mmmm") // ASIMD
 INST(asimd_VRECPS,          "VRECPS",                   "111100100D0znnnndddd1111NQM1mmmm") // ASIMD
 INST(asimd_VRSQRTS,         "VRSQRTS",                  "111100100D1znnnndddd1111NQM1mmmm") // ASIMD
+INST(v8_SHA256H,            "SHA256H",                  "111100110D00nnnndddd1100NQM0mmmm") // v8
+INST(v8_SHA256H2,           "SHA256H2",                 "111100110D01nnnndddd1100NQM0mmmm") // v8
+INST(v8_SHA256SU1,          "SHA256SU1",                "111100110D10nnnndddd1100NQM0mmmm") // v8

 // Three registers of different lengths
 INST(asimd_VADDL,           "VADDL/VADDW",              "1111001U1Dzznnnndddd000oN0M0mmmm") // ASIMD
@ -144,7 +147,7 @@ INST(v8_AESIMC,             "AESIMC",                   "111100111D11zz00dddd001
 INST(arm_UDF,               "UNALLOCATED",              "111100111-11--01----001010-0----") // v8
 INST(arm_UDF,               "UNALLOCATED (SHA1H)",      "111100111-11--01----001011-0----") // v8
 INST(arm_UDF,               "UNALLOCATED (SHA1SU1)",    "111100111-11--10----001110-0----") // v8
-INST(arm_UDF,               "UNALLOCATED (SHA256SU0)",  "111100111-11--10----001111-0----") // v8
+INST(v8_SHA256SU0,          "SHA256SU0",                "111100111D11zz10dddd001111M0mmmm") // v8

 // One register and modified immediate
 INST(asimd_VMOV_imm,        "VBIC, VMOV, VMVN, VORR (immediate)",  "1111001a1D000bcdVVVVmmmm0Qo1efgh") // ASIMD
--- a/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.inc
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/decoder/thumb32.inc
@ -19,6 +19,8 @@ INST(thumb32_LDRD_lit_1,     "LDRD (lit)",               "11101000U1111111ttttss
 INST(thumb32_LDRD_lit_2,     "LDRD (lit)",               "11101001U1W11111ttttssssiiiiiiii")
 INST(thumb32_LDRD_imm_1,     "LDRD (imm)",               "11101000U111nnnnttttssssiiiiiiii")
 INST(thumb32_LDRD_imm_2,     "LDRD (imm)",               "11101001U1W1nnnnttttssssiiiiiiii")
+INST(thumb32_STL,            "STL",                      "111010001100nnnntttt111110101111") // v8
+INST(thumb32_LDA,            "LDA",                      "111010001101nnnntttt111110101111") // v8
 INST(thumb32_STREXB,         "STREXB",                   "111010001100nnnntttt11110100dddd")
 INST(thumb32_STREXH,         "STREXH",                   "111010001100nnnntttt11110101dddd")
 INST(thumb32_STREXD,         "STREXD",                   "111010001100nnnnttttuuuu0111dddd")
--- a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.h
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/a32_translate_impl.h
@ -493,6 +493,7 @@ struct TranslatorVisitor final {
    bool thumb32_STMDB(bool W, Reg n, Imm<15> reg_list);

    // thumb32 load/store dual, load/store exclusive, table branch instructions
+    bool thumb32_LDA(Reg n, Reg t);
    bool thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8);
    bool thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8);
    bool thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8);
@ -503,6 +504,7 @@ struct TranslatorVisitor final {
    bool thumb32_LDREXD(Reg n, Reg t, Reg t2);
    bool thumb32_LDREXB(Reg n, Reg t);
    bool thumb32_LDREXH(Reg n, Reg t);
+    bool thumb32_STL(Reg n, Reg t);
    bool thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8);
    bool thumb32_STREXB(Reg n, Reg t, Reg d);
    bool thumb32_STREXD(Reg n, Reg t, Reg t2, Reg d);
@ -875,6 +877,9 @@ struct TranslatorVisitor final {
    bool asimd_VPMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
    bool asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
    bool asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+    bool v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+    bool v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
+    bool v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);

    // Advanced SIMD three registers with different lengths
    bool asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm);
@ -918,6 +923,7 @@ struct TranslatorVisitor final {
    bool v8_AESE(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
    bool v8_AESIMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
    bool v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
+    bool v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
    bool asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
    bool asimd_VCLZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
    bool asimd_VCNT(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
--- a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_three_regs.cpp
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_three_regs.cpp
@ -831,6 +831,60 @@ bool TranslatorVisitor::asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, boo
    });
 }

+bool TranslatorVisitor::v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+    if (!Q || Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm)) {
+        return UndefinedInstruction();
+    }
+
+    const auto d = ToVector(Q, Vd, D);
+    const auto n = ToVector(Q, Vn, N);
+    const auto m = ToVector(Q, Vm, M);
+
+    const auto x = ir.GetVector(d);
+    const auto y = ir.GetVector(n);
+    const auto w = ir.GetVector(m);
+    const auto result = ir.SHA256Hash(x, y, w, true);
+
+    ir.SetVector(d, result);
+    return true;
+}
+
+bool TranslatorVisitor::v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+    if (!Q || Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm)) {
+        return UndefinedInstruction();
+    }
+
+    const auto n = ToVector(Q, Vn, N);
+    const auto d = ToVector(Q, Vd, D);
+    const auto m = ToVector(Q, Vm, M);
+
+    const auto x = ir.GetVector(n);
+    const auto y = ir.GetVector(d);
+    const auto w = ir.GetVector(m);
+    const auto result = ir.SHA256Hash(x, y, w, false);
+
+    ir.SetVector(d, result);
+    return true;
+}
+
+bool TranslatorVisitor::v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
+    if (!Q || Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm)) {
+        return UndefinedInstruction();
+    }
+
+    const auto d = ToVector(Q, Vd, D);
+    const auto n = ToVector(Q, Vn, N);
+    const auto m = ToVector(Q, Vm, M);
+
+    const auto x = ir.GetVector(d);
+    const auto y = ir.GetVector(n);
+    const auto z = ir.GetVector(m);
+    const auto result = ir.SHA256MessageSchedule1(x, y, z);
+
+    ir.SetVector(d, result);
+    return true;
+}
+
 // ASIMD Three registers of different length

 bool TranslatorVisitor::asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) {
--- a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/asimd_two_regs_misc.cpp
@ -225,6 +225,21 @@ bool TranslatorVisitor::v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm
    return true;
 }

+bool TranslatorVisitor::v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
+    if (sz != 0b10 || Common::Bit<0>(Vd) || Common::Bit<0>(Vm)) {
+        return UndefinedInstruction();
+    }
+
+    const auto d = ToVector(true, Vd, D);
+    const auto m = ToVector(true, Vm, M);
+    const auto x = ir.GetVector(d);
+    const auto y = ir.GetVector(m);
+    const auto result = ir.SHA256MessageSchedule0(x, y);
+
+    ir.SetVector(d, result);
+    return true;
+}
+
 bool TranslatorVisitor::asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
    if (sz == 0b11) {
        return UndefinedInstruction();
--- a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_dual.cpp
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/thumb32_load_store_dual.cpp
@ -110,6 +110,16 @@ static bool StoreDual(TranslatorVisitor& v, bool P, bool U, bool W, Reg n, Reg t
    return true;
 }

+bool TranslatorVisitor::thumb32_LDA(Reg n, Reg t) {
+    if (t == Reg::PC || n == Reg::PC) {
+        return UnpredictableInstruction();
+    }
+
+    const auto address = ir.GetRegister(n);
+    ir.SetRegister(t, ir.ReadMemory32(address));  // AccType::Ordered
+    return true;
+}
+
 bool TranslatorVisitor::thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) {
    return LoadDualImmediate(*this, false, U, true, n, t, t2, imm8);
 }
@ -184,6 +194,16 @@ bool TranslatorVisitor::thumb32_LDREXH(Reg n, Reg t) {
    return true;
 }

+bool TranslatorVisitor::thumb32_STL(Reg n, Reg t) {
+    if (t == Reg::PC || n == Reg::PC) {
+        return UnpredictableInstruction();
+    }
+
+    const auto address = ir.GetRegister(n);
+    ir.WriteMemory32(address, ir.GetRegister(t));  // AccType::Ordered
+    return true;
+}
+
 bool TranslatorVisitor::thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8) {
    if (d == Reg::PC || t == Reg::PC || n == Reg::PC) {
        return UnpredictableInstruction();
--- a/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/vfp.cpp
+++ b/externals/dynarmic/src/dynarmic/frontend/A32/translate/impl/vfp.cpp
@ -1407,7 +1407,7 @@ bool TranslatorVisitor::vfp_VLDM_a1(Cond cond, bool p, bool u, bool D, bool w, R
        return arm_UDF();
    }

-    if (n == Reg::PC && w) {
+    if (n == Reg::PC && (w || ir.current_location.TFlag())) {
        return UnpredictableInstruction();
    }

@ -1457,7 +1457,7 @@ bool TranslatorVisitor::vfp_VLDM_a2(Cond cond, bool p, bool u, bool D, bool w, R
        return arm_UDF();
    }

-    if (n == Reg::PC && w) {
+    if (n == Reg::PC && (w || ir.current_location.TFlag())) {
        return UnpredictableInstruction();
    }

--- a/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp
+++ b/externals/dynarmic/src/dynarmic/frontend/A64/translate/impl/simd_sha.cpp
@ -46,67 +46,6 @@ IR::U128 SHA1HashUpdate(IREmitter& ir, Vec Vm, Vec Vn, Vec Vd, SHA1HashUpdateFun

    return x;
 }
-
-IR::U32 SHAhashSIGMA0(IREmitter& ir, IR::U32 x) {
-    const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(2));
-    const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(13));
-    const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(22));
-
-    return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
-}
-
-IR::U32 SHAhashSIGMA1(IREmitter& ir, IR::U32 x) {
-    const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(6));
-    const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(11));
-    const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(25));
-
-    return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
-}
-
-enum class SHA256HashPart {
-    Part1,
-    Part2
-};
-
-IR::U128 SHA256hash(IREmitter& ir, IR::U128 x, IR::U128 y, IR::U128 w, SHA256HashPart part) {
-    for (size_t i = 0; i < 4; i++) {
-        const IR::U32 low_x = ir.VectorGetElement(32, x, 0);
-        const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1);
-        const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2);
-        const IR::U32 high_x = ir.VectorGetElement(32, x, 3);
-
-        const IR::U32 low_y = ir.VectorGetElement(32, y, 0);
-        const IR::U32 after_low_y = ir.VectorGetElement(32, y, 1);
-        const IR::U32 before_high_y = ir.VectorGetElement(32, y, 2);
-        const IR::U32 high_y = ir.VectorGetElement(32, y, 3);
-
-        const IR::U32 choice = SHAchoose(ir, low_y, after_low_y, before_high_y);
-        const IR::U32 majority = SHAmajority(ir, low_x, after_low_x, before_high_x);
-
-        const IR::U32 t = [&] {
-            const IR::U32 w_element = ir.VectorGetElement(32, w, i);
-            const IR::U32 sig = SHAhashSIGMA1(ir, low_y);
-
-            return ir.Add(high_y, ir.Add(sig, ir.Add(choice, w_element)));
-        }();
-
-        const IR::U32 new_low_x = ir.Add(t, ir.Add(SHAhashSIGMA0(ir, low_x), majority));
-        const IR::U32 new_low_y = ir.Add(t, high_x);
-
-        // Shuffle all words left by 1 element: [3, 2, 1, 0] -> [2, 1, 0, 3]
-        const IR::U128 shuffled_x = ir.VectorShuffleWords(x, 0b10010011);
-        const IR::U128 shuffled_y = ir.VectorShuffleWords(y, 0b10010011);
-
-        x = ir.VectorSetElement(32, shuffled_x, 0, new_low_x);
-        y = ir.VectorSetElement(32, shuffled_y, 0, new_low_y);
-    }
-
-    if (part == SHA256HashPart::Part1) {
-        return x;
-    }
-
-    return y;
-}
 }  // Anonymous namespace

 bool TranslatorVisitor::SHA1C(Vec Vm, Vec Vn, Vec Vd) {
@ -175,85 +114,34 @@ bool TranslatorVisitor::SHA1H(Vec Vn, Vec Vd) {
 }

 bool TranslatorVisitor::SHA256SU0(Vec Vn, Vec Vd) {
-    const IR::U128 d = ir.GetQ(Vd);
-    const IR::U128 n = ir.GetQ(Vn);
+    const IR::U128 x = ir.GetQ(Vd);
+    const IR::U128 y = ir.GetQ(Vn);

-    const IR::U128 t = [&] {
-        // Shuffle the upper three elements down: [3, 2, 1, 0] -> [0, 3, 2, 1]
-        const IR::U128 shuffled = ir.VectorShuffleWords(d, 0b00111001);
-
-        return ir.VectorSetElement(32, shuffled, 3, ir.VectorGetElement(32, n, 0));
-    }();
-
-    IR::U128 result = ir.ZeroVector();
-    for (size_t i = 0; i < 4; i++) {
-        const IR::U32 modified_element = [&] {
-            const IR::U32 element = ir.VectorGetElement(32, t, i);
-            const IR::U32 tmp1 = ir.RotateRight(element, ir.Imm8(7));
-            const IR::U32 tmp2 = ir.RotateRight(element, ir.Imm8(18));
-            const IR::U32 tmp3 = ir.LogicalShiftRight(element, ir.Imm8(3));
-
-            return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
-        }();
-
-        const IR::U32 d_element = ir.VectorGetElement(32, d, i);
-        result = ir.VectorSetElement(32, result, i, ir.Add(modified_element, d_element));
-    }
+    const IR::U128 result = ir.SHA256MessageSchedule0(x, y);

    ir.SetQ(Vd, result);
    return true;
 }

 bool TranslatorVisitor::SHA256SU1(Vec Vm, Vec Vn, Vec Vd) {
-    const IR::U128 d = ir.GetQ(Vd);
-    const IR::U128 m = ir.GetQ(Vm);
-    const IR::U128 n = ir.GetQ(Vn);
+    const IR::U128 x = ir.GetQ(Vd);
+    const IR::U128 y = ir.GetQ(Vn);
+    const IR::U128 z = ir.GetQ(Vm);

-    const IR::U128 T0 = [&] {
-        const IR::U32 low_m = ir.VectorGetElement(32, m, 0);
-        const IR::U128 shuffled_n = ir.VectorShuffleWords(n, 0b00111001);
-
-        return ir.VectorSetElement(32, shuffled_n, 3, low_m);
-    }();
-
-    const IR::U128 lower_half = [&] {
-        const IR::U128 T = ir.VectorShuffleWords(m, 0b01001110);
-        const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17);
-        const IR::U128 tmp2 = ir.VectorRotateRight(32, T, 19);
-        const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, T, 10);
-        const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
-        const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, d, T0));
-        return ir.VectorZeroUpper(tmp5);
-    }();
-
-    const IR::U64 upper_half = [&] {
-        const IR::U128 tmp1 = ir.VectorRotateRight(32, lower_half, 17);
-        const IR::U128 tmp2 = ir.VectorRotateRight(32, lower_half, 19);
-        const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, lower_half, 10);
-        const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
-
-        // Shuffle the top two 32-bit elements downwards [3, 2, 1, 0] -> [1, 0, 3, 2]
-        const IR::U128 shuffled_d = ir.VectorShuffleWords(d, 0b01001110);
-        const IR::U128 shuffled_T0 = ir.VectorShuffleWords(T0, 0b01001110);
-
-        const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, shuffled_d, shuffled_T0));
-        return ir.VectorGetElement(64, tmp5, 0);
-    }();
-
-    const IR::U128 result = ir.VectorSetElement(64, lower_half, 1, upper_half);
+    const IR::U128 result = ir.SHA256MessageSchedule1(x, y, z);

    ir.SetQ(Vd, result);
    return true;
 }

 bool TranslatorVisitor::SHA256H(Vec Vm, Vec Vn, Vec Vd) {
-    const IR::U128 result = SHA256hash(ir, ir.GetQ(Vd), ir.GetQ(Vn), ir.GetQ(Vm), SHA256HashPart::Part1);
+    const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vd), ir.GetQ(Vn), ir.GetQ(Vm), true);
    ir.SetQ(Vd, result);
    return true;
 }

 bool TranslatorVisitor::SHA256H2(Vec Vm, Vec Vn, Vec Vd) {
-    const IR::U128 result = SHA256hash(ir, ir.GetQ(Vn), ir.GetQ(Vd), ir.GetQ(Vm), SHA256HashPart::Part2);
+    const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vn), ir.GetQ(Vd), ir.GetQ(Vm), false);
    ir.SetQ(Vd, result);
    return true;
 }
--- a/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp
+++ b/externals/dynarmic/src/dynarmic/ir/ir_emitter.cpp
@ -903,6 +903,18 @@ U8 IREmitter::SM4AccessSubstitutionBox(const U8& a) {
    return Inst<U8>(Opcode::SM4AccessSubstitutionBox, a);
 }

+U128 IREmitter::SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1) {
+    return Inst<U128>(Opcode::SHA256Hash, x, y, w, Imm1(part1));
+}
+
+U128 IREmitter::SHA256MessageSchedule0(const U128& x, const U128& y) {
+    return Inst<U128>(Opcode::SHA256MessageSchedule0, x, y);
+}
+
+U128 IREmitter::SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z) {
+    return Inst<U128>(Opcode::SHA256MessageSchedule1, x, y, z);
+}
+
 UAny IREmitter::VectorGetElement(size_t esize, const U128& a, size_t index) {
    ASSERT_MSG(esize * index < 128, "Invalid index");
    switch (esize) {
--- a/externals/dynarmic/src/dynarmic/ir/ir_emitter.h
+++ b/externals/dynarmic/src/dynarmic/ir/ir_emitter.h
@ -219,6 +219,10 @@ public:

    U8 SM4AccessSubstitutionBox(const U8& a);

+    U128 SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1);
+    U128 SHA256MessageSchedule0(const U128& x, const U128& y);
+    U128 SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z);
+
    UAny VectorGetElement(size_t esize, const U128& a, size_t index);
    U128 VectorSetElement(size_t esize, const U128& a, size_t index, const UAny& elem);
    U128 VectorAbs(size_t esize, const U128& a);
--- a/externals/dynarmic/src/dynarmic/ir/opcodes.inc
+++ b/externals/dynarmic/src/dynarmic/ir/opcodes.inc
@ -272,6 +272,11 @@ OPCODE(AESMixColumns,                                       U128,           U128
 // SM4 instructions
 OPCODE(SM4AccessSubstitutionBox,                            U8,             U8                                                              )

+// SHA instructions
+OPCODE(SHA256Hash,                                          U128,           U128,           U128,           U128,           U1              )
+OPCODE(SHA256MessageSchedule0,                              U128,           U128,           U128                                            )
+OPCODE(SHA256MessageSchedule1,                              U128,           U128,           U128,           U128                            )
+
 // Vector instructions
 OPCODE(VectorGetElement8,                                   U8,             U128,           U8                                              )
 OPCODE(VectorGetElement16,                                  U16,            U128,           U8                                              )
--- a/externals/dynarmic/src/dynarmic/ir/opt/passes.h
+++ b/externals/dynarmic/src/dynarmic/ir/opt/passes.h
@ -20,6 +20,13 @@ class Block;

 namespace Dynarmic::Optimization {

+struct PolyfillOptions {
+    bool sha256 = false;
+
+    bool operator==(const PolyfillOptions&) const = default;
+};
+
+void PolyfillPass(IR::Block& block, const PolyfillOptions& opt);
 void A32ConstantMemoryReads(IR::Block& block, A32::UserCallbacks* cb);
 void A32GetSetElimination(IR::Block& block);
 void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf);
--- a/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp
+++ b/externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp
@ -0,0 +1,175 @@
+/* This file is part of the dynarmic project.
+ * Copyright (c) 2022 MerryMage
+ * SPDX-License-Identifier: 0BSD
+ */
+
+#include "dynarmic/ir/basic_block.h"
+#include "dynarmic/ir/ir_emitter.h"
+#include "dynarmic/ir/microinstruction.h"
+#include "dynarmic/ir/opcodes.h"
+#include "dynarmic/ir/opt/passes.h"
+
+namespace Dynarmic::Optimization {
+
+namespace {
+
+void PolyfillSHA256MessageSchedule0(IR::IREmitter& ir, IR::Inst& inst) {
+    const IR::U128 x = (IR::U128)inst.GetArg(0);
+    const IR::U128 y = (IR::U128)inst.GetArg(1);
+
+    const IR::U128 t = ir.VectorExtract(x, y, 32);
+
+    IR::U128 result = ir.ZeroVector();
+    for (size_t i = 0; i < 4; i++) {
+        const IR::U32 modified_element = [&] {
+            const IR::U32 element = ir.VectorGetElement(32, t, i);
+            const IR::U32 tmp1 = ir.RotateRight(element, ir.Imm8(7));
+            const IR::U32 tmp2 = ir.RotateRight(element, ir.Imm8(18));
+            const IR::U32 tmp3 = ir.LogicalShiftRight(element, ir.Imm8(3));
+
+            return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
+        }();
+
+        result = ir.VectorSetElement(32, result, i, modified_element);
+    }
+    result = ir.VectorAdd(32, result, x);
+
+    inst.ReplaceUsesWith(result);
+}
+
+void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) {
+    const IR::U128 x = (IR::U128)inst.GetArg(0);
+    const IR::U128 y = (IR::U128)inst.GetArg(1);
+    const IR::U128 z = (IR::U128)inst.GetArg(2);
+
+    const IR::U128 T0 = ir.VectorExtract(y, z, 32);
+
+    const IR::U128 lower_half = [&] {
+        const IR::U128 T = ir.VectorShuffleWords(z, 0b01001110);
+        const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17);
+        const IR::U128 tmp2 = ir.VectorRotateRight(32, T, 19);
+        const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, T, 10);
+        const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
+        const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, x, T0));
+        return ir.VectorZeroUpper(tmp5);
+    }();
+
+    const IR::U64 upper_half = [&] {
+        const IR::U128 tmp1 = ir.VectorRotateRight(32, lower_half, 17);
+        const IR::U128 tmp2 = ir.VectorRotateRight(32, lower_half, 19);
+        const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, lower_half, 10);
+        const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
+
+        // Shuffle the top two 32-bit elements downwards [3, 2, 1, 0] -> [1, 0, 3, 2]
+        const IR::U128 shuffled_d = ir.VectorShuffleWords(x, 0b01001110);
+        const IR::U128 shuffled_T0 = ir.VectorShuffleWords(T0, 0b01001110);
+
+        const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, shuffled_d, shuffled_T0));
+        return ir.VectorGetElement(64, tmp5, 0);
+    }();
+
+    const IR::U128 result = ir.VectorSetElement(64, lower_half, 1, upper_half);
+
+    inst.ReplaceUsesWith(result);
+}
+
+IR::U32 SHAchoose(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
+    return ir.Eor(ir.And(ir.Eor(y, z), x), z);
+}
+
+IR::U32 SHAmajority(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
+    return ir.Or(ir.And(x, y), ir.And(ir.Or(x, y), z));
+}
+
+IR::U32 SHAhashSIGMA0(IR::IREmitter& ir, IR::U32 x) {
+    const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(2));
+    const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(13));
+    const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(22));
+
+    return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
+}
+
+IR::U32 SHAhashSIGMA1(IR::IREmitter& ir, IR::U32 x) {
+    const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(6));
+    const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(11));
+    const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(25));
+
+    return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
+}
+
+void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) {
+    IR::U128 x = (IR::U128)inst.GetArg(0);
+    IR::U128 y = (IR::U128)inst.GetArg(1);
+    const IR::U128 w = (IR::U128)inst.GetArg(2);
+    const bool part1 = inst.GetArg(3).GetU1();
+
+    for (size_t i = 0; i < 4; i++) {
+        const IR::U32 low_x = ir.VectorGetElement(32, x, 0);
+        const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1);
+        const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2);
+        const IR::U32 high_x = ir.VectorGetElement(32, x, 3);
+
+        const IR::U32 low_y = ir.VectorGetElement(32, y, 0);
+        const IR::U32 after_low_y = ir.VectorGetElement(32, y, 1);
+        const IR::U32 before_high_y = ir.VectorGetElement(32, y, 2);
+        const IR::U32 high_y = ir.VectorGetElement(32, y, 3);
+
+        const IR::U32 choice = SHAchoose(ir, low_y, after_low_y, before_high_y);
+        const IR::U32 majority = SHAmajority(ir, low_x, after_low_x, before_high_x);
+
+        const IR::U32 t = [&] {
+            const IR::U32 w_element = ir.VectorGetElement(32, w, i);
+            const IR::U32 sig = SHAhashSIGMA1(ir, low_y);
+
+            return ir.Add(high_y, ir.Add(sig, ir.Add(choice, w_element)));
+        }();
+
+        const IR::U32 new_low_x = ir.Add(t, ir.Add(SHAhashSIGMA0(ir, low_x), majority));
+        const IR::U32 new_low_y = ir.Add(t, high_x);
+
+        // Shuffle all words left by 1 element: [3, 2, 1, 0] -> [2, 1, 0, 3]
+        const IR::U128 shuffled_x = ir.VectorShuffleWords(x, 0b10010011);
+        const IR::U128 shuffled_y = ir.VectorShuffleWords(y, 0b10010011);
+
+        x = ir.VectorSetElement(32, shuffled_x, 0, new_low_x);
+        y = ir.VectorSetElement(32, shuffled_y, 0, new_low_y);
+    }
+
+    inst.ReplaceUsesWith(part1 ? x : y);
+}
+
+}  // namespace
+
+void PolyfillPass(IR::Block& block, const PolyfillOptions& polyfill) {
+    if (polyfill == PolyfillOptions{}) {
+        return;
+    }
+
+    IR::IREmitter ir{block};
+
+    for (auto& inst : block) {
+        ir.SetInsertionPoint(&inst);
+
+        switch (inst.GetOpcode()) {
+        case IR::Opcode::SHA256MessageSchedule0:
+            if (polyfill.sha256) {
+                PolyfillSHA256MessageSchedule0(ir, inst);
+            }
+            break;
+        case IR::Opcode::SHA256MessageSchedule1:
+            if (polyfill.sha256) {
+                PolyfillSHA256MessageSchedule1(ir, inst);
+            }
+            break;
+        case IR::Opcode::SHA256Hash:
+            if (polyfill.sha256) {
+                PolyfillSHA256Hash(ir, inst);
+            }
+            break;
+        default:
+            break;
+        }
+    }
+}
+
+}  // namespace Dynarmic::Optimization
--- a/externals/dynarmic/tests/A32/fuzz_arm.cpp
+++ b/externals/dynarmic/tests/A32/fuzz_arm.cpp
@ -535,6 +535,37 @@ TEST_CASE("A32: Single random thumb instruction", "[thumb]") {
    }
 }

+TEST_CASE("A32: Single random thumb instruction (offset)", "[thumb]") {
+    ThumbTestEnv jit_env{};
+    ThumbTestEnv uni_env{};
+
+    Dynarmic::A32::Jit jit{GetUserConfig(jit_env)};
+    A32Unicorn<ThumbTestEnv> uni{uni_env};
+
+    A32Unicorn<ThumbTestEnv>::RegisterArray regs;
+    A32Unicorn<ThumbTestEnv>::ExtRegArray ext_reg;
+    std::vector<u16> instructions;
+
+    for (size_t iteration = 0; iteration < 100000; ++iteration) {
+        std::generate(regs.begin(), regs.end(), [] { return RandInt<u32>(0, ~u32(0)); });
+        std::generate(ext_reg.begin(), ext_reg.end(), [] { return RandInt<u32>(0, ~u32(0)); });
+
+        instructions.clear();
+        instructions.push_back(0xbf00);  // NOP
+        const std::vector<u16> inst = GenRandomThumbInst(0, true);
+        instructions.insert(instructions.end(), inst.begin(), inst.end());
+
+        const u32 start_address = 100;
+        const u32 cpsr = (RandInt<u32>(0, 0xF) << 28) | 0x1F0;
+        const u32 fpcr = RandomFpcr();
+
+        INFO("Instruction: 0x" << std::hex << inst[0]);
+
+        regs[15] = start_address;
+        RunTestInstance(jit, uni, jit_env, uni_env, regs, ext_reg, instructions, cpsr, fpcr, 2);
+    }
+}
+
 TEST_CASE("A32: Small random thumb block", "[thumb]") {
    ThumbTestEnv jit_env{};
    ThumbTestEnv uni_env{};
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -38,7 +38,7 @@ if (MSVC)
        /MP
        /Zf
        /Zi
-        /Zm200
+        /Zm300
        /Zo
        /permissive-
        /EHsc
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@ -322,7 +322,7 @@ struct Memory::Impl {
        }

        if (Settings::IsFastmemEnabled()) {
-            const bool is_read_enable = Settings::IsGPULevelHigh() || !cached;
+            const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;
            system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
        }

--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@ -1495,15 +1495,13 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
        overlap_ids.push_back(overlap_id);
        overlap.Pick();
        const VAddr overlap_cpu_addr = overlap.CpuAddr();
-        bool goes_left = false;
-        if (overlap_cpu_addr < begin) {
-            goes_left = true;
+        const bool expands_left = overlap_cpu_addr < begin;
+        if (expands_left) {
            cpu_addr = begin = overlap_cpu_addr;
        }
        const VAddr overlap_end = overlap_cpu_addr + overlap.SizeBytes();
-        bool goes_right = false;
+        const bool expands_right = overlap_end > end;
        if (overlap_end > end) {
-            goes_right = true;
            end = overlap_end;
        }
        stream_score += overlap.StreamScore();
@ -1511,11 +1509,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
            // When this memory region has been joined a bunch of times, we assume it's being used
            // as a stream buffer. Increase the size to skip constantly recreating buffers.
            has_stream_leap = true;
-            if (goes_right) {
+            if (expands_right) {
                begin -= PAGE_SIZE * 256;
                cpu_addr = begin;
            }
-            if (goes_left) {
+            if (expands_left) {
                end += PAGE_SIZE * 256;
            }
        }
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@ -2,8 +2,11 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <algorithm>
 #include <cstring>
 #include <optional>
+
+#include "common/alignment.h"
 #include "common/assert.h"
 #include "common/settings.h"
 #include "core/core.h"
@ -27,6 +30,7 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
      upload_state{memory_manager, regs.upload} {
    dirty.flags.flip();
    InitializeRegisterDefaults();
+    accelerated_reads = Settings::IsFastmemEnabled();
 }

 Maxwell3D::~Maxwell3D() = default;
@ -210,28 +214,14 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
        return ProcessCBBind(4);
    case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
        return DrawArrays();
-    case MAXWELL3D_REG_INDEX(small_index): {
+    case MAXWELL3D_REG_INDEX(small_index):
        regs.index_array.count = regs.small_index.count;
        regs.index_array.first = regs.small_index.first;
        dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
-        bool is_extreme = Settings::IsGPULevelExtreme();
-
-        if (!is_extreme) {
-            for (size_t i = 0; i < Regs::NumVertexArrays; i++) {
-                if (!dirty.flags[VideoCommon::Dirty::VertexBuffer0 + i]) {
-                    continue;
-                }
-                const u32 stride = regs.vertex_array[i].stride;
-                const u32 num_vertices = regs.index_array.first + regs.index_array.count;
-                const GPUVAddr gpu_addr_begin =
-                    regs.vertex_array[i].StartAddress() + regs.index_array.first * stride;
-                const GPUVAddr gpu_addr_end = gpu_addr_begin + num_vertices * stride + 1;
-                regs.vertex_array_limit[i].SetAddress(gpu_addr_end);
-            }
-        }
-        DrawArrays();
-        return;
+        if (!Settings::IsGPULevelExtreme()) {
+            RecalculateVertexArrayLimit();
        }
+        return DrawArrays();
    case MAXWELL3D_REG_INDEX(topology_override):
        use_topology_override = true;
        return;
@ -685,4 +675,71 @@ void Maxwell3D::ProcessClearBuffers() {
    rasterizer->Clear();
 }

+void Maxwell3D::RecalculateVertexArrayLimit() {
+    GPUVAddr start_address = regs.index_array.StartAddress();
+    auto& vn_state = vertex_num_approx_state;
+    if (start_address != vn_state.last_index_array_start ||
+        vn_state.current_min_index != regs.index_array.first) {
+        vn_state.last_index_array_start = start_address;
+        vn_state.current_max_index = regs.index_array.first;
+        vn_state.current_min_index = regs.index_array.first;
+        vn_state.current_num_vertices = 0;
+    }
+    const u32 index_count = regs.index_array.first + regs.index_array.count;
+    if (index_count <= vn_state.current_max_index) {
+        return;
+    }
+    const u32 max_base = std::max(regs.index_array.first, vn_state.current_max_index);
+    const u32 num_indices = index_count - max_base;
+    const size_t size_index = regs.index_array.FormatSizeInBytes();
+    const size_t expected_size = num_indices * size_index;
+    const size_t offset = max_base * size_index;
+
+    auto maybe_ptr = memory_manager.GpuToHostPointer(start_address + offset);
+    u8* ptr;
+    if (accelerated_reads && maybe_ptr) {
+        ptr = *maybe_ptr;
+    } else {
+        vn_state.index_buffer_cache.resize(Common::DivideUp(expected_size, sizeof(u32)));
+        ptr = reinterpret_cast<u8*>(vn_state.index_buffer_cache.data());
+        memory_manager.ReadBlockUnsafe(start_address + offset, ptr, expected_size);
+    }
+    vn_state.current_max_index = index_count;
+
+    u32 new_num_vertices{};
+    switch (regs.index_array.format) {
+    case Regs::IndexFormat::UnsignedByte: {
+        std::span<const u8> span{ptr, num_indices};
+        const auto max = std::max_element(span.begin(), span.end());
+        new_num_vertices = *max + 1;
+        break;
+    }
+    case Regs::IndexFormat::UnsignedShort: {
+        std::span<const u16> span{reinterpret_cast<const u16*>(ptr), num_indices};
+        const auto max = std::max_element(span.begin(), span.end());
+        new_num_vertices = *max + 1;
+        break;
+    }
+    case Regs::IndexFormat::UnsignedInt: {
+        std::span<const u32> span{reinterpret_cast<const u32*>(ptr), num_indices};
+        const auto max = std::max_element(span.begin(), span.end());
+        new_num_vertices = *max + 1;
+        break;
+    }
+    }
+    if (new_num_vertices > vn_state.current_num_vertices) {
+        vn_state.current_num_vertices = new_num_vertices;
+        for (size_t i = 0; i < Regs::NumVertexArrays; i++) {
+            if (!regs.vertex_array[i].enable) {
+                continue;
+            }
+            const u32 stride = regs.vertex_array[i].stride;
+            const GPUVAddr gpu_addr_begin = regs.vertex_array[i].StartAddress();
+            const GPUVAddr gpu_addr_end = gpu_addr_begin + new_num_vertices * stride - 1;
+            regs.vertex_array_limit[i].SetAddress(gpu_addr_end);
+            dirty.flags[VideoCommon::Dirty::VertexBuffer0 + i] = true;
+        }
+    }
+}
+
 } // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@ -1498,6 +1498,16 @@ public:
        Tables tables{};
    } dirty;

+    struct VertexNumApproxState {
+        GPUVAddr last_index_array_start;
+        u32 current_max_index;
+        u32 current_min_index;
+        u32 current_num_vertices;
+        std::vector<u32> index_buffer_cache;
+    } vertex_num_approx_state;
+
+    bool accelerated_reads{};
+
 private:
    void InitializeRegisterDefaults();

@ -1566,6 +1576,8 @@ private:
    // Handles a instance drawcall from MME
    void StepInstance(MMEDrawMode expected_mode, u32 count);

+    void RecalculateVertexArrayLimit();
+
    /// Returns a query's value or an empty object if the value will be deferred through a cache.
    std::optional<u64> GetQueryResult();

--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@ -6,6 +6,7 @@

 #include "common/alignment.h"
 #include "common/assert.h"
+#include "common/host_memory.h"
 #include "common/logging/log.h"
 #include "core/core.h"
 #include "core/hle/kernel/k_page_table.h"
@ -186,6 +187,19 @@ std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const {
    return page_entry.ToAddress() + (gpu_addr & page_mask);
 }

+std::optional<u8*> MemoryManager::GpuToHostPointer(GPUVAddr gpu_addr) const {
+    auto cpu_addr = GpuToCpuAddress(gpu_addr);
+    if (!cpu_addr) {
+        return std::nullopt;
+    }
+    auto& device_memory = system.DeviceMemory();
+    auto base = device_memory.buffer.VirtualBasePointer();
+    if (!base) {
+        return std::nullopt;
+    }
+    return base + *cpu_addr;
+}
+
 std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr, std::size_t size) const {
    size_t page_index{addr >> page_bits};
    const size_t page_last{(addr + size + page_size - 1) >> page_bits};
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@ -76,6 +76,8 @@ public:

    [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;

+    [[nodiscard]] std::optional<u8*> GpuToHostPointer(GPUVAddr addr) const;
+
    [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr, std::size_t size) const;

    template <typename T>