early-access version 2585

This commit is contained in:
pineappleEA 2022-03-21 16:53:01 +01:00
parent dd6ea95c54
commit 9c48e94f2d
31 changed files with 1072 additions and 483 deletions

View file

@ -1,7 +1,7 @@
yuzu emulator early access yuzu emulator early access
============= =============
This is the source code for early-access 2576. This is the source code for early-access 2585.
## Legal Notice ## Legal Notice

View file

@ -1,6 +1,6 @@
name: Build and Test name: Build and Test
on: [push, pull_request] on: [ push, pull_request ]
env: env:
BUILD_TYPE: Release BUILD_TYPE: Release
@ -9,76 +9,93 @@ jobs:
build: build:
strategy: strategy:
matrix: matrix:
os: [windows-latest, ubuntu-latest, macos-latest] os: [ windows-latest, ubuntu-latest, macos-latest ]
cpu_detection: [0, 1] cpu_detection: [ 0, 1 ]
fail-fast: false fail-fast: false
runs-on: ${{matrix.os}} runs-on: ${{matrix.os}}
steps: steps:
- name: Install build dependencies - name: Install build dependencies
if: ${{matrix.os == 'ubuntu-latest'}} if: ${{matrix.os == 'ubuntu-latest'}}
run: sudo apt-get install llvm ninja-build run: sudo apt-get install llvm ninja-build
- name: Install build dependencies - name: Install build dependencies
if: ${{matrix.os == 'macos-latest'}} if: ${{matrix.os == 'macos-latest'}}
run: | run: |
brew install llvm ninja brew install llvm ninja
echo "/usr/local/opt/llvm/bin" >> $GITHUB_PATH echo "/usr/local/opt/llvm/bin" >> $GITHUB_PATH
- name: Checkout dynarmic repo
uses: actions/checkout@v2
- name: Checkout ext-boost repo - name: Checkout dynarmic repo
uses: actions/checkout@v2 uses: actions/checkout@v2
with:
repository: MerryMage/ext-boost
path: externals/ext-boost
- name: Checkout unicorn repo - name: Checkout ext-boost repo
if: ${{matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'}} uses: actions/checkout@v2
uses: actions/checkout@v2 with:
with: repository: MerryMage/ext-boost
repository: MerryMage/unicorn path: externals/ext-boost
path: externals/unicorn
- name: Build unicorn - name: Checkout unicorn repo
if: ${{matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'}} if: ${{matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'}}
working-directory: externals/unicorn uses: actions/checkout@v2
run: UNICORN_ARCHS=aarch64,arm ./make.sh with:
repository: MerryMage/unicorn
path: externals/unicorn
- name: Configure CMake - name: Build unicorn
if: ${{matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'}} if: ${{matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'}}
run: > working-directory: externals/unicorn
cmake run: UNICORN_ARCHS=aarch64,arm ./make.sh
-B ${{github.workspace}}/build
-DBoost_INCLUDE_DIRS=${{github.workspace}}/externals/ext-boost
-DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
-DDYNARMIC_ENABLE_CPU_FEATURE_DETECTION=${{matrix.cpu_detection}}
-DDYNARMIC_TESTS_USE_UNICORN=1
-DDYNARMIC_USE_LLVM=1
-DLIBUNICORN_INCLUDE_DIR=${{github.workspace}}/externals/unicorn/include
-DLIBUNICORN_LIBRARY=${{github.workspace}}/externals/unicorn/libunicorn.a
-G Ninja
- name: Configure CMake - name: Configure CMake
if: ${{matrix.os == 'windows-latest'}} if: ${{matrix.os == 'ubuntu-latest'}}
run: > env:
cmake CC: gcc-10
-B ${{github.workspace}}/build CXX: g++-10
-DBoost_INCLUDE_DIRS=${{github.workspace}}/externals/ext-boost run: >
-DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}} cmake
-DDYNARMIC_ENABLE_CPU_FEATURE_DETECTION=${{matrix.cpu_detection}} -B ${{github.workspace}}/build
-G "Visual Studio 17 2022" -DBoost_INCLUDE_DIRS=${{github.workspace}}/externals/ext-boost
-A x64 -DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
-DDYNARMIC_ENABLE_CPU_FEATURE_DETECTION=${{matrix.cpu_detection}}
-DDYNARMIC_TESTS_USE_UNICORN=1
-DDYNARMIC_USE_LLVM=1
-DLIBUNICORN_INCLUDE_DIR=${{github.workspace}}/externals/unicorn/include
-DLIBUNICORN_LIBRARY=${{github.workspace}}/externals/unicorn/libunicorn.a
-G Ninja
- name: Build - name: Configure CMake
working-directory: ${{github.workspace}}/build if: ${{matrix.os == 'macos-latest'}}
run: cmake --build . --config Release run: >
cmake
-B ${{github.workspace}}/build
-DBoost_INCLUDE_DIRS=${{github.workspace}}/externals/ext-boost
-DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
-DDYNARMIC_ENABLE_CPU_FEATURE_DETECTION=${{matrix.cpu_detection}}
-DDYNARMIC_TESTS_USE_UNICORN=1
-DDYNARMIC_USE_LLVM=1
-DLIBUNICORN_INCLUDE_DIR=${{github.workspace}}/externals/unicorn/include
-DLIBUNICORN_LIBRARY=${{github.workspace}}/externals/unicorn/libunicorn.a
-G Ninja
- name: Test - name: Configure CMake
env: if: ${{matrix.os == 'windows-latest'}}
DYLD_FALLBACK_LIBRARY_PATH: ${{github.workspace}}/externals/unicorn run: >
working-directory: ${{github.workspace}}/build cmake
run: ctest --extra-verbose -C ${{env.BUILD_TYPE}} -B ${{github.workspace}}/build
-DBoost_INCLUDE_DIRS=${{github.workspace}}/externals/ext-boost
-DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
-DDYNARMIC_ENABLE_CPU_FEATURE_DETECTION=${{matrix.cpu_detection}}
-G "Visual Studio 17 2022"
-A x64
- name: Build
working-directory: ${{github.workspace}}/build
run: cmake --build . --config Release
- name: Test
env:
DYLD_FALLBACK_LIBRARY_PATH: ${{github.workspace}}/externals/unicorn
working-directory: ${{github.workspace}}/build
run: ctest --extra-verbose -C ${{env.BUILD_TYPE}}

View file

@ -134,7 +134,6 @@ endif()
if (DYNARMIC_NO_BUNDLED_VIXL AND ARCHITECTURE STREQUAL "arm64") if (DYNARMIC_NO_BUNDLED_VIXL AND ARCHITECTURE STREQUAL "arm64")
find_package(PkgConfig REQUIRED) find_package(PkgConfig REQUIRED)
pkg_check_modules(vixl REQUIRED IMPORTED_TARGET vixl) pkg_check_modules(vixl REQUIRED IMPORTED_TARGET vixl)
target_include_directories(PkgConfig::vixl INTERFACE "${vixl_INCLUDE_DIRS}/vixl")
add_library(vixl ALIAS PkgConfig::vixl) add_library(vixl ALIAS PkgConfig::vixl)
endif() endif()

File diff suppressed because it is too large Load diff

View file

@ -92,6 +92,7 @@ add_library(dynarmic
ir/opt/identity_removal_pass.cpp ir/opt/identity_removal_pass.cpp
ir/opt/ir_matcher.h ir/opt/ir_matcher.h
ir/opt/passes.h ir/opt/passes.h
ir/opt/polyfill_pass.cpp
ir/opt/verification_pass.cpp ir/opt/verification_pass.cpp
ir/terminal.h ir/terminal.h
ir/type.cpp ir/type.cpp
@ -286,6 +287,7 @@ if (ARCHITECTURE STREQUAL "x86_64")
backend/x64/emit_x64_packed.cpp backend/x64/emit_x64_packed.cpp
backend/x64/emit_x64_saturation.cpp backend/x64/emit_x64_saturation.cpp
backend/x64/emit_x64_sm4.cpp backend/x64/emit_x64_sm4.cpp
backend/x64/emit_x64_sha.cpp
backend/x64/emit_x64_vector.cpp backend/x64/emit_x64_vector.cpp
backend/x64/emit_x64_vector_floating_point.cpp backend/x64/emit_x64_vector_floating_point.cpp
backend/x64/emit_x64_vector_saturation.cpp backend/x64/emit_x64_vector_saturation.cpp

View file

@ -50,16 +50,24 @@ static std::function<void(BlockOfCode&)> GenRCP(const A32::UserConfig& conf) {
}; };
} }
static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
return Optimization::PolyfillOptions{
.sha256 = !code.HasHostFeature(HostFeature::SHA),
};
}
struct Jit::Impl { struct Jit::Impl {
Impl(Jit* jit, A32::UserConfig conf) Impl(Jit* jit, A32::UserConfig conf)
: block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this), JitStateInfo{jit_state}, conf.code_cache_size, conf.far_code_offset, GenRCP(conf)) : block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this), JitStateInfo{jit_state}, conf.code_cache_size, conf.far_code_offset, GenRCP(conf))
, emitter(block_of_code, conf, jit) , emitter(block_of_code, conf, jit)
, polyfill_options(GenPolyfillOptions(block_of_code))
, conf(std::move(conf)) , conf(std::move(conf))
, jit_interface(jit) {} , jit_interface(jit) {}
A32JitState jit_state; A32JitState jit_state;
BlockOfCode block_of_code; BlockOfCode block_of_code;
A32EmitX64 emitter; A32EmitX64 emitter;
Optimization::PolyfillOptions polyfill_options;
const A32::UserConfig conf; const A32::UserConfig conf;
@ -154,6 +162,7 @@ private:
} }
IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, conf.callbacks, {conf.arch_version, conf.define_unpredictable_behaviour, conf.hook_hint_instructions}); IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, conf.callbacks, {conf.arch_version, conf.define_unpredictable_behaviour, conf.hook_hint_instructions});
Optimization::PolyfillPass(ir_block, polyfill_options);
if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) { if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) {
Optimization::A32GetSetElimination(ir_block); Optimization::A32GetSetElimination(ir_block);
Optimization::DeadCodeElimination(ir_block); Optimization::DeadCodeElimination(ir_block);

View file

@ -45,12 +45,19 @@ static std::function<void(BlockOfCode&)> GenRCP(const A64::UserConfig& conf) {
}; };
} }
static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
return Optimization::PolyfillOptions{
.sha256 = !code.HasHostFeature(HostFeature::SHA),
};
}
struct Jit::Impl final { struct Jit::Impl final {
public: public:
Impl(Jit* jit, UserConfig conf) Impl(Jit* jit, UserConfig conf)
: conf(conf) : conf(conf)
, block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this), JitStateInfo{jit_state}, conf.code_cache_size, conf.far_code_offset, GenRCP(conf)) , block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this), JitStateInfo{jit_state}, conf.code_cache_size, conf.far_code_offset, GenRCP(conf))
, emitter(block_of_code, conf, jit) { , emitter(block_of_code, conf, jit)
, polyfill_options(GenPolyfillOptions(block_of_code)) {
ASSERT(conf.page_table_address_space_bits >= 12 && conf.page_table_address_space_bits <= 64); ASSERT(conf.page_table_address_space_bits >= 12 && conf.page_table_address_space_bits <= 64);
} }
@ -253,6 +260,7 @@ private:
const auto get_code = [this](u64 vaddr) { return conf.callbacks->MemoryReadCode(vaddr); }; const auto get_code = [this](u64 vaddr) { return conf.callbacks->MemoryReadCode(vaddr); };
IR::Block ir_block = A64::Translate(A64::LocationDescriptor{current_location}, get_code, IR::Block ir_block = A64::Translate(A64::LocationDescriptor{current_location}, get_code,
{conf.define_unpredictable_behaviour, conf.wall_clock_cntpct}); {conf.define_unpredictable_behaviour, conf.wall_clock_cntpct});
Optimization::PolyfillPass(ir_block, polyfill_options);
Optimization::A64CallbackConfigPass(ir_block, conf); Optimization::A64CallbackConfigPass(ir_block, conf);
if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) { if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) {
Optimization::A64GetSetElimination(ir_block); Optimization::A64GetSetElimination(ir_block);
@ -301,6 +309,7 @@ private:
A64JitState jit_state; A64JitState jit_state;
BlockOfCode block_of_code; BlockOfCode block_of_code;
A64EmitX64 emitter; A64EmitX64 emitter;
Optimization::PolyfillOptions polyfill_options;
bool invalidate_entire_cache = false; bool invalidate_entire_cache = false;
boost::icl::interval_set<u64> invalid_cache_ranges; boost::icl::interval_set<u64> invalid_cache_ranges;

View file

@ -114,6 +114,8 @@ HostFeature GetHostFeatures() {
features |= HostFeature::FMA; features |= HostFeature::FMA;
if (cpu_info.has(Cpu::tAESNI)) if (cpu_info.has(Cpu::tAESNI))
features |= HostFeature::AES; features |= HostFeature::AES;
if (cpu_info.has(Cpu::tSHA))
features |= HostFeature::SHA;
if (cpu_info.has(Cpu::tPOPCNT)) if (cpu_info.has(Cpu::tPOPCNT))
features |= HostFeature::POPCNT; features |= HostFeature::POPCNT;
if (cpu_info.has(Cpu::tBMI1)) if (cpu_info.has(Cpu::tBMI1))

View file

@ -0,0 +1,81 @@
/* This file is part of the dynarmic project.
* Copyright (c) 2022 MerryMage
* SPDX-License-Identifier: 0BSD
*/
#include "dynarmic/backend/x64/block_of_code.h"
#include "dynarmic/backend/x64/emit_x64.h"
namespace Dynarmic::Backend::X64 {
using namespace Xbyak::util;
void EmitX64::EmitSHA256Hash(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
const bool part1 = args[3].GetImmediateU1();
ASSERT(code.HasHostFeature(HostFeature::SHA));
// 3 2 1 0
// x = d c b a
// y = h g f e
// w = wk3 wk2 wk1 wk0
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
const Xbyak::Xmm w = ctx.reg_alloc.UseXmm(args[2]);
// x64 expects:
// 3 2 1 0
// src1 = c d g h
// src2 = a b e f
// xmm0 = - - wk1 wk0
code.movaps(xmm0, y);
code.shufps(xmm0, x, 0b10111011); // src1
code.shufps(y, x, 0b00010001); // src2
code.movaps(x, xmm0);
code.movaps(xmm0, w);
code.sha256rnds2(x, y);
code.punpckhqdq(xmm0, xmm0);
code.sha256rnds2(y, x);
code.shufps(y, x, part1 ? 0b10111011 : 0b00010001);
ctx.reg_alloc.DefineValue(inst, y);
}
void EmitX64::EmitSHA256MessageSchedule0(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(code.HasHostFeature(HostFeature::SHA));
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
code.sha256msg1(x, y);
ctx.reg_alloc.DefineValue(inst, x);
}
void EmitX64::EmitSHA256MessageSchedule1(EmitContext& ctx, IR::Inst* inst) {
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
ASSERT(code.HasHostFeature(HostFeature::SHA));
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm z = ctx.reg_alloc.UseXmm(args[2]);
code.movaps(xmm0, z);
code.palignr(xmm0, y, 4);
code.paddd(x, xmm0);
code.sha256msg2(x, z);
ctx.reg_alloc.DefineValue(inst, x);
}
} // namespace Dynarmic::Backend::X64

View file

@ -26,14 +26,15 @@ enum class HostFeature : u64 {
F16C = 1ULL << 13, F16C = 1ULL << 13,
FMA = 1ULL << 14, FMA = 1ULL << 14,
AES = 1ULL << 15, AES = 1ULL << 15,
POPCNT = 1ULL << 16, SHA = 1ULL << 16,
BMI1 = 1ULL << 17, POPCNT = 1ULL << 17,
BMI2 = 1ULL << 18, BMI1 = 1ULL << 18,
LZCNT = 1ULL << 19, BMI2 = 1ULL << 19,
GFNI = 1ULL << 20, LZCNT = 1ULL << 20,
GFNI = 1ULL << 21,
// Zen-based BMI2 // Zen-based BMI2
FastBMI2 = 1ULL << 21, FastBMI2 = 1ULL << 22,
// Orthographic AVX512 features on 128 and 256 vectors // Orthographic AVX512 features on 128 and 256 vectors
AVX512_Ortho = AVX512F | AVX512VL, AVX512_Ortho = AVX512F | AVX512VL,

View file

@ -50,6 +50,9 @@ INST(asimd_VPMAX_float, "VPMAX (floating-point)", "111100110D0znnnndddd111
INST(asimd_VPMIN_float, "VPMIN (floating-point)", "111100110D1znnnndddd1111NQM0mmmm") // ASIMD INST(asimd_VPMIN_float, "VPMIN (floating-point)", "111100110D1znnnndddd1111NQM0mmmm") // ASIMD
INST(asimd_VRECPS, "VRECPS", "111100100D0znnnndddd1111NQM1mmmm") // ASIMD INST(asimd_VRECPS, "VRECPS", "111100100D0znnnndddd1111NQM1mmmm") // ASIMD
INST(asimd_VRSQRTS, "VRSQRTS", "111100100D1znnnndddd1111NQM1mmmm") // ASIMD INST(asimd_VRSQRTS, "VRSQRTS", "111100100D1znnnndddd1111NQM1mmmm") // ASIMD
INST(v8_SHA256H, "SHA256H", "111100110D00nnnndddd1100NQM0mmmm") // v8
INST(v8_SHA256H2, "SHA256H2", "111100110D01nnnndddd1100NQM0mmmm") // v8
INST(v8_SHA256SU1, "SHA256SU1", "111100110D10nnnndddd1100NQM0mmmm") // v8
// Three registers of different lengths // Three registers of different lengths
INST(asimd_VADDL, "VADDL/VADDW", "1111001U1Dzznnnndddd000oN0M0mmmm") // ASIMD INST(asimd_VADDL, "VADDL/VADDW", "1111001U1Dzznnnndddd000oN0M0mmmm") // ASIMD
@ -144,7 +147,7 @@ INST(v8_AESIMC, "AESIMC", "111100111D11zz00dddd001
INST(arm_UDF, "UNALLOCATED", "111100111-11--01----001010-0----") // v8 INST(arm_UDF, "UNALLOCATED", "111100111-11--01----001010-0----") // v8
INST(arm_UDF, "UNALLOCATED (SHA1H)", "111100111-11--01----001011-0----") // v8 INST(arm_UDF, "UNALLOCATED (SHA1H)", "111100111-11--01----001011-0----") // v8
INST(arm_UDF, "UNALLOCATED (SHA1SU1)", "111100111-11--10----001110-0----") // v8 INST(arm_UDF, "UNALLOCATED (SHA1SU1)", "111100111-11--10----001110-0----") // v8
INST(arm_UDF, "UNALLOCATED (SHA256SU0)", "111100111-11--10----001111-0----") // v8 INST(v8_SHA256SU0, "SHA256SU0", "111100111D11zz10dddd001111M0mmmm") // v8
// One register and modified immediate // One register and modified immediate
INST(asimd_VMOV_imm, "VBIC, VMOV, VMVN, VORR (immediate)", "1111001a1D000bcdVVVVmmmm0Qo1efgh") // ASIMD INST(asimd_VMOV_imm, "VBIC, VMOV, VMVN, VORR (immediate)", "1111001a1D000bcdVVVVmmmm0Qo1efgh") // ASIMD

View file

@ -19,6 +19,8 @@ INST(thumb32_LDRD_lit_1, "LDRD (lit)", "11101000U1111111ttttss
INST(thumb32_LDRD_lit_2, "LDRD (lit)", "11101001U1W11111ttttssssiiiiiiii") INST(thumb32_LDRD_lit_2, "LDRD (lit)", "11101001U1W11111ttttssssiiiiiiii")
INST(thumb32_LDRD_imm_1, "LDRD (imm)", "11101000U111nnnnttttssssiiiiiiii") INST(thumb32_LDRD_imm_1, "LDRD (imm)", "11101000U111nnnnttttssssiiiiiiii")
INST(thumb32_LDRD_imm_2, "LDRD (imm)", "11101001U1W1nnnnttttssssiiiiiiii") INST(thumb32_LDRD_imm_2, "LDRD (imm)", "11101001U1W1nnnnttttssssiiiiiiii")
INST(thumb32_STL, "STL", "111010001100nnnntttt111110101111") // v8
INST(thumb32_LDA, "LDA", "111010001101nnnntttt111110101111") // v8
INST(thumb32_STREXB, "STREXB", "111010001100nnnntttt11110100dddd") INST(thumb32_STREXB, "STREXB", "111010001100nnnntttt11110100dddd")
INST(thumb32_STREXH, "STREXH", "111010001100nnnntttt11110101dddd") INST(thumb32_STREXH, "STREXH", "111010001100nnnntttt11110101dddd")
INST(thumb32_STREXD, "STREXD", "111010001100nnnnttttuuuu0111dddd") INST(thumb32_STREXD, "STREXD", "111010001100nnnnttttuuuu0111dddd")

View file

@ -493,6 +493,7 @@ struct TranslatorVisitor final {
bool thumb32_STMDB(bool W, Reg n, Imm<15> reg_list); bool thumb32_STMDB(bool W, Reg n, Imm<15> reg_list);
// thumb32 load/store dual, load/store exclusive, table branch instructions // thumb32 load/store dual, load/store exclusive, table branch instructions
bool thumb32_LDA(Reg n, Reg t);
bool thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8); bool thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8);
bool thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8); bool thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8);
bool thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8); bool thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8);
@ -503,6 +504,7 @@ struct TranslatorVisitor final {
bool thumb32_LDREXD(Reg n, Reg t, Reg t2); bool thumb32_LDREXD(Reg n, Reg t, Reg t2);
bool thumb32_LDREXB(Reg n, Reg t); bool thumb32_LDREXB(Reg n, Reg t);
bool thumb32_LDREXH(Reg n, Reg t); bool thumb32_LDREXH(Reg n, Reg t);
bool thumb32_STL(Reg n, Reg t);
bool thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8); bool thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8);
bool thumb32_STREXB(Reg n, Reg t, Reg d); bool thumb32_STREXB(Reg n, Reg t, Reg d);
bool thumb32_STREXD(Reg n, Reg t, Reg t2, Reg d); bool thumb32_STREXD(Reg n, Reg t, Reg t2, Reg d);
@ -875,6 +877,9 @@ struct TranslatorVisitor final {
bool asimd_VPMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VPMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
bool asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
bool asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm); bool asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
bool v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
bool v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
bool v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
// Advanced SIMD three registers with different lengths // Advanced SIMD three registers with different lengths
bool asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm); bool asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm);
@ -918,6 +923,7 @@ struct TranslatorVisitor final {
bool v8_AESE(bool D, size_t sz, size_t Vd, bool M, size_t Vm); bool v8_AESE(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
bool v8_AESIMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm); bool v8_AESIMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
bool v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm); bool v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
bool v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
bool asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); bool asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
bool asimd_VCLZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); bool asimd_VCLZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
bool asimd_VCNT(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm); bool asimd_VCNT(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);

View file

@ -831,6 +831,60 @@ bool TranslatorVisitor::asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, boo
}); });
} }
bool TranslatorVisitor::v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
if (!Q || Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm)) {
return UndefinedInstruction();
}
const auto d = ToVector(Q, Vd, D);
const auto n = ToVector(Q, Vn, N);
const auto m = ToVector(Q, Vm, M);
const auto x = ir.GetVector(d);
const auto y = ir.GetVector(n);
const auto w = ir.GetVector(m);
const auto result = ir.SHA256Hash(x, y, w, true);
ir.SetVector(d, result);
return true;
}
bool TranslatorVisitor::v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
if (!Q || Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm)) {
return UndefinedInstruction();
}
const auto n = ToVector(Q, Vn, N);
const auto d = ToVector(Q, Vd, D);
const auto m = ToVector(Q, Vm, M);
const auto x = ir.GetVector(n);
const auto y = ir.GetVector(d);
const auto w = ir.GetVector(m);
const auto result = ir.SHA256Hash(x, y, w, false);
ir.SetVector(d, result);
return true;
}
bool TranslatorVisitor::v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
if (!Q || Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm)) {
return UndefinedInstruction();
}
const auto d = ToVector(Q, Vd, D);
const auto n = ToVector(Q, Vn, N);
const auto m = ToVector(Q, Vm, M);
const auto x = ir.GetVector(d);
const auto y = ir.GetVector(n);
const auto z = ir.GetVector(m);
const auto result = ir.SHA256MessageSchedule1(x, y, z);
ir.SetVector(d, result);
return true;
}
// ASIMD Three registers of different length // ASIMD Three registers of different length
bool TranslatorVisitor::asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) { bool TranslatorVisitor::asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) {

View file

@ -225,6 +225,21 @@ bool TranslatorVisitor::v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm
return true; return true;
} }
bool TranslatorVisitor::v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
if (sz != 0b10 || Common::Bit<0>(Vd) || Common::Bit<0>(Vm)) {
return UndefinedInstruction();
}
const auto d = ToVector(true, Vd, D);
const auto m = ToVector(true, Vm, M);
const auto x = ir.GetVector(d);
const auto y = ir.GetVector(m);
const auto result = ir.SHA256MessageSchedule0(x, y);
ir.SetVector(d, result);
return true;
}
bool TranslatorVisitor::asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) { bool TranslatorVisitor::asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
if (sz == 0b11) { if (sz == 0b11) {
return UndefinedInstruction(); return UndefinedInstruction();

View file

@ -110,6 +110,16 @@ static bool StoreDual(TranslatorVisitor& v, bool P, bool U, bool W, Reg n, Reg t
return true; return true;
} }
bool TranslatorVisitor::thumb32_LDA(Reg n, Reg t) {
if (t == Reg::PC || n == Reg::PC) {
return UnpredictableInstruction();
}
const auto address = ir.GetRegister(n);
ir.SetRegister(t, ir.ReadMemory32(address)); // AccType::Ordered
return true;
}
bool TranslatorVisitor::thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) { bool TranslatorVisitor::thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) {
return LoadDualImmediate(*this, false, U, true, n, t, t2, imm8); return LoadDualImmediate(*this, false, U, true, n, t, t2, imm8);
} }
@ -184,6 +194,16 @@ bool TranslatorVisitor::thumb32_LDREXH(Reg n, Reg t) {
return true; return true;
} }
bool TranslatorVisitor::thumb32_STL(Reg n, Reg t) {
if (t == Reg::PC || n == Reg::PC) {
return UnpredictableInstruction();
}
const auto address = ir.GetRegister(n);
ir.WriteMemory32(address, ir.GetRegister(t)); // AccType::Ordered
return true;
}
bool TranslatorVisitor::thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8) { bool TranslatorVisitor::thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8) {
if (d == Reg::PC || t == Reg::PC || n == Reg::PC) { if (d == Reg::PC || t == Reg::PC || n == Reg::PC) {
return UnpredictableInstruction(); return UnpredictableInstruction();

View file

@ -1407,7 +1407,7 @@ bool TranslatorVisitor::vfp_VLDM_a1(Cond cond, bool p, bool u, bool D, bool w, R
return arm_UDF(); return arm_UDF();
} }
if (n == Reg::PC && w) { if (n == Reg::PC && (w || ir.current_location.TFlag())) {
return UnpredictableInstruction(); return UnpredictableInstruction();
} }
@ -1457,7 +1457,7 @@ bool TranslatorVisitor::vfp_VLDM_a2(Cond cond, bool p, bool u, bool D, bool w, R
return arm_UDF(); return arm_UDF();
} }
if (n == Reg::PC && w) { if (n == Reg::PC && (w || ir.current_location.TFlag())) {
return UnpredictableInstruction(); return UnpredictableInstruction();
} }

View file

@ -46,67 +46,6 @@ IR::U128 SHA1HashUpdate(IREmitter& ir, Vec Vm, Vec Vn, Vec Vd, SHA1HashUpdateFun
return x; return x;
} }
IR::U32 SHAhashSIGMA0(IREmitter& ir, IR::U32 x) {
const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(2));
const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(13));
const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(22));
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
}
IR::U32 SHAhashSIGMA1(IREmitter& ir, IR::U32 x) {
const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(6));
const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(11));
const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(25));
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
}
enum class SHA256HashPart {
Part1,
Part2
};
IR::U128 SHA256hash(IREmitter& ir, IR::U128 x, IR::U128 y, IR::U128 w, SHA256HashPart part) {
for (size_t i = 0; i < 4; i++) {
const IR::U32 low_x = ir.VectorGetElement(32, x, 0);
const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1);
const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2);
const IR::U32 high_x = ir.VectorGetElement(32, x, 3);
const IR::U32 low_y = ir.VectorGetElement(32, y, 0);
const IR::U32 after_low_y = ir.VectorGetElement(32, y, 1);
const IR::U32 before_high_y = ir.VectorGetElement(32, y, 2);
const IR::U32 high_y = ir.VectorGetElement(32, y, 3);
const IR::U32 choice = SHAchoose(ir, low_y, after_low_y, before_high_y);
const IR::U32 majority = SHAmajority(ir, low_x, after_low_x, before_high_x);
const IR::U32 t = [&] {
const IR::U32 w_element = ir.VectorGetElement(32, w, i);
const IR::U32 sig = SHAhashSIGMA1(ir, low_y);
return ir.Add(high_y, ir.Add(sig, ir.Add(choice, w_element)));
}();
const IR::U32 new_low_x = ir.Add(t, ir.Add(SHAhashSIGMA0(ir, low_x), majority));
const IR::U32 new_low_y = ir.Add(t, high_x);
// Shuffle all words left by 1 element: [3, 2, 1, 0] -> [2, 1, 0, 3]
const IR::U128 shuffled_x = ir.VectorShuffleWords(x, 0b10010011);
const IR::U128 shuffled_y = ir.VectorShuffleWords(y, 0b10010011);
x = ir.VectorSetElement(32, shuffled_x, 0, new_low_x);
y = ir.VectorSetElement(32, shuffled_y, 0, new_low_y);
}
if (part == SHA256HashPart::Part1) {
return x;
}
return y;
}
} // Anonymous namespace } // Anonymous namespace
bool TranslatorVisitor::SHA1C(Vec Vm, Vec Vn, Vec Vd) { bool TranslatorVisitor::SHA1C(Vec Vm, Vec Vn, Vec Vd) {
@ -175,85 +114,34 @@ bool TranslatorVisitor::SHA1H(Vec Vn, Vec Vd) {
} }
bool TranslatorVisitor::SHA256SU0(Vec Vn, Vec Vd) { bool TranslatorVisitor::SHA256SU0(Vec Vn, Vec Vd) {
const IR::U128 d = ir.GetQ(Vd); const IR::U128 x = ir.GetQ(Vd);
const IR::U128 n = ir.GetQ(Vn); const IR::U128 y = ir.GetQ(Vn);
const IR::U128 t = [&] { const IR::U128 result = ir.SHA256MessageSchedule0(x, y);
// Shuffle the upper three elements down: [3, 2, 1, 0] -> [0, 3, 2, 1]
const IR::U128 shuffled = ir.VectorShuffleWords(d, 0b00111001);
return ir.VectorSetElement(32, shuffled, 3, ir.VectorGetElement(32, n, 0));
}();
IR::U128 result = ir.ZeroVector();
for (size_t i = 0; i < 4; i++) {
const IR::U32 modified_element = [&] {
const IR::U32 element = ir.VectorGetElement(32, t, i);
const IR::U32 tmp1 = ir.RotateRight(element, ir.Imm8(7));
const IR::U32 tmp2 = ir.RotateRight(element, ir.Imm8(18));
const IR::U32 tmp3 = ir.LogicalShiftRight(element, ir.Imm8(3));
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
}();
const IR::U32 d_element = ir.VectorGetElement(32, d, i);
result = ir.VectorSetElement(32, result, i, ir.Add(modified_element, d_element));
}
ir.SetQ(Vd, result); ir.SetQ(Vd, result);
return true; return true;
} }
bool TranslatorVisitor::SHA256SU1(Vec Vm, Vec Vn, Vec Vd) { bool TranslatorVisitor::SHA256SU1(Vec Vm, Vec Vn, Vec Vd) {
const IR::U128 d = ir.GetQ(Vd); const IR::U128 x = ir.GetQ(Vd);
const IR::U128 m = ir.GetQ(Vm); const IR::U128 y = ir.GetQ(Vn);
const IR::U128 n = ir.GetQ(Vn); const IR::U128 z = ir.GetQ(Vm);
const IR::U128 T0 = [&] { const IR::U128 result = ir.SHA256MessageSchedule1(x, y, z);
const IR::U32 low_m = ir.VectorGetElement(32, m, 0);
const IR::U128 shuffled_n = ir.VectorShuffleWords(n, 0b00111001);
return ir.VectorSetElement(32, shuffled_n, 3, low_m);
}();
const IR::U128 lower_half = [&] {
const IR::U128 T = ir.VectorShuffleWords(m, 0b01001110);
const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17);
const IR::U128 tmp2 = ir.VectorRotateRight(32, T, 19);
const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, T, 10);
const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, d, T0));
return ir.VectorZeroUpper(tmp5);
}();
const IR::U64 upper_half = [&] {
const IR::U128 tmp1 = ir.VectorRotateRight(32, lower_half, 17);
const IR::U128 tmp2 = ir.VectorRotateRight(32, lower_half, 19);
const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, lower_half, 10);
const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
// Shuffle the top two 32-bit elements downwards [3, 2, 1, 0] -> [1, 0, 3, 2]
const IR::U128 shuffled_d = ir.VectorShuffleWords(d, 0b01001110);
const IR::U128 shuffled_T0 = ir.VectorShuffleWords(T0, 0b01001110);
const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, shuffled_d, shuffled_T0));
return ir.VectorGetElement(64, tmp5, 0);
}();
const IR::U128 result = ir.VectorSetElement(64, lower_half, 1, upper_half);
ir.SetQ(Vd, result); ir.SetQ(Vd, result);
return true; return true;
} }
bool TranslatorVisitor::SHA256H(Vec Vm, Vec Vn, Vec Vd) { bool TranslatorVisitor::SHA256H(Vec Vm, Vec Vn, Vec Vd) {
const IR::U128 result = SHA256hash(ir, ir.GetQ(Vd), ir.GetQ(Vn), ir.GetQ(Vm), SHA256HashPart::Part1); const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vd), ir.GetQ(Vn), ir.GetQ(Vm), true);
ir.SetQ(Vd, result); ir.SetQ(Vd, result);
return true; return true;
} }
bool TranslatorVisitor::SHA256H2(Vec Vm, Vec Vn, Vec Vd) { bool TranslatorVisitor::SHA256H2(Vec Vm, Vec Vn, Vec Vd) {
const IR::U128 result = SHA256hash(ir, ir.GetQ(Vn), ir.GetQ(Vd), ir.GetQ(Vm), SHA256HashPart::Part2); const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vn), ir.GetQ(Vd), ir.GetQ(Vm), false);
ir.SetQ(Vd, result); ir.SetQ(Vd, result);
return true; return true;
} }

View file

@ -903,6 +903,18 @@ U8 IREmitter::SM4AccessSubstitutionBox(const U8& a) {
return Inst<U8>(Opcode::SM4AccessSubstitutionBox, a); return Inst<U8>(Opcode::SM4AccessSubstitutionBox, a);
} }
U128 IREmitter::SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1) {
return Inst<U128>(Opcode::SHA256Hash, x, y, w, Imm1(part1));
}
U128 IREmitter::SHA256MessageSchedule0(const U128& x, const U128& y) {
return Inst<U128>(Opcode::SHA256MessageSchedule0, x, y);
}
U128 IREmitter::SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z) {
return Inst<U128>(Opcode::SHA256MessageSchedule1, x, y, z);
}
UAny IREmitter::VectorGetElement(size_t esize, const U128& a, size_t index) { UAny IREmitter::VectorGetElement(size_t esize, const U128& a, size_t index) {
ASSERT_MSG(esize * index < 128, "Invalid index"); ASSERT_MSG(esize * index < 128, "Invalid index");
switch (esize) { switch (esize) {

View file

@ -219,6 +219,10 @@ public:
U8 SM4AccessSubstitutionBox(const U8& a); U8 SM4AccessSubstitutionBox(const U8& a);
U128 SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1);
U128 SHA256MessageSchedule0(const U128& x, const U128& y);
U128 SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z);
UAny VectorGetElement(size_t esize, const U128& a, size_t index); UAny VectorGetElement(size_t esize, const U128& a, size_t index);
U128 VectorSetElement(size_t esize, const U128& a, size_t index, const UAny& elem); U128 VectorSetElement(size_t esize, const U128& a, size_t index, const UAny& elem);
U128 VectorAbs(size_t esize, const U128& a); U128 VectorAbs(size_t esize, const U128& a);

View file

@ -272,6 +272,11 @@ OPCODE(AESMixColumns, U128, U128
// SM4 instructions // SM4 instructions
OPCODE(SM4AccessSubstitutionBox, U8, U8 ) OPCODE(SM4AccessSubstitutionBox, U8, U8 )
// SHA instructions
OPCODE(SHA256Hash, U128, U128, U128, U128, U1 )
OPCODE(SHA256MessageSchedule0, U128, U128, U128 )
OPCODE(SHA256MessageSchedule1, U128, U128, U128, U128 )
// Vector instructions // Vector instructions
OPCODE(VectorGetElement8, U8, U128, U8 ) OPCODE(VectorGetElement8, U8, U128, U8 )
OPCODE(VectorGetElement16, U16, U128, U8 ) OPCODE(VectorGetElement16, U16, U128, U8 )

View file

@ -20,6 +20,13 @@ class Block;
namespace Dynarmic::Optimization { namespace Dynarmic::Optimization {
struct PolyfillOptions {
bool sha256 = false;
bool operator==(const PolyfillOptions&) const = default;
};
void PolyfillPass(IR::Block& block, const PolyfillOptions& opt);
void A32ConstantMemoryReads(IR::Block& block, A32::UserCallbacks* cb); void A32ConstantMemoryReads(IR::Block& block, A32::UserCallbacks* cb);
void A32GetSetElimination(IR::Block& block); void A32GetSetElimination(IR::Block& block);
void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf); void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf);

View file

@ -0,0 +1,175 @@
/* This file is part of the dynarmic project.
* Copyright (c) 2022 MerryMage
* SPDX-License-Identifier: 0BSD
*/
#include "dynarmic/ir/basic_block.h"
#include "dynarmic/ir/ir_emitter.h"
#include "dynarmic/ir/microinstruction.h"
#include "dynarmic/ir/opcodes.h"
#include "dynarmic/ir/opt/passes.h"
namespace Dynarmic::Optimization {
namespace {
void PolyfillSHA256MessageSchedule0(IR::IREmitter& ir, IR::Inst& inst) {
const IR::U128 x = (IR::U128)inst.GetArg(0);
const IR::U128 y = (IR::U128)inst.GetArg(1);
const IR::U128 t = ir.VectorExtract(x, y, 32);
IR::U128 result = ir.ZeroVector();
for (size_t i = 0; i < 4; i++) {
const IR::U32 modified_element = [&] {
const IR::U32 element = ir.VectorGetElement(32, t, i);
const IR::U32 tmp1 = ir.RotateRight(element, ir.Imm8(7));
const IR::U32 tmp2 = ir.RotateRight(element, ir.Imm8(18));
const IR::U32 tmp3 = ir.LogicalShiftRight(element, ir.Imm8(3));
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
}();
result = ir.VectorSetElement(32, result, i, modified_element);
}
result = ir.VectorAdd(32, result, x);
inst.ReplaceUsesWith(result);
}
void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) {
const IR::U128 x = (IR::U128)inst.GetArg(0);
const IR::U128 y = (IR::U128)inst.GetArg(1);
const IR::U128 z = (IR::U128)inst.GetArg(2);
const IR::U128 T0 = ir.VectorExtract(y, z, 32);
const IR::U128 lower_half = [&] {
const IR::U128 T = ir.VectorShuffleWords(z, 0b01001110);
const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17);
const IR::U128 tmp2 = ir.VectorRotateRight(32, T, 19);
const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, T, 10);
const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, x, T0));
return ir.VectorZeroUpper(tmp5);
}();
const IR::U64 upper_half = [&] {
const IR::U128 tmp1 = ir.VectorRotateRight(32, lower_half, 17);
const IR::U128 tmp2 = ir.VectorRotateRight(32, lower_half, 19);
const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, lower_half, 10);
const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
// Shuffle the top two 32-bit elements downwards [3, 2, 1, 0] -> [1, 0, 3, 2]
const IR::U128 shuffled_d = ir.VectorShuffleWords(x, 0b01001110);
const IR::U128 shuffled_T0 = ir.VectorShuffleWords(T0, 0b01001110);
const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, shuffled_d, shuffled_T0));
return ir.VectorGetElement(64, tmp5, 0);
}();
const IR::U128 result = ir.VectorSetElement(64, lower_half, 1, upper_half);
inst.ReplaceUsesWith(result);
}
IR::U32 SHAchoose(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
return ir.Eor(ir.And(ir.Eor(y, z), x), z);
}
IR::U32 SHAmajority(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
return ir.Or(ir.And(x, y), ir.And(ir.Or(x, y), z));
}
IR::U32 SHAhashSIGMA0(IR::IREmitter& ir, IR::U32 x) {
const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(2));
const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(13));
const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(22));
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
}
IR::U32 SHAhashSIGMA1(IR::IREmitter& ir, IR::U32 x) {
const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(6));
const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(11));
const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(25));
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
}
void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) {
IR::U128 x = (IR::U128)inst.GetArg(0);
IR::U128 y = (IR::U128)inst.GetArg(1);
const IR::U128 w = (IR::U128)inst.GetArg(2);
const bool part1 = inst.GetArg(3).GetU1();
for (size_t i = 0; i < 4; i++) {
const IR::U32 low_x = ir.VectorGetElement(32, x, 0);
const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1);
const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2);
const IR::U32 high_x = ir.VectorGetElement(32, x, 3);
const IR::U32 low_y = ir.VectorGetElement(32, y, 0);
const IR::U32 after_low_y = ir.VectorGetElement(32, y, 1);
const IR::U32 before_high_y = ir.VectorGetElement(32, y, 2);
const IR::U32 high_y = ir.VectorGetElement(32, y, 3);
const IR::U32 choice = SHAchoose(ir, low_y, after_low_y, before_high_y);
const IR::U32 majority = SHAmajority(ir, low_x, after_low_x, before_high_x);
const IR::U32 t = [&] {
const IR::U32 w_element = ir.VectorGetElement(32, w, i);
const IR::U32 sig = SHAhashSIGMA1(ir, low_y);
return ir.Add(high_y, ir.Add(sig, ir.Add(choice, w_element)));
}();
const IR::U32 new_low_x = ir.Add(t, ir.Add(SHAhashSIGMA0(ir, low_x), majority));
const IR::U32 new_low_y = ir.Add(t, high_x);
// Shuffle all words left by 1 element: [3, 2, 1, 0] -> [2, 1, 0, 3]
const IR::U128 shuffled_x = ir.VectorShuffleWords(x, 0b10010011);
const IR::U128 shuffled_y = ir.VectorShuffleWords(y, 0b10010011);
x = ir.VectorSetElement(32, shuffled_x, 0, new_low_x);
y = ir.VectorSetElement(32, shuffled_y, 0, new_low_y);
}
inst.ReplaceUsesWith(part1 ? x : y);
}
} // namespace
void PolyfillPass(IR::Block& block, const PolyfillOptions& polyfill) {
if (polyfill == PolyfillOptions{}) {
return;
}
IR::IREmitter ir{block};
for (auto& inst : block) {
ir.SetInsertionPoint(&inst);
switch (inst.GetOpcode()) {
case IR::Opcode::SHA256MessageSchedule0:
if (polyfill.sha256) {
PolyfillSHA256MessageSchedule0(ir, inst);
}
break;
case IR::Opcode::SHA256MessageSchedule1:
if (polyfill.sha256) {
PolyfillSHA256MessageSchedule1(ir, inst);
}
break;
case IR::Opcode::SHA256Hash:
if (polyfill.sha256) {
PolyfillSHA256Hash(ir, inst);
}
break;
default:
break;
}
}
}
} // namespace Dynarmic::Optimization

View file

@ -535,6 +535,37 @@ TEST_CASE("A32: Single random thumb instruction", "[thumb]") {
} }
} }
TEST_CASE("A32: Single random thumb instruction (offset)", "[thumb]") {
ThumbTestEnv jit_env{};
ThumbTestEnv uni_env{};
Dynarmic::A32::Jit jit{GetUserConfig(jit_env)};
A32Unicorn<ThumbTestEnv> uni{uni_env};
A32Unicorn<ThumbTestEnv>::RegisterArray regs;
A32Unicorn<ThumbTestEnv>::ExtRegArray ext_reg;
std::vector<u16> instructions;
for (size_t iteration = 0; iteration < 100000; ++iteration) {
std::generate(regs.begin(), regs.end(), [] { return RandInt<u32>(0, ~u32(0)); });
std::generate(ext_reg.begin(), ext_reg.end(), [] { return RandInt<u32>(0, ~u32(0)); });
instructions.clear();
instructions.push_back(0xbf00); // NOP
const std::vector<u16> inst = GenRandomThumbInst(0, true);
instructions.insert(instructions.end(), inst.begin(), inst.end());
const u32 start_address = 100;
const u32 cpsr = (RandInt<u32>(0, 0xF) << 28) | 0x1F0;
const u32 fpcr = RandomFpcr();
INFO("Instruction: 0x" << std::hex << inst[0]);
regs[15] = start_address;
RunTestInstance(jit, uni, jit_env, uni_env, regs, ext_reg, instructions, cpsr, fpcr, 2);
}
}
TEST_CASE("A32: Small random thumb block", "[thumb]") { TEST_CASE("A32: Small random thumb block", "[thumb]") {
ThumbTestEnv jit_env{}; ThumbTestEnv jit_env{};
ThumbTestEnv uni_env{}; ThumbTestEnv uni_env{};

View file

@ -38,7 +38,7 @@ if (MSVC)
/MP /MP
/Zf /Zf
/Zi /Zi
/Zm200 /Zm300
/Zo /Zo
/permissive- /permissive-
/EHsc /EHsc

View file

@ -322,7 +322,7 @@ struct Memory::Impl {
} }
if (Settings::IsFastmemEnabled()) { if (Settings::IsFastmemEnabled()) {
const bool is_read_enable = Settings::IsGPULevelHigh() || !cached; const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;
system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached); system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
} }

View file

@ -1495,15 +1495,13 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
overlap_ids.push_back(overlap_id); overlap_ids.push_back(overlap_id);
overlap.Pick(); overlap.Pick();
const VAddr overlap_cpu_addr = overlap.CpuAddr(); const VAddr overlap_cpu_addr = overlap.CpuAddr();
bool goes_left = false; const bool expands_left = overlap_cpu_addr < begin;
if (overlap_cpu_addr < begin) { if (expands_left) {
goes_left = true;
cpu_addr = begin = overlap_cpu_addr; cpu_addr = begin = overlap_cpu_addr;
} }
const VAddr overlap_end = overlap_cpu_addr + overlap.SizeBytes(); const VAddr overlap_end = overlap_cpu_addr + overlap.SizeBytes();
bool goes_right = false; const bool expands_right = overlap_end > end;
if (overlap_end > end) { if (overlap_end > end) {
goes_right = true;
end = overlap_end; end = overlap_end;
} }
stream_score += overlap.StreamScore(); stream_score += overlap.StreamScore();
@ -1511,11 +1509,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
// When this memory region has been joined a bunch of times, we assume it's being used // When this memory region has been joined a bunch of times, we assume it's being used
// as a stream buffer. Increase the size to skip constantly recreating buffers. // as a stream buffer. Increase the size to skip constantly recreating buffers.
has_stream_leap = true; has_stream_leap = true;
if (goes_right) { if (expands_right) {
begin -= PAGE_SIZE * 256; begin -= PAGE_SIZE * 256;
cpu_addr = begin; cpu_addr = begin;
} }
if (goes_left) { if (expands_left) {
end += PAGE_SIZE * 256; end += PAGE_SIZE * 256;
} }
} }

View file

@ -2,8 +2,11 @@
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include <algorithm>
#include <cstring> #include <cstring>
#include <optional> #include <optional>
#include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/settings.h" #include "common/settings.h"
#include "core/core.h" #include "core/core.h"
@ -27,6 +30,7 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
upload_state{memory_manager, regs.upload} { upload_state{memory_manager, regs.upload} {
dirty.flags.flip(); dirty.flags.flip();
InitializeRegisterDefaults(); InitializeRegisterDefaults();
accelerated_reads = Settings::IsFastmemEnabled();
} }
Maxwell3D::~Maxwell3D() = default; Maxwell3D::~Maxwell3D() = default;
@ -210,28 +214,14 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
return ProcessCBBind(4); return ProcessCBBind(4);
case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
return DrawArrays(); return DrawArrays();
case MAXWELL3D_REG_INDEX(small_index): { case MAXWELL3D_REG_INDEX(small_index):
regs.index_array.count = regs.small_index.count; regs.index_array.count = regs.small_index.count;
regs.index_array.first = regs.small_index.first; regs.index_array.first = regs.small_index.first;
dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
bool is_extreme = Settings::IsGPULevelExtreme(); if (!Settings::IsGPULevelExtreme()) {
RecalculateVertexArrayLimit();
if (!is_extreme) {
for (size_t i = 0; i < Regs::NumVertexArrays; i++) {
if (!dirty.flags[VideoCommon::Dirty::VertexBuffer0 + i]) {
continue;
}
const u32 stride = regs.vertex_array[i].stride;
const u32 num_vertices = regs.index_array.first + regs.index_array.count;
const GPUVAddr gpu_addr_begin =
regs.vertex_array[i].StartAddress() + regs.index_array.first * stride;
const GPUVAddr gpu_addr_end = gpu_addr_begin + num_vertices * stride + 1;
regs.vertex_array_limit[i].SetAddress(gpu_addr_end);
}
} }
DrawArrays(); return DrawArrays();
return;
}
case MAXWELL3D_REG_INDEX(topology_override): case MAXWELL3D_REG_INDEX(topology_override):
use_topology_override = true; use_topology_override = true;
return; return;
@ -685,4 +675,71 @@ void Maxwell3D::ProcessClearBuffers() {
rasterizer->Clear(); rasterizer->Clear();
} }
void Maxwell3D::RecalculateVertexArrayLimit() {
GPUVAddr start_address = regs.index_array.StartAddress();
auto& vn_state = vertex_num_approx_state;
if (start_address != vn_state.last_index_array_start ||
vn_state.current_min_index != regs.index_array.first) {
vn_state.last_index_array_start = start_address;
vn_state.current_max_index = regs.index_array.first;
vn_state.current_min_index = regs.index_array.first;
vn_state.current_num_vertices = 0;
}
const u32 index_count = regs.index_array.first + regs.index_array.count;
if (index_count <= vn_state.current_max_index) {
return;
}
const u32 max_base = std::max(regs.index_array.first, vn_state.current_max_index);
const u32 num_indices = index_count - max_base;
const size_t size_index = regs.index_array.FormatSizeInBytes();
const size_t expected_size = num_indices * size_index;
const size_t offset = max_base * size_index;
auto maybe_ptr = memory_manager.GpuToHostPointer(start_address + offset);
u8* ptr;
if (accelerated_reads && maybe_ptr) {
ptr = *maybe_ptr;
} else {
vn_state.index_buffer_cache.resize(Common::DivideUp(expected_size, sizeof(u32)));
ptr = reinterpret_cast<u8*>(vn_state.index_buffer_cache.data());
memory_manager.ReadBlockUnsafe(start_address + offset, ptr, expected_size);
}
vn_state.current_max_index = index_count;
u32 new_num_vertices{};
switch (regs.index_array.format) {
case Regs::IndexFormat::UnsignedByte: {
std::span<const u8> span{ptr, num_indices};
const auto max = std::max_element(span.begin(), span.end());
new_num_vertices = *max + 1;
break;
}
case Regs::IndexFormat::UnsignedShort: {
std::span<const u16> span{reinterpret_cast<const u16*>(ptr), num_indices};
const auto max = std::max_element(span.begin(), span.end());
new_num_vertices = *max + 1;
break;
}
case Regs::IndexFormat::UnsignedInt: {
std::span<const u32> span{reinterpret_cast<const u32*>(ptr), num_indices};
const auto max = std::max_element(span.begin(), span.end());
new_num_vertices = *max + 1;
break;
}
}
if (new_num_vertices > vn_state.current_num_vertices) {
vn_state.current_num_vertices = new_num_vertices;
for (size_t i = 0; i < Regs::NumVertexArrays; i++) {
if (!regs.vertex_array[i].enable) {
continue;
}
const u32 stride = regs.vertex_array[i].stride;
const GPUVAddr gpu_addr_begin = regs.vertex_array[i].StartAddress();
const GPUVAddr gpu_addr_end = gpu_addr_begin + new_num_vertices * stride - 1;
regs.vertex_array_limit[i].SetAddress(gpu_addr_end);
dirty.flags[VideoCommon::Dirty::VertexBuffer0 + i] = true;
}
}
}
} // namespace Tegra::Engines } // namespace Tegra::Engines

View file

@ -1498,6 +1498,16 @@ public:
Tables tables{}; Tables tables{};
} dirty; } dirty;
struct VertexNumApproxState {
GPUVAddr last_index_array_start;
u32 current_max_index;
u32 current_min_index;
u32 current_num_vertices;
std::vector<u32> index_buffer_cache;
} vertex_num_approx_state;
bool accelerated_reads{};
private: private:
void InitializeRegisterDefaults(); void InitializeRegisterDefaults();
@ -1566,6 +1576,8 @@ private:
// Handles a instance drawcall from MME // Handles a instance drawcall from MME
void StepInstance(MMEDrawMode expected_mode, u32 count); void StepInstance(MMEDrawMode expected_mode, u32 count);
void RecalculateVertexArrayLimit();
/// Returns a query's value or an empty object if the value will be deferred through a cache. /// Returns a query's value or an empty object if the value will be deferred through a cache.
std::optional<u64> GetQueryResult(); std::optional<u64> GetQueryResult();

View file

@ -6,6 +6,7 @@
#include "common/alignment.h" #include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/host_memory.h"
#include "common/logging/log.h" #include "common/logging/log.h"
#include "core/core.h" #include "core/core.h"
#include "core/hle/kernel/k_page_table.h" #include "core/hle/kernel/k_page_table.h"
@ -186,6 +187,19 @@ std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const {
return page_entry.ToAddress() + (gpu_addr & page_mask); return page_entry.ToAddress() + (gpu_addr & page_mask);
} }
std::optional<u8*> MemoryManager::GpuToHostPointer(GPUVAddr gpu_addr) const {
auto cpu_addr = GpuToCpuAddress(gpu_addr);
if (!cpu_addr) {
return std::nullopt;
}
auto& device_memory = system.DeviceMemory();
auto base = device_memory.buffer.VirtualBasePointer();
if (!base) {
return std::nullopt;
}
return base + *cpu_addr;
}
std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr, std::size_t size) const { std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr, std::size_t size) const {
size_t page_index{addr >> page_bits}; size_t page_index{addr >> page_bits};
const size_t page_last{(addr + size + page_size - 1) >> page_bits}; const size_t page_last{(addr + size + page_size - 1) >> page_bits};

View file

@ -76,6 +76,8 @@ public:
[[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const; [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
[[nodiscard]] std::optional<u8*> GpuToHostPointer(GPUVAddr addr) const;
[[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr, std::size_t size) const; [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr, std::size_t size) const;
template <typename T> template <typename T>