early-access version 2585
This commit is contained in:
parent
dd6ea95c54
commit
9c48e94f2d
31 changed files with 1072 additions and 483 deletions
|
@ -1,7 +1,7 @@
|
|||
yuzu emulator early access
|
||||
=============
|
||||
|
||||
This is the source code for early-access 2576.
|
||||
This is the source code for early-access 2585.
|
||||
|
||||
## Legal Notice
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
name: Build and Test
|
||||
|
||||
on: [push, pull_request]
|
||||
on: [ push, pull_request ]
|
||||
|
||||
env:
|
||||
BUILD_TYPE: Release
|
||||
|
@ -9,8 +9,8 @@ jobs:
|
|||
build:
|
||||
strategy:
|
||||
matrix:
|
||||
os: [windows-latest, ubuntu-latest, macos-latest]
|
||||
cpu_detection: [0, 1]
|
||||
os: [ windows-latest, ubuntu-latest, macos-latest ]
|
||||
cpu_detection: [ 0, 1 ]
|
||||
fail-fast: false
|
||||
|
||||
runs-on: ${{matrix.os}}
|
||||
|
@ -49,7 +49,24 @@ jobs:
|
|||
run: UNICORN_ARCHS=aarch64,arm ./make.sh
|
||||
|
||||
- name: Configure CMake
|
||||
if: ${{matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest'}}
|
||||
if: ${{matrix.os == 'ubuntu-latest'}}
|
||||
env:
|
||||
CC: gcc-10
|
||||
CXX: g++-10
|
||||
run: >
|
||||
cmake
|
||||
-B ${{github.workspace}}/build
|
||||
-DBoost_INCLUDE_DIRS=${{github.workspace}}/externals/ext-boost
|
||||
-DCMAKE_BUILD_TYPE=${{env.BUILD_TYPE}}
|
||||
-DDYNARMIC_ENABLE_CPU_FEATURE_DETECTION=${{matrix.cpu_detection}}
|
||||
-DDYNARMIC_TESTS_USE_UNICORN=1
|
||||
-DDYNARMIC_USE_LLVM=1
|
||||
-DLIBUNICORN_INCLUDE_DIR=${{github.workspace}}/externals/unicorn/include
|
||||
-DLIBUNICORN_LIBRARY=${{github.workspace}}/externals/unicorn/libunicorn.a
|
||||
-G Ninja
|
||||
|
||||
- name: Configure CMake
|
||||
if: ${{matrix.os == 'macos-latest'}}
|
||||
run: >
|
||||
cmake
|
||||
-B ${{github.workspace}}/build
|
||||
|
|
1
externals/dynarmic/CMakeLists.txt
vendored
1
externals/dynarmic/CMakeLists.txt
vendored
|
@ -134,7 +134,6 @@ endif()
|
|||
if (DYNARMIC_NO_BUNDLED_VIXL AND ARCHITECTURE STREQUAL "arm64")
|
||||
find_package(PkgConfig REQUIRED)
|
||||
pkg_check_modules(vixl REQUIRED IMPORTED_TARGET vixl)
|
||||
target_include_directories(PkgConfig::vixl INTERFACE "${vixl_INCLUDE_DIRS}/vixl")
|
||||
add_library(vixl ALIAS PkgConfig::vixl)
|
||||
endif()
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -92,6 +92,7 @@ add_library(dynarmic
|
|||
ir/opt/identity_removal_pass.cpp
|
||||
ir/opt/ir_matcher.h
|
||||
ir/opt/passes.h
|
||||
ir/opt/polyfill_pass.cpp
|
||||
ir/opt/verification_pass.cpp
|
||||
ir/terminal.h
|
||||
ir/type.cpp
|
||||
|
@ -286,6 +287,7 @@ if (ARCHITECTURE STREQUAL "x86_64")
|
|||
backend/x64/emit_x64_packed.cpp
|
||||
backend/x64/emit_x64_saturation.cpp
|
||||
backend/x64/emit_x64_sm4.cpp
|
||||
backend/x64/emit_x64_sha.cpp
|
||||
backend/x64/emit_x64_vector.cpp
|
||||
backend/x64/emit_x64_vector_floating_point.cpp
|
||||
backend/x64/emit_x64_vector_saturation.cpp
|
||||
|
|
|
@ -50,16 +50,24 @@ static std::function<void(BlockOfCode&)> GenRCP(const A32::UserConfig& conf) {
|
|||
};
|
||||
}
|
||||
|
||||
static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
|
||||
return Optimization::PolyfillOptions{
|
||||
.sha256 = !code.HasHostFeature(HostFeature::SHA),
|
||||
};
|
||||
}
|
||||
|
||||
struct Jit::Impl {
|
||||
Impl(Jit* jit, A32::UserConfig conf)
|
||||
: block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this), JitStateInfo{jit_state}, conf.code_cache_size, conf.far_code_offset, GenRCP(conf))
|
||||
, emitter(block_of_code, conf, jit)
|
||||
, polyfill_options(GenPolyfillOptions(block_of_code))
|
||||
, conf(std::move(conf))
|
||||
, jit_interface(jit) {}
|
||||
|
||||
A32JitState jit_state;
|
||||
BlockOfCode block_of_code;
|
||||
A32EmitX64 emitter;
|
||||
Optimization::PolyfillOptions polyfill_options;
|
||||
|
||||
const A32::UserConfig conf;
|
||||
|
||||
|
@ -154,6 +162,7 @@ private:
|
|||
}
|
||||
|
||||
IR::Block ir_block = A32::Translate(A32::LocationDescriptor{descriptor}, conf.callbacks, {conf.arch_version, conf.define_unpredictable_behaviour, conf.hook_hint_instructions});
|
||||
Optimization::PolyfillPass(ir_block, polyfill_options);
|
||||
if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) {
|
||||
Optimization::A32GetSetElimination(ir_block);
|
||||
Optimization::DeadCodeElimination(ir_block);
|
||||
|
|
|
@ -45,12 +45,19 @@ static std::function<void(BlockOfCode&)> GenRCP(const A64::UserConfig& conf) {
|
|||
};
|
||||
}
|
||||
|
||||
static Optimization::PolyfillOptions GenPolyfillOptions(const BlockOfCode& code) {
|
||||
return Optimization::PolyfillOptions{
|
||||
.sha256 = !code.HasHostFeature(HostFeature::SHA),
|
||||
};
|
||||
}
|
||||
|
||||
struct Jit::Impl final {
|
||||
public:
|
||||
Impl(Jit* jit, UserConfig conf)
|
||||
: conf(conf)
|
||||
, block_of_code(GenRunCodeCallbacks(conf.callbacks, &GetCurrentBlockThunk, this), JitStateInfo{jit_state}, conf.code_cache_size, conf.far_code_offset, GenRCP(conf))
|
||||
, emitter(block_of_code, conf, jit) {
|
||||
, emitter(block_of_code, conf, jit)
|
||||
, polyfill_options(GenPolyfillOptions(block_of_code)) {
|
||||
ASSERT(conf.page_table_address_space_bits >= 12 && conf.page_table_address_space_bits <= 64);
|
||||
}
|
||||
|
||||
|
@ -253,6 +260,7 @@ private:
|
|||
const auto get_code = [this](u64 vaddr) { return conf.callbacks->MemoryReadCode(vaddr); };
|
||||
IR::Block ir_block = A64::Translate(A64::LocationDescriptor{current_location}, get_code,
|
||||
{conf.define_unpredictable_behaviour, conf.wall_clock_cntpct});
|
||||
Optimization::PolyfillPass(ir_block, polyfill_options);
|
||||
Optimization::A64CallbackConfigPass(ir_block, conf);
|
||||
if (conf.HasOptimization(OptimizationFlag::GetSetElimination)) {
|
||||
Optimization::A64GetSetElimination(ir_block);
|
||||
|
@ -301,6 +309,7 @@ private:
|
|||
A64JitState jit_state;
|
||||
BlockOfCode block_of_code;
|
||||
A64EmitX64 emitter;
|
||||
Optimization::PolyfillOptions polyfill_options;
|
||||
|
||||
bool invalidate_entire_cache = false;
|
||||
boost::icl::interval_set<u64> invalid_cache_ranges;
|
||||
|
|
|
@ -114,6 +114,8 @@ HostFeature GetHostFeatures() {
|
|||
features |= HostFeature::FMA;
|
||||
if (cpu_info.has(Cpu::tAESNI))
|
||||
features |= HostFeature::AES;
|
||||
if (cpu_info.has(Cpu::tSHA))
|
||||
features |= HostFeature::SHA;
|
||||
if (cpu_info.has(Cpu::tPOPCNT))
|
||||
features |= HostFeature::POPCNT;
|
||||
if (cpu_info.has(Cpu::tBMI1))
|
||||
|
|
81
externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp
vendored
Executable file
81
externals/dynarmic/src/dynarmic/backend/x64/emit_x64_sha.cpp
vendored
Executable file
|
@ -0,0 +1,81 @@
|
|||
/* This file is part of the dynarmic project.
|
||||
* Copyright (c) 2022 MerryMage
|
||||
* SPDX-License-Identifier: 0BSD
|
||||
*/
|
||||
|
||||
#include "dynarmic/backend/x64/block_of_code.h"
|
||||
#include "dynarmic/backend/x64/emit_x64.h"
|
||||
|
||||
namespace Dynarmic::Backend::X64 {
|
||||
|
||||
using namespace Xbyak::util;
|
||||
|
||||
void EmitX64::EmitSHA256Hash(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const bool part1 = args[3].GetImmediateU1();
|
||||
|
||||
ASSERT(code.HasHostFeature(HostFeature::SHA));
|
||||
|
||||
// 3 2 1 0
|
||||
// x = d c b a
|
||||
// y = h g f e
|
||||
// w = wk3 wk2 wk1 wk0
|
||||
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
|
||||
const Xbyak::Xmm w = ctx.reg_alloc.UseXmm(args[2]);
|
||||
|
||||
// x64 expects:
|
||||
// 3 2 1 0
|
||||
// src1 = c d g h
|
||||
// src2 = a b e f
|
||||
// xmm0 = - - wk1 wk0
|
||||
|
||||
code.movaps(xmm0, y);
|
||||
code.shufps(xmm0, x, 0b10111011); // src1
|
||||
code.shufps(y, x, 0b00010001); // src2
|
||||
code.movaps(x, xmm0);
|
||||
|
||||
code.movaps(xmm0, w);
|
||||
code.sha256rnds2(x, y);
|
||||
|
||||
code.punpckhqdq(xmm0, xmm0);
|
||||
code.sha256rnds2(y, x);
|
||||
|
||||
code.shufps(y, x, part1 ? 0b10111011 : 0b00010001);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, y);
|
||||
}
|
||||
|
||||
void EmitX64::EmitSHA256MessageSchedule0(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
ASSERT(code.HasHostFeature(HostFeature::SHA));
|
||||
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
|
||||
|
||||
code.sha256msg1(x, y);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, x);
|
||||
}
|
||||
|
||||
void EmitX64::EmitSHA256MessageSchedule1(EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
ASSERT(code.HasHostFeature(HostFeature::SHA));
|
||||
|
||||
const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]);
|
||||
const Xbyak::Xmm z = ctx.reg_alloc.UseXmm(args[2]);
|
||||
|
||||
code.movaps(xmm0, z);
|
||||
code.palignr(xmm0, y, 4);
|
||||
code.paddd(x, xmm0);
|
||||
code.sha256msg2(x, z);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, x);
|
||||
}
|
||||
|
||||
} // namespace Dynarmic::Backend::X64
|
|
@ -26,14 +26,15 @@ enum class HostFeature : u64 {
|
|||
F16C = 1ULL << 13,
|
||||
FMA = 1ULL << 14,
|
||||
AES = 1ULL << 15,
|
||||
POPCNT = 1ULL << 16,
|
||||
BMI1 = 1ULL << 17,
|
||||
BMI2 = 1ULL << 18,
|
||||
LZCNT = 1ULL << 19,
|
||||
GFNI = 1ULL << 20,
|
||||
SHA = 1ULL << 16,
|
||||
POPCNT = 1ULL << 17,
|
||||
BMI1 = 1ULL << 18,
|
||||
BMI2 = 1ULL << 19,
|
||||
LZCNT = 1ULL << 20,
|
||||
GFNI = 1ULL << 21,
|
||||
|
||||
// Zen-based BMI2
|
||||
FastBMI2 = 1ULL << 21,
|
||||
FastBMI2 = 1ULL << 22,
|
||||
|
||||
// Orthographic AVX512 features on 128 and 256 vectors
|
||||
AVX512_Ortho = AVX512F | AVX512VL,
|
||||
|
|
|
@ -50,6 +50,9 @@ INST(asimd_VPMAX_float, "VPMAX (floating-point)", "111100110D0znnnndddd111
|
|||
INST(asimd_VPMIN_float, "VPMIN (floating-point)", "111100110D1znnnndddd1111NQM0mmmm") // ASIMD
|
||||
INST(asimd_VRECPS, "VRECPS", "111100100D0znnnndddd1111NQM1mmmm") // ASIMD
|
||||
INST(asimd_VRSQRTS, "VRSQRTS", "111100100D1znnnndddd1111NQM1mmmm") // ASIMD
|
||||
INST(v8_SHA256H, "SHA256H", "111100110D00nnnndddd1100NQM0mmmm") // v8
|
||||
INST(v8_SHA256H2, "SHA256H2", "111100110D01nnnndddd1100NQM0mmmm") // v8
|
||||
INST(v8_SHA256SU1, "SHA256SU1", "111100110D10nnnndddd1100NQM0mmmm") // v8
|
||||
|
||||
// Three registers of different lengths
|
||||
INST(asimd_VADDL, "VADDL/VADDW", "1111001U1Dzznnnndddd000oN0M0mmmm") // ASIMD
|
||||
|
@ -144,7 +147,7 @@ INST(v8_AESIMC, "AESIMC", "111100111D11zz00dddd001
|
|||
INST(arm_UDF, "UNALLOCATED", "111100111-11--01----001010-0----") // v8
|
||||
INST(arm_UDF, "UNALLOCATED (SHA1H)", "111100111-11--01----001011-0----") // v8
|
||||
INST(arm_UDF, "UNALLOCATED (SHA1SU1)", "111100111-11--10----001110-0----") // v8
|
||||
INST(arm_UDF, "UNALLOCATED (SHA256SU0)", "111100111-11--10----001111-0----") // v8
|
||||
INST(v8_SHA256SU0, "SHA256SU0", "111100111D11zz10dddd001111M0mmmm") // v8
|
||||
|
||||
// One register and modified immediate
|
||||
INST(asimd_VMOV_imm, "VBIC, VMOV, VMVN, VORR (immediate)", "1111001a1D000bcdVVVVmmmm0Qo1efgh") // ASIMD
|
||||
|
|
|
@ -19,6 +19,8 @@ INST(thumb32_LDRD_lit_1, "LDRD (lit)", "11101000U1111111ttttss
|
|||
INST(thumb32_LDRD_lit_2, "LDRD (lit)", "11101001U1W11111ttttssssiiiiiiii")
|
||||
INST(thumb32_LDRD_imm_1, "LDRD (imm)", "11101000U111nnnnttttssssiiiiiiii")
|
||||
INST(thumb32_LDRD_imm_2, "LDRD (imm)", "11101001U1W1nnnnttttssssiiiiiiii")
|
||||
INST(thumb32_STL, "STL", "111010001100nnnntttt111110101111") // v8
|
||||
INST(thumb32_LDA, "LDA", "111010001101nnnntttt111110101111") // v8
|
||||
INST(thumb32_STREXB, "STREXB", "111010001100nnnntttt11110100dddd")
|
||||
INST(thumb32_STREXH, "STREXH", "111010001100nnnntttt11110101dddd")
|
||||
INST(thumb32_STREXD, "STREXD", "111010001100nnnnttttuuuu0111dddd")
|
||||
|
|
|
@ -493,6 +493,7 @@ struct TranslatorVisitor final {
|
|||
bool thumb32_STMDB(bool W, Reg n, Imm<15> reg_list);
|
||||
|
||||
// thumb32 load/store dual, load/store exclusive, table branch instructions
|
||||
bool thumb32_LDA(Reg n, Reg t);
|
||||
bool thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8);
|
||||
bool thumb32_LDRD_imm_2(bool U, bool W, Reg n, Reg t, Reg t2, Imm<8> imm8);
|
||||
bool thumb32_LDRD_lit_1(bool U, Reg t, Reg t2, Imm<8> imm8);
|
||||
|
@ -503,6 +504,7 @@ struct TranslatorVisitor final {
|
|||
bool thumb32_LDREXD(Reg n, Reg t, Reg t2);
|
||||
bool thumb32_LDREXB(Reg n, Reg t);
|
||||
bool thumb32_LDREXH(Reg n, Reg t);
|
||||
bool thumb32_STL(Reg n, Reg t);
|
||||
bool thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8);
|
||||
bool thumb32_STREXB(Reg n, Reg t, Reg d);
|
||||
bool thumb32_STREXD(Reg n, Reg t, Reg t2, Reg d);
|
||||
|
@ -875,6 +877,9 @@ struct TranslatorVisitor final {
|
|||
bool asimd_VPMIN_float(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool asimd_VRECPS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
bool v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm);
|
||||
|
||||
// Advanced SIMD three registers with different lengths
|
||||
bool asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm);
|
||||
|
@ -918,6 +923,7 @@ struct TranslatorVisitor final {
|
|||
bool v8_AESE(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
|
||||
bool v8_AESIMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
|
||||
bool v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
|
||||
bool v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm);
|
||||
bool asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
|
||||
bool asimd_VCLZ(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
|
||||
bool asimd_VCNT(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm);
|
||||
|
|
|
@ -831,6 +831,60 @@ bool TranslatorVisitor::asimd_VRSQRTS(bool D, bool sz, size_t Vn, size_t Vd, boo
|
|||
});
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::v8_SHA256H(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
|
||||
if (!Q || Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm)) {
|
||||
return UndefinedInstruction();
|
||||
}
|
||||
|
||||
const auto d = ToVector(Q, Vd, D);
|
||||
const auto n = ToVector(Q, Vn, N);
|
||||
const auto m = ToVector(Q, Vm, M);
|
||||
|
||||
const auto x = ir.GetVector(d);
|
||||
const auto y = ir.GetVector(n);
|
||||
const auto w = ir.GetVector(m);
|
||||
const auto result = ir.SHA256Hash(x, y, w, true);
|
||||
|
||||
ir.SetVector(d, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::v8_SHA256H2(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
|
||||
if (!Q || Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm)) {
|
||||
return UndefinedInstruction();
|
||||
}
|
||||
|
||||
const auto n = ToVector(Q, Vn, N);
|
||||
const auto d = ToVector(Q, Vd, D);
|
||||
const auto m = ToVector(Q, Vm, M);
|
||||
|
||||
const auto x = ir.GetVector(n);
|
||||
const auto y = ir.GetVector(d);
|
||||
const auto w = ir.GetVector(m);
|
||||
const auto result = ir.SHA256Hash(x, y, w, false);
|
||||
|
||||
ir.SetVector(d, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::v8_SHA256SU1(bool D, size_t Vn, size_t Vd, bool N, bool Q, bool M, size_t Vm) {
|
||||
if (!Q || Common::Bit<0>(Vd) || Common::Bit<0>(Vn) || Common::Bit<0>(Vm)) {
|
||||
return UndefinedInstruction();
|
||||
}
|
||||
|
||||
const auto d = ToVector(Q, Vd, D);
|
||||
const auto n = ToVector(Q, Vn, N);
|
||||
const auto m = ToVector(Q, Vm, M);
|
||||
|
||||
const auto x = ir.GetVector(d);
|
||||
const auto y = ir.GetVector(n);
|
||||
const auto z = ir.GetVector(m);
|
||||
const auto result = ir.SHA256MessageSchedule1(x, y, z);
|
||||
|
||||
ir.SetVector(d, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
// ASIMD Three registers of different length
|
||||
|
||||
bool TranslatorVisitor::asimd_VADDL(bool U, bool D, size_t sz, size_t Vn, size_t Vd, bool op, bool N, bool M, size_t Vm) {
|
||||
|
|
|
@ -225,6 +225,21 @@ bool TranslatorVisitor::v8_AESMC(bool D, size_t sz, size_t Vd, bool M, size_t Vm
|
|||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::v8_SHA256SU0(bool D, size_t sz, size_t Vd, bool M, size_t Vm) {
|
||||
if (sz != 0b10 || Common::Bit<0>(Vd) || Common::Bit<0>(Vm)) {
|
||||
return UndefinedInstruction();
|
||||
}
|
||||
|
||||
const auto d = ToVector(true, Vd, D);
|
||||
const auto m = ToVector(true, Vm, M);
|
||||
const auto x = ir.GetVector(d);
|
||||
const auto y = ir.GetVector(m);
|
||||
const auto result = ir.SHA256MessageSchedule0(x, y);
|
||||
|
||||
ir.SetVector(d, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::asimd_VCLS(bool D, size_t sz, size_t Vd, bool Q, bool M, size_t Vm) {
|
||||
if (sz == 0b11) {
|
||||
return UndefinedInstruction();
|
||||
|
|
|
@ -110,6 +110,16 @@ static bool StoreDual(TranslatorVisitor& v, bool P, bool U, bool W, Reg n, Reg t
|
|||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::thumb32_LDA(Reg n, Reg t) {
|
||||
if (t == Reg::PC || n == Reg::PC) {
|
||||
return UnpredictableInstruction();
|
||||
}
|
||||
|
||||
const auto address = ir.GetRegister(n);
|
||||
ir.SetRegister(t, ir.ReadMemory32(address)); // AccType::Ordered
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::thumb32_LDRD_imm_1(bool U, Reg n, Reg t, Reg t2, Imm<8> imm8) {
|
||||
return LoadDualImmediate(*this, false, U, true, n, t, t2, imm8);
|
||||
}
|
||||
|
@ -184,6 +194,16 @@ bool TranslatorVisitor::thumb32_LDREXH(Reg n, Reg t) {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::thumb32_STL(Reg n, Reg t) {
|
||||
if (t == Reg::PC || n == Reg::PC) {
|
||||
return UnpredictableInstruction();
|
||||
}
|
||||
|
||||
const auto address = ir.GetRegister(n);
|
||||
ir.WriteMemory32(address, ir.GetRegister(t)); // AccType::Ordered
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::thumb32_STREX(Reg n, Reg t, Reg d, Imm<8> imm8) {
|
||||
if (d == Reg::PC || t == Reg::PC || n == Reg::PC) {
|
||||
return UnpredictableInstruction();
|
||||
|
|
|
@ -1407,7 +1407,7 @@ bool TranslatorVisitor::vfp_VLDM_a1(Cond cond, bool p, bool u, bool D, bool w, R
|
|||
return arm_UDF();
|
||||
}
|
||||
|
||||
if (n == Reg::PC && w) {
|
||||
if (n == Reg::PC && (w || ir.current_location.TFlag())) {
|
||||
return UnpredictableInstruction();
|
||||
}
|
||||
|
||||
|
@ -1457,7 +1457,7 @@ bool TranslatorVisitor::vfp_VLDM_a2(Cond cond, bool p, bool u, bool D, bool w, R
|
|||
return arm_UDF();
|
||||
}
|
||||
|
||||
if (n == Reg::PC && w) {
|
||||
if (n == Reg::PC && (w || ir.current_location.TFlag())) {
|
||||
return UnpredictableInstruction();
|
||||
}
|
||||
|
||||
|
|
|
@ -46,67 +46,6 @@ IR::U128 SHA1HashUpdate(IREmitter& ir, Vec Vm, Vec Vn, Vec Vd, SHA1HashUpdateFun
|
|||
|
||||
return x;
|
||||
}
|
||||
|
||||
IR::U32 SHAhashSIGMA0(IREmitter& ir, IR::U32 x) {
|
||||
const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(2));
|
||||
const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(13));
|
||||
const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(22));
|
||||
|
||||
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
|
||||
}
|
||||
|
||||
IR::U32 SHAhashSIGMA1(IREmitter& ir, IR::U32 x) {
|
||||
const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(6));
|
||||
const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(11));
|
||||
const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(25));
|
||||
|
||||
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
|
||||
}
|
||||
|
||||
enum class SHA256HashPart {
|
||||
Part1,
|
||||
Part2
|
||||
};
|
||||
|
||||
IR::U128 SHA256hash(IREmitter& ir, IR::U128 x, IR::U128 y, IR::U128 w, SHA256HashPart part) {
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
const IR::U32 low_x = ir.VectorGetElement(32, x, 0);
|
||||
const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1);
|
||||
const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2);
|
||||
const IR::U32 high_x = ir.VectorGetElement(32, x, 3);
|
||||
|
||||
const IR::U32 low_y = ir.VectorGetElement(32, y, 0);
|
||||
const IR::U32 after_low_y = ir.VectorGetElement(32, y, 1);
|
||||
const IR::U32 before_high_y = ir.VectorGetElement(32, y, 2);
|
||||
const IR::U32 high_y = ir.VectorGetElement(32, y, 3);
|
||||
|
||||
const IR::U32 choice = SHAchoose(ir, low_y, after_low_y, before_high_y);
|
||||
const IR::U32 majority = SHAmajority(ir, low_x, after_low_x, before_high_x);
|
||||
|
||||
const IR::U32 t = [&] {
|
||||
const IR::U32 w_element = ir.VectorGetElement(32, w, i);
|
||||
const IR::U32 sig = SHAhashSIGMA1(ir, low_y);
|
||||
|
||||
return ir.Add(high_y, ir.Add(sig, ir.Add(choice, w_element)));
|
||||
}();
|
||||
|
||||
const IR::U32 new_low_x = ir.Add(t, ir.Add(SHAhashSIGMA0(ir, low_x), majority));
|
||||
const IR::U32 new_low_y = ir.Add(t, high_x);
|
||||
|
||||
// Shuffle all words left by 1 element: [3, 2, 1, 0] -> [2, 1, 0, 3]
|
||||
const IR::U128 shuffled_x = ir.VectorShuffleWords(x, 0b10010011);
|
||||
const IR::U128 shuffled_y = ir.VectorShuffleWords(y, 0b10010011);
|
||||
|
||||
x = ir.VectorSetElement(32, shuffled_x, 0, new_low_x);
|
||||
y = ir.VectorSetElement(32, shuffled_y, 0, new_low_y);
|
||||
}
|
||||
|
||||
if (part == SHA256HashPart::Part1) {
|
||||
return x;
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
} // Anonymous namespace
|
||||
|
||||
bool TranslatorVisitor::SHA1C(Vec Vm, Vec Vn, Vec Vd) {
|
||||
|
@ -175,85 +114,34 @@ bool TranslatorVisitor::SHA1H(Vec Vn, Vec Vd) {
|
|||
}
|
||||
|
||||
bool TranslatorVisitor::SHA256SU0(Vec Vn, Vec Vd) {
|
||||
const IR::U128 d = ir.GetQ(Vd);
|
||||
const IR::U128 n = ir.GetQ(Vn);
|
||||
const IR::U128 x = ir.GetQ(Vd);
|
||||
const IR::U128 y = ir.GetQ(Vn);
|
||||
|
||||
const IR::U128 t = [&] {
|
||||
// Shuffle the upper three elements down: [3, 2, 1, 0] -> [0, 3, 2, 1]
|
||||
const IR::U128 shuffled = ir.VectorShuffleWords(d, 0b00111001);
|
||||
|
||||
return ir.VectorSetElement(32, shuffled, 3, ir.VectorGetElement(32, n, 0));
|
||||
}();
|
||||
|
||||
IR::U128 result = ir.ZeroVector();
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
const IR::U32 modified_element = [&] {
|
||||
const IR::U32 element = ir.VectorGetElement(32, t, i);
|
||||
const IR::U32 tmp1 = ir.RotateRight(element, ir.Imm8(7));
|
||||
const IR::U32 tmp2 = ir.RotateRight(element, ir.Imm8(18));
|
||||
const IR::U32 tmp3 = ir.LogicalShiftRight(element, ir.Imm8(3));
|
||||
|
||||
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
|
||||
}();
|
||||
|
||||
const IR::U32 d_element = ir.VectorGetElement(32, d, i);
|
||||
result = ir.VectorSetElement(32, result, i, ir.Add(modified_element, d_element));
|
||||
}
|
||||
const IR::U128 result = ir.SHA256MessageSchedule0(x, y);
|
||||
|
||||
ir.SetQ(Vd, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::SHA256SU1(Vec Vm, Vec Vn, Vec Vd) {
|
||||
const IR::U128 d = ir.GetQ(Vd);
|
||||
const IR::U128 m = ir.GetQ(Vm);
|
||||
const IR::U128 n = ir.GetQ(Vn);
|
||||
const IR::U128 x = ir.GetQ(Vd);
|
||||
const IR::U128 y = ir.GetQ(Vn);
|
||||
const IR::U128 z = ir.GetQ(Vm);
|
||||
|
||||
const IR::U128 T0 = [&] {
|
||||
const IR::U32 low_m = ir.VectorGetElement(32, m, 0);
|
||||
const IR::U128 shuffled_n = ir.VectorShuffleWords(n, 0b00111001);
|
||||
|
||||
return ir.VectorSetElement(32, shuffled_n, 3, low_m);
|
||||
}();
|
||||
|
||||
const IR::U128 lower_half = [&] {
|
||||
const IR::U128 T = ir.VectorShuffleWords(m, 0b01001110);
|
||||
const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17);
|
||||
const IR::U128 tmp2 = ir.VectorRotateRight(32, T, 19);
|
||||
const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, T, 10);
|
||||
const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
|
||||
const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, d, T0));
|
||||
return ir.VectorZeroUpper(tmp5);
|
||||
}();
|
||||
|
||||
const IR::U64 upper_half = [&] {
|
||||
const IR::U128 tmp1 = ir.VectorRotateRight(32, lower_half, 17);
|
||||
const IR::U128 tmp2 = ir.VectorRotateRight(32, lower_half, 19);
|
||||
const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, lower_half, 10);
|
||||
const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
|
||||
|
||||
// Shuffle the top two 32-bit elements downwards [3, 2, 1, 0] -> [1, 0, 3, 2]
|
||||
const IR::U128 shuffled_d = ir.VectorShuffleWords(d, 0b01001110);
|
||||
const IR::U128 shuffled_T0 = ir.VectorShuffleWords(T0, 0b01001110);
|
||||
|
||||
const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, shuffled_d, shuffled_T0));
|
||||
return ir.VectorGetElement(64, tmp5, 0);
|
||||
}();
|
||||
|
||||
const IR::U128 result = ir.VectorSetElement(64, lower_half, 1, upper_half);
|
||||
const IR::U128 result = ir.SHA256MessageSchedule1(x, y, z);
|
||||
|
||||
ir.SetQ(Vd, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::SHA256H(Vec Vm, Vec Vn, Vec Vd) {
|
||||
const IR::U128 result = SHA256hash(ir, ir.GetQ(Vd), ir.GetQ(Vn), ir.GetQ(Vm), SHA256HashPart::Part1);
|
||||
const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vd), ir.GetQ(Vn), ir.GetQ(Vm), true);
|
||||
ir.SetQ(Vd, result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool TranslatorVisitor::SHA256H2(Vec Vm, Vec Vn, Vec Vd) {
|
||||
const IR::U128 result = SHA256hash(ir, ir.GetQ(Vn), ir.GetQ(Vd), ir.GetQ(Vm), SHA256HashPart::Part2);
|
||||
const IR::U128 result = ir.SHA256Hash(ir.GetQ(Vn), ir.GetQ(Vd), ir.GetQ(Vm), false);
|
||||
ir.SetQ(Vd, result);
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -903,6 +903,18 @@ U8 IREmitter::SM4AccessSubstitutionBox(const U8& a) {
|
|||
return Inst<U8>(Opcode::SM4AccessSubstitutionBox, a);
|
||||
}
|
||||
|
||||
U128 IREmitter::SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1) {
|
||||
return Inst<U128>(Opcode::SHA256Hash, x, y, w, Imm1(part1));
|
||||
}
|
||||
|
||||
U128 IREmitter::SHA256MessageSchedule0(const U128& x, const U128& y) {
|
||||
return Inst<U128>(Opcode::SHA256MessageSchedule0, x, y);
|
||||
}
|
||||
|
||||
U128 IREmitter::SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z) {
|
||||
return Inst<U128>(Opcode::SHA256MessageSchedule1, x, y, z);
|
||||
}
|
||||
|
||||
UAny IREmitter::VectorGetElement(size_t esize, const U128& a, size_t index) {
|
||||
ASSERT_MSG(esize * index < 128, "Invalid index");
|
||||
switch (esize) {
|
||||
|
|
|
@ -219,6 +219,10 @@ public:
|
|||
|
||||
U8 SM4AccessSubstitutionBox(const U8& a);
|
||||
|
||||
U128 SHA256Hash(const U128& x, const U128& y, const U128& w, bool part1);
|
||||
U128 SHA256MessageSchedule0(const U128& x, const U128& y);
|
||||
U128 SHA256MessageSchedule1(const U128& x, const U128& y, const U128& z);
|
||||
|
||||
UAny VectorGetElement(size_t esize, const U128& a, size_t index);
|
||||
U128 VectorSetElement(size_t esize, const U128& a, size_t index, const UAny& elem);
|
||||
U128 VectorAbs(size_t esize, const U128& a);
|
||||
|
|
|
@ -272,6 +272,11 @@ OPCODE(AESMixColumns, U128, U128
|
|||
// SM4 instructions
|
||||
OPCODE(SM4AccessSubstitutionBox, U8, U8 )
|
||||
|
||||
// SHA instructions
|
||||
OPCODE(SHA256Hash, U128, U128, U128, U128, U1 )
|
||||
OPCODE(SHA256MessageSchedule0, U128, U128, U128 )
|
||||
OPCODE(SHA256MessageSchedule1, U128, U128, U128, U128 )
|
||||
|
||||
// Vector instructions
|
||||
OPCODE(VectorGetElement8, U8, U128, U8 )
|
||||
OPCODE(VectorGetElement16, U16, U128, U8 )
|
||||
|
|
|
@ -20,6 +20,13 @@ class Block;
|
|||
|
||||
namespace Dynarmic::Optimization {
|
||||
|
||||
struct PolyfillOptions {
|
||||
bool sha256 = false;
|
||||
|
||||
bool operator==(const PolyfillOptions&) const = default;
|
||||
};
|
||||
|
||||
void PolyfillPass(IR::Block& block, const PolyfillOptions& opt);
|
||||
void A32ConstantMemoryReads(IR::Block& block, A32::UserCallbacks* cb);
|
||||
void A32GetSetElimination(IR::Block& block);
|
||||
void A64CallbackConfigPass(IR::Block& block, const A64::UserConfig& conf);
|
||||
|
|
175
externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp
vendored
Executable file
175
externals/dynarmic/src/dynarmic/ir/opt/polyfill_pass.cpp
vendored
Executable file
|
@ -0,0 +1,175 @@
|
|||
/* This file is part of the dynarmic project.
|
||||
* Copyright (c) 2022 MerryMage
|
||||
* SPDX-License-Identifier: 0BSD
|
||||
*/
|
||||
|
||||
#include "dynarmic/ir/basic_block.h"
|
||||
#include "dynarmic/ir/ir_emitter.h"
|
||||
#include "dynarmic/ir/microinstruction.h"
|
||||
#include "dynarmic/ir/opcodes.h"
|
||||
#include "dynarmic/ir/opt/passes.h"
|
||||
|
||||
namespace Dynarmic::Optimization {
|
||||
|
||||
namespace {
|
||||
|
||||
void PolyfillSHA256MessageSchedule0(IR::IREmitter& ir, IR::Inst& inst) {
|
||||
const IR::U128 x = (IR::U128)inst.GetArg(0);
|
||||
const IR::U128 y = (IR::U128)inst.GetArg(1);
|
||||
|
||||
const IR::U128 t = ir.VectorExtract(x, y, 32);
|
||||
|
||||
IR::U128 result = ir.ZeroVector();
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
const IR::U32 modified_element = [&] {
|
||||
const IR::U32 element = ir.VectorGetElement(32, t, i);
|
||||
const IR::U32 tmp1 = ir.RotateRight(element, ir.Imm8(7));
|
||||
const IR::U32 tmp2 = ir.RotateRight(element, ir.Imm8(18));
|
||||
const IR::U32 tmp3 = ir.LogicalShiftRight(element, ir.Imm8(3));
|
||||
|
||||
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
|
||||
}();
|
||||
|
||||
result = ir.VectorSetElement(32, result, i, modified_element);
|
||||
}
|
||||
result = ir.VectorAdd(32, result, x);
|
||||
|
||||
inst.ReplaceUsesWith(result);
|
||||
}
|
||||
|
||||
void PolyfillSHA256MessageSchedule1(IR::IREmitter& ir, IR::Inst& inst) {
|
||||
const IR::U128 x = (IR::U128)inst.GetArg(0);
|
||||
const IR::U128 y = (IR::U128)inst.GetArg(1);
|
||||
const IR::U128 z = (IR::U128)inst.GetArg(2);
|
||||
|
||||
const IR::U128 T0 = ir.VectorExtract(y, z, 32);
|
||||
|
||||
const IR::U128 lower_half = [&] {
|
||||
const IR::U128 T = ir.VectorShuffleWords(z, 0b01001110);
|
||||
const IR::U128 tmp1 = ir.VectorRotateRight(32, T, 17);
|
||||
const IR::U128 tmp2 = ir.VectorRotateRight(32, T, 19);
|
||||
const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, T, 10);
|
||||
const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
|
||||
const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, x, T0));
|
||||
return ir.VectorZeroUpper(tmp5);
|
||||
}();
|
||||
|
||||
const IR::U64 upper_half = [&] {
|
||||
const IR::U128 tmp1 = ir.VectorRotateRight(32, lower_half, 17);
|
||||
const IR::U128 tmp2 = ir.VectorRotateRight(32, lower_half, 19);
|
||||
const IR::U128 tmp3 = ir.VectorLogicalShiftRight(32, lower_half, 10);
|
||||
const IR::U128 tmp4 = ir.VectorEor(tmp1, ir.VectorEor(tmp2, tmp3));
|
||||
|
||||
// Shuffle the top two 32-bit elements downwards [3, 2, 1, 0] -> [1, 0, 3, 2]
|
||||
const IR::U128 shuffled_d = ir.VectorShuffleWords(x, 0b01001110);
|
||||
const IR::U128 shuffled_T0 = ir.VectorShuffleWords(T0, 0b01001110);
|
||||
|
||||
const IR::U128 tmp5 = ir.VectorAdd(32, tmp4, ir.VectorAdd(32, shuffled_d, shuffled_T0));
|
||||
return ir.VectorGetElement(64, tmp5, 0);
|
||||
}();
|
||||
|
||||
const IR::U128 result = ir.VectorSetElement(64, lower_half, 1, upper_half);
|
||||
|
||||
inst.ReplaceUsesWith(result);
|
||||
}
|
||||
|
||||
IR::U32 SHAchoose(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
|
||||
return ir.Eor(ir.And(ir.Eor(y, z), x), z);
|
||||
}
|
||||
|
||||
IR::U32 SHAmajority(IR::IREmitter& ir, IR::U32 x, IR::U32 y, IR::U32 z) {
|
||||
return ir.Or(ir.And(x, y), ir.And(ir.Or(x, y), z));
|
||||
}
|
||||
|
||||
IR::U32 SHAhashSIGMA0(IR::IREmitter& ir, IR::U32 x) {
|
||||
const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(2));
|
||||
const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(13));
|
||||
const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(22));
|
||||
|
||||
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
|
||||
}
|
||||
|
||||
IR::U32 SHAhashSIGMA1(IR::IREmitter& ir, IR::U32 x) {
|
||||
const IR::U32 tmp1 = ir.RotateRight(x, ir.Imm8(6));
|
||||
const IR::U32 tmp2 = ir.RotateRight(x, ir.Imm8(11));
|
||||
const IR::U32 tmp3 = ir.RotateRight(x, ir.Imm8(25));
|
||||
|
||||
return ir.Eor(tmp1, ir.Eor(tmp2, tmp3));
|
||||
}
|
||||
|
||||
void PolyfillSHA256Hash(IR::IREmitter& ir, IR::Inst& inst) {
|
||||
IR::U128 x = (IR::U128)inst.GetArg(0);
|
||||
IR::U128 y = (IR::U128)inst.GetArg(1);
|
||||
const IR::U128 w = (IR::U128)inst.GetArg(2);
|
||||
const bool part1 = inst.GetArg(3).GetU1();
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
const IR::U32 low_x = ir.VectorGetElement(32, x, 0);
|
||||
const IR::U32 after_low_x = ir.VectorGetElement(32, x, 1);
|
||||
const IR::U32 before_high_x = ir.VectorGetElement(32, x, 2);
|
||||
const IR::U32 high_x = ir.VectorGetElement(32, x, 3);
|
||||
|
||||
const IR::U32 low_y = ir.VectorGetElement(32, y, 0);
|
||||
const IR::U32 after_low_y = ir.VectorGetElement(32, y, 1);
|
||||
const IR::U32 before_high_y = ir.VectorGetElement(32, y, 2);
|
||||
const IR::U32 high_y = ir.VectorGetElement(32, y, 3);
|
||||
|
||||
const IR::U32 choice = SHAchoose(ir, low_y, after_low_y, before_high_y);
|
||||
const IR::U32 majority = SHAmajority(ir, low_x, after_low_x, before_high_x);
|
||||
|
||||
const IR::U32 t = [&] {
|
||||
const IR::U32 w_element = ir.VectorGetElement(32, w, i);
|
||||
const IR::U32 sig = SHAhashSIGMA1(ir, low_y);
|
||||
|
||||
return ir.Add(high_y, ir.Add(sig, ir.Add(choice, w_element)));
|
||||
}();
|
||||
|
||||
const IR::U32 new_low_x = ir.Add(t, ir.Add(SHAhashSIGMA0(ir, low_x), majority));
|
||||
const IR::U32 new_low_y = ir.Add(t, high_x);
|
||||
|
||||
// Shuffle all words left by 1 element: [3, 2, 1, 0] -> [2, 1, 0, 3]
|
||||
const IR::U128 shuffled_x = ir.VectorShuffleWords(x, 0b10010011);
|
||||
const IR::U128 shuffled_y = ir.VectorShuffleWords(y, 0b10010011);
|
||||
|
||||
x = ir.VectorSetElement(32, shuffled_x, 0, new_low_x);
|
||||
y = ir.VectorSetElement(32, shuffled_y, 0, new_low_y);
|
||||
}
|
||||
|
||||
inst.ReplaceUsesWith(part1 ? x : y);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
void PolyfillPass(IR::Block& block, const PolyfillOptions& polyfill) {
|
||||
if (polyfill == PolyfillOptions{}) {
|
||||
return;
|
||||
}
|
||||
|
||||
IR::IREmitter ir{block};
|
||||
|
||||
for (auto& inst : block) {
|
||||
ir.SetInsertionPoint(&inst);
|
||||
|
||||
switch (inst.GetOpcode()) {
|
||||
case IR::Opcode::SHA256MessageSchedule0:
|
||||
if (polyfill.sha256) {
|
||||
PolyfillSHA256MessageSchedule0(ir, inst);
|
||||
}
|
||||
break;
|
||||
case IR::Opcode::SHA256MessageSchedule1:
|
||||
if (polyfill.sha256) {
|
||||
PolyfillSHA256MessageSchedule1(ir, inst);
|
||||
}
|
||||
break;
|
||||
case IR::Opcode::SHA256Hash:
|
||||
if (polyfill.sha256) {
|
||||
PolyfillSHA256Hash(ir, inst);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Dynarmic::Optimization
|
31
externals/dynarmic/tests/A32/fuzz_arm.cpp
vendored
31
externals/dynarmic/tests/A32/fuzz_arm.cpp
vendored
|
@ -535,6 +535,37 @@ TEST_CASE("A32: Single random thumb instruction", "[thumb]") {
|
|||
}
|
||||
}
|
||||
|
||||
TEST_CASE("A32: Single random thumb instruction (offset)", "[thumb]") {
|
||||
ThumbTestEnv jit_env{};
|
||||
ThumbTestEnv uni_env{};
|
||||
|
||||
Dynarmic::A32::Jit jit{GetUserConfig(jit_env)};
|
||||
A32Unicorn<ThumbTestEnv> uni{uni_env};
|
||||
|
||||
A32Unicorn<ThumbTestEnv>::RegisterArray regs;
|
||||
A32Unicorn<ThumbTestEnv>::ExtRegArray ext_reg;
|
||||
std::vector<u16> instructions;
|
||||
|
||||
for (size_t iteration = 0; iteration < 100000; ++iteration) {
|
||||
std::generate(regs.begin(), regs.end(), [] { return RandInt<u32>(0, ~u32(0)); });
|
||||
std::generate(ext_reg.begin(), ext_reg.end(), [] { return RandInt<u32>(0, ~u32(0)); });
|
||||
|
||||
instructions.clear();
|
||||
instructions.push_back(0xbf00); // NOP
|
||||
const std::vector<u16> inst = GenRandomThumbInst(0, true);
|
||||
instructions.insert(instructions.end(), inst.begin(), inst.end());
|
||||
|
||||
const u32 start_address = 100;
|
||||
const u32 cpsr = (RandInt<u32>(0, 0xF) << 28) | 0x1F0;
|
||||
const u32 fpcr = RandomFpcr();
|
||||
|
||||
INFO("Instruction: 0x" << std::hex << inst[0]);
|
||||
|
||||
regs[15] = start_address;
|
||||
RunTestInstance(jit, uni, jit_env, uni_env, regs, ext_reg, instructions, cpsr, fpcr, 2);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("A32: Small random thumb block", "[thumb]") {
|
||||
ThumbTestEnv jit_env{};
|
||||
ThumbTestEnv uni_env{};
|
||||
|
|
|
@ -38,7 +38,7 @@ if (MSVC)
|
|||
/MP
|
||||
/Zf
|
||||
/Zi
|
||||
/Zm200
|
||||
/Zm300
|
||||
/Zo
|
||||
/permissive-
|
||||
/EHsc
|
||||
|
|
|
@ -322,7 +322,7 @@ struct Memory::Impl {
|
|||
}
|
||||
|
||||
if (Settings::IsFastmemEnabled()) {
|
||||
const bool is_read_enable = Settings::IsGPULevelHigh() || !cached;
|
||||
const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;
|
||||
system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
|
||||
}
|
||||
|
||||
|
|
|
@ -1495,15 +1495,13 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
|
|||
overlap_ids.push_back(overlap_id);
|
||||
overlap.Pick();
|
||||
const VAddr overlap_cpu_addr = overlap.CpuAddr();
|
||||
bool goes_left = false;
|
||||
if (overlap_cpu_addr < begin) {
|
||||
goes_left = true;
|
||||
const bool expands_left = overlap_cpu_addr < begin;
|
||||
if (expands_left) {
|
||||
cpu_addr = begin = overlap_cpu_addr;
|
||||
}
|
||||
const VAddr overlap_end = overlap_cpu_addr + overlap.SizeBytes();
|
||||
bool goes_right = false;
|
||||
const bool expands_right = overlap_end > end;
|
||||
if (overlap_end > end) {
|
||||
goes_right = true;
|
||||
end = overlap_end;
|
||||
}
|
||||
stream_score += overlap.StreamScore();
|
||||
|
@ -1511,11 +1509,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu
|
|||
// When this memory region has been joined a bunch of times, we assume it's being used
|
||||
// as a stream buffer. Increase the size to skip constantly recreating buffers.
|
||||
has_stream_leap = true;
|
||||
if (goes_right) {
|
||||
if (expands_right) {
|
||||
begin -= PAGE_SIZE * 256;
|
||||
cpu_addr = begin;
|
||||
}
|
||||
if (goes_left) {
|
||||
if (expands_left) {
|
||||
end += PAGE_SIZE * 256;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,8 +2,11 @@
|
|||
// Licensed under GPLv2 or any later version
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
#include <optional>
|
||||
|
||||
#include "common/alignment.h"
|
||||
#include "common/assert.h"
|
||||
#include "common/settings.h"
|
||||
#include "core/core.h"
|
||||
|
@ -27,6 +30,7 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
|
|||
upload_state{memory_manager, regs.upload} {
|
||||
dirty.flags.flip();
|
||||
InitializeRegisterDefaults();
|
||||
accelerated_reads = Settings::IsFastmemEnabled();
|
||||
}
|
||||
|
||||
Maxwell3D::~Maxwell3D() = default;
|
||||
|
@ -210,28 +214,14 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
|
|||
return ProcessCBBind(4);
|
||||
case MAXWELL3D_REG_INDEX(draw.vertex_end_gl):
|
||||
return DrawArrays();
|
||||
case MAXWELL3D_REG_INDEX(small_index): {
|
||||
case MAXWELL3D_REG_INDEX(small_index):
|
||||
regs.index_array.count = regs.small_index.count;
|
||||
regs.index_array.first = regs.small_index.first;
|
||||
dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
|
||||
bool is_extreme = Settings::IsGPULevelExtreme();
|
||||
|
||||
if (!is_extreme) {
|
||||
for (size_t i = 0; i < Regs::NumVertexArrays; i++) {
|
||||
if (!dirty.flags[VideoCommon::Dirty::VertexBuffer0 + i]) {
|
||||
continue;
|
||||
}
|
||||
const u32 stride = regs.vertex_array[i].stride;
|
||||
const u32 num_vertices = regs.index_array.first + regs.index_array.count;
|
||||
const GPUVAddr gpu_addr_begin =
|
||||
regs.vertex_array[i].StartAddress() + regs.index_array.first * stride;
|
||||
const GPUVAddr gpu_addr_end = gpu_addr_begin + num_vertices * stride + 1;
|
||||
regs.vertex_array_limit[i].SetAddress(gpu_addr_end);
|
||||
}
|
||||
}
|
||||
DrawArrays();
|
||||
return;
|
||||
if (!Settings::IsGPULevelExtreme()) {
|
||||
RecalculateVertexArrayLimit();
|
||||
}
|
||||
return DrawArrays();
|
||||
case MAXWELL3D_REG_INDEX(topology_override):
|
||||
use_topology_override = true;
|
||||
return;
|
||||
|
@ -685,4 +675,71 @@ void Maxwell3D::ProcessClearBuffers() {
|
|||
rasterizer->Clear();
|
||||
}
|
||||
|
||||
void Maxwell3D::RecalculateVertexArrayLimit() {
|
||||
GPUVAddr start_address = regs.index_array.StartAddress();
|
||||
auto& vn_state = vertex_num_approx_state;
|
||||
if (start_address != vn_state.last_index_array_start ||
|
||||
vn_state.current_min_index != regs.index_array.first) {
|
||||
vn_state.last_index_array_start = start_address;
|
||||
vn_state.current_max_index = regs.index_array.first;
|
||||
vn_state.current_min_index = regs.index_array.first;
|
||||
vn_state.current_num_vertices = 0;
|
||||
}
|
||||
const u32 index_count = regs.index_array.first + regs.index_array.count;
|
||||
if (index_count <= vn_state.current_max_index) {
|
||||
return;
|
||||
}
|
||||
const u32 max_base = std::max(regs.index_array.first, vn_state.current_max_index);
|
||||
const u32 num_indices = index_count - max_base;
|
||||
const size_t size_index = regs.index_array.FormatSizeInBytes();
|
||||
const size_t expected_size = num_indices * size_index;
|
||||
const size_t offset = max_base * size_index;
|
||||
|
||||
auto maybe_ptr = memory_manager.GpuToHostPointer(start_address + offset);
|
||||
u8* ptr;
|
||||
if (accelerated_reads && maybe_ptr) {
|
||||
ptr = *maybe_ptr;
|
||||
} else {
|
||||
vn_state.index_buffer_cache.resize(Common::DivideUp(expected_size, sizeof(u32)));
|
||||
ptr = reinterpret_cast<u8*>(vn_state.index_buffer_cache.data());
|
||||
memory_manager.ReadBlockUnsafe(start_address + offset, ptr, expected_size);
|
||||
}
|
||||
vn_state.current_max_index = index_count;
|
||||
|
||||
u32 new_num_vertices{};
|
||||
switch (regs.index_array.format) {
|
||||
case Regs::IndexFormat::UnsignedByte: {
|
||||
std::span<const u8> span{ptr, num_indices};
|
||||
const auto max = std::max_element(span.begin(), span.end());
|
||||
new_num_vertices = *max + 1;
|
||||
break;
|
||||
}
|
||||
case Regs::IndexFormat::UnsignedShort: {
|
||||
std::span<const u16> span{reinterpret_cast<const u16*>(ptr), num_indices};
|
||||
const auto max = std::max_element(span.begin(), span.end());
|
||||
new_num_vertices = *max + 1;
|
||||
break;
|
||||
}
|
||||
case Regs::IndexFormat::UnsignedInt: {
|
||||
std::span<const u32> span{reinterpret_cast<const u32*>(ptr), num_indices};
|
||||
const auto max = std::max_element(span.begin(), span.end());
|
||||
new_num_vertices = *max + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (new_num_vertices > vn_state.current_num_vertices) {
|
||||
vn_state.current_num_vertices = new_num_vertices;
|
||||
for (size_t i = 0; i < Regs::NumVertexArrays; i++) {
|
||||
if (!regs.vertex_array[i].enable) {
|
||||
continue;
|
||||
}
|
||||
const u32 stride = regs.vertex_array[i].stride;
|
||||
const GPUVAddr gpu_addr_begin = regs.vertex_array[i].StartAddress();
|
||||
const GPUVAddr gpu_addr_end = gpu_addr_begin + new_num_vertices * stride - 1;
|
||||
regs.vertex_array_limit[i].SetAddress(gpu_addr_end);
|
||||
dirty.flags[VideoCommon::Dirty::VertexBuffer0 + i] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Tegra::Engines
|
||||
|
|
|
@ -1498,6 +1498,16 @@ public:
|
|||
Tables tables{};
|
||||
} dirty;
|
||||
|
||||
struct VertexNumApproxState {
|
||||
GPUVAddr last_index_array_start;
|
||||
u32 current_max_index;
|
||||
u32 current_min_index;
|
||||
u32 current_num_vertices;
|
||||
std::vector<u32> index_buffer_cache;
|
||||
} vertex_num_approx_state;
|
||||
|
||||
bool accelerated_reads{};
|
||||
|
||||
private:
|
||||
void InitializeRegisterDefaults();
|
||||
|
||||
|
@ -1566,6 +1576,8 @@ private:
|
|||
// Handles a instance drawcall from MME
|
||||
void StepInstance(MMEDrawMode expected_mode, u32 count);
|
||||
|
||||
void RecalculateVertexArrayLimit();
|
||||
|
||||
/// Returns a query's value or an empty object if the value will be deferred through a cache.
|
||||
std::optional<u64> GetQueryResult();
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include "common/alignment.h"
|
||||
#include "common/assert.h"
|
||||
#include "common/host_memory.h"
|
||||
#include "common/logging/log.h"
|
||||
#include "core/core.h"
|
||||
#include "core/hle/kernel/k_page_table.h"
|
||||
|
@ -186,6 +187,19 @@ std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr gpu_addr) const {
|
|||
return page_entry.ToAddress() + (gpu_addr & page_mask);
|
||||
}
|
||||
|
||||
std::optional<u8*> MemoryManager::GpuToHostPointer(GPUVAddr gpu_addr) const {
|
||||
auto cpu_addr = GpuToCpuAddress(gpu_addr);
|
||||
if (!cpu_addr) {
|
||||
return std::nullopt;
|
||||
}
|
||||
auto& device_memory = system.DeviceMemory();
|
||||
auto base = device_memory.buffer.VirtualBasePointer();
|
||||
if (!base) {
|
||||
return std::nullopt;
|
||||
}
|
||||
return base + *cpu_addr;
|
||||
}
|
||||
|
||||
std::optional<VAddr> MemoryManager::GpuToCpuAddress(GPUVAddr addr, std::size_t size) const {
|
||||
size_t page_index{addr >> page_bits};
|
||||
const size_t page_last{(addr + size + page_size - 1) >> page_bits};
|
||||
|
|
|
@ -76,6 +76,8 @@ public:
|
|||
|
||||
[[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
|
||||
|
||||
[[nodiscard]] std::optional<u8*> GpuToHostPointer(GPUVAddr addr) const;
|
||||
|
||||
[[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr, std::size_t size) const;
|
||||
|
||||
template <typename T>
|
||||
|
|
Loading…
Reference in a new issue