From 242561debf141579a09dbac7ac6bcbd6b481a511 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 29 Dec 2024 18:11:39 +1000 Subject: [PATCH] CPU/Recompiler: Align dispatchers and JIT blocks A couple of percent difference if we're lucky. Practically probably <1%. --- src/core/cpu_code_cache.cpp | 11 ++--- src/core/cpu_code_cache_private.h | 1 + src/core/cpu_recompiler.cpp | 27 ++++++------ src/core/cpu_recompiler.h | 12 ++++++ src/core/cpu_recompiler_arm32.cpp | 14 +++--- src/core/cpu_recompiler_arm64.cpp | 67 ++++++++++++++++++----------- src/core/cpu_recompiler_riscv64.cpp | 9 +++- src/core/cpu_recompiler_x64.cpp | 41 +++++++++++++++--- 8 files changed, 127 insertions(+), 55 deletions(-) diff --git a/src/core/cpu_code_cache.cpp b/src/core/cpu_code_cache.cpp index 23b823095..dcd8c68e3 100644 --- a/src/core/cpu_code_cache.cpp +++ b/src/core/cpu_code_cache.cpp @@ -1492,18 +1492,15 @@ void CPU::CodeCache::CommitFarCode(u32 length) void CPU::CodeCache::AlignCode(u32 alignment) { -#if defined(CPU_ARCH_X64) - constexpr u8 padding_value = 0xcc; // int3 -#else - constexpr u8 padding_value = 0x00; -#endif - DebugAssert(Common::IsPow2(alignment)); const u32 num_padding_bytes = std::min(static_cast(Common::AlignUpPow2(reinterpret_cast(s_free_code_ptr), alignment) - reinterpret_cast(s_free_code_ptr)), GetFreeCodeSpace()); - std::memset(s_free_code_ptr, padding_value, num_padding_bytes); + + if (num_padding_bytes > 0) + EmitAlignmentPadding(s_free_code_ptr, num_padding_bytes); + s_free_code_ptr += num_padding_bytes; s_code_used += num_padding_bytes; } diff --git a/src/core/cpu_code_cache_private.h b/src/core/cpu_code_cache_private.h index 2e574e971..0b6697c24 100644 --- a/src/core/cpu_code_cache_private.h +++ b/src/core/cpu_code_cache_private.h @@ -247,6 +247,7 @@ bool HasPreviouslyFaultedOnPC(u32 guest_pc); u32 EmitASMFunctions(void* code, u32 code_size); u32 EmitJump(void* code, const void* dst, bool flush_icache); +void EmitAlignmentPadding(void* dst, size_t size); void DisassembleAndLogHostCode(const void* start, u32 size); u32 GetHostInstructionCount(const void* start, u32 size); diff --git a/src/core/cpu_recompiler.cpp b/src/core/cpu_recompiler.cpp index f8cb96113..bff8a4615 100644 --- a/src/core/cpu_recompiler.cpp +++ b/src/core/cpu_recompiler.cpp @@ -34,7 +34,7 @@ CPU::Recompiler::Recompiler::Recompiler() = default; CPU::Recompiler::Recompiler::~Recompiler() = default; void CPU::Recompiler::Recompiler::Reset(CodeCache::Block* block, u8* code_buffer, u32 code_buffer_space, - u8* far_code_buffer, u32 far_code_space) + u8* far_code_buffer, u32 far_code_space) { m_block = block; m_compiler_pc = block->pc; @@ -101,10 +101,12 @@ void CPU::Recompiler::Recompiler::BeginBlock() } const void* CPU::Recompiler::Recompiler::CompileBlock(CodeCache::Block* block, u32* host_code_size, - u32* host_far_code_size) + u32* host_far_code_size) { - Reset(block, CPU::CodeCache::GetFreeCodePointer(), CPU::CodeCache::GetFreeCodeSpace(), - CPU::CodeCache::GetFreeFarCodePointer(), CPU::CodeCache::GetFreeFarCodeSpace()); + CodeCache::AlignCode(FUNCTION_ALIGNMENT); + + Reset(block, CodeCache::GetFreeCodePointer(), CodeCache::GetFreeCodeSpace(), CodeCache::GetFreeFarCodePointer(), + CodeCache::GetFreeFarCodeSpace()); DEBUG_LOG("Block range: {:08X} -> {:08X}", block->pc, block->pc + block->size * 4); @@ -144,8 +146,8 @@ const void* CPU::Recompiler::Recompiler::CompileBlock(CodeCache::Block* block, u const void* code = EndCompile(&code_size, &far_code_size); *host_code_size = code_size; *host_far_code_size = far_code_size; - CPU::CodeCache::CommitCode(code_size); - CPU::CodeCache::CommitFarCode(far_code_size); + CodeCache::CommitCode(code_size); + CodeCache::CommitFarCode(far_code_size); return code; } @@ -651,7 +653,7 @@ const char* CPU::Recompiler::Recompiler::GetReadWriteModeString(u32 flags) } u32 CPU::Recompiler::Recompiler::AllocateHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */, - Reg reg /* = Reg::count */) + Reg reg /* = Reg::count */) { // Cancel any load delays before booting anything out if (flags & HR_MODE_WRITE && (type == HR_TYPE_CPU_REG || type == HR_TYPE_NEXT_LOAD_DELAY_VALUE)) @@ -753,7 +755,7 @@ u32 CPU::Recompiler::Recompiler::AllocateHostReg(u32 flags, HostRegAllocType typ } std::optional CPU::Recompiler::Recompiler::CheckHostReg(u32 flags, HostRegAllocType type /* = HR_TYPE_TEMP */, - Reg reg /* = Reg::count */) + Reg reg /* = Reg::count */) { for (u32 i = 0; i < NUM_HOST_REGS; i++) { @@ -1158,7 +1160,8 @@ void CPU::Recompiler::Recompiler::RestoreHostState() } void CPU::Recompiler::Recompiler::AddLoadStoreInfo(void* code_address, u32 code_size, u32 address_register, - u32 data_register, MemoryAccessSize size, bool is_signed, bool is_load) + u32 data_register, MemoryAccessSize size, bool is_signed, + bool is_load) { DebugAssert(CodeCache::IsUsingFastmem()); DebugAssert(address_register < NUM_HOST_REGS); @@ -1367,8 +1370,8 @@ void CPU::Recompiler::Recompiler::CompileBranchDelaySlot(bool dirty_pc /* = true } void CPU::Recompiler::Recompiler::CompileTemplate(void (Recompiler::*const_func)(CompileFlags), - void (Recompiler::*func)(CompileFlags), const void* pgxp_cpu_func, - u32 tflags) + void (Recompiler::*func)(CompileFlags), const void* pgxp_cpu_func, + u32 tflags) { // TODO: This is where we will do memory operand optimization. Remember to kill constants! // TODO: Swap S and T if commutative @@ -1733,7 +1736,7 @@ const TickCount* CPU::Recompiler::Recompiler::GetFetchMemoryAccessTimePtr() cons } void CPU::Recompiler::Recompiler::FlushForLoadStore(const std::optional& address, bool store, - bool use_fastmem) + bool use_fastmem) { if (use_fastmem) return; diff --git a/src/core/cpu_recompiler.h b/src/core/cpu_recompiler.h index b3f37a7e2..43a973339 100644 --- a/src/core/cpu_recompiler.h +++ b/src/core/cpu_recompiler.h @@ -34,6 +34,9 @@ public: static constexpr u32 NUM_HOST_REGS = 16; static constexpr bool HAS_MEMORY_OPERANDS = true; + // Align functions to 16 bytes. + static constexpr u32 FUNCTION_ALIGNMENT = 16; + #elif defined(CPU_ARCH_ARM32) // A reasonable "maximum" number of bytes per instruction. @@ -44,6 +47,9 @@ public: static constexpr u32 NUM_HOST_REGS = 16; static constexpr bool HAS_MEMORY_OPERANDS = false; + // Align functions to 4 bytes (word size). + static constexpr u32 FUNCTION_ALIGNMENT = 16; + #elif defined(CPU_ARCH_ARM64) // A reasonable "maximum" number of bytes per instruction. @@ -56,6 +62,9 @@ public: static constexpr u32 NUM_HOST_REGS = 32; static constexpr bool HAS_MEMORY_OPERANDS = false; + // Align functions to 16 bytes. + static constexpr u32 FUNCTION_ALIGNMENT = 16; + #elif defined(CPU_ARCH_RISCV64) // Number of host registers. @@ -68,6 +77,9 @@ public: static constexpr u32 MAX_NEAR_HOST_BYTES_PER_INSTRUCTION = 64; static constexpr u32 MIN_CODE_RESERVE_FOR_BLOCK = 512; + // Align functions to 16 bytes. + static constexpr u32 FUNCTION_ALIGNMENT = 16; + #endif public: diff --git a/src/core/cpu_recompiler_arm32.cpp b/src/core/cpu_recompiler_arm32.cpp index e6b6935cb..1ece153ba 100644 --- a/src/core/cpu_recompiler_arm32.cpp +++ b/src/core/cpu_recompiler_arm32.cpp @@ -320,14 +320,17 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) armAsm->FinalizeCode(); -#if 0 - // TODO: align? s_trampoline_targets.clear(); s_trampoline_start_ptr = static_cast(code) + armAsm->GetCursorOffset(); s_trampoline_used = 0; -#endif - return static_cast(armAsm->GetCursorOffset()) /* + TRAMPOLINE_AREA_SIZE*/; + return static_cast(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE; +} + +void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size) +{ + constexpr u8 padding_value = 0x00; + std::memset(dst, padding_value, size); } CPU::ARM32Recompiler::ARM32Recompiler() : m_emitter(A32), m_far_emitter(A32) @@ -1025,7 +1028,8 @@ void CPU::ARM32Recompiler::Flush(u32 flags) void CPU::ARM32Recompiler::Compile_Fallback() { - WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, inst->bits); + WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, + inst->bits); Flush(FLUSH_FOR_INTERPRETER); diff --git a/src/core/cpu_recompiler_arm64.cpp b/src/core/cpu_recompiler_arm64.cpp index 3f23402b8..c4aec70cc 100644 --- a/src/core/cpu_recompiler_arm64.cpp +++ b/src/core/cpu_recompiler_arm64.cpp @@ -41,19 +41,20 @@ LOG_CHANNEL(Recompiler); #define RSTATE vixl::aarch64::x19 #define RMEMBASE vixl::aarch64::x20 -bool armIsCallerSavedRegister(u32 id); -s64 armGetPCDisplacement(const void* current, const void* target); -bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr); -void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr); -void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm); -void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline); -void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline); -void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr); -void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr, - bool sign_extend_word = false); -void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr, - const vixl::aarch64::Register& tempreg = RXSCRATCH); -u8* armGetJumpTrampoline(const void* target); +static bool armIsCallerSavedRegister(u32 id); +static s64 armGetPCDisplacement(const void* current, const void* target); +static bool armIsInAdrpRange(vixl::aarch64::Assembler* armAsm, const void* addr); +static void armMoveAddressToReg(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr); +static void armEmitMov(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& rd, u64 imm); +static void armEmitJmp(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline); +static void armEmitCall(vixl::aarch64::Assembler* armAsm, const void* ptr, bool force_inline); +static void armEmitCondBranch(vixl::aarch64::Assembler* armAsm, vixl::aarch64::Condition cond, const void* ptr); +static void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr, + bool sign_extend_word = false); +static void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr, + const vixl::aarch64::Register& tempreg = RXSCRATCH); +static u8* armGetJumpTrampoline(const void* target); +static void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment); static constexpr u32 TRAMPOLINE_AREA_SIZE = 4 * 1024; static std::unordered_map s_trampoline_targets; @@ -327,8 +328,8 @@ void armEmitFarLoad(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Regis armAsm->ldr(reg, memop); } -void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, const void* addr, - const vixl::aarch64::Register& tempreg) +[[maybe_unused]] void armEmitFarStore(vixl::aarch64::Assembler* armAsm, const vixl::aarch64::Register& reg, + const void* addr, const vixl::aarch64::Register& tempreg) { DebugAssert(tempreg.IsX()); @@ -359,7 +360,7 @@ u8* armGetJumpTrampoline(const void* target) return s_trampoline_start_ptr + it->second; // align to 16 bytes? - const u32 offset = s_trampoline_used; // Common::AlignUpPow2(s_trampoline_used, 16); + const u32 offset = Common::AlignUpPow2(s_trampoline_used, CPU::Recompiler::FUNCTION_ALIGNMENT); // 4 movs plus a jump if (TRAMPOLINE_AREA_SIZE - offset < 20) @@ -387,6 +388,17 @@ u8* armGetJumpTrampoline(const void* target) return start; } +void armAlignCode(vixl::aarch64::Assembler* armAsm, size_t alignment) +{ + size_t addr = armAsm->GetCursorAddress(); + const size_t end_addr = Common::AlignUpPow2(addr, alignment); + while (addr != end_addr) + { + armAsm->nop(); + addr += vixl::aarch64::kInstructionSize; + } +} + void CPU::CodeCache::DisassembleAndLogHostCode(const void* start, u32 size) { #ifdef ENABLE_HOST_DISASSEMBLY @@ -434,7 +446,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) using namespace vixl::aarch64; Assembler actual_asm(static_cast(code), code_size); - Assembler* armAsm = &actual_asm; + Assembler* RESTRICT armAsm = &actual_asm; #ifdef VIXL_DEBUG vixl::CodeBufferCheckScope asm_check(armAsm, code_size, vixl::CodeBufferCheckScope::kDontReserveBufferSpace); @@ -455,21 +467,19 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) } // check events then for frame done + armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT); g_check_events_and_dispatch = armAsm->GetCursorAddress(); { - Label skip_event_check; armAsm->ldr(RWARG1, PTR(&g_state.pending_ticks)); armAsm->ldr(RWARG2, PTR(&g_state.downcount)); armAsm->cmp(RWARG1, RWARG2); - armAsm->b(&skip_event_check, lt); + armAsm->b(&dispatch, lt); g_run_events_and_dispatch = armAsm->GetCursorAddress(); armEmitCall(armAsm, reinterpret_cast(&TimingEvents::RunEvents), true); - - armAsm->bind(&skip_event_check); } - // TODO: align? + armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT); g_dispatcher = armAsm->GetCursorAddress(); { armAsm->bind(&dispatch); @@ -486,6 +496,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) armAsm->br(RXARG1); } + armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT); g_compile_or_revalidate_block = armAsm->GetCursorAddress(); { armAsm->ldr(RWARG1, PTR(&g_state.pc)); @@ -493,6 +504,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) armAsm->b(&dispatch); } + armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT); g_discard_and_recompile_block = armAsm->GetCursorAddress(); { armAsm->ldr(RWARG1, PTR(&g_state.pc)); @@ -500,6 +512,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) armAsm->b(&dispatch); } + armAlignCode(armAsm, Recompiler::FUNCTION_ALIGNMENT); g_interpret_block = armAsm->GetCursorAddress(); { armEmitCall(armAsm, reinterpret_cast(GetInterpretUncachedBlockFunction()), true); @@ -508,7 +521,6 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) armAsm->FinalizeCode(); - // TODO: align? s_trampoline_targets.clear(); s_trampoline_start_ptr = static_cast(code) + armAsm->GetCursorOffset(); s_trampoline_used = 0; @@ -516,6 +528,12 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) return static_cast(armAsm->GetCursorOffset()) + TRAMPOLINE_AREA_SIZE; } +void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size) +{ + constexpr u8 padding_value = 0x00; + std::memset(dst, padding_value, size); +} + CPU::ARM64Recompiler::ARM64Recompiler() : m_emitter(PositionDependentCode), m_far_emitter(PositionIndependentCode) { } @@ -1174,7 +1192,8 @@ void CPU::ARM64Recompiler::Flush(u32 flags) void CPU::ARM64Recompiler::Compile_Fallback() { - WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, inst->bits); + WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, + inst->bits); Flush(FLUSH_FOR_INTERPRETER); diff --git a/src/core/cpu_recompiler_riscv64.cpp b/src/core/cpu_recompiler_riscv64.cpp index 628692f82..2366bd591 100644 --- a/src/core/cpu_recompiler_riscv64.cpp +++ b/src/core/cpu_recompiler_riscv64.cpp @@ -317,6 +317,12 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) return static_cast(rvAsm->GetCodeBuffer().GetSizeInBytes()); } +void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size) +{ + constexpr u8 padding_value = 0x00; + std::memset(dst, padding_value, size); +} + u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache) { // TODO: get rid of assembler construction here @@ -998,7 +1004,8 @@ void CPU::RISCV64Recompiler::Flush(u32 flags) void CPU::RISCV64Recompiler::Compile_Fallback() { - WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, inst->bits); + WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, + inst->bits); Flush(FLUSH_FOR_INTERPRETER); diff --git a/src/core/cpu_recompiler_x64.cpp b/src/core/cpu_recompiler_x64.cpp index 1201ad73e..be18802cd 100644 --- a/src/core/cpu_recompiler_x64.cpp +++ b/src/core/cpu_recompiler_x64.cpp @@ -36,6 +36,7 @@ LOG_CHANNEL(Recompiler); // PGXP TODO: LWL etc, MFC0 // PGXP TODO: Spyro 1 level gates have issues. +static constexpr u32 FUNCTION_ALIGNMENT = 16; static constexpr u32 BACKPATCH_JMP_SIZE = 5; static bool IsCallerSavedRegister(u32 id); @@ -134,20 +135,18 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) } // check events then for frame done + cg->align(FUNCTION_ALIGNMENT); g_check_events_and_dispatch = cg->getCurr(); { - Label skip_event_check; cg->mov(RWARG1, cg->dword[PTR(&g_state.pending_ticks)]); cg->cmp(RWARG1, cg->dword[PTR(&g_state.downcount)]); - cg->jl(skip_event_check); + cg->jl(dispatch); g_run_events_and_dispatch = cg->getCurr(); cg->call(reinterpret_cast(&TimingEvents::RunEvents)); - - cg->L(skip_event_check); } - // TODO: align? + cg->align(FUNCTION_ALIGNMENT); g_dispatcher = cg->getCurr(); { cg->L(dispatch); @@ -164,6 +163,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) cg->jmp(cg->qword[RXARG2 + RXARG1 * 2]); } + cg->align(FUNCTION_ALIGNMENT); g_compile_or_revalidate_block = cg->getCurr(); { cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]); @@ -171,6 +171,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) cg->jmp(dispatch); } + cg->align(FUNCTION_ALIGNMENT); g_discard_and_recompile_block = cg->getCurr(); { cg->mov(RWARG1, cg->dword[PTR(&g_state.pc)]); @@ -178,6 +179,7 @@ u32 CPU::CodeCache::EmitASMFunctions(void* code, u32 code_size) cg->jmp(dispatch); } + cg->align(FUNCTION_ALIGNMENT); g_interpret_block = cg->getCurr(); { cg->call(CodeCache::GetInterpretUncachedBlockFunction()); @@ -201,6 +203,32 @@ u32 CPU::CodeCache::EmitJump(void* code, const void* dst, bool flush_icache) return 5; } +void CPU::CodeCache::EmitAlignmentPadding(void* dst, size_t size) +{ + // Copied from Xbyak nop(), to avoid constructing a CodeGenerator. + static const uint8_t nopTbl[9][9] = { + {0x90}, + {0x66, 0x90}, + {0x0F, 0x1F, 0x00}, + {0x0F, 0x1F, 0x40, 0x00}, + {0x0F, 0x1F, 0x44, 0x00, 0x00}, + {0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00}, + {0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00}, + {0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + {0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}, + }; + const size_t n = sizeof(nopTbl) / sizeof(nopTbl[0]); + u8* dst_ptr = static_cast(dst); + while (size > 0) + { + size_t len = (std::min)(n, size); + const uint8_t* seq = nopTbl[len - 1]; + std::memcpy(dst_ptr, seq, len); + dst_ptr += len; + size -= len; + } +} + #ifdef ENABLE_HOST_DISASSEMBLY static ZydisFormatterFunc s_old_print_address; @@ -929,7 +957,8 @@ void CPU::X64Recompiler::Flush(u32 flags) void CPU::X64Recompiler::Compile_Fallback() { - WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, inst->bits); + WARNING_LOG("Compiling instruction fallback at PC=0x{:08X}, instruction=0x{:08X}", m_current_instruction_pc, + inst->bits); Flush(FLUSH_FOR_INTERPRETER);