Merge pull request #5546 from Cacodemon345/ndr-reg-optimizations

New dynamic recompiler register optimizations
This commit is contained in:
Miran Grča
2025-07-11 11:11:14 +02:00
committed by GitHub
10 changed files with 377 additions and 139 deletions

View File

@@ -746,8 +746,7 @@ codegen_skip:
uop_MOV_PTR(ir, IREG_ea_seg, (void *) op_ea_seg);
if (op_ssegs != last_op_ssegs)
uop_MOV_IMM(ir, IREG_ssegs, op_ssegs);
uop_LOAD_FUNC_ARG_IMM(ir, 0, fetchdat);
uop_CALL_INSTRUCTION_FUNC(ir, op);
uop_CALL_INSTRUCTION_FUNC(ir, op, fetchdat);
codegen_flags_changed = 0;
codegen_mark_code_present(block, cs + cpu_state.pc, 8);

View File

@@ -218,6 +218,7 @@ codegen_CALL_FUNC_RESULT(codeblock_t *block, uop_t *uop)
static int
codegen_CALL_INSTRUCTION_FUNC(codeblock_t *block, uop_t *uop)
{
host_arm64_mov_imm(block, REG_ARG0, uop->imm_data);
host_arm64_call(block, uop->p);
host_arm64_CBNZ(block, REG_X0, (uintptr_t) codegen_exit_rout);

View File

@@ -286,6 +286,7 @@ codegen_CALL_FUNC_RESULT(codeblock_t *block, uop_t *uop)
static int
codegen_CALL_INSTRUCTION_FUNC(codeblock_t *block, uop_t *uop)
{
host_arm_MOV_IMM(block, REG_ARG0, uop->imm_data);
host_arm_call(block, uop->p);
host_arm_TST_REG(block, REG_R0, REG_R0);
host_arm_BNE(block, (uintptr_t) codegen_exit_rout);

File diff suppressed because it is too large Load Diff

View File

@@ -219,6 +219,11 @@ codegen_CALL_FUNC_RESULT(codeblock_t *block, uop_t *uop)
static int
codegen_CALL_INSTRUCTION_FUNC(codeblock_t *block, uop_t *uop)
{
# if _WIN64
host_x86_MOV32_REG_IMM(block, REG_ECX, uop->imm_data);
# else
host_x86_MOV32_REG_IMM(block, REG_EDI, uop->imm_data);
# endif
host_x86_CALL(block, uop->p);
host_x86_TEST32_REG(block, REG_EAX, REG_EAX);
host_x86_JNZ(block, codegen_exit_rout);

View File

@@ -221,6 +221,7 @@ codegen_CALL_FUNC_RESULT(codeblock_t *block, uop_t *uop)
static int
codegen_CALL_INSTRUCTION_FUNC(codeblock_t *block, uop_t *uop)
{
host_x86_MOV32_STACK_IMM(block, STACK_ARG0, uop->imm_data);
host_x86_CALL(block, uop->p);
host_x86_TEST32_REG(block, REG_EAX, REG_EAX);
host_x86_JNZ(block, codegen_exit_rout);

View File

@@ -38,7 +38,7 @@ codegen_ir_set_unroll(int count, int start, int first_instruction)
static void
duplicate_uop(ir_data_t *ir, uop_t *uop, int offset)
{
uop_t *new_uop = uop_alloc(ir, uop->type);
uop_t *new_uop = uop_alloc_unroll(ir, uop->type);
if (!ir_reg_is_invalid(uop->src_reg_a))
new_uop->src_reg_a = codegen_reg_read(uop->src_reg_a.reg);

View File

@@ -41,8 +41,8 @@
#define UOP_LOAD_FUNC_ARG_2_IMM (UOP_TYPE_PARAMS_IMM | 0x0a | UOP_TYPE_BARRIER)
#define UOP_LOAD_FUNC_ARG_3_IMM (UOP_TYPE_PARAMS_IMM | 0x0b | UOP_TYPE_BARRIER)
#define UOP_CALL_FUNC (UOP_TYPE_PARAMS_POINTER | 0x10 | UOP_TYPE_BARRIER)
/*UOP_CALL_INSTRUCTION_FUNC - call instruction handler at p, check return value and exit block if non-zero*/
#define UOP_CALL_INSTRUCTION_FUNC (UOP_TYPE_PARAMS_POINTER | 0x11 | UOP_TYPE_BARRIER)
/*UOP_CALL_INSTRUCTION_FUNC - call instruction handler at p with fetchdat, check return value and exit block if non-zero*/
#define UOP_CALL_INSTRUCTION_FUNC (UOP_TYPE_PARAMS_POINTER | UOP_TYPE_PARAMS_IMM | 0x11 | UOP_TYPE_BARRIER)
#define UOP_STORE_P_IMM (UOP_TYPE_PARAMS_IMM | 0x12)
#define UOP_STORE_P_IMM_8 (UOP_TYPE_PARAMS_IMM | 0x13)
/*UOP_LOAD_SEG - load segment in src_reg_a to segment p via loadseg(), check return value and exit block if non-zero*/
@@ -377,6 +377,34 @@ uop_alloc(ir_data_t *ir, uint32_t uop_type)
uop->jump_dest_uop = -1;
uop->jump_list_next = -1;
if (uop_type & (UOP_TYPE_BARRIER | UOP_TYPE_ORDER_BARRIER))
dirty_ir_regs[0] = dirty_ir_regs[1] = ~0ULL;
return uop;
}
static inline uop_t *
uop_alloc_unroll(ir_data_t *ir, uint32_t uop_type)
{
uop_t *uop;
if (ir->wr_pos >= UOP_NR_MAX)
fatal("Exceeded uOP max\n");
uop = &ir->uops[ir->wr_pos++];
uop->is_a16 = 0;
uop->dest_reg_a = invalid_ir_reg;
uop->src_reg_a = invalid_ir_reg;
uop->src_reg_b = invalid_ir_reg;
uop->src_reg_c = invalid_ir_reg;
uop->pc = cpu_state.oldpc;
uop->jump_dest_uop = -1;
uop->jump_list_next = -1;
if (uop_type & (UOP_TYPE_BARRIER | UOP_TYPE_ORDER_BARRIER))
codegen_reg_mark_as_required();
@@ -662,7 +690,7 @@ uop_gen_reg_src2_pointer(uint32_t uop_type, ir_data_t *ir, int src_reg_a, int sr
#define uop_CALL_FUNC(ir, p) uop_gen_pointer(UOP_CALL_FUNC, ir, p)
#define uop_CALL_FUNC_RESULT(ir, dst_reg, p) uop_gen_reg_dst_pointer(UOP_CALL_FUNC_RESULT, ir, dst_reg, p)
#define uop_CALL_INSTRUCTION_FUNC(ir, p) uop_gen_pointer(UOP_CALL_INSTRUCTION_FUNC, ir, p)
#define uop_CALL_INSTRUCTION_FUNC(ir, p, imm) uop_gen_pointer_imm(UOP_CALL_INSTRUCTION_FUNC, ir, p, imm)
#define uop_CMP_IMM_JZ(ir, src_reg, imm, p) uop_gen_reg_src_pointer_imm(UOP_CMP_IMM_JZ, ir, src_reg, p, imm)

View File

@@ -34,6 +34,8 @@ typedef struct host_reg_set_t {
static host_reg_set_t host_reg_set;
static host_reg_set_t host_fp_reg_set;
uint64_t dirty_ir_regs[2] = { 0, 0 };
enum {
REG_BYTE,
REG_WORD,
@@ -184,15 +186,36 @@ struct
[IREG_temp1d] = { REG_DOUBLE, (void *) 48, REG_FP, REG_VOLATILE },
};
static const uint8_t native_requested_sizes[9][8] =
{
[REG_BYTE][IREG_SIZE_B >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_BYTE][IREG_SIZE_B >> IREG_SIZE_SHIFT] = 1,
[REG_WORD][IREG_SIZE_W >> IREG_SIZE_SHIFT] = 1,
[REG_DWORD][IREG_SIZE_L >> IREG_SIZE_SHIFT] = 1,
[REG_QWORD][IREG_SIZE_D >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_QWORD][IREG_SIZE_D >> IREG_SIZE_SHIFT] = 1,
[REG_DOUBLE][IREG_SIZE_D >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_DOUBLE][IREG_SIZE_D >> IREG_SIZE_SHIFT] = 1,
[REG_QWORD][IREG_SIZE_Q >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_QWORD][IREG_SIZE_Q >> IREG_SIZE_SHIFT] = 1,
[REG_DOUBLE][IREG_SIZE_Q >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_DOUBLE][IREG_SIZE_Q >> IREG_SIZE_SHIFT] = 1,
[REG_POINTER][(sizeof(void *) == 4) ? (IREG_SIZE_L >> IREG_SIZE_SHIFT) : (IREG_SIZE_Q >> IREG_SIZE_SHIFT)] = 1
};
void
codegen_reg_mark_as_required(void)
{
for (uint8_t reg = 0; reg < IREG_COUNT; reg++) {
/* This used to start from IREG_EAX, now only starts from IREG_ESP since the first 4 registers are never optimized out. */
/* It also no longer iterates through volatile registers unnecessarily. */
for (uint8_t reg = IREG_ESP; reg < IREG_temp0; reg++) {
int last_version = reg_last_version[reg];
if (last_version > 0 && ireg_data[reg].is_volatile == REG_PERMANENT)
if (last_version > 0)
reg_version[reg][last_version].flags |= REG_FLAGS_REQUIRED;
}
dirty_ir_regs[0] = dirty_ir_regs[1] = 0;
}
int
@@ -201,29 +224,7 @@ reg_is_native_size(ir_reg_t ir_reg)
int native_size = ireg_data[IREG_GET_REG(ir_reg.reg)].native_size;
int requested_size = IREG_GET_SIZE(ir_reg.reg);
switch (native_size) {
case REG_BYTE:
case REG_FPU_ST_BYTE:
return (requested_size == IREG_SIZE_B);
case REG_WORD:
return (requested_size == IREG_SIZE_W);
case REG_DWORD:
return (requested_size == IREG_SIZE_L);
case REG_QWORD:
case REG_FPU_ST_QWORD:
case REG_DOUBLE:
case REG_FPU_ST_DOUBLE:
return ((requested_size == IREG_SIZE_D) || (requested_size == IREG_SIZE_Q));
case REG_POINTER:
if (sizeof(void *) == 4)
return (requested_size == IREG_SIZE_L);
return (requested_size == IREG_SIZE_Q);
default:
fatal("get_reg_is_native_size: unknown native size %i\n", native_size);
}
return 0;
return native_requested_sizes[native_size][requested_size >> IREG_SIZE_SHIFT];
}
void
@@ -256,6 +257,8 @@ codegen_reg_reset(void)
host_fp_reg_set.locked = 0;
host_fp_reg_set.nr_regs = CODEGEN_HOST_FP_REGS;
dirty_ir_regs[0] = dirty_ir_regs[1] = 0;
for (c = 0; c < IREG_COUNT; c++) {
reg_last_version[c] = 0;
reg_version[c][0].refcount = 0;

View File

@@ -16,59 +16,45 @@
#define IREG_SIZE_Q (5 << IREG_SIZE_SHIFT)
enum {
IREG_EAX = 0,
IREG_ECX = 1,
IREG_EDX = 2,
IREG_EBX = 3,
IREG_ESP = 4,
IREG_EBP = 5,
IREG_ESI = 6,
IREG_EDI = 7,
IREG_EAX,
IREG_ECX,
IREG_EDX,
IREG_EBX,
IREG_ESP,
IREG_EBP,
IREG_ESI,
IREG_EDI,
IREG_flags_op = 8,
IREG_flags_res = 9,
IREG_flags_op1 = 10,
IREG_flags_op2 = 11,
IREG_flags_op,
IREG_flags_res,
IREG_flags_op1,
IREG_flags_op2,
IREG_pc = 12,
IREG_oldpc = 13,
IREG_pc,
IREG_oldpc,
IREG_eaaddr = 14,
IREG_ea_seg = 15,
IREG_op32 = 16,
IREG_ssegsx = 17,
IREG_eaaddr,
IREG_ea_seg,
IREG_op32,
IREG_ssegsx,
IREG_rm_mod_reg = 18,
IREG_rm_mod_reg,
IREG_acycs = 19,
IREG_cycles = 20,
IREG_cycles,
IREG_CS_base = 21,
IREG_DS_base = 22,
IREG_ES_base = 23,
IREG_FS_base = 24,
IREG_GS_base = 25,
IREG_SS_base = 26,
IREG_CS_base,
IREG_DS_base,
IREG_ES_base,
IREG_FS_base,
IREG_GS_base,
IREG_SS_base,
IREG_CS_seg = 27,
IREG_DS_seg = 28,
IREG_ES_seg = 29,
IREG_FS_seg = 30,
IREG_GS_seg = 31,
IREG_SS_seg = 32,
/*Temporary registers are stored on the stack, and are not guaranteed to
be preserved across uOPs. They will not be written back if they will
not be read again.*/
IREG_temp0 = 33,
IREG_temp1 = 34,
IREG_temp2 = 35,
IREG_temp3 = 36,
IREG_FPU_TOP = 37,
IREG_temp0d = 38,
IREG_temp1d = 39,
IREG_CS_seg,
IREG_DS_seg,
IREG_ES_seg,
IREG_FS_seg,
IREG_GS_seg,
IREG_SS_seg,
/*FPU stack registers are physical registers. Use IREG_ST() / IREG_tag()
to access.
@@ -76,66 +62,79 @@ enum {
used directly to index the stack. When it is clear, the difference
between the current value of TOP and the value when the block was
first compiled will be added to adjust for any changes in TOP.*/
IREG_ST0 = 40,
IREG_ST1 = 41,
IREG_ST2 = 42,
IREG_ST3 = 43,
IREG_ST4 = 44,
IREG_ST5 = 45,
IREG_ST6 = 46,
IREG_ST7 = 47,
IREG_ST0,
IREG_ST1,
IREG_ST2,
IREG_ST3,
IREG_ST4,
IREG_ST5,
IREG_ST6,
IREG_ST7,
IREG_tag0 = 48,
IREG_tag1 = 49,
IREG_tag2 = 50,
IREG_tag3 = 51,
IREG_tag4 = 52,
IREG_tag5 = 53,
IREG_tag6 = 54,
IREG_tag7 = 55,
IREG_tag0,
IREG_tag1,
IREG_tag2,
IREG_tag3,
IREG_tag4,
IREG_tag5,
IREG_tag6,
IREG_tag7,
IREG_ST0_i64 = 56,
IREG_ST1_i64 = 57,
IREG_ST2_i64 = 58,
IREG_ST3_i64 = 59,
IREG_ST4_i64 = 60,
IREG_ST5_i64 = 61,
IREG_ST6_i64 = 62,
IREG_ST7_i64 = 63,
IREG_ST0_i64,
IREG_ST1_i64,
IREG_ST2_i64,
IREG_ST3_i64,
IREG_ST4_i64,
IREG_ST5_i64,
IREG_ST6_i64,
IREG_ST7_i64,
IREG_MM0x = 64,
IREG_MM1x = 65,
IREG_MM2x = 66,
IREG_MM3x = 67,
IREG_MM4x = 68,
IREG_MM5x = 69,
IREG_MM6x = 70,
IREG_MM7x = 71,
IREG_MM0x,
IREG_MM1x,
IREG_MM2x,
IREG_MM3x,
IREG_MM4x,
IREG_MM5x,
IREG_MM6x,
IREG_MM7x,
IREG_NPXCx = 72,
IREG_NPXSx = 73,
IREG_NPXCx,
IREG_NPXSx,
IREG_flagsx = 74,
IREG_eflagsx = 75,
IREG_flagsx,
IREG_eflagsx,
IREG_CS_limit_low = 76,
IREG_DS_limit_low = 77,
IREG_ES_limit_low = 78,
IREG_FS_limit_low = 79,
IREG_GS_limit_low = 80,
IREG_SS_limit_low = 81,
IREG_CS_limit_low,
IREG_DS_limit_low,
IREG_ES_limit_low,
IREG_FS_limit_low,
IREG_GS_limit_low,
IREG_SS_limit_low,
IREG_CS_limit_high = 82,
IREG_DS_limit_high = 83,
IREG_ES_limit_high = 84,
IREG_FS_limit_high = 85,
IREG_GS_limit_high = 86,
IREG_SS_limit_high = 87,
IREG_CS_limit_high,
IREG_DS_limit_high,
IREG_ES_limit_high,
IREG_FS_limit_high,
IREG_GS_limit_high,
IREG_SS_limit_high,
IREG_eaa16 = 88,
IREG_x87_op = 89,
IREG_eaa16,
IREG_x87_op,
IREG_COUNT = 90,
IREG_FPU_TOP,
/*Temporary registers are stored on the stack, and are not guaranteed to
be preserved across uOPs. They will not be written back if they will
not be read again.*/
IREG_temp0,
IREG_temp1,
IREG_temp2,
IREG_temp3,
IREG_temp0d,
IREG_temp1d,
IREG_COUNT,
IREG_INVALID = 255,
@@ -279,6 +278,7 @@ ireg_seg_limit_high(x86seg *seg)
}
extern uint8_t reg_last_version[IREG_COUNT];
extern uint64_t dirty_ir_regs[2];
/*This version of the register must be calculated, regardless of whether it is
apparently required or not. Do not optimise out.*/
@@ -363,10 +363,12 @@ codegen_reg_write(int reg, int uop_nr)
int last_version = reg_last_version[IREG_GET_REG(reg)];
reg_version_t *version;
#ifndef RELEASE_BUILD
if (IREG_GET_REG(reg) == IREG_INVALID)
fatal("codegen_reg_write - IREG_INVALID\n");
#endif
if (dirty_ir_regs[(IREG_GET_REG(reg) >> 6) & 3] & (1ull << ((uint64_t)IREG_GET_REG(reg) & 0x3full))) {
dirty_ir_regs[(IREG_GET_REG(reg) >> 6) & 3] &= ~(1ull << ((uint64_t)IREG_GET_REG(reg) & 0x3full));
if ((IREG_GET_REG(reg) > IREG_EBX && IREG_GET_REG(reg) < IREG_temp0) && last_version > 0) {
reg_version[IREG_GET_REG(reg)][last_version].flags |= REG_FLAGS_REQUIRED;
}
}
ireg.reg = reg;
ireg.version = last_version + 1;
@@ -376,12 +378,8 @@ codegen_reg_write(int reg, int uop_nr)
}
reg_last_version[IREG_GET_REG(reg)]++;
#ifndef RELEASE_BUILD
if (!reg_last_version[IREG_GET_REG(reg)])
fatal("codegen_reg_write - version overflow\n");
else
#endif
if (reg_last_version[IREG_GET_REG(reg)] > REG_VERSION_MAX)
if (reg_last_version[IREG_GET_REG(reg)] > REG_VERSION_MAX)
CPU_BLOCK_END();
if (reg_last_version[IREG_GET_REG(reg)] > max_version_refcount)
max_version_refcount = reg_last_version[IREG_GET_REG(reg)];