NDR: For barrier micro-ops, lazily mark written registers instead of iterating

Also remove some dead code in there.

4-5% more improvement observed.
This commit is contained in:
Cacodemon345
2025-05-06 16:25:09 +06:00
parent 864e01b0e2
commit aafd2f22f5
4 changed files with 62 additions and 34 deletions

View File

@@ -38,7 +38,7 @@ codegen_ir_set_unroll(int count, int start, int first_instruction)
static void static void
duplicate_uop(ir_data_t *ir, uop_t *uop, int offset) duplicate_uop(ir_data_t *ir, uop_t *uop, int offset)
{ {
uop_t *new_uop = uop_alloc(ir, uop->type); uop_t *new_uop = uop_alloc_unroll(ir, uop->type);
if (!ir_reg_is_invalid(uop->src_reg_a)) if (!ir_reg_is_invalid(uop->src_reg_a))
new_uop->src_reg_a = codegen_reg_read(uop->src_reg_a.reg); new_uop->src_reg_a = codegen_reg_read(uop->src_reg_a.reg);

View File

@@ -377,6 +377,34 @@ uop_alloc(ir_data_t *ir, uint32_t uop_type)
uop->jump_dest_uop = -1; uop->jump_dest_uop = -1;
uop->jump_list_next = -1; uop->jump_list_next = -1;
if (uop_type & (UOP_TYPE_BARRIER | UOP_TYPE_ORDER_BARRIER))
dirty_ir_regs[0] = dirty_ir_regs[1] = ~0ULL;
return uop;
}
static inline uop_t *
uop_alloc_unroll(ir_data_t *ir, uint32_t uop_type)
{
uop_t *uop;
if (ir->wr_pos >= UOP_NR_MAX)
fatal("Exceeded uOP max\n");
uop = &ir->uops[ir->wr_pos++];
uop->is_a16 = 0;
uop->dest_reg_a = invalid_ir_reg;
uop->src_reg_a = invalid_ir_reg;
uop->src_reg_b = invalid_ir_reg;
uop->src_reg_c = invalid_ir_reg;
uop->pc = cpu_state.oldpc;
uop->jump_dest_uop = -1;
uop->jump_list_next = -1;
if (uop_type & (UOP_TYPE_BARRIER | UOP_TYPE_ORDER_BARRIER)) if (uop_type & (UOP_TYPE_BARRIER | UOP_TYPE_ORDER_BARRIER))
codegen_reg_mark_as_required(); codegen_reg_mark_as_required();

View File

@@ -34,6 +34,8 @@ typedef struct host_reg_set_t {
static host_reg_set_t host_reg_set; static host_reg_set_t host_reg_set;
static host_reg_set_t host_fp_reg_set; static host_reg_set_t host_fp_reg_set;
uint64_t dirty_ir_regs[2] = { 0, 0 };
enum { enum {
REG_BYTE, REG_BYTE,
REG_WORD, REG_WORD,
@@ -184,6 +186,24 @@ struct
[IREG_temp1d] = { REG_DOUBLE, (void *) 48, REG_FP, REG_VOLATILE }, [IREG_temp1d] = { REG_DOUBLE, (void *) 48, REG_FP, REG_VOLATILE },
}; };
static const uint8_t native_requested_sizes[9][8] =
{
[REG_BYTE][IREG_SIZE_B >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_BYTE][IREG_SIZE_B >> IREG_SIZE_SHIFT] = 1,
[REG_WORD][IREG_SIZE_W >> IREG_SIZE_SHIFT] = 1,
[REG_DWORD][IREG_SIZE_L >> IREG_SIZE_SHIFT] = 1,
[REG_QWORD][IREG_SIZE_D >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_QWORD][IREG_SIZE_D >> IREG_SIZE_SHIFT] = 1,
[REG_DOUBLE][IREG_SIZE_D >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_DOUBLE][IREG_SIZE_D >> IREG_SIZE_SHIFT] = 1,
[REG_QWORD][IREG_SIZE_Q >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_QWORD][IREG_SIZE_Q >> IREG_SIZE_SHIFT] = 1,
[REG_DOUBLE][IREG_SIZE_Q >> IREG_SIZE_SHIFT] = 1,
[REG_FPU_ST_DOUBLE][IREG_SIZE_Q >> IREG_SIZE_SHIFT] = 1,
[REG_POINTER][(sizeof(void *) == 4) ? (IREG_SIZE_L >> IREG_SIZE_SHIFT) : (IREG_SIZE_Q >> IREG_SIZE_SHIFT)] = 1
};
void void
codegen_reg_mark_as_required(void) codegen_reg_mark_as_required(void)
{ {
@@ -195,6 +215,7 @@ codegen_reg_mark_as_required(void)
if (last_version > 0) if (last_version > 0)
reg_version[reg][last_version].flags |= REG_FLAGS_REQUIRED; reg_version[reg][last_version].flags |= REG_FLAGS_REQUIRED;
} }
dirty_ir_regs[0] = dirty_ir_regs[1] = 0;
} }
int int
@@ -203,29 +224,7 @@ reg_is_native_size(ir_reg_t ir_reg)
int native_size = ireg_data[IREG_GET_REG(ir_reg.reg)].native_size; int native_size = ireg_data[IREG_GET_REG(ir_reg.reg)].native_size;
int requested_size = IREG_GET_SIZE(ir_reg.reg); int requested_size = IREG_GET_SIZE(ir_reg.reg);
switch (native_size) { return native_requested_sizes[native_size][requested_size >> IREG_SIZE_SHIFT];
case REG_BYTE:
case REG_FPU_ST_BYTE:
return (requested_size == IREG_SIZE_B);
case REG_WORD:
return (requested_size == IREG_SIZE_W);
case REG_DWORD:
return (requested_size == IREG_SIZE_L);
case REG_QWORD:
case REG_FPU_ST_QWORD:
case REG_DOUBLE:
case REG_FPU_ST_DOUBLE:
return ((requested_size == IREG_SIZE_D) || (requested_size == IREG_SIZE_Q));
case REG_POINTER:
if (sizeof(void *) == 4)
return (requested_size == IREG_SIZE_L);
return (requested_size == IREG_SIZE_Q);
default:
fatal("get_reg_is_native_size: unknown native size %i\n", native_size);
}
return 0;
} }
void void
@@ -258,6 +257,8 @@ codegen_reg_reset(void)
host_fp_reg_set.locked = 0; host_fp_reg_set.locked = 0;
host_fp_reg_set.nr_regs = CODEGEN_HOST_FP_REGS; host_fp_reg_set.nr_regs = CODEGEN_HOST_FP_REGS;
dirty_ir_regs[0] = dirty_ir_regs[1] = 0;
for (c = 0; c < IREG_COUNT; c++) { for (c = 0; c < IREG_COUNT; c++) {
reg_last_version[c] = 0; reg_last_version[c] = 0;
reg_version[c][0].refcount = 0; reg_version[c][0].refcount = 0;

View File

@@ -278,6 +278,7 @@ ireg_seg_limit_high(x86seg *seg)
} }
extern uint8_t reg_last_version[IREG_COUNT]; extern uint8_t reg_last_version[IREG_COUNT];
extern uint64_t dirty_ir_regs[2];
/*This version of the register must be calculated, regardless of whether it is /*This version of the register must be calculated, regardless of whether it is
apparently required or not. Do not optimise out.*/ apparently required or not. Do not optimise out.*/
@@ -362,10 +363,12 @@ codegen_reg_write(int reg, int uop_nr)
int last_version = reg_last_version[IREG_GET_REG(reg)]; int last_version = reg_last_version[IREG_GET_REG(reg)];
reg_version_t *version; reg_version_t *version;
#ifndef RELEASE_BUILD if (dirty_ir_regs[(IREG_GET_REG(reg) >> 6) & 3] & (1ull << ((uint64_t)IREG_GET_REG(reg) & 0x3full))) {
if (IREG_GET_REG(reg) == IREG_INVALID) dirty_ir_regs[(IREG_GET_REG(reg) >> 6) & 3] &= ~(1ull << ((uint64_t)IREG_GET_REG(reg) & 0x3full));
fatal("codegen_reg_write - IREG_INVALID\n"); if ((IREG_GET_REG(reg) > IREG_EBX && IREG_GET_REG(reg) < IREG_temp0) && last_version > 0) {
#endif reg_version[IREG_GET_REG(reg)][last_version].flags |= REG_FLAGS_REQUIRED;
}
}
ireg.reg = reg; ireg.reg = reg;
ireg.version = last_version + 1; ireg.version = last_version + 1;
@@ -375,11 +378,7 @@ codegen_reg_write(int reg, int uop_nr)
} }
reg_last_version[IREG_GET_REG(reg)]++; reg_last_version[IREG_GET_REG(reg)]++;
#ifndef RELEASE_BUILD
if (!reg_last_version[IREG_GET_REG(reg)])
fatal("codegen_reg_write - version overflow\n");
else
#endif
if (reg_last_version[IREG_GET_REG(reg)] > REG_VERSION_MAX) if (reg_last_version[IREG_GET_REG(reg)] > REG_VERSION_MAX)
CPU_BLOCK_END(); CPU_BLOCK_END();
if (reg_last_version[IREG_GET_REG(reg)] > max_version_refcount) if (reg_last_version[IREG_GET_REG(reg)] > max_version_refcount)