Files
86Box/src/cpu/codegen_timing_p6.c
Jasmine Iwanek ad3eaf17a9 More unique names
2024-08-12 20:02:19 -04:00

2000 lines
90 KiB
C

/*Basic P6 timing model by plant/nerd73. Based on the K6 timing model*/
/*Some cycle timings come from https://www.agner.org/optimize/instruction_tables.pdf*/
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <wchar.h>
#include <86box/86box.h>
#include "cpu.h"
#include <86box/mem.h>
#include <86box/machine.h>
#include <86box/plat_unused.h>
#include "x86.h"
#include "x86_ops.h"
#include "x86seg_common.h"
#include "x87_sf.h"
#include "x87.h"
#include "386_common.h"
#include "codegen.h"
#include "codegen_ops.h"
#include "codegen_timing_common.h"
typedef enum uop_type_t {
UOP_ALU = 0, /*Executes in Port 0 or 1 ALU units*/
UOP_ALUP0, /*Executes in Port 0 ALU unit*/
UOP_LOAD, /*Executes in Load unit*/
UOP_STORED, /*Executes in Data Store unit*/
UOP_STOREA, /*Executes in Address Store unit*/
UOP_FLOAD, /*Executes in Load unit*/
UOP_FSTORED, /*Executes in Data Store unit*/
UOP_FSTOREA, /*Executes in Address Store unit*/
UOP_MLOAD, /*Executes in Load unit*/
UOP_MSTORED, /*Executes in Data Store unit*/
UOP_MSTOREA, /*Executes in Address Store unit*/
UOP_FLOAT, /*Executes in Floating Point unit*/
UOP_MMX, /*Executes in Port 0 or 1 ALU units as MMX*/
UOP_MMX_SHIFT, /*Executes in Port 1 ALU unit. Uses MMX shifter*/
UOP_MMX_MUL, /*Executes in Port 0 ALU unit. Uses MMX multiplier*/
UOP_BRANCH, /*Executes in Branch unit*/
UOP_FXCH /*Does not require an execution unit*/
} uop_type_t;
typedef enum decode_type_t {
DECODE_SIMPLE,
DECODE_COMPLEX,
} decode_type_t;
#define MAX_UOPS 10
typedef struct p6_uop_t {
uop_type_t type;
int latency;
} p6_uop_t;
typedef struct macro_op_t {
int nr_uops;
decode_type_t decode_type;
p6_uop_t uop[MAX_UOPS];
} macro_op_t;
static const macro_op_t alu_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_ALU, .latency = 1}
};
static const macro_op_t alup0_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t load_alu_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t load_alup0_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t alu_store_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1},
.uop[2] = { .type = UOP_STORED, .latency = 1},
.uop[3] = { .type = UOP_STOREA, .latency = 1}
};
static const macro_op_t alup0_store_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1},
.uop[2] = { .type = UOP_STORED, .latency = 1},
.uop[3] = { .type = UOP_STOREA, .latency = 1}
};
static const macro_op_t branch_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_BRANCH, .latency = 2}
};
static const macro_op_t fxch_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_FXCH, .latency = 1}
};
static const macro_op_t load_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_LOAD, .latency = 1}
};
static const macro_op_t store_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_STORED, .latency = 1},
.uop[1] = { .type = UOP_STOREA, .latency = 1}
};
static const macro_op_t bswap_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1},
};
static const macro_op_t leave_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t lods_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t loop_op = {
.nr_uops = 5,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 1},
.uop[3] = { .type = UOP_ALU, .latency = 1},
.uop[4] = { .type = UOP_BRANCH, .latency = 1}
};
static const macro_op_t mov_reg_seg_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
};
static const macro_op_t movs_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_STORED, .latency = 1},
.uop[2] = { .type = UOP_STOREA, .latency = 1},
.uop[3] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t pop_reg_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t pop_mem_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_STORED, .latency = 1},
.uop[2] = { .type = UOP_STOREA, .latency = 1},
.uop[3] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t push_imm_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_STORED, .latency = 1},
.uop[1] = { .type = UOP_STOREA, .latency = 1},
};
static const macro_op_t push_mem_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_STORED, .latency = 1},
.uop[2] = { .type = UOP_STOREA, .latency = 1}
};
static const macro_op_t push_seg_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_STORED, .latency = 1},
.uop[2] = { .type = UOP_STOREA, .latency = 1},
.uop[3] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t stos_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[1] = {.type = UOP_STORED, .latency = 1},
.uop[2] = { .type = UOP_STOREA, .latency = 1},
.uop[3] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t test_reg_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 1}
};
static const macro_op_t test_reg_b_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t test_mem_imm_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t test_mem_imm_b_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t xchg_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t mmx_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_MMX, .latency = 1}
};
static const macro_op_t mmx_mul_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_MMX_MUL, .latency = 1}
};
static const macro_op_t mmx_shift_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_MMX_SHIFT, .latency = 1}
};
static const macro_op_t load_mmx_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 2},
.uop[1] = { .type = UOP_MMX, .latency = 2}
};
static const macro_op_t load_mmx_mul_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 2},
.uop[1] = { .type = UOP_MMX_MUL, .latency = 2}
};
static const macro_op_t load_mmx_shift_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 2},
.uop[1] = { .type = UOP_MMX_SHIFT, .latency = 2}
};
static const macro_op_t mload_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_MLOAD, .latency = 1},
};
static const macro_op_t mstore_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_MSTORED, .latency = 1},
.uop[1] = { .type = UOP_MSTOREA, .latency = 1}
};
static const macro_op_t pmul_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_MMX_MUL, .latency = 1}
};
static const macro_op_t pmul_mem_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 2},
.uop[1] = { .type = UOP_MMX_MUL, .latency = 2}
};
static const macro_op_t float_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_FLOAT, .latency = 1}
};
static const macro_op_t fadd_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_FLOAT, .latency = 2}
};
static const macro_op_t fmul_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_ALUP0, .latency = 3}
};
static const macro_op_t float2_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAT, .latency = 1},
.uop[1] = { .type = UOP_FLOAT, .latency = 1}
};
static const macro_op_t fchs_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAT, .latency = 2},
.uop[1] = { .type = UOP_FLOAT, .latency = 2},
.uop[2] = { .type = UOP_FLOAT, .latency = 2}
};
static const macro_op_t load_float_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAD, .latency = 1},
.uop[1] = { .type = UOP_FLOAT, .latency = 1}
};
static const macro_op_t load_fadd_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAD, .latency = 1},
.uop[1] = { .type = UOP_FLOAT, .latency = 2}
};
static const macro_op_t load_fmul_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 4}
};
static const macro_op_t fstore_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FSTORED, .latency = 1},
.uop[1] = { .type = UOP_FSTOREA, .latency = 1},
};
static const macro_op_t load_fiadd_op = {
.nr_uops = 7,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAD, .latency = 1},
.uop[1] = { .type = UOP_FLOAT, .latency = 1},
.uop[2] = { .type = UOP_FLOAT, .latency = 1},
.uop[3] = { .type = UOP_FLOAT, .latency = 1},
.uop[4] = { .type = UOP_FLOAT, .latency = 1},
.uop[5] = { .type = UOP_FLOAT, .latency = 1},
.uop[6] = { .type = UOP_FLOAT, .latency = 1}
};
static const macro_op_t fdiv_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_FLOAT, .latency = 37}
};
static const macro_op_t fdiv_mem_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAD, .latency = 1 },
.uop[1] = { .type = UOP_FLOAT, .latency = 37}
};
static const macro_op_t fsin_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_FLOAT, .latency = 62}
};
static const macro_op_t fsqrt_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_FLOAT, .latency = 69}
};
static const macro_op_t fldcw_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_FLOAT, .latency = 10}
};
static const macro_op_t complex_float_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAT, .latency = 1}
};
static const macro_op_t complex_float_l_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAT, .latency = 50}
};
static const macro_op_t flde_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAD, .latency = 1},
.uop[1] = { .type = UOP_FLOAD, .latency = 1},
.uop[2] = { .type = UOP_FLOAT, .latency = 2}
};
static const macro_op_t fste_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_FLOAT, .latency = 2},
.uop[1] = { .type = UOP_FSTORED, .latency = 1},
.uop[2] = { .type = UOP_FSTOREA, .latency = 1}
};
static const macro_op_t complex_alu1_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 1}
};
static const macro_op_t alu2_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t alu3_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t alu6_op = {
.nr_uops = 6,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 1},
.uop[3] = { .type = UOP_ALU, .latency = 1},
.uop[4] = { .type = UOP_ALU, .latency = 1},
.uop[5] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t complex_alup0_1_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t alup0_3_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1},
.uop[2] = { .type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t alup0_6_op = {
.nr_uops = 6,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1},
.uop[2] = { .type = UOP_ALUP0, .latency = 1},
.uop[3] = { .type = UOP_ALUP0, .latency = 1},
.uop[4] = { .type = UOP_ALUP0, .latency = 1},
.uop[5] = { .type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t arpl_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 3},
.uop[1] = { .type = UOP_ALU, .latency = 3}
};
static const macro_op_t bound_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_LOAD, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 1},
.uop[3] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t bsx_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 10}
};
static const macro_op_t call_far_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 3},
.uop[1] = { .type = UOP_STORED, .latency = 1},
.uop[2] = { .type = UOP_STOREA, .latency = 1},
.uop[3] = { .type = UOP_BRANCH, .latency = 1}
};
static const macro_op_t cli_sti_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 7}
};
static const macro_op_t cmps_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t cmpsb_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t cmpxchg_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1},
.uop[2] = { .type = UOP_STORED, .latency = 1},
.uop[3] = { .type = UOP_STOREA, .latency = 1}
};
static const macro_op_t cmpxchg_b_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1},
.uop[2] = { .type = UOP_STORED, .latency = 1},
.uop[3] = { .type = UOP_STOREA, .latency = 1}
};
static const macro_op_t complex_push_mem_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_STORED, .latency = 1},
.uop[1] = { .type = UOP_STOREA, .latency = 1}
};
static const macro_op_t cpuid_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 23}
};
static const macro_op_t div16_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 21}
};
static const macro_op_t div16_mem_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1 },
.uop[1] = { .type = UOP_ALUP0, .latency = 21}
};
static const macro_op_t div32_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 37}
};
static const macro_op_t div32_mem_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1 },
.uop[1] = { .type = UOP_ALUP0, .latency = 37}
};
static const macro_op_t emms_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 50}
};
static const macro_op_t enter_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_STORED, .latency = 1 },
.uop[1] = { .type = UOP_STOREA, .latency = 1 },
.uop[2] = { .type = UOP_ALU, .latency = 10}
};
static const macro_op_t femms_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 6}
};
static const macro_op_t in_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 18}
};
static const macro_op_t ins_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 18},
.uop[1] = { .type = UOP_STORED, .latency = 1 },
.uop[2] = { .type = UOP_STOREA, .latency = 1 },
.uop[3] = { .type = UOP_ALU, .latency = 1 }
};
static const macro_op_t int_op = {
.nr_uops = 8,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 20},
.uop[1] = { .type = UOP_STORED, .latency = 1 },
.uop[2] = { .type = UOP_STOREA, .latency = 1 },
.uop[3] = { .type = UOP_STORED, .latency = 1 },
.uop[4] = { .type = UOP_STOREA, .latency = 1 },
.uop[5] = { .type = UOP_STORED, .latency = 1 },
.uop[6] = { .type = UOP_STOREA, .latency = 1 },
.uop[7] = { .type = UOP_BRANCH, .latency = 1 }
};
static const macro_op_t iret_op = {
.nr_uops = 5,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 3 },
.uop[1] = { .type = UOP_LOAD, .latency = 3 },
.uop[2] = { .type = UOP_LOAD, .latency = 3 },
.uop[3] = { .type = UOP_ALU, .latency = 20},
.uop[4] = { .type = UOP_BRANCH, .latency = 1 }
};
static const macro_op_t invd_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 500}
};
static const macro_op_t jmp_far_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 3},
.uop[1] = { .type = UOP_BRANCH, .latency = 1}
};
static const macro_op_t lss_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_LOAD, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 3}
};
static const macro_op_t mov_mem_seg_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_STORED, .latency = 1},
.uop[2] = { .type = UOP_STOREA, .latency = 1},
};
static const macro_op_t mov_seg_mem_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 3}
};
static const macro_op_t mov_seg_reg_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 3}
};
static const macro_op_t mul_op = {
.nr_uops = 1,
.decode_type = DECODE_SIMPLE,
.uop[0] = {.type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t mul_mem_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t mul64_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1},
.uop[2] = { .type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t mul64_mem_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1},
.uop[2] = { .type = UOP_ALUP0, .latency = 1},
.uop[3] = { .type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t out_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 18}
};
static const macro_op_t outs_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1 },
.uop[1] = { .type = UOP_ALU, .latency = 18}
};
static const macro_op_t pusha_op = {
.nr_uops = 8,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_STORED, .latency = 2},
.uop[1] = { .type = UOP_STOREA, .latency = 2},
.uop[2] = { .type = UOP_STORED, .latency = 2},
.uop[3] = { .type = UOP_STOREA, .latency = 2},
.uop[4] = { .type = UOP_STORED, .latency = 2},
.uop[5] = { .type = UOP_STOREA, .latency = 2},
.uop[6] = { .type = UOP_STORED, .latency = 2},
.uop[7] = { .type = UOP_STOREA, .latency = 2}
};
static const macro_op_t popa_op = {
.nr_uops = 8,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_LOAD, .latency = 1},
.uop[2] = { .type = UOP_LOAD, .latency = 1},
.uop[3] = { .type = UOP_LOAD, .latency = 1},
.uop[4] = { .type = UOP_LOAD, .latency = 1},
.uop[5] = { .type = UOP_LOAD, .latency = 1},
.uop[6] = { .type = UOP_LOAD, .latency = 1},
.uop[7] = { .type = UOP_LOAD, .latency = 1}
};
static const macro_op_t popf_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1 },
.uop[1] = { .type = UOP_ALU, .latency = 6 },
.uop[2] = { .type = UOP_ALUP0, .latency = 10}
};
static const macro_op_t pushf_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 1},
.uop[1] = { .type = UOP_STORED, .latency = 1},
.uop[2] = { .type = UOP_STOREA, .latency = 1}
};
static const macro_op_t ret_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_BRANCH, .latency = 1}
};
static const macro_op_t retf_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 3},
.uop[2] = { .type = UOP_BRANCH, .latency = 1}
};
static const macro_op_t scas_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t scasb_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t setcc_mem_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1},
.uop[2] = { .type = UOP_FSTORED, .latency = 1},
.uop[3] = { .type = UOP_FSTOREA, .latency = 1}
};
static const macro_op_t setcc_reg_op = {
.nr_uops = 3,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALUP0, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1},
.uop[2] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t test_mem_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t test_mem_b_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_ALUP0, .latency = 1}
};
static const macro_op_t xchg_mem_op = {
.nr_uops = 4,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_LOAD, .latency = 1},
.uop[1] = { .type = UOP_STORED, .latency = 1},
.uop[2] = { .type = UOP_STOREA, .latency = 1},
.uop[3] = { .type = UOP_ALU, .latency = 1}
};
static const macro_op_t xlat_op = {
.nr_uops = 2,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 1},
.uop[1] = { .type = UOP_LOAD, .latency = 1}
};
static const macro_op_t wbinvd_op = {
.nr_uops = 1,
.decode_type = DECODE_COMPLEX,
.uop[0] = {.type = UOP_ALU, .latency = 10000}
};
#define INVALID NULL
static const macro_op_t *opcode_timings_p6[256] = {
// clang-format off
/* ADD ADD ADD ADD*/
/*00*/ &alup0_store_op, &alu_store_op, &load_alup0_op, &load_alu_op,
/* ADD ADD PUSH ES POP ES*/
&alup0_op, &alu_op, &push_seg_op, &mov_seg_mem_op,
/* OR OR OR OR*/
&alup0_store_op, &alu_store_op, &load_alup0_op, &load_alu_op,
/* OR OR PUSH CS */
&alup0_op, &alu_op, &push_seg_op, INVALID,
/* ADC ADC ADC ADC*/
/*10*/ &alup0_store_op, &alu_store_op, &load_alup0_op, &load_alu_op,
/* ADC ADC PUSH SS POP SS*/
&complex_alup0_1_op, &complex_alu1_op, &push_seg_op, &mov_seg_mem_op,
/* SBB SBB SBB SBB*/
/*10*/ &alup0_store_op, &alu_store_op, &load_alup0_op, &load_alu_op,
/* SBB SBB PUSH DS POP DS*/
&complex_alup0_1_op, &complex_alu1_op, &push_seg_op, &mov_seg_mem_op,
/* AND AND AND AND*/
/*20*/ &alup0_store_op, &alu_store_op, &load_alup0_op, &load_alu_op,
/* AND AND DAA*/
&alup0_op, &alu_op, INVALID, &complex_alup0_1_op,
/* SUB SUB SUB SUB*/
&alup0_store_op, &alu_store_op, &load_alup0_op, &load_alu_op,
/* SUB SUB DAS*/
&alup0_op, &alu_op, INVALID, &complex_alup0_1_op,
/* XOR XOR XOR XOR*/
/*30*/ &alup0_store_op, &alu_store_op, &load_alup0_op, &load_alu_op,
/* XOR XOR AAA*/
&alup0_op, &alu_op, INVALID, &alup0_6_op,
/* CMP CMP CMP CMP*/
&load_alup0_op, &load_alu_op, &load_alup0_op, &load_alu_op,
/* CMP CMP AAS*/
&alup0_op, &alu_op, INVALID, &alup0_6_op,
/* INC EAX INC ECX INC EDX INC EBX*/
/*40*/ &alu_op, &alu_op, &alu_op, &alu_op,
/* INC ESP INC EBP INC ESI INC EDI*/
&alu_op, &alu_op, &alu_op, &alu_op,
/* DEC EAX DEC ECX DEC EDX DEC EBX*/
&alu_op, &alu_op, &alu_op, &alu_op,
/* DEC ESP DEC EBP DEC ESI DEC EDI*/
&alu_op, &alu_op, &alu_op, &alu_op,
/* PUSH EAX PUSH ECX PUSH EDX PUSH EBX*/
/*50*/ &store_op, &store_op, &store_op, &store_op,
/* PUSH ESP PUSH EBP PUSH ESI PUSH EDI*/
&store_op, &store_op, &store_op, &store_op,
/* POP EAX POP ECX POP EDX POP EBX*/
&pop_reg_op, &pop_reg_op, &pop_reg_op, &pop_reg_op,
/* POP ESP POP EBP POP ESI POP EDI*/
&pop_reg_op, &pop_reg_op, &pop_reg_op, &pop_reg_op,
/* PUSHA POPA BOUND ARPL*/
/*60*/ &pusha_op, &popa_op, &bound_op, &arpl_op,
INVALID, INVALID, INVALID, INVALID,
/* PUSH imm IMUL PUSH imm IMUL*/
&push_imm_op, &mul_op, &push_imm_op, &mul_op,
/* INSB INSW OUTSB OUTSW*/
&ins_op, &ins_op, &outs_op, &outs_op,
/* Jxx*/
/*70*/ &branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
/*80*/ INVALID, INVALID, INVALID, INVALID,
/* TEST TEST XCHG XCHG*/
&test_mem_b_op, &test_mem_op, &xchg_mem_op, &xchg_mem_op,
/* MOV MOV MOV MOV*/
&store_op, &store_op, &load_op, &load_op,
/* MOV from seg LEA MOV to seg POP*/
&mov_mem_seg_op, &store_op, &mov_seg_mem_op, &pop_mem_op,
/* NOP XCHG XCHG XCHG*/
/*90*/ &fxch_op, &xchg_op, &xchg_op, &xchg_op,
/* XCHG XCHG XCHG XCHG*/
&xchg_op, &xchg_op, &xchg_op, &xchg_op,
/* CBW CWD CALL far WAIT*/
&complex_alu1_op, &complex_alu1_op, &call_far_op, &fxch_op,
/* PUSHF POPF SAHF LAHF*/
&pushf_op, &popf_op, &complex_alup0_1_op, &complex_alup0_1_op,
/* MOV MOV MOV MOV*/
/*a0*/ &load_op, &load_op, &store_op, &store_op,
/* MOVSB MOVSW CMPSB CMPSW*/
&movs_op, &movs_op, &cmpsb_op, &cmps_op,
/* TEST TEST STOSB STOSW*/
&test_reg_b_op, &test_reg_op, &stos_op, &stos_op,
/* LODSB LODSW SCASB SCASW*/
&lods_op, &lods_op, &scasb_op, &scas_op,
/* MOV*/
/*b0*/ &alu_op, &alu_op, &alu_op, &alu_op,
&alu_op, &alu_op, &alu_op, &alu_op,
&alu_op, &alu_op, &alu_op, &alu_op,
&alu_op, &alu_op, &alu_op, &alu_op,
/* RET imm RET*/
/*c0*/ INVALID, INVALID, &ret_op, &ret_op,
/* LES LDS MOV MOV*/
&lss_op, &lss_op, &store_op, &store_op,
/* ENTER LEAVE RETF RETF*/
&enter_op, &leave_op, &retf_op, &retf_op,
/* INT3 INT INTO IRET*/
&int_op, &int_op, &int_op, &iret_op,
/*d0*/ INVALID, INVALID, INVALID, INVALID,
/* AAM AAD SETALC XLAT*/
&alup0_6_op, &alup0_3_op, &complex_alup0_1_op, &xlat_op,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/* LOOPNE LOOPE LOOP JCXZ*/
/*e0*/ &loop_op, &loop_op, &loop_op, &loop_op,
/* IN AL IN AX OUT_AL OUT_AX*/
&in_op, &in_op, &out_op, &out_op,
/* CALL JMP JMP JMP*/
&store_op, &branch_op, &jmp_far_op, &branch_op,
/* IN AL IN AX OUT_AL OUT_AX*/
&in_op, &in_op, &out_op, &out_op,
/* REPNE REPE*/
/*f0*/ INVALID, INVALID, INVALID, INVALID,
/* HLT CMC*/
&complex_alup0_1_op, &alu2_op, INVALID, INVALID,
/* CLC STC CLI STI*/
&complex_alu1_op, &complex_alu1_op, &cli_sti_op, &cli_sti_op,
/* CLD STD INCDEC*/
&complex_alu1_op, &complex_alu1_op, &alup0_store_op, INVALID
// clang-format on
};
static const macro_op_t *opcode_timings_p6_mod3[256] = {
// clang-format off
/* ADD ADD ADD ADD*/
/*00*/ &alup0_op, &alu_op, &alup0_op, &alu_op,
/* ADD ADD PUSH ES POP ES*/
&alup0_op, &alu_op, &push_seg_op, &mov_seg_mem_op,
/* OR OR OR OR*/
&alup0_op, &alu_op, &alup0_op, &alu_op,
/* OR OR PUSH CS */
&alup0_op, &alu_op, &push_seg_op, INVALID,
/* ADC ADC ADC ADC*/
/*10*/ &complex_alup0_1_op, &complex_alu1_op, &complex_alup0_1_op, &complex_alu1_op,
/* ADC ADC PUSH SS POP SS*/
&complex_alup0_1_op, &complex_alu1_op, &push_seg_op, &mov_seg_mem_op,
/* SBB SBB SBB SBB*/
&complex_alup0_1_op, &complex_alu1_op, &complex_alup0_1_op, &complex_alu1_op,
/* SBB SBB PUSH DS POP DS*/
&complex_alup0_1_op, &complex_alu1_op, &push_seg_op, &mov_seg_mem_op,
/* AND AND AND AND*/
/*20*/ &alup0_op, &alu_op, &alup0_op, &alu_op,
/* AND AND DAA*/
&alup0_op, &alu_op, INVALID, &complex_alup0_1_op,
/* SUB SUB SUB SUB*/
&alup0_op, &alu_op, &alup0_op, &alu_op,
/* SUB SUB DAS*/
&alup0_op, &alu_op, INVALID, &complex_alup0_1_op,
/* XOR XOR XOR XOR*/
/*30*/ &alup0_op, &alu_op, &alup0_op, &alu_op,
/* XOR XOR AAA*/
&alup0_op, &alu_op, INVALID, &alup0_6_op,
/* CMP CMP CMP CMP*/
&alup0_op, &alu_op, &alup0_op, &alu_op,
/* CMP CMP AAS*/
&alup0_op, &alu_op, INVALID, &alup0_6_op,
/* INC EAX INC ECX INC EDX INC EBX*/
/*40*/ &alu_op, &alu_op, &alu_op, &alu_op,
/* INC ESP INC EBP INC ESI INC EDI*/
&alu_op, &alu_op, &alu_op, &alu_op,
/* DEC EAX DEC ECX DEC EDX DEC EBX*/
&alu_op, &alu_op, &alu_op, &alu_op,
/* DEC ESP DEC EBP DEC ESI DEC EDI*/
&alu_op, &alu_op, &alu_op, &alu_op,
/* PUSH EAX PUSH ECX PUSH EDX PUSH EBX*/
/*50*/ &store_op, &store_op, &store_op, &store_op,
/* PUSH ESP PUSH EBP PUSH ESI PUSH EDI*/
&store_op, &store_op, &store_op, &store_op,
/* POP EAX POP ECX POP EDX POP EBX*/
&pop_reg_op, &pop_reg_op, &pop_reg_op, &pop_reg_op,
/* POP ESP POP EBP POP ESI POP EDI*/
&pop_reg_op, &pop_reg_op, &pop_reg_op, &pop_reg_op,
/* PUSHA POPA BOUND ARPL*/
/*60*/ &pusha_op, &popa_op, &bound_op, &arpl_op,
INVALID, INVALID, INVALID, INVALID,
/* PUSH imm IMUL PUSH imm IMUL*/
&push_imm_op, &mul_op, &push_imm_op, &mul_op,
/* INSB INSW OUTSB OUTSW*/
&ins_op, &ins_op, &outs_op, &outs_op,
/* Jxx*/
/*70*/ &branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
/*80*/ INVALID, INVALID, INVALID, INVALID,
/* TEST TEST XCHG XCHG*/
&complex_alu1_op, &complex_alu1_op, &alu3_op, &alu3_op,
/* MOV MOV MOV MOV*/
&store_op, &store_op, &load_op, &load_op,
/* MOV from seg LEA MOV to seg POP*/
&mov_reg_seg_op, &store_op, &mov_seg_reg_op, &pop_reg_op,
/* NOP XCHG XCHG XCHG*/
/*90*/ &fxch_op, &xchg_op, &xchg_op, &xchg_op,
/* XCHG XCHG XCHG XCHG*/
&xchg_op, &xchg_op, &xchg_op, &xchg_op,
/* CBW CWD CALL far WAIT*/
&complex_alu1_op, &complex_alu1_op, &call_far_op, &fxch_op,
/* PUSHF POPF SAHF LAHF*/
&pushf_op, &popf_op, &complex_alup0_1_op, &complex_alup0_1_op,
/* MOV MOV MOV MOV*/
/*a0*/ &load_op, &load_op, &store_op, &store_op,
/* MOVSB MOVSW CMPSB CMPSW*/
&movs_op, &movs_op, &cmpsb_op, &cmps_op,
/* TEST TEST STOSB STOSW*/
&test_reg_b_op, &test_reg_op, &stos_op, &stos_op,
/* LODSB LODSW SCASB SCASW*/
&lods_op, &lods_op, &scasb_op, &scas_op,
/* MOV*/
/*b0*/ &alu_op, &alu_op, &alu_op, &alu_op,
&alu_op, &alu_op, &alu_op, &alu_op,
&alu_op, &alu_op, &alu_op, &alu_op,
&alu_op, &alu_op, &alu_op, &alu_op,
/* RET imm RET*/
/*c0*/ INVALID, INVALID, &ret_op, &ret_op,
/* LES LDS MOV MOV*/
&lss_op, &lss_op, &store_op, &store_op,
/* ENTER LEAVE RETF RETF*/
&enter_op, &leave_op, &retf_op, &retf_op,
/* INT3 INT INTO IRET*/
&int_op, &int_op, &int_op, &iret_op,
/*d0*/ INVALID, INVALID, INVALID, INVALID,
/* AAM AAD SETALC XLAT*/
&alup0_6_op, &alup0_3_op, &complex_alup0_1_op, &xlat_op,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/* LOOPNE LOOPE LOOP JCXZ*/
/*e0*/ &loop_op, &loop_op, &loop_op, &loop_op,
/* IN AL IN AX OUT_AL OUT_AX*/
&in_op, &in_op, &out_op, &out_op,
/* CALL JMP JMP JMP*/
&store_op, &branch_op, &jmp_far_op, &branch_op,
/* IN AL IN AX OUT_AL OUT_AX*/
&in_op, &in_op, &out_op, &out_op,
/* REPNE REPE*/
/*f0*/ INVALID, INVALID, INVALID, INVALID,
/* HLT CMC*/
&complex_alup0_1_op, &alu2_op, INVALID, INVALID,
/* CLC STC CLI STI*/
&complex_alu1_op, &complex_alu1_op, &cli_sti_op, &cli_sti_op,
/* CLD STD INCDEC*/
&complex_alu1_op, &complex_alu1_op, &complex_alup0_1_op, INVALID
// clang-format on
};
static const macro_op_t *opcode_timings_p6_0f[256] = {
// clang-format off
/*00*/ &alu6_op, &alu6_op, &alu6_op, &alu6_op,
INVALID, &alu6_op, &alu6_op, INVALID,
&invd_op, &wbinvd_op, INVALID, INVALID,
INVALID, &load_op, &femms_op, INVALID,
/*10*/ INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*20*/ &alu6_op, &alu6_op, &alu6_op, &alu6_op,
&alu6_op, &alu6_op, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*30*/ &alu6_op, &alu6_op, &alu6_op, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*40*/ INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*50*/ INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*60*/ &load_mmx_op, &load_mmx_op, &load_mmx_op, &load_mmx_op,
&load_mmx_op, &load_mmx_op, &load_mmx_op, &load_mmx_op,
&load_mmx_op, &load_mmx_op, &load_mmx_op, &load_mmx_op,
INVALID, INVALID, &mload_op, &mload_op,
/*70*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op,
&load_mmx_op, &load_mmx_op, &load_mmx_op, &emms_op,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, &mstore_op, &mstore_op,
/*80*/ &branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
/*90*/ &setcc_reg_op, &setcc_reg_op, &setcc_reg_op, &setcc_reg_op,
&setcc_reg_op, &setcc_reg_op, &setcc_reg_op, &setcc_reg_op,
&setcc_reg_op, &setcc_reg_op, &setcc_reg_op, &setcc_reg_op,
&setcc_reg_op, &setcc_reg_op, &setcc_reg_op, &setcc_reg_op,
/*a0*/ &push_seg_op, &mov_seg_mem_op, &cpuid_op, &load_alu_op,
&alu_store_op, &alu_store_op, INVALID, INVALID,
&push_seg_op, &mov_seg_mem_op, INVALID, &load_alu_op,
&alu_store_op, &alu_store_op, INVALID, &mul_op,
/*b0*/ &cmpxchg_b_op, &cmpxchg_op, &lss_op, &load_alu_op,
&lss_op, &lss_op, &load_alup0_op, &load_alu_op,
INVALID, INVALID, &load_alu_op, &load_alu_op,
&bsx_op, &bsx_op, &load_alup0_op, &load_alu_op,
/*c0*/ &alup0_store_op, &alu_store_op, INVALID, INVALID,
INVALID, INVALID, INVALID, &cmpxchg_op,
&bswap_op, &bswap_op, &bswap_op, &bswap_op,
&bswap_op, &bswap_op, &bswap_op, &bswap_op,
/*d0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op,
INVALID, &load_mmx_mul_op, INVALID, INVALID,
&load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op,
&load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op,
/*e0*/ &load_mmx_op, &load_mmx_shift_op, &load_mmx_shift_op, INVALID,
INVALID, &pmul_mem_op, INVALID, INVALID,
&load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op,
&load_mmx_op, &load_mmx_op, INVALID, &load_mmx_op,
/*f0*/ INVALID, &load_mmx_shift_op, &load_mmx_shift_op, &load_mmx_shift_op,
INVALID, &pmul_mem_op, INVALID, INVALID,
&load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID,
&load_mmx_op, &load_mmx_op, &load_mmx_op, INVALID,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_0f_mod3[256] = {
// clang-format off
/*00*/ &alu6_op, &alu6_op, &alu6_op, &alu6_op,
INVALID, &alu6_op, &alu6_op, INVALID,
&invd_op, &wbinvd_op, INVALID, INVALID,
INVALID, INVALID, &femms_op, INVALID,
/*10*/ INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*20*/ &alu6_op, &alu6_op, &alu6_op, &alu6_op,
&alu6_op, &alu6_op, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*30*/ &alu6_op, &alu6_op, &alu6_op, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*40*/ INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*50*/ INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*60*/ &mmx_op, &mmx_op, &mmx_op, &mmx_op,
&mmx_op, &mmx_op, &mmx_op, &mmx_op,
&mmx_op, &mmx_op, &mmx_op, &mmx_op,
INVALID, INVALID, &mmx_op, &mmx_op,
/*70*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op,
&mmx_op, &mmx_op, &mmx_op, &emms_op,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, &mmx_op, &mmx_op,
/*80*/ &branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
&branch_op, &branch_op, &branch_op, &branch_op,
/*90*/ &setcc_mem_op, &setcc_mem_op, &setcc_mem_op, &setcc_mem_op,
&setcc_mem_op, &setcc_mem_op, &setcc_mem_op, &setcc_mem_op,
&setcc_mem_op, &setcc_mem_op, &setcc_mem_op, &setcc_mem_op,
&setcc_mem_op, &setcc_mem_op, &setcc_mem_op, &setcc_mem_op,
/*a0*/ &push_seg_op, &mov_seg_mem_op, &cpuid_op, &complex_alu1_op,
&complex_alu1_op, &complex_alu1_op, INVALID, INVALID,
&push_seg_op, &mov_seg_mem_op, INVALID, &complex_alu1_op,
&complex_alu1_op, &complex_alu1_op, INVALID, &mul_op,
/*b0*/ &cmpxchg_b_op, &cmpxchg_op, &lss_op, &complex_alu1_op,
&lss_op, &lss_op, &alup0_op, &alu_op,
INVALID, INVALID, &complex_alu1_op, &complex_alu1_op,
&bsx_op, &bsx_op, &alup0_op, &alu_op,
/*c0*/ &complex_alup0_1_op, &complex_alu1_op, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
&bswap_op, &bswap_op, &bswap_op, &bswap_op,
&bswap_op, &bswap_op, &bswap_op, &bswap_op,
/*d0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op,
INVALID, &mmx_mul_op, INVALID, INVALID,
&mmx_op, &mmx_op, INVALID, &mmx_op,
&mmx_op, &mmx_op, INVALID, &mmx_op,
/*e0*/ &mmx_op, &mmx_shift_op, &mmx_shift_op, INVALID,
INVALID, &pmul_op, INVALID, INVALID,
&mmx_op, &mmx_op, INVALID, &mmx_op,
&mmx_op, &mmx_op, INVALID, &mmx_op,
/*f0*/ INVALID, &mmx_shift_op, &mmx_shift_op, &mmx_shift_op,
INVALID, &pmul_op, INVALID, INVALID,
&mmx_op, &mmx_op, &mmx_op, INVALID,
&mmx_op, &mmx_op, &mmx_op, INVALID,
};
static const macro_op_t *opcode_timings_p6_shift[8] =
{
// clang-format off
&alu_store_op, &alu_store_op, &alu_store_op, &alu_store_op,
&alu_store_op, &alu_store_op, &alu_store_op, &alu_store_op
// clang-format on
};
static const macro_op_t *opcode_timings_p6_shift_b[8] = {
// clang-format off
&alup0_store_op, &alup0_store_op, &alup0_store_op, &alup0_store_op,
&alup0_store_op, &alup0_store_op, &alup0_store_op, &alup0_store_op
// clang-format on
};
static const macro_op_t *opcode_timings_p6_shift_mod3[8] = {
// clang-format off
&complex_alu1_op, &complex_alu1_op, &complex_alu1_op, &complex_alu1_op,
&alu_op, &alu_op, &alu_op, &alu_op
// clang-format on
};
static const macro_op_t *opcode_timings_p6_shift_b_mod3[8] = {
// clang-format off
&complex_alup0_1_op, &complex_alup0_1_op, &complex_alup0_1_op, &complex_alup0_1_op,
&alup0_op, &alup0_op, &alup0_op, &alup0_op
// clang-format on
};
static const macro_op_t *opcode_timings_p6_80[8] = {
// clang-format off
&alup0_store_op, &alup0_store_op, &alup0_store_op, &alup0_store_op,
&alup0_store_op, &alup0_store_op, &alup0_store_op, &alup0_store_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_80_mod3[8] = {
// clang-format off
&alup0_op, &alup0_op, &alup0_store_op, &alup0_store_op,
&alup0_op, &alup0_op, &alup0_op, &alup0_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_8x[8] = {
// clang-format off
&alu_store_op, &alu_store_op, &alu_store_op, &alu_store_op,
&alu_store_op, &alu_store_op, &alu_store_op, &alu_store_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_8x_mod3[8] = {
// clang-format off
&alu_op, &alu_op, &alu_store_op, &alu_store_op,
&alu_op, &alu_op, &alu_op, &alu_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_f6[8] = {
// clang-format off
/* TST NOT NEG*/
&test_mem_imm_b_op, INVALID, &alup0_store_op, &alup0_store_op,
/* MUL IMUL DIV IDIV*/
&mul_mem_op, &mul_mem_op, &div16_mem_op, &div16_mem_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_f6_mod3[8] = {
// clang-format off
/* TST NOT NEG*/
&test_reg_b_op, INVALID, &alup0_op, &alup0_op,
/* MUL IMUL DIV IDIV*/
&mul_op, &mul_op, &div16_op, &div16_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_f7[8] = {
// clang-format off
/* TST NOT NEG*/
&test_mem_imm_op, INVALID, &alu_store_op, &alu_store_op,
/* MUL IMUL DIV IDIV*/
&mul64_mem_op, &mul64_mem_op, &div32_mem_op, &div32_mem_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_f7_mod3[8] = {
// clang-format off
/* TST NOT NEG*/
&test_reg_op, INVALID, &alu_op, &alu_op,
/* MUL IMUL DIV IDIV*/
&mul64_op, &mul64_op, &div32_op, &div32_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_ff[8] = {
// clang-format off
/* INC DEC CALL CALL far*/
&alu_store_op, &alu_store_op, &store_op, &call_far_op,
/* JMP JMP far PUSH*/
&branch_op, &jmp_far_op, &push_mem_op, INVALID
// clang-format on
};
static const macro_op_t *opcode_timings_p6_ff_mod3[8] = {
// clang-format off
/* INC DEC CALL CALL far*/
&complex_alu1_op, &complex_alu1_op, &store_op, &call_far_op,
/* JMP JMP far PUSH*/
&branch_op, &jmp_far_op, &complex_push_mem_op, INVALID
// clang-format on
};
static const macro_op_t *opcode_timings_p6_d8[8] = {
// clang-format off
/* FADDs FMULs FCOMs FCOMPs*/
&load_fadd_op, &load_fmul_op, &load_float_op, &load_float_op,
/* FSUBs FSUBRs FDIVs FDIVRs*/
&load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_d8_mod3[8] = {
// clang-format off
/* FADD FMUL FCOM FCOMP*/
&fadd_op, &fmul_op, &float_op, &float_op,
/* FSUB FSUBR FDIV FDIVR*/
&float_op, &float_op, &fdiv_op, &fdiv_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_d9[8] = {
// clang-format off
/* FLDs FSTs FSTPs*/
&load_float_op, INVALID, &fstore_op, &fstore_op,
/* FLDENV FLDCW FSTENV FSTCW*/
&complex_float_l_op, &fldcw_op, &complex_float_l_op, &complex_float_op
// clang-format on
};
static const macro_op_t *opcode_timings_p6_d9_mod3[64] = {
// clang-format off
/*FLD*/
&float_op, &float_op, &float_op, &float_op,
&float_op, &float_op, &float_op, &float_op,
/*FXCH*/
&fxch_op, &fxch_op, &fxch_op, &fxch_op,
&fxch_op, &fxch_op, &fxch_op, &fxch_op,
/*FNOP*/
&float_op, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/*FSTP*/
&float2_op, &float2_op, &float2_op, &float2_op,
&float2_op, &float2_op, &float2_op, &float2_op,
/* opFCHS opFABS*/
&fchs_op, &float_op, INVALID, INVALID,
/* opFTST opFXAM*/
&float_op, &float_op, INVALID, INVALID,
/* opFLD1 opFLDL2T opFLDL2E opFLDPI*/
&float_op, &float_op, &float_op, &float_op,
/* opFLDEG2 opFLDLN2 opFLDZ*/
&float_op, &float_op, &float_op, INVALID,
/* opF2XM1 opFYL2X opFPTAN opFPATAN*/
&fsin_op, &fsin_op, &fsin_op, &fsin_op,
/* opFDECSTP opFINCSTP,*/
INVALID, INVALID, &float_op, &float_op,
/* opFPREM opFSQRT opFSINCOS*/
&fdiv_op, INVALID, &fsqrt_op, &fsin_op,
/* opFRNDINT opFSCALE opFSIN opFCOS*/
&float_op, &fdiv_op, &fsin_op, &fsin_op
// clang-format on
};
static const macro_op_t *opcode_timings_p6_da[8] = {
// clang-format off
/* FIADDl FIMULl FICOMl FICOMPl*/
&load_fadd_op, &load_fmul_op, &load_float_op, &load_float_op,
/* FISUBl FISUBRl FIDIVl FIDIVRl*/
&load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_da_mod3[8] = {
// clang-format off
INVALID, INVALID, INVALID, INVALID,
/* FCOMPP*/
INVALID, &float_op, INVALID, INVALID
// clang-format on
};
static const macro_op_t *opcode_timings_p6_db[8] = {
// clang-format off
/* FLDil FSTil FSTPil*/
&load_float_op, INVALID, &fstore_op, &fstore_op,
/* FLDe FSTPe*/
INVALID, &flde_op, INVALID, &fste_op
// clang-format on
};
static const macro_op_t *opcode_timings_p6_db_mod3[64] = {
// clang-format off
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
/* opFNOP opFCLEX opFINIT*/
INVALID, &float_op, &float_op, &float_op,
/* opFNOP opFNOP*/
&float_op, &float_op, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
INVALID, INVALID, INVALID, INVALID,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_dc[8] = {
// clang-format off
/* FADDd FMULd FCOMd FCOMPd*/
&load_fadd_op, &load_fmul_op, &load_float_op, &load_float_op,
/* FSUBd FSUBRd FDIVd FDIVRd*/
&load_float_op, &load_float_op, &fdiv_mem_op, &fdiv_mem_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_dc_mod3[8] = {
// clang-format off
/* opFADDr opFMULr*/
&fadd_op, &fmul_op, INVALID, INVALID,
/* opFSUBRr opFSUBr opFDIVRr opFDIVr*/
&float_op, &float_op, &fdiv_op, &fdiv_op
// clang-format on
};
static const macro_op_t *opcode_timings_p6_dd[8] = {
// clang-format off
/* FLDd FSTd FSTPd*/
&load_float_op, INVALID, &fstore_op, &fstore_op,
/* FRSTOR FSAVE FSTSW*/
&complex_float_l_op, INVALID, &complex_float_l_op, &complex_float_l_op
// clang-format on
};
static const macro_op_t *opcode_timings_p6_dd_mod3[8] = {
// clang-format off
/* FFFREE FST FSTP*/
&float_op, INVALID, &float_op, &float_op,
/* FUCOM FUCOMP*/
&float_op, &float_op, INVALID, INVALID
// clang-format on
};
static const macro_op_t *opcode_timings_p6_de[8] = {
// clang-format off
/* FIADDw FIMULw FICOMw FICOMPw*/
&load_fiadd_op, &load_fiadd_op, &load_fiadd_op, &load_fiadd_op,
/* FISUBw FISUBRw FIDIVw FIDIVRw*/
&load_fiadd_op, &load_fiadd_op, &load_fiadd_op, &load_fiadd_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_de_mod3[8] = {
// clang-format off
/* FADDP FMULP FCOMPP*/
&fadd_op, &fmul_op, INVALID, &float_op,
/* FSUBP FSUBRP FDIVP FDIVRP*/
&float_op, &float_op, &fdiv_op, &fdiv_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_df[8] = {
// clang-format off
/* FILDiw FISTiw FISTPiw*/
&load_float_op, INVALID, &fstore_op, &fstore_op,
/* FILDiq FBSTP FISTPiq*/
INVALID, &load_float_op, &complex_float_l_op, &fstore_op,
// clang-format on
};
static const macro_op_t *opcode_timings_p6_df_mod3[8] = {
// clang-format off
INVALID, INVALID, INVALID, INVALID,
/* FSTSW AX*/
&float_op, INVALID, INVALID, INVALID
// clang-format on
};
static uint8_t last_prefix;
static int prefixes;
static int decode_timestamp;
static int last_complete_timestamp;
typedef struct p6_unit_t {
uint32_t uop_mask;
double first_available_cycle;
} p6_unit_t;
static int nr_units;
static p6_unit_t *units;
/*Pentium Pro has no MMX*/
static p6_unit_t ppro_units[] = {
{ .uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUP0) | (1 << UOP_FLOAT) }, /*Port 0*/
{ .uop_mask = (1 << UOP_ALU) | (1 << UOP_BRANCH) }, /*Port 1*/
{ .uop_mask = (1 << UOP_LOAD) | (1 << UOP_FLOAD) }, /*Port 2*/
{ .uop_mask = (1 << UOP_STORED) | (1 << UOP_FSTORED) }, /*Port 3*/
{ .uop_mask = (1 << UOP_STOREA) | (1 << UOP_FSTOREA) }, /*Port 4*/
};
#define NR_PPRO_UNITS (sizeof(ppro_units) / sizeof(p6_unit_t))
/*Pentium II/Celeron assigns the multiplier to port 0, the shifter to port 1, and shares the MMX ALU*/
static p6_unit_t p2_units[] = {
{ .uop_mask = (1 << UOP_ALU) | (1 << UOP_ALUP0) | (1 << UOP_FLOAT) | /*Port 0*/
(1 << UOP_MMX) | (1 << UOP_MMX_MUL) },
{ .uop_mask = (1 << UOP_ALU) | (1 << UOP_BRANCH) | /*Port 1*/
(1 << UOP_MMX) | (1 << UOP_MMX_SHIFT) },
{ .uop_mask = (1 << UOP_LOAD) | (1 << UOP_FLOAD) | (1 << UOP_MLOAD) }, /*Port 2*/
{ .uop_mask = (1 << UOP_STORED) | (1 << UOP_FSTORED) | (1 << UOP_MSTORED) }, /*Port 3*/
{ .uop_mask = (1 << UOP_STOREA) | (1 << UOP_FSTOREA) | (1 << UOP_MSTOREA) }, /*Port 4*/
};
#define NR_P2_UNITS (sizeof(p2_units) / sizeof(p6_unit_t))
static int
uop_run(const p6_uop_t *uop, int decode_time)
{
p6_unit_t *best_unit = NULL;
int best_start_cycle = 99999;
/*UOP_FXCH does not require execution*/
if (uop->type == UOP_FXCH)
return decode_time;
/*Find execution unit for this uOP*/
for (int c = 0; c < nr_units; c++) {
if (units[c].uop_mask & (1 << uop->type)) {
if (units[c].first_available_cycle < best_start_cycle) {
best_unit = &units[c];
best_start_cycle = units[c].first_available_cycle;
}
}
}
if (!best_unit)
fatal("uop_run: can not find execution unit\n");
if (best_start_cycle < decode_time)
best_start_cycle = decode_time;
best_unit->first_available_cycle = best_start_cycle + uop->latency;
return best_start_cycle + uop->latency;
}
/*The P6 decoders can decode, per clock :
- 1 to 3 'simple' instructions, each up to 1 uOP and 7 bytes long
- 1 'complex' instruction, up to 4 uOPs or 3 per cycle for instructions longer than 4 uOPs
*/
static struct {
int nr_uops;
const p6_uop_t *uops[6];
/*Earliest time a uop can start. If the timestamp is -1, then the uop is
part of a dependency chain and the start time is the completion time of
the previous uop*/
int earliest_start[6];
} decode_buffer;
#define NR_OPSEQS 3
/*Timestamps of when the last three op sequences completed. Technically this is incorrect,
as the actual size of the opseq buffer is 20 bytes and not 18, but I'm restricted to multiples of 6*/
static int opseq_completion_timestamp[NR_OPSEQS];
static int next_opseq = 0;
#define NR_REGS 8
/*Timestamp of when last operation on an integer register completed*/
static int reg_available_timestamp[NR_REGS];
/*Timestamp of when last operation on an FPU register completed*/
static int fpu_st_timestamp[8];
/*Completion time of the last uop to be processed. Used to calculate timing of
dependent uop chains*/
static int last_uop_timestamp = 0;
void
decode_flush_p6(void)
{
int start_timestamp;
int uop_timestamp = 0;
/*Decoded opseq can not be submitted if there are no free spaces in the
opseq buffer*/
if (decode_timestamp < opseq_completion_timestamp[next_opseq])
decode_timestamp = opseq_completion_timestamp[next_opseq];
/*Ensure that uops can not be submitted before they have been decoded*/
if (decode_timestamp > last_uop_timestamp)
last_uop_timestamp = decode_timestamp;
/*Submit uops to execution units, and determine the latest completion time*/
for (int c = 0; c < (decode_buffer.nr_uops); c++) {
if (decode_buffer.earliest_start[c] == -1)
start_timestamp = last_uop_timestamp;
else
start_timestamp = decode_buffer.earliest_start[c];
last_uop_timestamp = uop_run(decode_buffer.uops[c], start_timestamp);
if (last_uop_timestamp > uop_timestamp)
uop_timestamp = last_uop_timestamp;
}
/*Calculate opseq completion time. Since opseqs complete in order, it
must be after the last completion.*/
if (uop_timestamp <= last_complete_timestamp)
last_complete_timestamp = last_complete_timestamp + 1;
else
last_complete_timestamp = uop_timestamp;
/*Advance to next opseq in buffer*/
opseq_completion_timestamp[next_opseq] = last_complete_timestamp;
next_opseq++;
if (next_opseq == NR_OPSEQS)
next_opseq = 0;
decode_timestamp++;
decode_buffer.nr_uops = 0;
}
/*The instruction is only of interest here if it's longer than 7 bytes, as that's the
limit on P6 simple decoding*/
static int
codegen_timing_instr_length(uint64_t deps, uint32_t fetchdat, int op_32)
{
int len = prefixes + 1; /*Opcode*/
if (deps & MODRM) {
len++; /*ModR/M*/
if (deps & HAS_IMM8)
len++;
if (deps & HAS_IMM1632)
len += (op_32 & 0x100) ? 4 : 2;
if (op_32 & 0x200) {
if ((fetchdat & 7) == 4 && (fetchdat & 0xc0) != 0xc0) {
/* Has SIB*/
len++;
if ((fetchdat & 0xc0) == 0x40)
len++;
else if ((fetchdat & 0xc0) == 0x80)
len += 4;
else if ((fetchdat & 0x700) == 0x500)
len += 4;
} else {
if ((fetchdat & 0xc0) == 0x40)
len++;
else if ((fetchdat & 0xc0) == 0x80)
len += 4;
else if ((fetchdat & 0xc7) == 0x05)
len += 4;
}
} else {
if ((fetchdat & 0xc0) == 0x40)
len++;
else if ((fetchdat & 0xc0) == 0x80)
len += 2;
else if ((fetchdat & 0xc7) == 0x06)
len += 2;
}
}
return len;
}
static void
decode_instruction(const macro_op_t *ins, uint64_t deps, uint32_t fetchdat, int op_32, int bit8)
{
uint32_t regmask_required;
uint32_t regmask_modified;
int c;
int d = 0; /*Complex decoder uOPs*/
int earliest_start = 0;
decode_type_t decode_type = ins->decode_type;
int instr_length = codegen_timing_instr_length(deps, fetchdat, op_32);
/*Generate input register mask, and determine the earliest time this
instruction can start. This is not accurate, as this is calculated per
x86 instruction when it should be handled per uop*/
regmask_required = get_dstdep_mask(deps, fetchdat, bit8);
regmask_required |= get_addr_regmask(deps, fetchdat, op_32);
for (c = 0; c < 8; c++) {
if (regmask_required & (1 << c)) {
if (reg_available_timestamp[c] > decode_timestamp)
earliest_start = reg_available_timestamp[c];
}
}
if ((deps & FPU_RW_ST0) && fpu_st_timestamp[0] > decode_timestamp)
earliest_start = fpu_st_timestamp[0];
if ((deps & FPU_RW_ST1) && fpu_st_timestamp[1] > decode_timestamp)
earliest_start = fpu_st_timestamp[1];
if (deps & FPU_RW_STREG) {
int reg = fetchdat & 7;
if (fpu_st_timestamp[reg] > decode_timestamp)
earliest_start = fpu_st_timestamp[reg];
}
/*Simple decoders are limited to 7 bytes & 1 uOP*/
if ((decode_type == DECODE_SIMPLE && instr_length > 7) || (decode_type == DECODE_SIMPLE && ins->nr_uops > 1))
decode_type = DECODE_COMPLEX;
switch (decode_type) {
case DECODE_SIMPLE:
if (decode_buffer.nr_uops - d == 2) {
decode_buffer.uops[decode_buffer.nr_uops] = &ins->uop[0];
decode_buffer.earliest_start[decode_buffer.nr_uops] = earliest_start;
decode_buffer.nr_uops = 3;
decode_flush_p6();
} else if (decode_buffer.nr_uops - d == 1) {
decode_buffer.uops[decode_buffer.nr_uops] = &ins->uop[0];
decode_buffer.earliest_start[decode_buffer.nr_uops] = earliest_start;
decode_buffer.nr_uops = 2 + d;
if (d)
decode_flush_p6();
} else if (decode_buffer.nr_uops) {
decode_buffer.uops[decode_buffer.nr_uops] = &ins->uop[0];
decode_buffer.earliest_start[decode_buffer.nr_uops] = earliest_start;
decode_buffer.nr_uops = 1 + d;
} else {
decode_buffer.nr_uops = 1;
decode_buffer.uops[0] = &ins->uop[0];
decode_buffer.earliest_start[0] = earliest_start;
}
break;
case DECODE_COMPLEX:
if (decode_buffer.nr_uops)
decode_flush_p6(); /*The 4-1-1 arrangement implies that a complex ins. can't be decoded after a simple one*/
d = 0;
for (c = 0; c < ins->nr_uops; c++) {
decode_buffer.uops[d] = &ins->uop[c];
if (c == 0)
decode_buffer.earliest_start[d] = earliest_start;
else
decode_buffer.earliest_start[d] = -1;
d++;
if ((d == 3) && (ins->nr_uops > 4)) { /*Ins. with >4 uOPs require the use of special units only present on 3 translate PLAs*/
d = 0;
decode_buffer.nr_uops = 3;
decode_flush_p6(); /*The other two decoders are halted to preserve in-order issue*/
}
}
if (d) {
decode_buffer.nr_uops = d;
}
break;
}
/*Update write timestamps for any output registers*/
regmask_modified = get_dstdep_mask(deps, fetchdat, bit8);
for (c = 0; c < 8; c++) {
if (regmask_modified & (1 << c))
reg_available_timestamp[c] = last_complete_timestamp;
}
if (deps & FPU_POP) {
for (c = 0; c < 7; c++)
fpu_st_timestamp[c] = fpu_st_timestamp[c + 1];
fpu_st_timestamp[7] = 0;
}
if (deps & FPU_POP2) {
for (c = 0; c < 6; c++)
fpu_st_timestamp[c] = fpu_st_timestamp[c + 2];
fpu_st_timestamp[6] = fpu_st_timestamp[7] = 0;
}
if (deps & FPU_PUSH) {
for (c = 0; c < 7; c++)
fpu_st_timestamp[c + 1] = fpu_st_timestamp[c];
fpu_st_timestamp[0] = 0;
}
if (deps & FPU_WRITE_ST0)
fpu_st_timestamp[0] = last_complete_timestamp;
if (deps & FPU_WRITE_ST1)
fpu_st_timestamp[1] = last_complete_timestamp;
if (deps & FPU_WRITE_STREG) {
int reg = fetchdat & 7;
if (deps & FPU_POP)
reg--;
if (reg >= 0 && !(reg == 0 && (deps & FPU_WRITE_ST0)) && !(reg == 1 && (deps & FPU_WRITE_ST1)))
fpu_st_timestamp[reg] = last_complete_timestamp;
}
}
void
codegen_timing_p6_block_start(void)
{
int c;
for (c = 0; c < nr_units; c++)
units[c].first_available_cycle = 0;
decode_timestamp = 0;
last_complete_timestamp = 0;
for (c = 0; c < NR_OPSEQS; c++)
opseq_completion_timestamp[c] = 0;
next_opseq = 0;
for (c = 0; c < NR_REGS; c++)
reg_available_timestamp[c] = 0;
for (c = 0; c < 8; c++)
fpu_st_timestamp[c] = 0;
}
void
codegen_timing_p6_start(void)
{
if (cpu_s->cpu_type == CPU_PENTIUMPRO) {
units = ppro_units;
nr_units = NR_PPRO_UNITS;
} else {
units = p2_units;
nr_units = NR_P2_UNITS;
}
last_prefix = 0;
prefixes = 0;
}
void
codegen_timing_p6_prefix(uint8_t prefix, uint32_t fetchdat)
{
if (prefix != 0x0f)
decode_timestamp++;
last_prefix = prefix;
prefixes++;
}
void
codegen_timing_p6_opcode(uint8_t opcode, uint32_t fetchdat, int op_32, UNUSED(uint32_t op_pc))
{
const macro_op_t **ins_table;
const uint64_t *deps;
int mod3 = ((fetchdat & 0xc0) == 0xc0);
int old_last_complete_timestamp = last_complete_timestamp;
int bit8 = !(opcode & 1);
switch (last_prefix) {
case 0x0f:
ins_table = mod3 ? opcode_timings_p6_0f_mod3 : opcode_timings_p6_0f;
deps = mod3 ? opcode_deps_0f_mod3 : opcode_deps_0f;
break;
case 0xd8:
ins_table = mod3 ? opcode_timings_p6_d8_mod3 : opcode_timings_p6_d8;
deps = mod3 ? opcode_deps_d8_mod3 : opcode_deps_d8;
opcode = (opcode >> 3) & 7;
break;
case 0xd9:
ins_table = mod3 ? opcode_timings_p6_d9_mod3 : opcode_timings_p6_d9;
deps = mod3 ? opcode_deps_d9_mod3 : opcode_deps_d9;
opcode = mod3 ? opcode & 0x3f : (opcode >> 3) & 7;
break;
case 0xda:
ins_table = mod3 ? opcode_timings_p6_da_mod3 : opcode_timings_p6_da;
deps = mod3 ? opcode_deps_da_mod3 : opcode_deps_da;
opcode = (opcode >> 3) & 7;
break;
case 0xdb:
ins_table = mod3 ? opcode_timings_p6_db_mod3 : opcode_timings_p6_db;
deps = mod3 ? opcode_deps_db_mod3 : opcode_deps_db;
opcode = mod3 ? opcode & 0x3f : (opcode >> 3) & 7;
break;
case 0xdc:
ins_table = mod3 ? opcode_timings_p6_dc_mod3 : opcode_timings_p6_dc;
deps = mod3 ? opcode_deps_dc_mod3 : opcode_deps_dc;
opcode = (opcode >> 3) & 7;
break;
case 0xdd:
ins_table = mod3 ? opcode_timings_p6_dd_mod3 : opcode_timings_p6_dd;
deps = mod3 ? opcode_deps_dd_mod3 : opcode_deps_dd;
opcode = (opcode >> 3) & 7;
break;
case 0xde:
ins_table = mod3 ? opcode_timings_p6_de_mod3 : opcode_timings_p6_de;
deps = mod3 ? opcode_deps_de_mod3 : opcode_deps_de;
opcode = (opcode >> 3) & 7;
break;
case 0xdf:
ins_table = mod3 ? opcode_timings_p6_df_mod3 : opcode_timings_p6_df;
deps = mod3 ? opcode_deps_df_mod3 : opcode_deps_df;
opcode = (opcode >> 3) & 7;
break;
default:
switch (opcode) {
case 0x80:
case 0x82:
ins_table = mod3 ? opcode_timings_p6_80_mod3 : opcode_timings_p6_80;
deps = mod3 ? opcode_deps_8x_mod3 : opcode_deps_8x;
opcode = (fetchdat >> 3) & 7;
break;
case 0x81:
case 0x83:
ins_table = mod3 ? opcode_timings_p6_8x_mod3 : opcode_timings_p6_8x;
deps = mod3 ? opcode_deps_8x_mod3 : opcode_deps_8x;
opcode = (fetchdat >> 3) & 7;
break;
case 0xc0:
case 0xd0:
case 0xd2:
ins_table = mod3 ? opcode_timings_p6_shift_b_mod3 : opcode_timings_p6_shift_b;
deps = mod3 ? opcode_deps_shift_mod3 : opcode_deps_shift;
opcode = (fetchdat >> 3) & 7;
break;
case 0xc1:
case 0xd1:
case 0xd3:
ins_table = mod3 ? opcode_timings_p6_shift_mod3 : opcode_timings_p6_shift;
deps = mod3 ? opcode_deps_shift_mod3 : opcode_deps_shift;
opcode = (fetchdat >> 3) & 7;
break;
case 0xf6:
ins_table = mod3 ? opcode_timings_p6_f6_mod3 : opcode_timings_p6_f6;
deps = mod3 ? opcode_deps_f6_mod3 : opcode_deps_f6;
opcode = (fetchdat >> 3) & 7;
break;
case 0xf7:
ins_table = mod3 ? opcode_timings_p6_f7_mod3 : opcode_timings_p6_f7;
deps = mod3 ? opcode_deps_f7_mod3 : opcode_deps_f7;
opcode = (fetchdat >> 3) & 7;
break;
case 0xff:
ins_table = mod3 ? opcode_timings_p6_ff_mod3 : opcode_timings_p6_ff;
deps = mod3 ? opcode_deps_ff_mod3 : opcode_deps_ff;
opcode = (fetchdat >> 3) & 7;
break;
default:
ins_table = mod3 ? opcode_timings_p6_mod3 : opcode_timings_p6;
deps = mod3 ? opcode_deps_mod3 : opcode_deps;
break;
}
}
if (ins_table[opcode])
decode_instruction(ins_table[opcode], deps[opcode], fetchdat, op_32, bit8);
else
decode_instruction(&complex_alu1_op, 0, fetchdat, op_32, bit8);
codegen_block_cycles += (last_complete_timestamp - old_last_complete_timestamp);
}
void
codegen_timing_p6_block_end(void)
{
if (decode_buffer.nr_uops) {
int old_last_complete_timestamp = last_complete_timestamp;
decode_flush_p6();
codegen_block_cycles += (last_complete_timestamp - old_last_complete_timestamp);
}
}
int
codegen_timing_p6_jump_cycles(void)
{
if (decode_buffer.nr_uops)
return 1;
return 0;
}
codegen_timing_t codegen_timing_p6 = {
codegen_timing_p6_start,
codegen_timing_p6_prefix,
codegen_timing_p6_opcode,
codegen_timing_p6_block_start,
codegen_timing_p6_block_end,
codegen_timing_p6_jump_cycles
};