qemu-qemu/target/i386/whpx/whpx-all.c

/*
 * QEMU Windows Hypervisor Platform accelerator (WHPX)
 *
 * Copyright Microsoft Corp. 2017
 *
 * This work is licensed under the terms of the GNU GPL, version 2 or later.
 * See the COPYING file in the top-level directory.
 *
 */

#include "qemu/osdep.h"
#include "cpu.h"
#include "system/address-spaces.h"
#include "system/ioport.h"
#include "gdbstub/helpers.h"
#include "qemu/accel.h"
#include "accel/accel-ops.h"
#include "system/memory.h"
#include "system/whpx.h"
#include "system/cpus.h"
#include "system/runstate.h"
#include "qemu/main-loop.h"
#include "hw/core/boards.h"
#include "hw/intc/ioapic.h"
#include "hw/i386/apic_internal.h"
#include "qemu/error-report.h"
#include "qapi/error.h"
#include "qapi/qapi-types-common.h"
#include "qapi/qapi-visit-common.h"
#include "migration/blocker.h"
#include "host-cpu.h"
#include "accel/accel-cpu-target.h"
#include <winerror.h>

#include "system/whpx-internal.h"
#include "system/whpx-accel-ops.h"
#include "system/whpx-all.h"
#include "system/whpx-common.h"

#include "emulate/x86_decode.h"
#include "emulate/x86_emu.h"
#include "emulate/x86_flags.h"
#include "emulate/x86_mmu.h"
#include "trace.h"

#include <winhvplatform.h>

#define HYPERV_APIC_BUS_FREQUENCY      (200000000ULL)
/* for kernel-irqchip=off */
#define HV_X64_MSR_APIC_FREQUENCY       0x40000023

static const WHV_REGISTER_NAME whpx_register_names[] = {

    /* X64 General purpose registers */
    WHvX64RegisterRax,
    WHvX64RegisterRcx,
    WHvX64RegisterRdx,
    WHvX64RegisterRbx,
    WHvX64RegisterRsp,
    WHvX64RegisterRbp,
    WHvX64RegisterRsi,
    WHvX64RegisterRdi,
    WHvX64RegisterR8,
    WHvX64RegisterR9,
    WHvX64RegisterR10,
    WHvX64RegisterR11,
    WHvX64RegisterR12,
    WHvX64RegisterR13,
    WHvX64RegisterR14,
    WHvX64RegisterR15,
    WHvX64RegisterRip,
    WHvX64RegisterRflags,

    /* X64 Segment registers */
    WHvX64RegisterEs,
    WHvX64RegisterCs,
    WHvX64RegisterSs,
    WHvX64RegisterDs,
    WHvX64RegisterFs,
    WHvX64RegisterGs,
    WHvX64RegisterLdtr,
    WHvX64RegisterTr,

    /* X64 Table registers */
    WHvX64RegisterIdtr,
    WHvX64RegisterGdtr,

    /* X64 Control Registers */
    WHvX64RegisterCr0,
    WHvX64RegisterCr2,
    WHvX64RegisterCr3,
    WHvX64RegisterCr4,
    WHvX64RegisterCr8,

    /* X64 Debug Registers */
    /*
     * WHvX64RegisterDr0,
     * WHvX64RegisterDr1,
     * WHvX64RegisterDr2,
     * WHvX64RegisterDr3,
     * WHvX64RegisterDr6,
     * WHvX64RegisterDr7,
     */

    /* X64 Floating Point and Vector Registers */
    WHvX64RegisterXmm0,
    WHvX64RegisterXmm1,
    WHvX64RegisterXmm2,
    WHvX64RegisterXmm3,
    WHvX64RegisterXmm4,
    WHvX64RegisterXmm5,
    WHvX64RegisterXmm6,
    WHvX64RegisterXmm7,
    WHvX64RegisterXmm8,
    WHvX64RegisterXmm9,
    WHvX64RegisterXmm10,
    WHvX64RegisterXmm11,
    WHvX64RegisterXmm12,
    WHvX64RegisterXmm13,
    WHvX64RegisterXmm14,
    WHvX64RegisterXmm15,
    WHvX64RegisterFpMmx0,
    WHvX64RegisterFpMmx1,
    WHvX64RegisterFpMmx2,
    WHvX64RegisterFpMmx3,
    WHvX64RegisterFpMmx4,
    WHvX64RegisterFpMmx5,
    WHvX64RegisterFpMmx6,
    WHvX64RegisterFpMmx7,
    WHvX64RegisterFpControlStatus,
    WHvX64RegisterXmmControlStatus,

    /* X64 MSRs */
    WHvX64RegisterEfer,
#ifdef TARGET_X86_64
    WHvX64RegisterKernelGsBase,
#endif
    WHvX64RegisterApicBase,
    /* WHvX64RegisterPat, */
    WHvX64RegisterSysenterCs,
    WHvX64RegisterSysenterEip,
    WHvX64RegisterSysenterEsp,
    WHvX64RegisterStar,
#ifdef TARGET_X86_64
    WHvX64RegisterLstar,
    WHvX64RegisterCstar,
    WHvX64RegisterSfmask,
#endif

    /* Interrupt / Event Registers */
    /*
     * WHvRegisterPendingInterruption,
     * WHvRegisterInterruptState,
     * WHvRegisterPendingEvent0,
     * WHvRegisterPendingEvent1
     * WHvX64RegisterDeliverabilityNotifications,
     */
};

static const WHV_REGISTER_NAME whpx_register_names_for_vmexit[] = {
    /* X64 General purpose registers */
    WHvX64RegisterRax,
    WHvX64RegisterRcx,
    WHvX64RegisterRdx,
    WHvX64RegisterRbx,
    WHvX64RegisterRsp,
    WHvX64RegisterRbp,
    WHvX64RegisterRsi,
    WHvX64RegisterRdi,
    WHvX64RegisterR8,
    WHvX64RegisterR9,
    WHvX64RegisterR10,
    WHvX64RegisterR11,
    WHvX64RegisterR12,
    WHvX64RegisterR13,
    WHvX64RegisterR14,
    WHvX64RegisterR15,
};

struct whpx_register_set {
    WHV_REGISTER_VALUE values[RTL_NUMBER_OF(whpx_register_names)];
};

/*
 * The current implementation of instruction stepping sets the TF flag
 * in RFLAGS, causing the CPU to raise an INT1 after each instruction.
 * This corresponds to the WHvX64ExceptionTypeDebugTrapOrFault exception.
 *
 * This approach has a few limitations:
 *     1. Stepping over a PUSHF/SAHF instruction will save the TF flag
 *        along with the other flags, possibly restoring it later. It would
 *        result in another INT1 when the flags are restored, triggering
 *        a stop in gdb that could be cleared by doing another step.
 *
 *        Stepping over a POPF/LAHF instruction will let it overwrite the
 *        TF flags, ending the stepping mode.
 *
 *     2. Stepping over an instruction raising an exception (e.g. INT, DIV,
 *        or anything that could result in a page fault) will save the flags
 *        to the stack, clear the TF flag, and let the guest execute the
 *        handler. Normally, the guest will restore the original flags,
 *        that will continue single-stepping.
 *
 *     3. Debuggers running on the guest may wish to set TF to do instruction
 *        stepping. INT1 events generated by it would be intercepted by us,
 *        as long as the gdb is connected to QEMU.
 *
 * In practice this means that:
 *     1. Stepping through flags-modifying instructions may cause gdb to
 *        continue or stop in unexpected places. This will be fully recoverable
 *        and will not crash the target.
 *
 *     2. Stepping over an instruction that triggers an exception will step
 *        over the exception handler, not into it.
 *
 *     3. Debugging the guest via gdb, while running debugger on the guest
 *        at the same time may lead to unexpected effects. Removing all
 *        breakpoints set via QEMU will prevent any further interference
 *        with the guest-level debuggers.
 *
 * The limitations can be addressed as shown below:
 *     1. PUSHF/SAHF/POPF/LAHF/IRET instructions can be emulated instead of
 *        stepping through them. The exact semantics of the instructions is
 *        defined in the "Combined Volume Set of Intel 64 and IA-32
 *        Architectures Software Developer's Manuals", however it involves a
 *        fair amount of corner cases due to compatibility with real mode,
 *        virtual 8086 mode, and differences between 64-bit and 32-bit modes.
 *
 *     2. We could step into the guest's exception handlers using the following
 *        sequence:
 *          a. Temporarily enable catching of all exception types via
 *             whpx_set_exception_exit_bitmap().
 *          b. Once an exception is intercepted, read the IDT/GDT and locate
 *             the original handler.
 *          c. Patch the original handler, injecting an INT3 at the beginning.
 *          d. Update the exception exit bitmap to only catch the
 *             WHvX64ExceptionTypeBreakpointTrap exception.
 *          e. Let the affected CPU run in the exclusive mode.
 *          f. Restore the original handler and the exception exit bitmap.
 *        Note that handling all corner cases related to IDT/GDT is harder
 *        than it may seem. See x86_cpu_get_phys_page_attrs_debug() for a
 *        rough idea.
 *
 *     3. In order to properly support guest-level debugging in parallel with
 *        the QEMU-level debugging, we would need to be able to pass some INT1
 *        events to the guest. This could be done via the following methods:
 *          a. Using the WHvRegisterPendingEvent register. As of Windows 21H1,
 *             it seems to only work for interrupts and not software
 *             exceptions.
 *          b. Locating and patching the original handler by parsing IDT/GDT.
 *             This involves relatively complex logic outlined in the previous
 *             paragraph.
 *          c. Emulating the exception invocation (i.e. manually updating RIP,
 *             RFLAGS, and pushing the old values to stack). This is even more
 *             complicated than the previous option, since it involves checking
 *             CPL, gate attributes, and doing various adjustments depending
 *             on the current CPU mode, whether the CPL is changing, etc.
 */
typedef enum WhpxStepMode {
    WHPX_STEP_NONE = 0,
    /* Halt other VCPUs */
    WHPX_STEP_EXCLUSIVE,
} WhpxStepMode;

static uint32_t max_vcpu_index;
static WHV_PROCESSOR_XSAVE_FEATURES whpx_xsave_cap;

static bool whpx_has_xsave(void)
{
    return whpx_xsave_cap.XsaveSupport;
}

static WHV_X64_SEGMENT_REGISTER whpx_seg_q2h(const SegmentCache *qs, int v86,
                                             int r86)
{
    WHV_X64_SEGMENT_REGISTER hs;
    unsigned flags = qs->flags;

    hs.Base = qs->base;
    hs.Limit = qs->limit;
    hs.Selector = qs->selector;

    if (v86) {
        hs.Attributes = 0;
        hs.SegmentType = 3;
        hs.Present = 1;
        hs.DescriptorPrivilegeLevel = 3;
        hs.NonSystemSegment = 1;

    } else {
        hs.Attributes = (flags >> DESC_TYPE_SHIFT);

        if (r86) {
            /* hs.Base &= 0xfffff; */
        }
    }

    return hs;
}

static SegmentCache whpx_seg_h2q(const WHV_X64_SEGMENT_REGISTER *hs)
{
    SegmentCache qs;

    qs.base = hs->Base;
    qs.limit = hs->Limit;
    qs.selector = hs->Selector;

    qs.flags = ((uint32_t)hs->Attributes) << DESC_TYPE_SHIFT;

    return qs;
}

/* X64 Extended Control Registers */
static void whpx_set_xcrs(CPUState *cpu)
{
    HRESULT hr;
    struct whpx_state *whpx = &whpx_global;
    WHV_REGISTER_VALUE xcr0;
    WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;

    if (!whpx_has_xsave()) {
        return;
    }

    /* Only xcr0 is supported by the hypervisor currently */
    xcr0.Reg64 = cpu_env(cpu)->xcr0;
    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
        whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to set register xcr0, hr=%08lx", hr);
    }
}

static int whpx_set_tsc(CPUState *cpu)
{
    WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
    WHV_REGISTER_VALUE tsc_val;
    HRESULT hr;
    struct whpx_state *whpx = &whpx_global;

    /*
     * Suspend the partition prior to setting the TSC to reduce the variance
     * in TSC across vCPUs. When the first vCPU runs post suspend, the
     * partition is automatically resumed.
     */
    if (whp_dispatch.WHvSuspendPartitionTime) {

        /*
         * Unable to suspend partition while setting TSC is not a fatal
         * error. It just increases the likelihood of TSC variance between
         * vCPUs and some guest OS are able to handle that just fine.
         */
        hr = whp_dispatch.WHvSuspendPartitionTime(whpx->partition);
        if (FAILED(hr)) {
            warn_report("WHPX: Failed to suspend partition, hr=%08lx", hr);
        }
    }

    tsc_val.Reg64 = cpu_env(cpu)->tsc;
    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
        whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to set TSC, hr=%08lx", hr);
        return -1;
    }

    return 0;
}

/*
 * The CR8 register in the CPU is mapped to the TPR register of the APIC,
 * however, they use a slightly different encoding. Specifically:
 *
 *     APIC.TPR[bits 7:4] = CR8[bits 3:0]
 *
 * This mechanism is described in section 10.8.6.1 of Volume 3 of Intel 64
 * and IA-32 Architectures Software Developer's Manual.
 *
 * The functions below translate the value of CR8 to TPR and vice versa.
 */

static uint64_t whpx_apic_tpr_to_cr8(uint64_t tpr)
{
    return tpr >> 4;
}

static uint64_t whpx_cr8_to_apic_tpr(uint64_t cr8)
{
    return cr8 << 4;
}

void whpx_set_registers(CPUState *cpu, WHPXStateLevel level)
{
    struct whpx_state *whpx = &whpx_global;
    AccelCPUState *vcpu = cpu->accel;
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
    struct whpx_register_set vcxt;
    HRESULT hr;
    int idx;
    int idx_next;
    int i;
    int v86, r86;

    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));

    /*
     * Following MSRs have side effects on the guest or are too heavy for
     * runtime. Limit them to full state update.
     */
    if (level >= WHPX_LEVEL_RESET_STATE) {
        whpx_set_tsc(cpu);
    }

    memset(&vcxt, 0, sizeof(struct whpx_register_set));

    v86 = (env->eflags & VM_MASK);
    r86 = !(env->cr[0] & CR0_PE_MASK);

    vcpu->tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
    vcpu->apic_base = cpu_get_apic_base(x86_cpu->apic_state);

    idx = 0;

    /* Indexes for first 16 registers match between HV and QEMU definitions */
    idx_next = 16;
    for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
        vcxt.values[idx].Reg64 = (uint64_t)env->regs[idx];
    }
    idx = idx_next;

    /* Same goes for RIP and RFLAGS */
    assert(whpx_register_names[idx] == WHvX64RegisterRip);
    vcxt.values[idx++].Reg64 = env->eip;

    assert(whpx_register_names[idx] == WHvX64RegisterRflags);
    lflags_to_rflags(env);
    vcxt.values[idx++].Reg64 = env->eflags;
    assert(idx == WHvX64RegisterEs);

    if (level > WHPX_LEVEL_FAST_RUNTIME_STATE) {

        /* Translate 6+4 segment registers. HV and QEMU order matches  */
        for (i = 0; i < 6; i += 1, idx += 1) {
            vcxt.values[idx].Segment = whpx_seg_q2h(&env->segs[i], v86, r86);
        }

        assert(idx == WHvX64RegisterLdtr);
        /*
         * Skip those registers for synchronisation after MMIO accesses
         * as they're not going to be modified in that case.
         */

        vcxt.values[idx++].Segment = whpx_seg_q2h(&env->ldt, 0, 0);

        assert(idx == WHvX64RegisterTr);
        vcxt.values[idx++].Segment = whpx_seg_q2h(&env->tr, 0, 0);

        assert(idx == WHvX64RegisterIdtr);
        vcxt.values[idx].Table.Base = env->idt.base;
        vcxt.values[idx].Table.Limit = env->idt.limit;
        idx += 1;

        assert(idx == WHvX64RegisterGdtr);
        vcxt.values[idx].Table.Base = env->gdt.base;
        vcxt.values[idx].Table.Limit = env->gdt.limit;
        idx += 1;

        /* CR0, 2, 3, 4, 8 */
        assert(whpx_register_names[idx] == WHvX64RegisterCr0);
        vcxt.values[idx++].Reg64 = env->cr[0];
        assert(whpx_register_names[idx] == WHvX64RegisterCr2);
        vcxt.values[idx++].Reg64 = env->cr[2];
        assert(whpx_register_names[idx] == WHvX64RegisterCr3);
        vcxt.values[idx++].Reg64 = env->cr[3];
        assert(whpx_register_names[idx] == WHvX64RegisterCr4);
        vcxt.values[idx++].Reg64 = env->cr[4];
        assert(whpx_register_names[idx] == WHvX64RegisterCr8);
        vcxt.values[idx++].Reg64 = vcpu->tpr;

        /* 8 Debug Registers - Skipped */

        /*
         * Extended control registers needs to be handled separately depending
         * on whether xsave is supported/enabled or not.
         */
        whpx_set_xcrs(cpu);

        /* 16 XMM registers */
        assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
        idx_next = idx + 16;
        for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
            vcxt.values[idx].Reg128.Low64 = env->xmm_regs[i].ZMM_Q(0);
            vcxt.values[idx].Reg128.High64 = env->xmm_regs[i].ZMM_Q(1);
        }
        idx = idx_next;

        /* 8 FP registers */
        assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
        for (i = 0; i < 8; i += 1, idx += 1) {
            vcxt.values[idx].Fp.AsUINT128.Low64 = env->fpregs[i].mmx.MMX_Q(0);
            /* vcxt.values[idx].Fp.AsUINT128.High64 =
                       env->fpregs[i].mmx.MMX_Q(1);
            */
        }

        /* FP control status register */
        assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
        vcxt.values[idx].FpControlStatus.FpControl = env->fpuc;
        vcxt.values[idx].FpControlStatus.FpStatus =
            (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
        vcxt.values[idx].FpControlStatus.FpTag = 0;
        for (i = 0; i < 8; ++i) {
            vcxt.values[idx].FpControlStatus.FpTag |= (!env->fptags[i]) << i;
        }
        vcxt.values[idx].FpControlStatus.Reserved = 0;
        vcxt.values[idx].FpControlStatus.LastFpOp = env->fpop;
        vcxt.values[idx].FpControlStatus.LastFpRip = env->fpip;
        idx += 1;

        /* XMM control status register */
        assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
        vcxt.values[idx].XmmControlStatus.LastFpRdp = 0;
        vcxt.values[idx].XmmControlStatus.XmmStatusControl = env->mxcsr;
        vcxt.values[idx].XmmControlStatus.XmmStatusControlMask = 0x0000ffff;
        idx += 1;

        /* MSRs */
        assert(whpx_register_names[idx] == WHvX64RegisterEfer);
        vcxt.values[idx++].Reg64 = env->efer;
#ifdef TARGET_X86_64
        assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
        vcxt.values[idx++].Reg64 = env->kernelgsbase;
#endif

        assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
        vcxt.values[idx++].Reg64 = vcpu->apic_base;

        /* WHvX64RegisterPat - Skipped */

        assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
        vcxt.values[idx++].Reg64 = env->sysenter_cs;
        assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
        vcxt.values[idx++].Reg64 = env->sysenter_eip;
        assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
        vcxt.values[idx++].Reg64 = env->sysenter_esp;
        assert(whpx_register_names[idx] == WHvX64RegisterStar);
        vcxt.values[idx++].Reg64 = env->star;
#ifdef TARGET_X86_64
        assert(whpx_register_names[idx] == WHvX64RegisterLstar);
        vcxt.values[idx++].Reg64 = env->lstar;
        assert(whpx_register_names[idx] == WHvX64RegisterCstar);
        vcxt.values[idx++].Reg64 = env->cstar;
        assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
        vcxt.values[idx++].Reg64 = env->fmask;
#endif

        /* Interrupt / Event Registers - Skipped */

        assert(idx == RTL_NUMBER_OF(whpx_register_names));
    }

    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
        whpx->partition, cpu->cpu_index,
        whpx_register_names,
        idx,
        &vcxt.values[0]);

    if (FAILED(hr)) {
        error_report("WHPX: Failed to set virtual processor context, hr=%08lx",
                     hr);
    }
}

static int whpx_get_tsc(CPUState *cpu)
{
    WHV_REGISTER_NAME tsc_reg = WHvX64RegisterTsc;
    WHV_REGISTER_VALUE tsc_val;
    HRESULT hr;
    struct whpx_state *whpx = &whpx_global;

    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
        whpx->partition, cpu->cpu_index, &tsc_reg, 1, &tsc_val);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to get TSC, hr=%08lx", hr);
        return -1;
    }

    cpu_env(cpu)->tsc = tsc_val.Reg64;
    return 0;
}

/* X64 Extended Control Registers */
static void whpx_get_xcrs(CPUState *cpu)
{
    HRESULT hr;
    struct whpx_state *whpx = &whpx_global;
    WHV_REGISTER_VALUE xcr0;
    WHV_REGISTER_NAME xcr0_name = WHvX64RegisterXCr0;

    if (!whpx_has_xsave()) {
        return;
    }

    /* Only xcr0 is supported by the hypervisor currently */
    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
        whpx->partition, cpu->cpu_index, &xcr0_name, 1, &xcr0);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to get register xcr0, hr=%08lx", hr);
        return;
    }

    cpu_env(cpu)->xcr0 = xcr0.Reg64;
}

static void whpx_get_registers_for_vmexit(CPUState *cpu, WHPXStateLevel level)
{
    struct whpx_state *whpx = &whpx_global;
    AccelCPUState *vcpu = cpu->accel;
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
    struct whpx_register_set vcxt;
    HRESULT hr;
    int idx;
    int idx_next;

    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));

    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
        whpx->partition, cpu->cpu_index,
        whpx_register_names_for_vmexit,
        RTL_NUMBER_OF(whpx_register_names_for_vmexit),
        &vcxt.values[0]);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
                     hr);
    }

    idx = 0;

    /* Indexes for first 16 registers match between HV and QEMU definitions */
    idx_next = 16;
    for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
        env->regs[idx] = vcxt.values[idx].Reg64;
    }
    idx = idx_next;

    env->eip = vcpu->exit_ctx.VpContext.Rip;
    env->eflags = vcpu->exit_ctx.VpContext.Rflags;
    rflags_to_lflags(env);

    assert(idx == RTL_NUMBER_OF(whpx_register_names_for_vmexit));

    x86_update_hflags(env);
}

void whpx_get_registers(CPUState *cpu, WHPXStateLevel level)
{
    struct whpx_state *whpx = &whpx_global;
    AccelCPUState *vcpu = cpu->accel;
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
    struct whpx_register_set vcxt;
    uint64_t tpr, apic_base;
    HRESULT hr;
    int idx;
    int idx_next;
    int i;

    assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));

    if (level == WHPX_LEVEL_FAST_RUNTIME_STATE) {
        return whpx_get_registers_for_vmexit(cpu, level);
    }

    if (!env->tsc_valid) {
        whpx_get_tsc(cpu);
        env->tsc_valid = !runstate_is_running();
    }

    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
        whpx->partition, cpu->cpu_index,
        whpx_register_names,
        RTL_NUMBER_OF(whpx_register_names),
        &vcxt.values[0]);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to get virtual processor context, hr=%08lx",
                     hr);
    }

    if (whpx_irqchip_in_kernel()) {
        /*
         * Fetch the TPR value from the emulated APIC. It may get overwritten
         * below with the value from CR8 returned by
         * WHvGetVirtualProcessorRegisters().
         */
        whpx_apic_get(x86_cpu->apic_state);
        vcpu->tpr = whpx_apic_tpr_to_cr8(
            cpu_get_apic_tpr(x86_cpu->apic_state));
    }

    idx = 0;

    /* Indexes for first 16 registers match between HV and QEMU definitions */
    idx_next = 16;
    for (idx = 0; idx < CPU_NB_REGS; idx += 1) {
        env->regs[idx] = vcxt.values[idx].Reg64;
    }
    idx = idx_next;

    /* Same goes for RIP and RFLAGS */
    assert(whpx_register_names[idx] == WHvX64RegisterRip);
    env->eip = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterRflags);
    env->eflags = vcxt.values[idx++].Reg64;
    rflags_to_lflags(env);

    /* Translate 6+4 segment registers. HV and QEMU order matches  */
    assert(idx == WHvX64RegisterEs);
    for (i = 0; i < 6; i += 1, idx += 1) {
        env->segs[i] = whpx_seg_h2q(&vcxt.values[idx].Segment);
    }

    assert(idx == WHvX64RegisterLdtr);
    env->ldt = whpx_seg_h2q(&vcxt.values[idx++].Segment);
    assert(idx == WHvX64RegisterTr);
    env->tr = whpx_seg_h2q(&vcxt.values[idx++].Segment);
    assert(idx == WHvX64RegisterIdtr);
    env->idt.base = vcxt.values[idx].Table.Base;
    env->idt.limit = vcxt.values[idx].Table.Limit;
    idx += 1;
    assert(idx == WHvX64RegisterGdtr);
    env->gdt.base = vcxt.values[idx].Table.Base;
    env->gdt.limit = vcxt.values[idx].Table.Limit;
    idx += 1;

    /* CR0, 2, 3, 4, 8 */
    assert(whpx_register_names[idx] == WHvX64RegisterCr0);
    env->cr[0] = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterCr2);
    env->cr[2] = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterCr3);
    env->cr[3] = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterCr4);
    env->cr[4] = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterCr8);
    tpr = vcxt.values[idx++].Reg64;
    if (tpr != vcpu->tpr) {
        vcpu->tpr = tpr;
        cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(tpr));
    }

    /* 8 Debug Registers - Skipped */

    /*
     * Extended control registers needs to be handled separately depending
     * on whether xsave is supported/enabled or not.
     */
    whpx_get_xcrs(cpu);

    /* 16 XMM registers */
    assert(whpx_register_names[idx] == WHvX64RegisterXmm0);
    idx_next = idx + 16;
    for (i = 0; i < sizeof(env->xmm_regs) / sizeof(ZMMReg); i += 1, idx += 1) {
        env->xmm_regs[i].ZMM_Q(0) = vcxt.values[idx].Reg128.Low64;
        env->xmm_regs[i].ZMM_Q(1) = vcxt.values[idx].Reg128.High64;
    }
    idx = idx_next;

    /* 8 FP registers */
    assert(whpx_register_names[idx] == WHvX64RegisterFpMmx0);
    for (i = 0; i < 8; i += 1, idx += 1) {
        env->fpregs[i].mmx.MMX_Q(0) = vcxt.values[idx].Fp.AsUINT128.Low64;
        /* env->fpregs[i].mmx.MMX_Q(1) =
               vcxt.values[idx].Fp.AsUINT128.High64;
        */
    }

    /* FP control status register */
    assert(whpx_register_names[idx] == WHvX64RegisterFpControlStatus);
    env->fpuc = vcxt.values[idx].FpControlStatus.FpControl;
    env->fpstt = (vcxt.values[idx].FpControlStatus.FpStatus >> 11) & 0x7;
    env->fpus = vcxt.values[idx].FpControlStatus.FpStatus & ~0x3800;
    for (i = 0; i < 8; ++i) {
        env->fptags[i] = !((vcxt.values[idx].FpControlStatus.FpTag >> i) & 1);
    }
    env->fpop = vcxt.values[idx].FpControlStatus.LastFpOp;
    env->fpip = vcxt.values[idx].FpControlStatus.LastFpRip;
    idx += 1;

    /* XMM control status register */
    assert(whpx_register_names[idx] == WHvX64RegisterXmmControlStatus);
    env->mxcsr = vcxt.values[idx].XmmControlStatus.XmmStatusControl;
    idx += 1;

    /* MSRs */
    assert(whpx_register_names[idx] == WHvX64RegisterEfer);
    env->efer = vcxt.values[idx++].Reg64;
#ifdef TARGET_X86_64
    assert(whpx_register_names[idx] == WHvX64RegisterKernelGsBase);
    env->kernelgsbase = vcxt.values[idx++].Reg64;
#endif

    assert(whpx_register_names[idx] == WHvX64RegisterApicBase);
    apic_base = vcxt.values[idx++].Reg64;
    if (apic_base != vcpu->apic_base) {
        vcpu->apic_base = apic_base;
        cpu_set_apic_base(x86_cpu->apic_state, vcpu->apic_base);
    }

    /* WHvX64RegisterPat - Skipped */

    assert(whpx_register_names[idx] == WHvX64RegisterSysenterCs);
    env->sysenter_cs = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterSysenterEip);
    env->sysenter_eip = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterSysenterEsp);
    env->sysenter_esp = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterStar);
    env->star = vcxt.values[idx++].Reg64;
#ifdef TARGET_X86_64
    assert(whpx_register_names[idx] == WHvX64RegisterLstar);
    env->lstar = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterCstar);
    env->cstar = vcxt.values[idx++].Reg64;
    assert(whpx_register_names[idx] == WHvX64RegisterSfmask);
    env->fmask = vcxt.values[idx++].Reg64;
#endif

    /* Interrupt / Event Registers - Skipped */

    assert(idx == RTL_NUMBER_OF(whpx_register_names));

    if (whpx_irqchip_in_kernel()) {
        whpx_apic_get(x86_cpu->apic_state);
    }

    x86_update_hflags(env);
}

static int emulate_instruction(CPUState *cpu, const uint8_t *insn_bytes, size_t insn_len)
{
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
    struct x86_decode decode = { 0 };
    x86_insn_stream stream = { .bytes = insn_bytes, .len = insn_len };

    whpx_get_registers(cpu, WHPX_LEVEL_FAST_RUNTIME_STATE);
    decode_instruction_stream(env, &decode, &stream);
    exec_instruction(env, &decode);
    whpx_set_registers(cpu, WHPX_LEVEL_FAST_RUNTIME_STATE);

    return 0;
}

static int whpx_handle_mmio(CPUState *cpu, WHV_RUN_VP_EXIT_CONTEXT *exit_ctx)
{
    WHV_MEMORY_ACCESS_CONTEXT *ctx = &exit_ctx->MemoryAccess;
    int ret;

    ret = emulate_instruction(cpu, ctx->InstructionBytes, ctx->InstructionByteCount);
    if (ret < 0) {
        error_report("failed to emulate mmio");
        return -1;
    }

    return 0;
}

static void handle_io(CPUState *env, uint16_t port, void *buffer,
                  int direction, int size, int count)
{
    int i;
    uint8_t *ptr = buffer;

    for (i = 0; i < count; i++) {
        address_space_rw(&address_space_io, port, MEMTXATTRS_UNSPECIFIED,
                         ptr, size,
                         direction);
        ptr += size;
    }
}

static void whpx_bump_rip(CPUState *cpu, WHV_RUN_VP_EXIT_CONTEXT *exit_ctx)
{
    WHV_REGISTER_VALUE reg;
    whpx_get_reg(cpu, WHvX64RegisterRip, &reg);
    reg.Reg64 = exit_ctx->VpContext.Rip + exit_ctx->VpContext.InstructionLength;
    whpx_set_reg(cpu, WHvX64RegisterRip, reg);
}

static int whpx_handle_portio(CPUState *cpu,
                              WHV_RUN_VP_EXIT_CONTEXT *exit_ctx)
{
    WHV_X64_IO_PORT_ACCESS_CONTEXT *ctx = &exit_ctx->IoPortAccess;
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
    int ret;

    if (!ctx->AccessInfo.StringOp && !ctx->AccessInfo.IsWrite) {
        uint64_t val = 0;
        WHV_REGISTER_VALUE reg;

        whpx_get_reg(cpu, WHvX64RegisterRax, &reg);
        handle_io(cpu, ctx->PortNumber, &val, 0, ctx->AccessInfo.AccessSize, 1);
        if (ctx->AccessInfo.AccessSize == 1) {
            reg.Reg8 = val;
        } else if (ctx->AccessInfo.AccessSize == 2) {
            reg.Reg16 = val;
        } else if (ctx->AccessInfo.AccessSize == 4) {
            reg.Reg64 = (uint32_t)val;
        } else {
            reg.Reg64 = (uint64_t)val;
        }
        whpx_bump_rip(cpu, exit_ctx);
        whpx_set_reg(cpu, WHvX64RegisterRax, reg);
        return 0;
    } else if (!ctx->AccessInfo.StringOp && ctx->AccessInfo.IsWrite) {
        RAX(env) = ctx->Rax;
        handle_io(cpu, ctx->PortNumber, &RAX(env), 1, ctx->AccessInfo.AccessSize, 1);
        whpx_bump_rip(cpu, exit_ctx);
        return 0;
    }

    ret = emulate_instruction(cpu, ctx->InstructionBytes, exit_ctx->VpContext.InstructionLength);
    if (ret < 0) {
        error_report("failed to emulate I/O port access");
        return -1;
    }

    return 0;
}

static void whpx_segment_to_x86_descriptor(CPUState *cpu, WHV_X64_SEGMENT_REGISTER* reg,
                                   struct x86_segment_descriptor *desc)
{
    uint32_t limit;
    desc->g = reg->Granularity;

    /*
     * Hyper-V can return reg->Granularity == 0
     * with a higher limit than 0xfffff.
     *
     * Detect that case and set desc->g
     * with shifting the limit properly.
     */
    if (!desc->g && reg->Limit <= 0xfffff) {
        limit = reg->Limit;
    } else {
        limit = (reg->Limit >> 12);
        desc->g = 1;
    }

    x86_set_segment_limit(desc, limit);
    x86_set_segment_base(desc, reg->Base);

    desc->type = reg->SegmentType;
    desc->s = reg->NonSystemSegment;
    desc->dpl = reg->DescriptorPrivilegeLevel;
    desc->p = reg->Present;
    desc->avl = reg->Available;
    desc->l = reg->Long;
    desc->db = reg->Default;
}

static void whpx_read_segment_descriptor(CPUState *cpu, WHV_X64_SEGMENT_REGISTER* reg,
                                    X86Seg seg)
{
    AccelCPUState *vcpu = cpu->accel;
    WHV_REGISTER_NAME reg_name = WHvX64RegisterEs + seg;
    WHV_REGISTER_VALUE val;

    if (seg == R_CS) {
        *reg = vcpu->exit_ctx.VpContext.Cs;
        return;
    }
    if (vcpu->exit_ctx.ExitReason == WHvRunVpExitReasonX64IoPortAccess) {
        if (seg == R_DS) {
            *reg = vcpu->exit_ctx.IoPortAccess.Ds;
            return;
        } else if (seg == R_ES) {
            *reg = vcpu->exit_ctx.IoPortAccess.Es;
            return;
        }
    }

    whpx_get_reg(cpu, reg_name, &val);
    *reg = val.Segment;
}

static void read_segment_descriptor(CPUState *cpu,
                                    struct x86_segment_descriptor *desc,
                                    enum X86Seg seg_idx)
{
    WHV_X64_SEGMENT_REGISTER reg;
    whpx_read_segment_descriptor(cpu, &reg, seg_idx);
    whpx_segment_to_x86_descriptor(cpu, &reg, desc);
}

static bool is_protected_mode(CPUState *cpu)
{
    AccelCPUState *vcpu = cpu->accel;

    return vcpu->exit_ctx.VpContext.ExecutionState.Cr0Pe == 1;
}

static bool is_long_mode(CPUState *cpu)
{
    AccelCPUState *vcpu = cpu->accel;

    return vcpu->exit_ctx.VpContext.ExecutionState.EferLma == 1;
}

static bool is_user_mode(CPUState *cpu)
{
    AccelCPUState *vcpu = cpu->accel;
    return vcpu->exit_ctx.VpContext.ExecutionState.Cpl == 3;
}

static target_ulong read_cr(CPUState *cpu, int cr)
{
    WHV_REGISTER_NAME whv_cr;
    WHV_REGISTER_VALUE val;

    switch (cr) {
    case 0:
        whv_cr = WHvX64RegisterCr0;
        break;
    case 2:
        whv_cr = WHvX64RegisterCr2;
        break;
    case 3:
        whv_cr = WHvX64RegisterCr3;
        break;
    case 4:
        whv_cr = WHvX64RegisterCr4;
        break;
    case 8:
        whv_cr = WHvX64RegisterCr8;
        break;
    default:
        abort();
    }
    whpx_get_reg(cpu, whv_cr, &val);

    return val.Reg64;
}

static const struct x86_emul_ops whpx_x86_emul_ops = {
    .read_segment_descriptor = read_segment_descriptor,
    .handle_io = handle_io,
    .is_protected_mode = is_protected_mode,
    .is_long_mode = is_long_mode,
    .is_user_mode = is_user_mode,
    .read_cr = read_cr
};

static void whpx_init_emu(void)
{
    init_decoder();
    init_emu(&whpx_x86_emul_ops);
}

/*
 * Controls whether we should intercept various exceptions on the guest,
 * namely breakpoint/single-step events.
 *
 * The 'exceptions' argument accepts a bitmask, e.g:
 * (1 << WHvX64ExceptionTypeDebugTrapOrFault) | (...)
 */
HRESULT whpx_set_exception_exit_bitmap(UINT64 exceptions)
{
    struct whpx_state *whpx = &whpx_global;
    WHV_PARTITION_PROPERTY prop;
    HRESULT hr;

    if (exceptions == whpx->exception_exit_bitmap) {
        return S_OK;
    }

    /* Register for MSR and CPUID exits */
    memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
    prop.ExtendedVmExits.X64MsrExit = 1;
    if (exceptions != 0) {
        prop.ExtendedVmExits.ExceptionExit = 1;
    }

    hr = whp_dispatch.WHvSetPartitionProperty(
            whpx->partition,
            WHvPartitionPropertyCodeExtendedVmExits,
            &prop,
            sizeof(WHV_PARTITION_PROPERTY));
    if (FAILED(hr)) {
        error_report("WHPX: Failed to enable extended VM exits, hr=%08lx", hr);
        return hr;
    }

    memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
    prop.ExceptionExitBitmap = exceptions;

    hr = whp_dispatch.WHvSetPartitionProperty(
        whpx->partition,
        WHvPartitionPropertyCodeExceptionExitBitmap,
        &prop,
        sizeof(WHV_PARTITION_PROPERTY));

    if (SUCCEEDED(hr)) {
        whpx->exception_exit_bitmap = exceptions;
    }

    return hr;
}


/*
 * This function is called before/after stepping over a single instruction.
 * It will update the CPU registers to arm/disarm the instruction stepping
 * accordingly.
 */
static HRESULT whpx_vcpu_configure_single_stepping(CPUState *cpu,
    bool set,
    uint64_t *exit_context_rflags)
{
    WHV_REGISTER_NAME reg_name;
    WHV_REGISTER_VALUE reg_value;
    HRESULT hr;
    struct whpx_state *whpx = &whpx_global;

    /*
     * If we are trying to step over a single instruction, we need to set the
     * TF bit in rflags. Otherwise, clear it.
     */
    reg_name = WHvX64RegisterRflags;
    hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
        whpx->partition,
        cpu->cpu_index,
        &reg_name,
        1,
        &reg_value);

    if (FAILED(hr)) {
        error_report("WHPX: Failed to get rflags, hr=%08lx", hr);
        return hr;
    }

    if (exit_context_rflags) {
        assert(*exit_context_rflags == reg_value.Reg64);
    }

    if (set) {
        /* Raise WHvX64ExceptionTypeDebugTrapOrFault after each instruction */
        reg_value.Reg64 |= TF_MASK;
    } else {
        reg_value.Reg64 &= ~TF_MASK;
    }

    if (exit_context_rflags) {
        *exit_context_rflags = reg_value.Reg64;
    }

    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
        whpx->partition,
        cpu->cpu_index,
        &reg_name,
        1,
        &reg_value);

    if (FAILED(hr)) {
        error_report("WHPX: Failed to set rflags,"
            " hr=%08lx",
            hr);
        return hr;
    }

    reg_name = WHvRegisterInterruptState;
    reg_value.Reg64 = 0;

    /* Suspend delivery of hardware interrupts during single-stepping. */
    reg_value.InterruptState.InterruptShadow = set != 0;

    hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
    whpx->partition,
        cpu->cpu_index,
        &reg_name,
        1,
        &reg_value);

    if (FAILED(hr)) {
        error_report("WHPX: Failed to set InterruptState,"
            " hr=%08lx",
            hr);
        return hr;
    }

    if (!set) {
        /*
         * We have just finished stepping over a single instruction,
         * and intercepted the INT1 generated by it.
         * We need to now hide the INT1 from the guest,
         * as it would not be expecting it.
         */

        reg_name = WHvX64RegisterPendingDebugException;
        hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
        whpx->partition,
            cpu->cpu_index,
            &reg_name,
            1,
            &reg_value);

        if (FAILED(hr)) {
            error_report("WHPX: Failed to get pending debug exceptions,"
                         "hr=%08lx", hr);
            return hr;
        }

        if (reg_value.PendingDebugException.SingleStep) {
            reg_value.PendingDebugException.SingleStep = 0;

            hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
                whpx->partition,
                cpu->cpu_index,
                &reg_name,
                1,
                &reg_value);

            if (FAILED(hr)) {
                error_report("WHPX: Failed to clear pending debug exceptions,"
                             "hr=%08lx", hr);
             return hr;
            }
        }

    }

    return S_OK;
}

/*
 * Linux uses int3 (0xCC) during startup (see int3_selftest()) and for
 * debugging user-mode applications. Since the WHPX API does not offer
 * an easy way to pass the intercepted exception back to the guest, we
 * resort to using INT1 instead, and let the guest always handle INT3.
 */
static const uint8_t whpx_breakpoint_instruction = 0xF1;

/*
 * The WHPX QEMU backend implements breakpoints by writing the INT1
 * instruction into memory (ignoring the DRx registers). This raises a few
 * issues that need to be carefully handled:
 *
 * 1. Although unlikely, other parts of QEMU may set multiple breakpoints
 *    at the same location, and later remove them in arbitrary order.
 *    This should not cause memory corruption, and should only remove the
 *    physical breakpoint instruction when the last QEMU breakpoint is gone.
 *
 * 2. Writing arbitrary virtual memory may fail if it's not mapped to a valid
 *    physical location. Hence, physically adding/removing a breakpoint can
 *    theoretically fail at any time. We need to keep track of it.
 *
 * The function below rebuilds a list of low-level breakpoints (one per
 * address, tracking the original instruction and any errors) from the list of
 * high-level breakpoints (set via cpu_breakpoint_insert()).
 *
 * In order to optimize performance, this function stores the list of
 * high-level breakpoints (a.k.a. CPU breakpoints) used to compute the
 * low-level ones, so that it won't be re-invoked until these breakpoints
 * change.
 *
 * Note that this function decides which breakpoints should be inserted into,
 * memory, but doesn't actually do it. The memory accessing is done in
 * whpx_apply_breakpoints().
 */
void whpx_translate_cpu_breakpoints(
    struct whpx_breakpoints *breakpoints,
    CPUState *cpu,
    int cpu_breakpoint_count)
{
    CPUBreakpoint *bp;
    int cpu_bp_index = 0;

    breakpoints->original_addresses =
        g_renew(vaddr, breakpoints->original_addresses, cpu_breakpoint_count);

    breakpoints->original_address_count = cpu_breakpoint_count;

    int max_breakpoints = cpu_breakpoint_count +
        (breakpoints->breakpoints ? breakpoints->breakpoints->used : 0);

    struct whpx_breakpoint_collection *new_breakpoints =
        g_malloc0(sizeof(struct whpx_breakpoint_collection)
                  + max_breakpoints * sizeof(struct whpx_breakpoint));

    new_breakpoints->allocated = max_breakpoints;
    new_breakpoints->used = 0;

    /*
     * 1. Preserve all old breakpoints that could not be automatically
     * cleared when the CPU got stopped.
     */
    if (breakpoints->breakpoints) {
        int i;
        for (i = 0; i < breakpoints->breakpoints->used; i++) {
            if (breakpoints->breakpoints->data[i].state != WHPX_BP_CLEARED) {
                new_breakpoints->data[new_breakpoints->used++] =
                    breakpoints->breakpoints->data[i];
            }
        }
    }

    /* 2. Map all CPU breakpoints to WHPX breakpoints */
    QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) {
        int i;
        bool found = false;

        /* This will be used to detect changed CPU breakpoints later. */
        breakpoints->original_addresses[cpu_bp_index++] = bp->pc;

        for (i = 0; i < new_breakpoints->used; i++) {
            /*
             * WARNING: This loop has O(N^2) complexity, where N is the
             * number of breakpoints. It should not be a bottleneck in
             * real-world scenarios, since it only needs to run once after
             * the breakpoints have been modified.
             * If this ever becomes a concern, it can be optimized by storing
             * high-level breakpoint objects in a tree or hash map.
             */

            if (new_breakpoints->data[i].address == bp->pc) {
                /* There was already a breakpoint at this address. */
                if (new_breakpoints->data[i].state == WHPX_BP_CLEAR_PENDING) {
                    new_breakpoints->data[i].state = WHPX_BP_SET;
                } else if (new_breakpoints->data[i].state == WHPX_BP_SET) {
                    new_breakpoints->data[i].state = WHPX_BP_SET_PENDING;
                }

                found = true;
                break;
            }
        }

        if (!found && new_breakpoints->used < new_breakpoints->allocated) {
            /* No WHPX breakpoint at this address. Create one. */
            new_breakpoints->data[new_breakpoints->used].address = bp->pc;
            new_breakpoints->data[new_breakpoints->used].state =
                WHPX_BP_SET_PENDING;
            new_breakpoints->used++;
        }
    }

    /*
     * Free the previous breakpoint list. This can be optimized by keeping
     * it as shadow buffer for the next computation instead of freeing
     * it immediately.
     */
    g_free(breakpoints->breakpoints);

    breakpoints->breakpoints = new_breakpoints;
}

/*
 * Physically inserts/removes the breakpoints by reading and writing the
 * physical memory, keeping a track of the failed attempts.
 *
 * Passing resuming=true  will try to set all previously unset breakpoints.
 * Passing resuming=false will remove all inserted ones.
 */
void whpx_apply_breakpoints(
    struct whpx_breakpoint_collection *breakpoints,
    CPUState *cpu,
    bool resuming)
{
    int i, rc;
    if (!breakpoints) {
        return;
    }

    for (i = 0; i < breakpoints->used; i++) {
        /* Decide what to do right now based on the last known state. */
        WhpxBreakpointState state = breakpoints->data[i].state;
        switch (state) {
        case WHPX_BP_CLEARED:
            if (resuming) {
                state = WHPX_BP_SET_PENDING;
            }
            break;
        case WHPX_BP_SET_PENDING:
            if (!resuming) {
                state = WHPX_BP_CLEARED;
            }
            break;
        case WHPX_BP_SET:
            if (!resuming) {
                state = WHPX_BP_CLEAR_PENDING;
            }
            break;
        case WHPX_BP_CLEAR_PENDING:
            if (resuming) {
                state = WHPX_BP_SET;
            }
            break;
        }

        if (state == WHPX_BP_SET_PENDING) {
            /* Remember the original instruction. */
            rc = cpu_memory_rw_debug(cpu,
                breakpoints->data[i].address,
                &breakpoints->data[i].original_instruction,
                1,
                false);

            if (!rc) {
                /* Write the breakpoint instruction. */
                rc = cpu_memory_rw_debug(cpu,
                    breakpoints->data[i].address,
                    (void *)&whpx_breakpoint_instruction,
                    1,
                    true);
            }

            if (!rc) {
                state = WHPX_BP_SET;
            }

        }

        if (state == WHPX_BP_CLEAR_PENDING) {
            /* Restore the original instruction. */
            rc = cpu_memory_rw_debug(cpu,
                breakpoints->data[i].address,
                &breakpoints->data[i].original_instruction,
                1,
                true);

            if (!rc) {
                state = WHPX_BP_CLEARED;
            }
        }

        breakpoints->data[i].state = state;
    }
}

bool whpx_arch_supports_guest_debug(void)
{
    return true;
}

void whpx_arch_destroy_vcpu(CPUState *cpu)
{
    X86CPU *x86cpu = X86_CPU(cpu);
    CPUX86State *env = &x86cpu->env;
    g_free(env->emu_mmio_buf);
}

/* Returns the address of the next instruction that is about to be executed. */
static vaddr whpx_vcpu_get_pc(CPUState *cpu, bool exit_context_valid)
{
    if (cpu->vcpu_dirty) {
        /* The CPU registers have been modified by other parts of QEMU. */
        return cpu_env(cpu)->eip;
    } else if (exit_context_valid) {
        /*
         * The CPU registers have not been modified by neither other parts
         * of QEMU, nor this port by calling WHvSetVirtualProcessorRegisters().
         * This is the most common case.
         */
        AccelCPUState *vcpu = cpu->accel;
        return vcpu->exit_ctx.VpContext.Rip;
    } else {
        /*
         * The CPU registers have been modified by a call to
         * WHvSetVirtualProcessorRegisters() and must be re-queried from
         * the target.
         */
        WHV_REGISTER_VALUE reg_value;
        WHV_REGISTER_NAME reg_name = WHvX64RegisterRip;
        HRESULT hr;
        struct whpx_state *whpx = &whpx_global;

        hr = whp_dispatch.WHvGetVirtualProcessorRegisters(
            whpx->partition,
            cpu->cpu_index,
            &reg_name,
            1,
            &reg_value);

        if (FAILED(hr)) {
            error_report("WHPX: Failed to get PC, hr=%08lx", hr);
            return 0;
        }

        return reg_value.Reg64;
    }
}

static int whpx_handle_halt(CPUState *cpu)
{
    int ret = 0;

    bql_lock();
    if (!(cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD) &&
          (cpu_env(cpu)->eflags & IF_MASK)) &&
        !cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) {
        cpu->exception_index = EXCP_HLT;
        cpu->halted = true;
        ret = 1;
    }
    bql_unlock();

    return ret;
}

static void whpx_vcpu_kick_out_of_hlt(CPUState *cpu)
{
    WHV_REGISTER_VALUE reg;
    whpx_get_reg(cpu, WHvRegisterInternalActivityState, &reg);
    if (reg.InternalActivity.HaltSuspend) {
        reg.InternalActivity.HaltSuspend = 0;
        whpx_set_reg(cpu, WHvRegisterInternalActivityState, reg);
    }
}

static void whpx_vcpu_pre_run(CPUState *cpu)
{
    HRESULT hr;
    struct whpx_state *whpx = &whpx_global;
    AccelCPUState *vcpu = cpu->accel;
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
    int irq;
    uint8_t tpr;
    WHV_X64_PENDING_INTERRUPTION_REGISTER new_int;
    UINT32 reg_count = 0;
    WHV_REGISTER_VALUE reg_values[3];
    WHV_REGISTER_NAME reg_names[3];

    memset(&new_int, 0, sizeof(new_int));
    memset(reg_values, 0, sizeof(reg_values));

    bql_lock();

    /* Inject NMI */
    if (!vcpu->interruption_pending &&
        cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI | CPU_INTERRUPT_SMI)) {
        if (cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) {
            cpu_reset_interrupt(cpu, CPU_INTERRUPT_NMI);
            vcpu->interruptable = false;
            new_int.InterruptionType = WHvX64PendingNmi;
            new_int.InterruptionPending = 1;
            new_int.InterruptionVector = 2;
        }
        if (cpu_test_interrupt(cpu, CPU_INTERRUPT_SMI)) {
            cpu_reset_interrupt(cpu, CPU_INTERRUPT_SMI);
        }
    }

    /*
     * Force the VCPU out of its inner loop to process any INIT requests or
     * commit pending TPR access.
     */
    if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) {
        if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT) &&
            !(env->hflags & HF_SMM_MASK)) {
            qatomic_set(&cpu->exit_request, true);
        }
        if (cpu_test_interrupt(cpu, CPU_INTERRUPT_TPR)) {
            qatomic_set(&cpu->exit_request, true);
        }
    }

    /* Get pending hard interruption or replay one that was overwritten */
    if (!whpx_irqchip_in_kernel()) {
        if (!vcpu->interruption_pending &&
            vcpu->interruptable && (env->eflags & IF_MASK)) {
            assert(!new_int.InterruptionPending);
            if (cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) {
                cpu_reset_interrupt(cpu, CPU_INTERRUPT_HARD);
                irq = cpu_get_pic_interrupt(env);
                if (irq >= 0) {
                    new_int.InterruptionType = WHvX64PendingInterrupt;
                    new_int.InterruptionPending = 1;
                    new_int.InterruptionVector = irq;
                }
            }
        }

        /* Setup interrupt state if new one was prepared */
        if (new_int.InterruptionPending) {
            reg_values[reg_count].PendingInterruption = new_int;
            reg_names[reg_count] = WHvRegisterPendingInterruption;
            reg_count += 1;
        }
    } else if (vcpu->ready_for_pic_interrupt &&
               cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) {
        cpu_reset_interrupt(cpu, CPU_INTERRUPT_HARD);
        irq = cpu_get_pic_interrupt(env);
        if (irq >= 0) {
            reg_names[reg_count] = WHvRegisterPendingEvent;
            reg_values[reg_count].ExtIntEvent = (WHV_X64_PENDING_EXT_INT_EVENT)
            {
                .EventPending = 1,
                .EventType = WHvX64PendingEventExtInt,
                .Vector = irq,
            };
            reg_count += 1;
            /*
             * When the Hyper-V APIC is enabled, to get out of HLT we
             * either have to request an interrupt or manually get it away
             * from HLT.
             *
             * We also manually do inject some interrupts via WHvRegisterPendingEvent
             * instead of WHVRequestInterrupt, which does not reset the HLT state.
             */
            if (whpx_irqchip_in_kernel()) {
                whpx_vcpu_kick_out_of_hlt(cpu);
            }
        }
     }

    /* Sync the TPR to the CR8 if was modified during the intercept */
    tpr = whpx_apic_tpr_to_cr8(cpu_get_apic_tpr(x86_cpu->apic_state));
    if (tpr != vcpu->tpr) {
        vcpu->tpr = tpr;
        reg_values[reg_count].Reg64 = tpr;
        qatomic_set(&cpu->exit_request, true);
        reg_names[reg_count] = WHvX64RegisterCr8;
        reg_count += 1;
    }

    /* Update the state of the interrupt delivery notification */
    if (!vcpu->window_registered &&
        cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD)) {
        reg_values[reg_count].DeliverabilityNotifications =
            (WHV_X64_DELIVERABILITY_NOTIFICATIONS_REGISTER) {
                .InterruptNotification = 1
            };
        vcpu->window_registered = 1;
        reg_names[reg_count] = WHvX64RegisterDeliverabilityNotifications;
        reg_count += 1;
    }

    bql_unlock();
    vcpu->ready_for_pic_interrupt = false;

    if (reg_count) {
        hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
            whpx->partition, cpu->cpu_index,
            reg_names, reg_count, reg_values);
        if (FAILED(hr)) {
            error_report("WHPX: Failed to set interrupt state registers,"
                         " hr=%08lx", hr);
        }
    }
}

static void whpx_vcpu_post_run(CPUState *cpu)
{
    AccelCPUState *vcpu = cpu->accel;
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;

    env->eflags = vcpu->exit_ctx.VpContext.Rflags;

    uint64_t tpr = vcpu->exit_ctx.VpContext.Cr8;
    if (vcpu->tpr != tpr) {
        vcpu->tpr = tpr;
        bql_lock();
        cpu_set_apic_tpr(x86_cpu->apic_state, whpx_cr8_to_apic_tpr(vcpu->tpr));
        bql_unlock();
    }

    vcpu->interruption_pending =
        vcpu->exit_ctx.VpContext.ExecutionState.InterruptionPending;

    vcpu->interruptable =
        !vcpu->exit_ctx.VpContext.ExecutionState.InterruptShadow;
}


static void whpx_vcpu_process_async_events(CPUState *cpu)
{
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
    AccelCPUState *vcpu = cpu->accel;

    if (cpu_test_interrupt(cpu, CPU_INTERRUPT_INIT) &&
        !(env->hflags & HF_SMM_MASK)) {
        whpx_cpu_synchronize_state(cpu);
        do_cpu_init(x86_cpu);
        vcpu->interruptable = true;
    }

    if (cpu_test_interrupt(cpu, CPU_INTERRUPT_POLL)) {
        cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
        apic_poll_irq(x86_cpu->apic_state);
    }

    if ((cpu_test_interrupt(cpu, CPU_INTERRUPT_HARD) &&
         (env->eflags & IF_MASK)) ||
        cpu_test_interrupt(cpu, CPU_INTERRUPT_NMI)) {
        cpu->halted = false;
    }

    if (cpu_test_interrupt(cpu, CPU_INTERRUPT_SIPI)) {
        cpu_reset_interrupt(cpu, CPU_INTERRUPT_SIPI);
        whpx_cpu_synchronize_state(cpu);
        do_cpu_sipi(x86_cpu);
    }

    if (cpu_test_interrupt(cpu, CPU_INTERRUPT_TPR)) {
        cpu_reset_interrupt(cpu, CPU_INTERRUPT_TPR);
        whpx_cpu_synchronize_state(cpu);
        apic_handle_tpr_access_report(x86_cpu->apic_state, env->eip,
                                      env->tpr_access_type);
    }
}

static void whpx_inject_exceptions(CPUState* cpu)
{
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;

    if (env->exception_injected) {
        env->exception_injected = 0;
        WHV_REGISTER_VALUE reg = {};
        reg.ExceptionEvent.EventPending = 1;
        reg.ExceptionEvent.EventType = WHvX64PendingEventException;
        reg.ExceptionEvent.DeliverErrorCode = 1;
        reg.ExceptionEvent.Vector = env->exception_nr;
        reg.ExceptionEvent.ErrorCode = env->error_code;
        if (env->exception_nr == EXCP0E_PAGE) {
            reg.ExceptionEvent.ExceptionParameter = env->cr[2];
        }
        whpx_set_reg(cpu, WHvRegisterPendingEvent, reg);
    }
}

int whpx_vcpu_run(CPUState *cpu)
{
    HRESULT hr;
    struct whpx_state *whpx = &whpx_global;
    AccelCPUState *vcpu = cpu->accel;
    struct whpx_breakpoint *stepped_over_bp = NULL;
    WhpxStepMode exclusive_step_mode = WHPX_STEP_NONE;
    int ret;

    g_assert(bql_locked());

    if (whpx->running_cpus++ == 0) {
        /* Insert breakpoints into memory, update exception exit bitmap. */
        ret = whpx_first_vcpu_starting(cpu);
        if (ret != 0) {
            return ret;
        }
    }

    if (whpx->breakpoints.breakpoints &&
        whpx->breakpoints.breakpoints->used > 0)
    {
        uint64_t pc = whpx_vcpu_get_pc(cpu, true);
        stepped_over_bp = whpx_lookup_breakpoint_by_addr(pc);
        if (stepped_over_bp && stepped_over_bp->state != WHPX_BP_SET) {
            stepped_over_bp = NULL;
        }

        if (stepped_over_bp) {
            /*
             * We are trying to run the instruction overwritten by an active
             * breakpoint. We will temporarily disable the breakpoint, suspend
             * other CPUs, and step over the instruction.
             */
            exclusive_step_mode = WHPX_STEP_EXCLUSIVE;
        }
    }

    if (exclusive_step_mode == WHPX_STEP_NONE) {
        whpx_vcpu_process_async_events(cpu);
        if (cpu->halted && !whpx_irqchip_in_kernel()) {
            cpu->exception_index = EXCP_HLT;
            qatomic_set(&cpu->exit_request, false);
            return 0;
        }
    }

    bql_unlock();

    if (exclusive_step_mode != WHPX_STEP_NONE) {
        start_exclusive();
        g_assert(cpu == current_cpu);
        g_assert(!cpu->running);
        cpu->running = true;

        hr = whpx_set_exception_exit_bitmap(
            1UL << WHvX64ExceptionTypeDebugTrapOrFault);
        if (!SUCCEEDED(hr)) {
            error_report("WHPX: Failed to update exception exit mask, "
                         "hr=%08lx.", hr);
            return 1;
        }

        if (stepped_over_bp) {
            /* Temporarily disable the triggered breakpoint. */
            cpu_memory_rw_debug(cpu,
                stepped_over_bp->address,
                &stepped_over_bp->original_instruction,
                1,
                true);
        }
    } else {
        cpu_exec_start(cpu);
    }

    do {
        if (cpu->vcpu_dirty) {
            whpx_set_registers(cpu, WHPX_LEVEL_RUNTIME_STATE);
            cpu->vcpu_dirty = false;
        }

        if (exclusive_step_mode == WHPX_STEP_NONE) {
            whpx_vcpu_pre_run(cpu);

            /* Corresponding store-release is in cpu_exit. */
            if (qatomic_load_acquire(&cpu->exit_request)) {
                whpx_vcpu_kick(cpu);
            }
        }

        if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
            whpx_vcpu_configure_single_stepping(cpu, true, NULL);
        }

        whpx_inject_exceptions(cpu);

        hr = whp_dispatch.WHvRunVirtualProcessor(
            whpx->partition, cpu->cpu_index,
            &vcpu->exit_ctx, sizeof(vcpu->exit_ctx));

        if (FAILED(hr)) {
            error_report("WHPX: Failed to exec a virtual processor,"
                         " hr=%08lx", hr);
            ret = -1;
            break;
        }

        if (exclusive_step_mode != WHPX_STEP_NONE || cpu->singlestep_enabled) {
            whpx_vcpu_configure_single_stepping(cpu,
                false,
                &vcpu->exit_ctx.VpContext.Rflags);
        }

        whpx_vcpu_post_run(cpu);

        switch (vcpu->exit_ctx.ExitReason) {
        case WHvRunVpExitReasonMemoryAccess:
            ret = whpx_handle_mmio(cpu, &vcpu->exit_ctx);
            break;

        case WHvRunVpExitReasonX64IoPortAccess:
            ret = whpx_handle_portio(cpu, &vcpu->exit_ctx);
            break;

        case WHvRunVpExitReasonX64InterruptWindow:
            vcpu->ready_for_pic_interrupt = 1;
            vcpu->window_registered = 0;
            ret = 0;
            break;

        case WHvRunVpExitReasonX64ApicEoi:
            assert(whpx_irqchip_in_kernel());
            ioapic_eoi_broadcast(vcpu->exit_ctx.ApicEoi.InterruptVector);
            break;

        case WHvRunVpExitReasonX64Halt:
            /*
             * Used for kernel-irqchip=off
             */
            ret = whpx_handle_halt(cpu);
            break;

        case WHvRunVpExitReasonCanceled:
            if (exclusive_step_mode != WHPX_STEP_NONE) {
                /*
                 * We are trying to step over a single instruction, and
                 * likely got a request to stop from another thread.
                 * Delay it until we are done stepping
                 * over.
                 */
                ret = 0;
            } else {
                cpu->exception_index = EXCP_INTERRUPT;
                ret = 1;
            }
            break;
        case WHvRunVpExitReasonX64MsrAccess: {
            WHV_REGISTER_VALUE reg_values[3] = {0};
            WHV_REGISTER_NAME reg_names[3];
            UINT32 reg_count;
            bool is_known_msr = 0;

            reg_names[0] = WHvX64RegisterRip;
            reg_names[1] = WHvX64RegisterRax;
            reg_names[2] = WHvX64RegisterRdx;

            reg_values[0].Reg64 =
                vcpu->exit_ctx.VpContext.Rip +
                vcpu->exit_ctx.VpContext.InstructionLength;

            if (vcpu->exit_ctx.MsrAccess.MsrNumber == HV_X64_MSR_APIC_FREQUENCY
                && !vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite
                && !whpx_irqchip_in_kernel()) {
                is_known_msr = 1;
                reg_values[1].Reg32 = (uint32_t)X86_CPU(cpu)->env.apic_bus_freq;
            }
            /*
             * For all unsupported MSR access we:
             *     ignore writes
             *     return 0 on read.
             */
            reg_count = vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite ?
                        1 : 3;

            if (!is_known_msr) {
                trace_whpx_unsupported_msr_access(vcpu->exit_ctx.MsrAccess.MsrNumber,
                    vcpu->exit_ctx.MsrAccess.AccessInfo.IsWrite);
            }

            hr = whp_dispatch.WHvSetVirtualProcessorRegisters(
                whpx->partition,
                cpu->cpu_index,
                reg_names, reg_count,
                reg_values);

            if (FAILED(hr)) {
                error_report("WHPX: Failed to set MsrAccess state "
                             " registers, hr=%08lx", hr);
            }
            ret = 0;
            break;
        }
        case WHvRunVpExitReasonException:
            whpx_get_registers(cpu, WHPX_LEVEL_FULL_STATE);

            if ((vcpu->exit_ctx.VpException.ExceptionType ==
                 WHvX64ExceptionTypeDebugTrapOrFault) &&
                (vcpu->exit_ctx.VpException.InstructionByteCount >= 1) &&
                (vcpu->exit_ctx.VpException.InstructionBytes[0] ==
                 whpx_breakpoint_instruction)) {
                /* Stopped at a software breakpoint. */
                cpu->exception_index = EXCP_DEBUG;
            } else if ((vcpu->exit_ctx.VpException.ExceptionType ==
                        WHvX64ExceptionTypeDebugTrapOrFault) &&
                       !cpu->singlestep_enabled) {
                /*
                 * Just finished stepping over a breakpoint, but the
                 * gdb does not expect us to do single-stepping.
                 * Don't do anything special.
                 */
                cpu->exception_index = EXCP_INTERRUPT;
            } else {
                /* Another exception or debug event. Report it to GDB. */
                cpu->exception_index = EXCP_DEBUG;
            }

            ret = 1;
            break;
        case WHvRunVpExitReasonNone:
        case WHvRunVpExitReasonUnrecoverableException:
        case WHvRunVpExitReasonInvalidVpRegisterValue:
        case WHvRunVpExitReasonUnsupportedFeature:
        default:
            error_report("WHPX: Unexpected VP exit code %d",
                         vcpu->exit_ctx.ExitReason);
            whpx_get_registers(cpu, WHPX_LEVEL_FULL_STATE);
            bql_lock();
            qemu_system_guest_panicked(cpu_get_crash_info(cpu));
            bql_unlock();
            break;
        }

    } while (!ret);

    if (stepped_over_bp) {
        /* Restore the breakpoint we stepped over */
        cpu_memory_rw_debug(cpu,
            stepped_over_bp->address,
            (void *)&whpx_breakpoint_instruction,
            1,
            true);
    }

    if (exclusive_step_mode != WHPX_STEP_NONE) {
        g_assert(cpu_in_exclusive_context(cpu));
        cpu->running = false;
        end_exclusive();

        exclusive_step_mode = WHPX_STEP_NONE;
    } else {
        cpu_exec_end(cpu);
    }

    bql_lock();
    current_cpu = cpu;

    if (--whpx->running_cpus == 0) {
        whpx_last_vcpu_stopping(cpu);
    }

    return ret < 0;
}

/*
 * Vcpu support.
 */

static Error *whpx_migration_blocker;

static void whpx_cpu_update_state(void *opaque, bool running, RunState state)
{
    CPUX86State *env = opaque;

    if (running) {
        env->tsc_valid = false;
    }
}

int whpx_init_vcpu(CPUState *cpu)
{
    HRESULT hr;
    struct whpx_state *whpx = &whpx_global;
    AccelCPUState *vcpu = NULL;
    Error *local_error = NULL;
    X86CPU *x86_cpu = X86_CPU(cpu);
    CPUX86State *env = &x86_cpu->env;
    UINT64 freq = 0;
    int ret;

    /* Add migration blockers for all unsupported features of the
     * Windows Hypervisor Platform
     */
    if (whpx_migration_blocker == NULL) {
        error_setg(&whpx_migration_blocker,
               "State blocked due to non-migratable CPUID feature support,"
               "dirty memory tracking support, and XSAVE/XRSTOR support");

        if (migrate_add_blocker(&whpx_migration_blocker, &local_error) < 0) {
            error_report_err(local_error);
            ret = -EINVAL;
            goto error;
        }
    }

    vcpu = g_new0(AccelCPUState, 1);

    hr = whp_dispatch.WHvCreateVirtualProcessor(
        whpx->partition, cpu->cpu_index, 0);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to create a virtual processor,"
                     " hr=%08lx", hr);
        ret = -EINVAL;
        goto error;
    }

    /*
     * vcpu's TSC frequency is either specified by user, or use the value
     * provided by Hyper-V if the former is not present. In the latter case, we
     * query it from Hyper-V and record in env->tsc_khz, so that vcpu's TSC
     * frequency can be migrated later via this field.
     */
    if (!env->tsc_khz) {
        hr = whp_dispatch.WHvGetCapability(
            WHvCapabilityCodeProcessorClockFrequency, &freq, sizeof(freq),
                NULL);
        if (hr != WHV_E_UNKNOWN_CAPABILITY) {
            if (FAILED(hr)) {
                printf("WHPX: Failed to query tsc frequency, hr=0x%08lx\n", hr);
            } else {
                env->tsc_khz = freq / 1000; /* Hz to KHz */
            }
        }
    }

    env->apic_bus_freq = HYPERV_APIC_BUS_FREQUENCY;
    hr = whp_dispatch.WHvGetCapability(
        WHvCapabilityCodeInterruptClockFrequency, &freq, sizeof(freq), NULL);
    if (hr != WHV_E_UNKNOWN_CAPABILITY) {
        if (FAILED(hr)) {
            printf("WHPX: Failed to query apic bus frequency hr=0x%08lx\n", hr);
        } else {
            env->apic_bus_freq = freq;
        }
    }

    /* When not using the Hyper-V APIC, the frequency is 1 GHz */
    if (!whpx_irqchip_in_kernel()) {
        env->apic_bus_freq = 1000000000;
    }

    vcpu->interruptable = true;
    cpu->vcpu_dirty = true;
    cpu->accel = vcpu;
    max_vcpu_index = max(max_vcpu_index, cpu->cpu_index);
    qemu_add_vm_change_state_handler(whpx_cpu_update_state, env);

    env->emu_mmio_buf = g_new(char, 4096);

    return 0;

error:
    g_free(vcpu);

    return ret;
}

void whpx_cpu_instance_init(CPUState *cs)
{
    X86CPU *cpu = X86_CPU(cs);

    host_cpu_instance_init(cpu);
}

/*
 * Partition support
 */

int whpx_accel_init(AccelState *as, MachineState *ms)
{
    struct whpx_state *whpx;
    int ret;
    HRESULT hr;
    WHV_CAPABILITY whpx_cap;
    UINT32 whpx_cap_size;
    WHV_PARTITION_PROPERTY prop;
    WHV_CAPABILITY_FEATURES features = {0};
    WHV_PROCESSOR_FEATURES_BANKS processor_features;
    WHV_PROCESSOR_PERFMON_FEATURES perfmon_features;
    bool is_legacy_os = false;

    whpx = &whpx_global;

    if (!init_whp_dispatch()) {
        ret = -ENOSYS;
        goto error;
    }

    whpx->mem_quota = ms->ram_size;

    hr = whp_dispatch.WHvGetCapability(
        WHvCapabilityCodeHypervisorPresent, &whpx_cap,
        sizeof(whpx_cap), &whpx_cap_size);
    if (FAILED(hr) || !whpx_cap.HypervisorPresent) {
        error_report("WHPX: No accelerator found, hr=%08lx", hr);
        ret = -ENOSPC;
        goto error;
    }

    hr = whp_dispatch.WHvGetCapability(
        WHvCapabilityCodeFeatures, &features, sizeof(features), NULL);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to query capabilities, hr=%08lx", hr);
        ret = -EINVAL;
        goto error;
    }

    hr = whp_dispatch.WHvCreatePartition(&whpx->partition);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to create partition, hr=%08lx", hr);
        ret = -EINVAL;
        goto error;
    }

    /*
     * Query the XSAVE capability of the partition. Any error here is not
     * considered fatal.
     */
    hr = whp_dispatch.WHvGetPartitionProperty(
        whpx->partition,
        WHvPartitionPropertyCodeProcessorXsaveFeatures,
        &whpx_xsave_cap,
        sizeof(whpx_xsave_cap),
        &whpx_cap_size);

    /*
     * Windows version which don't support this property will return with the
     * specific error code.
     */
    if (FAILED(hr) && hr != WHV_E_UNKNOWN_PROPERTY) {
        error_report("WHPX: Failed to query XSAVE capability, hr=%08lx", hr);
    }

    if (!whpx_has_xsave()) {
        printf("WHPX: Partition is not XSAVE capable\n");
    }

    memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
    prop.ProcessorCount = ms->smp.cpus;
    hr = whp_dispatch.WHvSetPartitionProperty(
        whpx->partition,
        WHvPartitionPropertyCodeProcessorCount,
        &prop,
        sizeof(WHV_PARTITION_PROPERTY));

    if (FAILED(hr)) {
        error_report("WHPX: Failed to set partition processor count to %u,"
                     " hr=%08lx", prop.ProcessorCount, hr);
        ret = -EINVAL;
        goto error;
    }

    /*
     * Error out if WHP doesn't support apic emulation and user is requiring
     * it.
     */
    if (whpx->kernel_irqchip_required && (!features.LocalApicEmulation ||
            !whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2)) {
        error_report("WHPX: kernel irqchip requested, but unavailable. "
            "Try without kernel-irqchip or with kernel-irqchip=off");
        ret = -EINVAL;
        goto error;
    }

    if (whpx->kernel_irqchip_allowed && features.LocalApicEmulation &&
        whp_dispatch.WHvSetVirtualProcessorInterruptControllerState2) {
        WHV_X64_LOCAL_APIC_EMULATION_MODE mode =
            WHvX64LocalApicEmulationModeX2Apic;
        hr = whp_dispatch.WHvSetPartitionProperty(
            whpx->partition,
            WHvPartitionPropertyCodeLocalApicEmulationMode,
            &mode,
            sizeof(mode));
        if (FAILED(hr)) {
            error_report("WHPX: Failed to enable kernel irqchip hr=%08lx", hr);
            if (whpx->kernel_irqchip_required) {
                error_report("WHPX: kernel irqchip requested, but unavailable");
                ret = -EINVAL;
                goto error;
            }
        } else {
            whpx_irqchip_in_kernel = true;
        }
    }

    /* Set all the supported features, to follow the MSHV example */
    memset(&processor_features, 0, sizeof(WHV_PROCESSOR_FEATURES_BANKS));
    processor_features.BanksCount = 2;

    hr = whp_dispatch.WHvGetCapability(
        WHvCapabilityCodeProcessorFeaturesBanks, &processor_features,
        sizeof(WHV_PROCESSOR_FEATURES_BANKS), &whpx_cap_size);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to get processor features, hr=%08lx", hr);
        ret = -ENOSPC;
        goto error;
    }

    if (whpx_irqchip_in_kernel() && processor_features.Bank1.NestedVirtSupport) {
        memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
        prop.NestedVirtualization = 1;
        hr = whp_dispatch.WHvSetPartitionProperty(
            whpx->partition,
            WHvPartitionPropertyCodeNestedVirtualization,
            &prop,
            sizeof(WHV_PARTITION_PROPERTY));
            if (FAILED(hr)) {
                error_report("WHPX: Failed to enable nested virtualization, hr=%08lx", hr);
                ret = -EINVAL;
                goto error;
        }
    }

    hr = whp_dispatch.WHvSetPartitionProperty(
            whpx->partition,
            WHvPartitionPropertyCodeProcessorFeaturesBanks,
            &processor_features,
            sizeof(WHV_PROCESSOR_FEATURES_BANKS));
    if (FAILED(hr)) {
        error_report("WHPX: Failed to set processor features, hr=%08lx", hr);
        ret = -EINVAL;
        goto error;
    }

    /* Enable supported performance monitoring capabilities */
    hr = whp_dispatch.WHvGetCapability(
        WHvCapabilityCodeProcessorPerfmonFeatures, &perfmon_features,
        sizeof(WHV_PROCESSOR_PERFMON_FEATURES), &whpx_cap_size);
    /*
     * Relying on this is a crutch to maintain Windows 10 support.
     *
     * WHvCapabilityCodeProcessorPerfmonFeatures and
     * WHvPartitionPropertyCodeSyntheticProcessorFeaturesBanks
     * are implemented starting from Windows Server 2022 (build 20348).
     */
    if (FAILED(hr)) {
        warn_report("WHPX: Failed to get performance "
                    "monitoring features, hr=%08lx", hr);
        is_legacy_os = true;
    } else {
        hr = whp_dispatch.WHvSetPartitionProperty(
                whpx->partition,
                WHvPartitionPropertyCodeProcessorPerfmonFeatures,
                &perfmon_features,
                sizeof(WHV_PROCESSOR_PERFMON_FEATURES));
        if (FAILED(hr)) {
            error_report("WHPX: Failed to set performance "
                         "monitoring features, hr=%08lx", hr);
            ret = -EINVAL;
            goto error;
        }
    }

    /* Enable synthetic processor features */
    WHV_SYNTHETIC_PROCESSOR_FEATURES_BANKS synthetic_features;
    memset(&synthetic_features, 0, sizeof(WHV_SYNTHETIC_PROCESSOR_FEATURES_BANKS));
    synthetic_features.BanksCount = 1;

    synthetic_features.Bank0.HypervisorPresent = 1;
    synthetic_features.Bank0.Hv1 = 1;
    synthetic_features.Bank0.AccessVpRunTimeReg = 1;
    synthetic_features.Bank0.AccessPartitionReferenceCounter = 1;
    synthetic_features.Bank0.AccessPartitionReferenceTsc = 1;
    synthetic_features.Bank0.AccessHypercallRegs = 1;
    synthetic_features.Bank0.AccessFrequencyRegs = 1;
    synthetic_features.Bank0.EnableExtendedGvaRangesForFlushVirtualAddressList = 1;
    synthetic_features.Bank0.AccessVpIndex = 1;
    synthetic_features.Bank0.AccessHypercallRegs = 1;
    synthetic_features.Bank0.TbFlushHypercalls = 1;

    if (whpx_irqchip_in_kernel()) {
        synthetic_features.Bank0.AccessSynicRegs = 1;
        synthetic_features.Bank0.AccessSyntheticTimerRegs = 1;
        synthetic_features.Bank0.AccessIntrCtrlRegs = 1;
        synthetic_features.Bank0.SyntheticClusterIpi = 1;
        synthetic_features.Bank0.DirectSyntheticTimers = 1;
    }

    if (!is_legacy_os && whpx->hyperv_enlightenments_allowed) {
        hr = whp_dispatch.WHvSetPartitionProperty(
                whpx->partition,
                WHvPartitionPropertyCodeSyntheticProcessorFeaturesBanks,
                &synthetic_features,
                sizeof(WHV_SYNTHETIC_PROCESSOR_FEATURES_BANKS));
        if (FAILED(hr)) {
            error_report("WHPX: Failed to set synthetic features, hr=%08lx", hr);
            ret = -EINVAL;
            goto error;
        }
    } else if (is_legacy_os && whpx->hyperv_enlightenments_required) {
        error_report("Hyper-V enlightenments not available on legacy Windows");
        ret = -EINVAL;
        goto error;
    }

    /* Register for MSR and CPUID exits */
    memset(&prop, 0, sizeof(WHV_PARTITION_PROPERTY));
    prop.ExtendedVmExits.X64MsrExit = 1;

    hr = whp_dispatch.WHvSetPartitionProperty(
            whpx->partition,
            WHvPartitionPropertyCodeExtendedVmExits,
            &prop,
            sizeof(WHV_PARTITION_PROPERTY));
    if (FAILED(hr)) {
        error_report("WHPX: Failed to enable extended VM exits, hr=%08lx", hr);
        ret = -EINVAL;
        goto error;
    }

    /*
     * We do not want to intercept any exceptions from the guest,
     * until we actually start debugging with gdb.
     */
    whpx->exception_exit_bitmap = -1;
    hr = whpx_set_exception_exit_bitmap(0);

    if (FAILED(hr)) {
        error_report("WHPX: Failed to set exception exit bitmap, hr=%08lx", hr);
        ret = -EINVAL;
        goto error;
    }

    hr = whp_dispatch.WHvSetupPartition(whpx->partition);
    if (FAILED(hr)) {
        error_report("WHPX: Failed to setup partition, hr=%08lx", hr);
        ret = -EINVAL;
        goto error;
    }

    whpx_memory_init();
    whpx_init_emu();

    return 0;

error:

    if (NULL != whpx->partition) {
        whp_dispatch.WHvDeletePartition(whpx->partition);
        whpx->partition = NULL;
    }

    return ret;
}