diff --git a/CMakeLists.txt b/CMakeLists.txt index 57e3b178f..29fedd6ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -185,6 +185,7 @@ cmake_dependent_option(SIO_DETECT "Super I/O Detection Helper" cmake_dependent_option(WACOM "Wacom Input Devices" ON "DEV_BRANCH" OFF) cmake_dependent_option(XL24 "ATI VGA Wonder XL24 (ATI-28800-6)" ON "DEV_BRANCH" OFF) cmake_dependent_option(NETSWITCH "Network Switch Support" ON "DEV_BRANCH" OFF) +cmake_dependent_option(VFIO "Virtual Function I/O" ON "DEV_BRANCH" OFF) # Ditto but for Qt if(QT) diff --git a/src/86box.c b/src/86box.c index b90591267..95f714747 100644 --- a/src/86box.c +++ b/src/86box.c @@ -107,6 +107,7 @@ #include <86box/apm.h> #include <86box/acpi.h> #include <86box/nv/vid_nv_rivatimer.h> +#include <86box/vfio.h> // Disable c99-designator to avoid the warnings about int ng #ifdef __clang__ @@ -1646,6 +1647,11 @@ pc_reset_hard_init(void) the chances of the SCSI controller ending up on the bridge. */ video_voodoo_init(); +#if defined(USE_VFIO) && defined(__linux__) + /* Initialize VFIO */ + vfio_init(); +#endif + /* installs first game port if no device provides one, must be late */ if (joystick_type[0]) gameport_update_joystick_type(0); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 166320061..7c389c368 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -103,6 +103,16 @@ if(VNC) endif() endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + if(VFIO) + include(CheckIncludeFile) + check_include_file("linux/vfio.h" VFIO_AVAILABLE) + if(VFIO_AVAILABLE) + add_compile_definitions(USE_VFIO) + endif() + endif() +endif() + if(INSTRUMENT) add_compile_definitions(USE_INSTRUMENT) endif() diff --git a/src/device/CMakeLists.txt b/src/device/CMakeLists.txt index 12835e909..dcad64149 100644 --- a/src/device/CMakeLists.txt +++ b/src/device/CMakeLists.txt @@ -81,6 +81,17 @@ if(ISAMEM_BRAT) target_compile_definitions(dev PRIVATE USE_ISAMEM_BRAT) endif() +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + if(VFIO) + include(CheckIncludeFile) + check_include_file("linux/vfio.h" VFIO_AVAILABLE) + if(VFIO_AVAILABLE) + add_compile_definitions(USE_VFIO) + target_sources(dev PRIVATE vfio.c) + endif() + endif() +endif() + if(LASERXT) target_compile_definitions(dev PRIVATE USE_LASERXT) endif() diff --git a/src/device/vfio.c b/src/device/vfio.c new file mode 100644 index 000000000..b5fc077b8 --- /dev/null +++ b/src/device/vfio.c @@ -0,0 +1,3377 @@ +/* + * 86Box A hypervisor and IBM PC system emulator that specializes in + * running old operating systems and software designed for IBM + * PC systems and compatibles from 1981 through fairly recent + * system designs based on the PCI bus. + * + * This file is part of the 86Box distribution. + * + * Virtual Function I/O PCI passthrough handler. + * + * Authors: RichardG, + * + * Copyright 2021-2025 RichardG. + */ +#define _FILE_OFFSET_BITS 64 +#define _LARGEFILE64_SOURCE 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define HAVE_STDARG_H +#include "cpu.h" +#include <86box/86box.h> +#include <86box/ini.h> +#include <86box/config.h> +#include <86box/device.h> +#include <86box/i2c.h> /* log2i */ +#include <86box/io.h> +#include <86box/mem.h> +#include <86box/path.h> +#include <86box/pci.h> +#include <86box/plat.h> +#include <86box/thread.h> +#include <86box/timer.h> +#include <86box/video.h> + +/* Just so we don't have to include Linux's pci.h, which + has some defines that conflict with our own pci.h */ +#define PCI_SLOT(devfn) (((devfn) >> 3) & 0x1f) +#define PCI_FUNC(devfn) ((devfn) & 0x07) + +enum { + NVIDIA_3D0_NONE = 0, + NVIDIA_3D0_SELECT, + NVIDIA_3D0_WINDOW, + NVIDIA_3D0_READ, + NVIDIA_3D0_WRITE +}; + +typedef struct { + int fd; + uint64_t precalc_offset; + uint64_t offset; + uint64_t size; + uint32_t emulated_offset; + uint8_t *mmap_base; + uint8_t *mmap_precalc; + uint8_t type; + uint8_t bar_id; + uint8_t read : 1; + uint8_t write : 1; + mem_mapping_t mem_mapping; + char name[20]; + struct _vfio_device_ *dev; + + struct { + mem_mapping_t mem_mappings[2]; + + struct { + uint32_t offset; + } iomirror; + + struct { + uint32_t offset; + } configmirror; + + struct { + struct { + uint32_t start; + uint32_t end; + } offset[2]; + uint32_t index; + } configwindow; + } quirks; +} vfio_region_t; + +typedef struct { + struct _vfio_device_ *dev; + int fd; + int type; + int vector; + uint16_t msix_offset; +} vfio_irq_t; + +typedef struct _vfio_device_ { + int fd; + uint8_t mem_enabled : 1; + uint8_t io_enabled : 1; + uint8_t rom_enabled : 1; + uint8_t can_reset : 1; + uint8_t can_flr_reset : 1; + uint8_t can_pm_reset : 1; + uint8_t can_hot_reset : 1; + uint8_t slot; + uint8_t bar_count; + uint8_t pm_cap; + uint8_t msi_cap; + uint8_t msix_cap; + uint8_t pcie_cap; + uint8_t af_cap; + char *name; + char *rom_fn; + + vfio_region_t bars[6]; + vfio_region_t rom; + vfio_region_t config; + vfio_region_t vga_io_lo; + vfio_region_t vga_io_hi; + vfio_region_t vga_mem; + + struct { + uint8_t type; + int vector_count; + vfio_irq_t *vectors; + + struct { + int raised; + uint8_t pin; + uint8_t state; + } intx; + struct { + uint32_t address; + uint32_t address_upper; + uint32_t pending; + uint32_t mask; + uint16_t ctl; + uint16_t data; + uint16_t vector_enable_mask; + uint8_t vector_count; + uint8_t vector_enable_count; + } msi; + struct { + mem_mapping_t table_mapping; + mem_mapping_t pba_mapping; + uint32_t table_offset; + uint32_t pba_offset; + uint32_t table_offset_precalc; + uint32_t pba_offset_precalc; + uint16_t ctl; + uint16_t vector_count; + uint16_t table_size; + uint16_t pba_size; + uint8_t table_bar; + uint8_t pba_bar; + uint8_t *table; + uint8_t *pba; + } msix; + } irq; + + struct { + union { + struct { + vfio_region_t *bar; + } ati3c3; + + struct { + uint64_t master_enable; + uint8_t bar_enable; + } nvidiabar5; + + struct { + uint32_t index; + uint8_t state; + } nvidia3d0; + }; + } quirks; + + struct _vfio_device_ *next; +} vfio_device_t; + +typedef struct _vfio_group_ { + int id; + int fd; + + vfio_device_t *first_device; + vfio_device_t *current_device; + + struct _vfio_group_ *next; +} vfio_group_t; + +static video_timings_t timing_default = { VIDEO_PCI, 8, 16, 32, 8, 16, 32 }; +static int container_fd = -1; +static int epoll_fd = -1; +static int irq_thread_wake_fd = -1; +static int closing = 0; +static int intx_high = 0; +static int timing_readb = 0; +static int timing_readw = 0; +static int timing_readl = 0; +static int timing_writeb = 0; +static int timing_writew = 0; +static int timing_writel = 0; +static vfio_group_t *first_group = NULL; +static vfio_group_t *current_group; +static thread_t *irq_thread; +static event_t *irq_event; +static event_t *irq_thread_resume; +static pc_timer_t irq_timer; +static vfio_irq_t *current_irq = NULL; +static const device_t vfio_device; + +#define ENABLE_VFIO_LOG 2 +#ifdef ENABLE_VFIO_LOG +int vfio_do_log = ENABLE_VFIO_LOG; + +static void +vfio_log(const char *fmt, ...) +{ + va_list ap; + + if (vfio_do_log) { + va_start(ap, fmt); + pclog_ex(fmt, ap); + va_end(ap); + } +} + +# if ENABLE_VFIO_LOG == 2 +# define vfio_log_op vfio_log +# else +# define vfio_log_op(fmt, ...) +# endif +#else +# define vfio_log(fmt, ...) +# define vfio_log_op(fmt, ...) +#endif + +static uint8_t vfio_bar_gettype(vfio_device_t *dev, vfio_region_t *bar); +static uint8_t vfio_config_readb(int func, int addr, void *priv); +static uint16_t vfio_config_readw(int func, int addr, void *priv); +static uint32_t vfio_config_readl(int func, int addr, void *priv); +static void vfio_config_writeb(int func, int addr, uint8_t val, void *priv); +static void vfio_config_writew(int func, int addr, uint16_t val, void *priv); +static void vfio_config_writel(int func, int addr, uint32_t val, void *priv); +static void vfio_irq_intx_setpin(vfio_device_t *dev); +static void vfio_irq_msi_disable(vfio_device_t *dev); +static void vfio_irq_msix_disable(vfio_device_t *dev); +static void vfio_irq_msix_updatemask(vfio_device_t *dev, uint16_t offset); +static void vfio_irq_enable(vfio_device_t *dev, int type); + +#define VFIO_RW(space, length_char, addr_type, addr_slength, val_type, val_slength) \ + static val_type \ + vfio_##space##_read##length_char##_fd(addr_type addr, void *priv) \ + { \ + register vfio_region_t *region = (vfio_region_t *) priv; \ + val_type ret; \ + if (pread(region->fd, &ret, sizeof(ret), region->precalc_offset + addr) != sizeof(ret)) \ + ret = -1; \ + vfio_log_op("[%04X:%08X] VFIO: " #space "_read" #length_char "_fd(%0" #addr_slength "X) = %0" #val_slength "X\n", CS, cpu_state.pc, addr, ret); \ + cycles -= timing_read##length_char; \ + intx_high = 0; \ + return ret; \ + } \ + \ + static void \ + vfio_##space##_write##length_char##_fd(addr_type addr, val_type val, void *priv) \ + { \ + register vfio_region_t *region = (vfio_region_t *) priv; \ + vfio_log_op("[%04X:%08X] VFIO: " #space "_write" #length_char "_fd(%0" #addr_slength "X, %0" #val_slength "X)\n", CS, cpu_state.pc, addr, val); \ + (void) !pwrite(region->fd, &val, sizeof(val), region->precalc_offset + addr); \ + cycles -= timing_write##length_char; \ + intx_high = 0; \ + } \ + \ + static val_type \ + vfio_##space##_read##length_char##_mm(addr_type addr, void *priv) \ + { \ + register val_type ret = *((val_type *) &((uint8_t *) priv)[addr]); \ + vfio_log_op("[%04X:%08X] VFIO: " #space "_read" #length_char "_mm(%0" #addr_slength "X) = %0" #val_slength "X\n", CS, cpu_state.pc, addr, ret); \ + cycles -= timing_read##length_char; \ + intx_high = 0; \ + return ret; \ + } \ + \ + static void \ + vfio_##space##_write##length_char##_mm(addr_type addr, val_type val, void *priv) \ + { \ + vfio_log_op("[%04X:%08X] VFIO: " #space "_write" #length_char "_mm(%0" #addr_slength "X, %0" #val_slength "X)\n", CS, cpu_state.pc, addr, val); \ + *((val_type *) &((uint8_t *) priv)[addr]) = val; \ + cycles -= timing_write##length_char; \ + intx_high = 0; \ + } + +VFIO_RW(mem, b, uint32_t, 8, uint8_t, 2) +VFIO_RW(mem, w, uint32_t, 8, uint16_t, 4) +VFIO_RW(mem, l, uint32_t, 8, uint32_t, 8) +VFIO_RW(io, b, uint16_t, 4, uint8_t, 2) +VFIO_RW(io, w, uint16_t, 4, uint16_t, 4) +VFIO_RW(io, l, uint16_t, 4, uint32_t, 8) + +static void +vfio_quirk_capture_io(vfio_device_t *dev, vfio_region_t *bar, + uint16_t base, uint16_t size, uint8_t enable, + uint8_t (*inb)(uint16_t addr, void *priv), + uint16_t (*inw)(uint16_t addr, void *priv), + uint32_t (*inl)(uint16_t addr, void *priv), + void (*outb)(uint16_t addr, uint8_t val, void *priv), + void (*outw)(uint16_t addr, uint16_t val, void *priv), + void (*outl)(uint16_t addr, uint32_t val, void *priv)) +{ + /* Remove quirk handler from port range. */ + io_removehandler(base, size, + bar->read ? inb : NULL, + bar->read ? inw : NULL, + bar->read ? inl : NULL, + bar->write ? outb : NULL, + bar->write ? outw : NULL, + bar->write ? outl : NULL, + dev ? ((void *) dev) : ((void *) bar)); + + if (enable) { + /* Remove existing handler from port range. */ + if (bar->mmap_base) /* mmap available */ + io_removehandler(base, size, + bar->read ? vfio_io_readb_mm : NULL, + bar->read ? vfio_io_readw_mm : NULL, + bar->read ? vfio_io_readl_mm : NULL, + bar->write ? vfio_io_writeb_mm : NULL, + bar->write ? vfio_io_writew_mm : NULL, + bar->write ? vfio_io_writel_mm : NULL, + bar->mmap_precalc); + else /* mmap not available */ + io_removehandler(base, size, + bar->read ? vfio_io_readb_fd : NULL, + bar->read ? vfio_io_readw_fd : NULL, + bar->read ? vfio_io_readl_fd : NULL, + bar->write ? vfio_io_writeb_fd : NULL, + bar->write ? vfio_io_writew_fd : NULL, + bar->write ? vfio_io_writel_fd : NULL, + bar); + + /* Add quirk handler to port range. */ + io_sethandler(base, size, + bar->read ? inb : NULL, + bar->read ? inw : NULL, + bar->read ? inl : NULL, + bar->write ? outb : NULL, + bar->write ? outw : NULL, + bar->write ? outl : NULL, + dev ? ((void *) dev) : ((void *) bar)); + } +} + +static uint8_t +vfio_quirk_configmirror_readb(uint32_t addr, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Cascade to the main handler. */ + vfio_mem_readb_fd(addr, bar); + + /* Read configuration register. */ + uint8_t ret = vfio_config_readb(0, addr - bar->quirks.configmirror.offset, dev); + vfio_log_op("VFIO %s: Config mirror: Read %02X from index %02X\n", + dev->name, ret, addr - bar->quirks.configmirror.offset); + + return ret; +} + +static uint16_t +vfio_quirk_configmirror_readw(uint32_t addr, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Cascade to the main handler. */ + vfio_mem_readw_fd(addr, bar); + + /* Read configuration register. */ + uint16_t ret = vfio_config_readw(0, addr - bar->quirks.configmirror.offset, dev); + vfio_log_op("VFIO %s: Config mirror: Read %04X from index %02X\n", + dev->name, ret, addr - bar->quirks.configmirror.offset); + + return ret; +} + +static uint32_t +vfio_quirk_configmirror_readl(uint32_t addr, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Cascade to the main handler. */ + vfio_mem_readl_fd(addr, bar); + + /* Read configuration register. */ + uint32_t ret = vfio_config_readl(0, addr - bar->quirks.configmirror.offset, dev); + vfio_log_op("VFIO %s: Config mirror: Read %08X from index %02X\n", + dev->name, ret, addr - bar->quirks.configmirror.offset); + + return ret; +} + +static void +vfio_quirk_configmirror_writeb(uint32_t addr, uint8_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Write configuration register. */ + vfio_log_op("VFIO %s: Config mirror: Write %02X to index %02X\n", + dev->name, val, addr - bar->quirks.configmirror.offset); + vfio_config_writeb(0, addr - bar->quirks.configmirror.offset, val, dev); +} + +static void +vfio_quirk_configmirror_writew(uint32_t addr, uint16_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Write configuration register. */ + vfio_log_op("VFIO %s: Config mirror: Write %04X to index %02X\n", + dev->name, val, addr - bar->quirks.configmirror.offset); + vfio_config_writew(0, addr - bar->quirks.configmirror.offset, val, dev); +} + +static void +vfio_quirk_configmirror_writel(uint32_t addr, uint32_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Write configuration register. */ + vfio_log_op("VFIO %s: Config mirror: Write %08X to index %02X\n", + dev->name, val, addr - bar->quirks.configmirror.offset); + vfio_config_writel(0, addr - bar->quirks.configmirror.offset, val, dev); +} + +static void +vfio_quirk_configmirror(vfio_device_t *dev, vfio_region_t *bar, + uint32_t offset, uint8_t mapping_slot, uint8_t enable) +{ + /* Get the additional memory mapping structure. */ + mem_mapping_t *mapping = &bar->quirks.mem_mappings[mapping_slot]; + + vfio_log("VFIO %s: %sapping configuration space mirror for %s @ %08X\n", + dev->name, enable ? "M" : "Unm", bar->name, bar->emulated_offset + offset); + + /* Add mapping if it wasn't already added. + Being added after region setup, it should override the main BAR mapping. */ + if (!mapping->base) + mem_mapping_add(mapping, 0, 0, + vfio_quirk_configmirror_readb, + vfio_quirk_configmirror_readw, + vfio_quirk_configmirror_readl, + vfio_quirk_configmirror_writeb, + vfio_quirk_configmirror_writew, + vfio_quirk_configmirror_writel, + NULL, MEM_MAPPING_EXTERNAL, bar); + + /* Store start offset. */ + bar->quirks.configmirror.offset = bar->emulated_offset + offset; + + /* Enable or disable mapping. */ + if (enable) + mem_mapping_set_addr(mapping, bar->emulated_offset + offset, 256); + else + mem_mapping_disable(mapping); +} + +static void +vfio_quirk_configwindow_index_writeb(uint16_t addr, uint8_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Write configuration register index. */ + vfio_log_op("VFIO %s: Config window: Write index[%d] %02X\n", + dev->name, addr & 3, val); + uint8_t offset = (addr & 3) << 3; + bar->quirks.configwindow.index &= ~(0x000000ff << offset); + bar->quirks.configwindow.index |= val << offset; + + /* Cascade to the main handler. */ + vfio_io_writeb_fd(addr, val, bar); +} + +static void +vfio_quirk_configwindow_index_writew(uint16_t addr, uint16_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Write configuration register index. */ + vfio_log_op("VFIO %s: Config window: Write index[%d] %04X\n", + dev->name, addr & 2, val); + uint8_t offset = (addr & 2) << 3; + bar->quirks.configwindow.index &= ~(0x0000ffff << offset); + bar->quirks.configwindow.index |= val << offset; + + /* Cascade to the main handler. */ + vfio_io_writew_fd(addr, val, bar); +} + +static void +vfio_quirk_configwindow_index_writel(uint16_t addr, uint32_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Write configuration register index. */ + vfio_log_op("VFIO %s: Config window: Write index %08X\n", + dev->name, val); + bar->quirks.configwindow.index = val; + + /* Cascade to the main handler. */ + vfio_io_writel_fd(addr, val, bar); +} + +static uint8_t +vfio_quirk_configwindow_data_readb(uint16_t addr, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Cascade to the main handler. */ + uint8_t ret = vfio_io_readb_fd(addr, bar); + + /* Read configuration register if part of the main PCI configuration space. */ + uint32_t index = bar->quirks.configwindow.index; + if ((index >= bar->quirks.configwindow.offset[0].start) && (index <= bar->quirks.configwindow.offset[0].end)) { + ret = vfio_config_readb(0, index - bar->quirks.configwindow.offset[0].start, dev); + vfio_log_op("VFIO %s: Config window: Read %02X from primary index %08X\n", + dev->name, ret, index); + } else if ((index >= bar->quirks.configwindow.offset[1].start) && (index <= bar->quirks.configwindow.offset[1].end)) { + ret = vfio_config_readb(0, index - bar->quirks.configwindow.offset[1].start, dev); + vfio_log_op("VFIO %s: Config window: Read %02X from secondary index %08X\n", + dev->name, ret, index); + } + + return ret; +} + +static uint16_t +vfio_quirk_configwindow_data_readw(uint16_t addr, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Cascade to the main handler. */ + uint16_t ret = vfio_io_readw_fd(addr, bar); + + /* Read configuration register if part of the main PCI configuration space. */ + uint32_t index = bar->quirks.configwindow.index; + if ((index >= bar->quirks.configwindow.offset[0].start) && (index <= bar->quirks.configwindow.offset[0].end)) { + ret = vfio_config_readw(0, index - bar->quirks.configwindow.offset[0].start, dev); + vfio_log_op("VFIO %s: Config window: Read %04X from primary index %08X\n", + dev->name, ret, index); + } else if ((index >= bar->quirks.configwindow.offset[1].start) && (index <= bar->quirks.configwindow.offset[1].end)) { + ret = vfio_config_readw(0, index - bar->quirks.configwindow.offset[1].start, dev); + vfio_log_op("VFIO %s: Config window: Read %04X from secondary index %08X\n", + dev->name, ret, index); + } + + return ret; +} + +static uint32_t +vfio_quirk_configwindow_data_readl(uint16_t addr, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Cascade to the main handler. */ + uint32_t ret = vfio_io_readl_fd(addr, bar); + + /* Read configuration register if part of the main PCI configuration space. */ + uint32_t index = bar->quirks.configwindow.index; + if ((index >= bar->quirks.configwindow.offset[0].start) && (index <= bar->quirks.configwindow.offset[0].end)) { + ret = vfio_config_readl(0, index - bar->quirks.configwindow.offset[0].start, dev); + vfio_log_op("VFIO %s: Config window: Read %08X from primary index %08X\n", + dev->name, ret, index); + } else if ((index >= bar->quirks.configwindow.offset[1].start) && (index <= bar->quirks.configwindow.offset[1].end)) { + ret = vfio_config_readl(0, index - bar->quirks.configwindow.offset[1].start, dev); + vfio_log_op("VFIO %s: Config window: Read %08X from secondary index %08X\n", + dev->name, ret, index); + } + + return ret; +} + +static void +vfio_quirk_configwindow_data_writeb(uint16_t addr, uint8_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Write configuration register if part of the main PCI configuration space. */ + uint32_t index = bar->quirks.configwindow.index; + if ((index >= bar->quirks.configwindow.offset[0].start) && (index <= bar->quirks.configwindow.offset[0].end)) { + vfio_log_op("VFIO %s: Config window: Write %02X to primary index %08X\n", + dev->name, val, index); + vfio_config_writeb(0, index - bar->quirks.configwindow.offset[0].start, val, dev); + return; + } else if ((index >= bar->quirks.configwindow.offset[1].start) && (index <= bar->quirks.configwindow.offset[1].end)) { + vfio_log_op("VFIO %s: Config window: Write %02X to secondary index %08X\n", + dev->name, val, index); + vfio_config_writeb(0, index - bar->quirks.configwindow.offset[1].start, val, dev); + return; + } + + /* Cascade to the main handler. */ + vfio_io_writeb_fd(addr, val, bar); +} + +static void +vfio_quirk_configwindow_data_writew(uint16_t addr, uint16_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Write configuration register if part of the main PCI configuration space. */ + uint32_t index = bar->quirks.configwindow.index; + if ((index >= bar->quirks.configwindow.offset[0].start) && (index <= bar->quirks.configwindow.offset[0].end)) { + vfio_log_op("VFIO %s: Config window: Write %04X to primary index %08X\n", + dev->name, val, index); + vfio_config_writew(0, index - bar->quirks.configwindow.offset[0].start, val, dev); + return; + } else if ((index >= bar->quirks.configwindow.offset[1].start) && (index <= bar->quirks.configwindow.offset[1].end)) { + vfio_log_op("VFIO %s: Config window: Write %04X to secondary index %08X\n", + dev->name, val, index); + vfio_config_writew(0, index - bar->quirks.configwindow.offset[1].start, val, dev); + return; + } + + /* Cascade to the main handler. */ + vfio_io_writew_fd(addr, val, bar); +} + +static void +vfio_quirk_configwindow_data_writel(uint16_t addr, uint32_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + vfio_device_t *dev = bar->dev; + + /* Write configuration register if part of the main PCI configuration space. */ + uint32_t index = bar->quirks.configwindow.index; + if ((index >= bar->quirks.configwindow.offset[0].start) && (index <= bar->quirks.configwindow.offset[0].end)) { + vfio_log_op("VFIO %s: Config window: Write %08X to primary index %08X\n", + dev->name, val, index); + vfio_config_writel(0, index - bar->quirks.configwindow.offset[0].start, val, dev); + return; + } else if ((index >= bar->quirks.configwindow.offset[1].start) && (index <= bar->quirks.configwindow.offset[1].end)) { + vfio_log_op("VFIO %s: Config window: Write %08X to secondary index %08X\n", + dev->name, val, index); + vfio_config_writel(0, index - bar->quirks.configwindow.offset[1].start, val, dev); + return; + } + + /* Cascade to the main handler. */ + vfio_io_writel_fd(addr, val, bar); +} + +static void +vfio_quirk_configwindow(vfio_device_t *dev, vfio_region_t *bar, + uint16_t index_offset, uint16_t index_size, + uint16_t data_offset, uint16_t data_size, + uint32_t window_offset0, uint32_t window_offset1, uint8_t enable) +{ + vfio_log("VFIO %s: %sapping configuration space window for %s @ %04X and %04X\n", + dev->name, enable ? "M" : "Unm", bar->name, + bar->emulated_offset + index_offset, bar->emulated_offset + data_offset); + + /* Store start offsets, as well as end offsets to speed up operations. */ + bar->quirks.configwindow.offset[0].start = window_offset0; + bar->quirks.configwindow.offset[0].end = window_offset0 + 255; + bar->quirks.configwindow.offset[1].start = window_offset1; + bar->quirks.configwindow.offset[1].end = window_offset1 + 255; + + /* Enable or disable mapping. */ + vfio_quirk_capture_io(NULL, bar, bar->emulated_offset + index_offset, index_size, enable, + vfio_io_readb_fd, + vfio_io_readw_fd, + vfio_io_readl_fd, + vfio_quirk_configwindow_index_writeb, + vfio_quirk_configwindow_index_writew, + vfio_quirk_configwindow_index_writel); + vfio_quirk_capture_io(NULL, bar, bar->emulated_offset + data_offset, data_size, enable, + vfio_quirk_configwindow_data_readb, + vfio_quirk_configwindow_data_readw, + vfio_quirk_configwindow_data_readl, + vfio_quirk_configwindow_data_writeb, + vfio_quirk_configwindow_data_writew, + vfio_quirk_configwindow_data_writel); +} + +static uint8_t +vfio_quirk_iomirror_readb(uint16_t addr, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + + /* Read I/O port mirror from memory-mapped space. */ + uint8_t ret = vfio_mem_readb_fd(bar->emulated_offset + bar->quirks.iomirror.offset + addr, bar); +#ifdef ENABLE_VFIO_LOG + vfio_device_t *dev = bar->dev; + vfio_log_op("VFIO %s: I/O mirror: Read %02X from %04X (%08X)\n", dev->name, + ret, addr, bar->quirks.iomirror.offset + addr); +#endif + return ret; +} + +static uint16_t +vfio_quirk_iomirror_readw(uint16_t addr, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + + /* Read I/O port mirror from memory-mapped space. */ + uint16_t ret = vfio_mem_readw_fd(bar->emulated_offset + bar->quirks.iomirror.offset + addr, bar); +#ifdef ENABLE_VFIO_LOG + vfio_device_t *dev = bar->dev; + vfio_log_op("VFIO %s: I/O mirror: Read %04X from %04X (%08X)\n", dev->name, + ret, addr, bar->quirks.iomirror.offset + addr); +#endif + return ret; +} + +static uint32_t +vfio_quirk_iomirror_readl(uint16_t addr, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + + /* Read I/O port mirror from memory-mapped space. */ + uint32_t ret = vfio_mem_readl_fd(bar->emulated_offset + bar->quirks.iomirror.offset + addr, bar); +#ifdef ENABLE_VFIO_LOG + vfio_device_t *dev = bar->dev; + vfio_log_op("VFIO %s: I/O mirror: Read %08X from %04X (%08X)\n", dev->name, + ret, addr, bar->quirks.iomirror.offset + addr); +#endif + return ret; +} + +static void +vfio_quirk_iomirror_writeb(uint16_t addr, uint8_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + + /* Write I/O port mirror to memory-mapped space. */ +#ifdef ENABLE_VFIO_LOG + vfio_device_t *dev = bar->dev; + vfio_log_op("VFIO %s: I/O mirror: Write %02X to %04X (%08X)\n", dev->name, + val, addr, bar->quirks.iomirror.offset + addr); +#endif + vfio_mem_writeb_fd(bar->emulated_offset + bar->quirks.iomirror.offset + addr, val, bar); +} + +static void +vfio_quirk_iomirror_writew(uint16_t addr, uint16_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + + /* Write I/O port mirror to memory-mapped space. */ +#ifdef ENABLE_VFIO_LOG + vfio_device_t *dev = bar->dev; + vfio_log_op("VFIO %s: I/O mirror: Write %04X to %04X (%08X)\n", dev->name, + val, addr, bar->quirks.iomirror.offset + addr); +#endif + vfio_mem_writew_fd(bar->emulated_offset + bar->quirks.iomirror.offset + addr, val, bar); +} + +static void +vfio_quirk_iomirror_writel(uint16_t addr, uint32_t val, void *priv) +{ + vfio_region_t *bar = (vfio_region_t *) priv; + + /* Write I/O port mirror to memory-mapped space. */ +#ifdef ENABLE_VFIO_LOG + vfio_device_t *dev = bar->dev; + vfio_log_op("VFIO %s: I/O mirror: Write %08X to %04X (%08X)\n", dev->name, + val, addr, bar->quirks.iomirror.offset + addr); +#endif + vfio_mem_writel_fd(bar->emulated_offset + bar->quirks.iomirror.offset + addr, val, bar); +} + +static void +vfio_quirk_iomirror(vfio_device_t *dev, vfio_region_t *bar, + uint32_t offset, uint16_t base, uint16_t length, uint8_t enable) +{ + vfio_log("VFIO %s: %sapping I/O mirror for %s @ %08X\n", + dev->name, enable ? "M" : "Unm", bar->name, bar->emulated_offset + offset); + + /* Save I/O mirror offset, only one per BAR for now. */ + bar->quirks.iomirror.offset = offset; + + /* Add or remove quirk handler from port range. */ + if (enable) + io_sethandler(base, length, + bar->read ? vfio_quirk_iomirror_readb : NULL, + bar->read ? vfio_quirk_iomirror_readw : NULL, + bar->read ? vfio_quirk_iomirror_readl : NULL, + bar->write ? vfio_quirk_iomirror_writeb : NULL, + bar->write ? vfio_quirk_iomirror_writew : NULL, + bar->write ? vfio_quirk_iomirror_writel : NULL, + bar); + else + io_removehandler(base, length, + bar->read ? vfio_quirk_iomirror_readb : NULL, + bar->read ? vfio_quirk_iomirror_readw : NULL, + bar->read ? vfio_quirk_iomirror_readl : NULL, + bar->write ? vfio_quirk_iomirror_writeb : NULL, + bar->write ? vfio_quirk_iomirror_writew : NULL, + bar->write ? vfio_quirk_iomirror_writel : NULL, + bar); +} + +static uint8_t +vfio_quirk_ati3c3_readb(uint16_t addr, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Read high byte of the I/O BAR address. */ + uint8_t ret = dev->quirks.ati3c3.bar->emulated_offset >> 8; + vfio_log_op("VFIO %s: ATI 3C3: Read %02X\n", ret); + + return ret; +} + +static void +vfio_quirk_nvidiabar5(vfio_device_t *dev) +{ + /* Remap config window based on BAR enable status and the master/enable registers. */ + vfio_quirk_configwindow(dev, &dev->bars[5], 0x08, 4, 0x0c, 4, 0x1800, 0x88000, + dev->quirks.nvidiabar5.bar_enable && ((dev->quirks.nvidiabar5.master_enable & 0x0000000100000001) == 0x0000000100000001)); +} + +static void +vfio_quirk_nvidiabar5_writeb(uint16_t addr, uint8_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Write master/enable registers. */ + vfio_log_op("VFIO %s: NVIDIA BAR 5: Write [%d] %02X\n", + dev->name, addr & 7, val); + uint8_t offset = (addr & 7) << 3; + dev->quirks.nvidiabar5.master_enable &= ~(0x00000000000000ff << offset); + dev->quirks.nvidiabar5.master_enable |= val << offset; + + /* Update window to account for changes in master/enable registers. */ + vfio_quirk_nvidiabar5(dev); + + /* Cascade to the main handler. */ + vfio_io_writeb_fd(addr, val, &dev->bars[5]); +} + +static void +vfio_quirk_nvidiabar5_writew(uint16_t addr, uint16_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Write master/enable registers. */ + vfio_log_op("VFIO %s: NVIDIA BAR 5: Write [%d] %04X\n", + dev->name, addr & 7, val); + uint8_t offset = (addr & 6) << 3; + dev->quirks.nvidiabar5.master_enable &= ~(0x000000000000ffff << offset); + dev->quirks.nvidiabar5.master_enable |= val << offset; + + /* Update window to account for changes in master/enable registers. */ + vfio_quirk_nvidiabar5(dev); + + /* Cascade to the main handler. */ + vfio_io_writew_fd(addr, val, &dev->bars[5]); +} + +static void +vfio_quirk_nvidiabar5_writel(uint16_t addr, uint32_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Write master/enable registers. */ + vfio_log_op("VFIO %s: NVIDIA BAR 5: Write [%d] %08X\n", + dev->name, addr & 7, val); + uint8_t offset = (addr & 4) << 3; + dev->quirks.nvidiabar5.master_enable &= ~(0x00000000ffffffff << offset); + dev->quirks.nvidiabar5.master_enable |= val << offset; + + /* Update window to account for changes in master/enable registers. */ + vfio_quirk_nvidiabar5(dev); + + /* Cascade to the main handler. */ + vfio_io_writel_fd(addr, val, &dev->bars[5]); +} + +static uint8_t +vfio_quirk_nvidia3d0_state_readb(uint16_t addr, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Reset state on read. */ + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to NONE state (byte read)\n", dev->name); + + /* Cascade to the main handler. */ + return vfio_io_readb_fd(addr, &dev->vga_io_hi); +} + +static uint16_t +vfio_quirk_nvidia3d0_state_readw(uint16_t addr, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Reset state on read. */ + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to NONE state (word read)\n", dev->name); + + /* Cascade to the main handler. */ + return vfio_io_readw_fd(addr, &dev->vga_io_hi); +} + +static uint32_t +vfio_quirk_nvidia3d0_state_readl(uint16_t addr, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Reset state on read. */ + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to NONE state (dword read)\n", dev->name); + + /* Cascade to the main handler. */ + return vfio_io_readl_fd(addr, &dev->vga_io_hi); +} + +static void +vfio_quirk_nvidia3d0_state_writeb(uint16_t addr, uint8_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Commands don't fit in a byte; just reset state and move on. */ + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to NONE state (byte write)\n", dev->name); + + /* Cascade to the main handler. */ + vfio_io_writeb_fd(addr, val, &dev->vga_io_hi); +} + +static void +vfio_quirk_nvidia3d0_state_writew(uint16_t addr, uint16_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + uint8_t prev_state = dev->quirks.nvidia3d0.state; + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + + /* Interpret NVIDIA commands. */ + switch (val) { + case 0x338: + if (prev_state == NVIDIA_3D0_NONE) { + dev->quirks.nvidia3d0.state = NVIDIA_3D0_SELECT; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to SELECT state (word write)\n", dev->name); + } + break; + + case 0x538: + if (prev_state == NVIDIA_3D0_WINDOW) { + dev->quirks.nvidia3d0.state = NVIDIA_3D0_READ; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to READ state (word write)\n", dev->name); + } + break; + + case 0x738: + if (prev_state == NVIDIA_3D0_WINDOW) { + dev->quirks.nvidia3d0.state = NVIDIA_3D0_WRITE; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to WRITE state (word write)\n", dev->name); + } + break; + } + + /* Cascade to the main handler. */ + vfio_io_writew_fd(addr, val, &dev->vga_io_hi); +} + +static void +vfio_quirk_nvidia3d0_state_writel(uint16_t addr, uint32_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + uint8_t prev_state = dev->quirks.nvidia3d0.state; + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + + /* Interpret NVIDIA commands. */ + switch (val) { + case 0x338: + if (prev_state == NVIDIA_3D0_NONE) { + dev->quirks.nvidia3d0.state = NVIDIA_3D0_SELECT; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to SELECT state (dword write)\n", dev->name); + } + break; + + case 0x538: + if (prev_state == NVIDIA_3D0_WINDOW) { + dev->quirks.nvidia3d0.state = NVIDIA_3D0_READ; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to READ state (dword write)\n", dev->name); + } + break; + + case 0x738: + if (prev_state == NVIDIA_3D0_WINDOW) { + dev->quirks.nvidia3d0.state = NVIDIA_3D0_WRITE; + vfio_log_op("VFIO %s: NVIDIA 3D0: Switching to WRITE state (dword write)\n", dev->name); + } + break; + } + + /* Cascade to the main handler. */ + vfio_io_writel_fd(addr, val, &dev->vga_io_hi); +} + +static uint8_t +vfio_quirk_nvidia3d0_data_readb(uint16_t addr, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Cascade to the main handler. */ + uint8_t prev_state = dev->quirks.nvidia3d0.state; + uint8_t ret = vfio_io_readb_fd(addr, &dev->vga_io_hi); + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + + /* Read configuration register if part of the main PCI configuration space. */ + if ((prev_state == NVIDIA_3D0_READ) && (((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00001800) || ((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00088000))) { + ret = vfio_config_readb(0, dev->quirks.nvidia3d0.index, dev); + vfio_log_op("VFIO %s: NVIDIA 3D0: Read %02X from index %08X\n", dev->name, + ret, dev->quirks.nvidia3d0.index); + } + + return ret; +} + +static uint16_t +vfio_quirk_nvidia3d0_data_readw(uint16_t addr, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Cascade to the main handler. */ + uint8_t prev_state = dev->quirks.nvidia3d0.state; + uint16_t ret = vfio_io_readw_fd(addr, &dev->vga_io_hi); + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + + /* Read configuration register if part of the main PCI configuration space. */ + if ((prev_state == NVIDIA_3D0_READ) && (((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00001800) || ((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00088000))) { + ret = vfio_config_readw(0, dev->quirks.nvidia3d0.index, dev); + vfio_log_op("VFIO %s: NVIDIA 3D0: Read %04X from index %08X\n", dev->name, + ret, dev->quirks.nvidia3d0.index); + } + + return ret; +} + +static uint32_t +vfio_quirk_nvidia3d0_data_readl(uint16_t addr, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + /* Cascade to the main handler. */ + uint8_t prev_state = dev->quirks.nvidia3d0.state; + uint32_t ret = vfio_io_readl_fd(addr, &dev->vga_io_hi); + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + + /* Read configuration register if part of the main PCI configuration space. */ + if ((prev_state == NVIDIA_3D0_READ) && (((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00001800) || ((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00088000))) { + ret = vfio_config_readl(0, dev->quirks.nvidia3d0.index, dev); + vfio_log_op("VFIO %s: NVIDIA 3D0: Read %08X from index %08X\n", dev->name, + ret, dev->quirks.nvidia3d0.index); + } + + return ret; +} + +static void +vfio_quirk_nvidia3d0_data_writeb(uint16_t addr, uint8_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + uint8_t prev_state = dev->quirks.nvidia3d0.state; + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + + if (prev_state == NVIDIA_3D0_SELECT) { + /* Write MMIO index. */ + dev->quirks.nvidia3d0.index = val; + dev->quirks.nvidia3d0.state = NVIDIA_3D0_WINDOW; + vfio_log_op("VFIO %s: NVIDIA 3D0: Write index %02X\n", dev->name, val); + } else if (prev_state == NVIDIA_3D0_WRITE) { + /* Write configuration register if part of the main PCI configuration space. */ + if (((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00001800) || ((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00088000)) { + /* Write configuration register. */ + vfio_log_op("VFIO %s: NVIDIA 3D0: Write %02X to index %08X\n", dev->name, + val, dev->quirks.nvidia3d0.index); + vfio_config_writeb(0, dev->quirks.nvidia3d0.index, val, dev); + return; + } + } + + /* Cascade to the main handler. */ + vfio_io_writeb_fd(addr, val, &dev->vga_io_hi); +} + +static void +vfio_quirk_nvidia3d0_data_writew(uint16_t addr, uint16_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + uint8_t prev_state = dev->quirks.nvidia3d0.state; + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + + if (prev_state == NVIDIA_3D0_SELECT) { + /* Write MMIO index. */ + dev->quirks.nvidia3d0.index = val; + dev->quirks.nvidia3d0.state = NVIDIA_3D0_WINDOW; + vfio_log_op("VFIO %s: NVIDIA 3D0: Write index %04X\n", dev->name, val); + } else if (prev_state == NVIDIA_3D0_WRITE) { + /* Write configuration register if part of the main PCI configuration space. */ + if (((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00001800) || ((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00088000)) { + vfio_log_op("VFIO %s: NVIDIA 3D0: Write %04X to index %08X\n", dev->name, + val, dev->quirks.nvidia3d0.index); + vfio_config_writew(0, dev->quirks.nvidia3d0.index, val, dev); + return; + } + } + + /* Cascade to the main handler. */ + vfio_io_writew_fd(addr, val, &dev->vga_io_hi); +} + +static void +vfio_quirk_nvidia3d0_data_writel(uint16_t addr, uint32_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + + uint8_t prev_state = dev->quirks.nvidia3d0.state; + dev->quirks.nvidia3d0.state = NVIDIA_3D0_NONE; + + if (prev_state == NVIDIA_3D0_SELECT) { + /* Write MMIO index. */ + dev->quirks.nvidia3d0.index = val; + dev->quirks.nvidia3d0.state = NVIDIA_3D0_WINDOW; + vfio_log_op("VFIO %s: NVIDIA 3D0: Write index %08X\n", dev->name, val); + } else if (prev_state == NVIDIA_3D0_WRITE) { + /* Write configuration register if part of the main PCI configuration space. */ + if (((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00001800) || ((dev->quirks.nvidia3d0.index & 0xffffff00) == 0x00088000)) { + /* Write configuration register. */ + vfio_log_op("VFIO %s: NVIDIA 3D0: Write %08X to index %08X\n", dev->name, + val, dev->quirks.nvidia3d0.index); + vfio_config_writel(0, dev->quirks.nvidia3d0.index, val, dev); + return; + } + } + + /* Cascade to the main handler. */ + vfio_io_writel_fd(addr, val, &dev->vga_io_hi); +} + +static void +vfio_quirk_remap(vfio_device_t *dev, vfio_region_t *bar, uint8_t enable) +{ + /* Read vendor ID. */ + uint16_t vendor; + if (pread(dev->config.fd, &vendor, sizeof(vendor), dev->config.offset) != sizeof(vendor)) + vendor = 0x0000; + + int i, j; + switch (vendor) { + case 0x1002: /* ATI */ + i = (vfio_bar_gettype(dev, &dev->bars[1]) == 0x01) && (dev->bars[1].size >= 256); + j = (vfio_bar_gettype(dev, &dev->bars[4]) == 0x01) && (dev->bars[4].size >= 256); + + /* ATI/AMD cards report the I/O BAR's high byte on port 3C3, and according + to the Red Hat slide deck, this is used for VBIOS bootstrapping purposes. + This I/O BAR can be either 1 or 4, so we probe which one it is. If unsure + (shouldn't really happen), pick 1 which is mostly used by older cards. */ + if ((bar == &dev->vga_io_hi) && (i || j)) { + dev->quirks.ati3c3.bar = (j && !i) ? &dev->bars[4] : &dev->bars[1]; + vfio_log("VFIO %s: %sapping ATI 3C3 quirk (BAR %d)\n", dev->name, + enable ? "M" : "Unm", dev->quirks.ati3c3.bar->bar_id); + vfio_quirk_capture_io(dev, bar, 0x3c3, 1, enable, + vfio_quirk_ati3c3_readb, NULL, NULL, + NULL, NULL, NULL); + } + + /* BAR 2 configuration space mirror, and BAR 1/4 configuration space window. */ + if (j && !i) { + /* QEMU only enables the mirror here if BAR 2 is 64-bit capable. */ + if ((bar->bar_id == 2) && ((vfio_config_readb(0, 0x18, dev) & 0x07) == 0x04)) + vfio_quirk_configmirror(dev, bar, 0x4000, 0, enable); + else if (bar->bar_id == 4) + vfio_quirk_configwindow(dev, bar, 0x00, 4, 0x04, 4, 0x4000, 0x4000, enable); + } else { + if (bar->bar_id == 2) + vfio_quirk_configmirror(dev, bar, 0xf00, 0, enable); + else if (bar->bar_id == 1) + vfio_quirk_configwindow(dev, bar, 0x00, 4, 0x04, 4, 0xf00, 0xf00, enable); + } + break; + + case 0x1023: /* Trident */ + /* Mirror TGUI acceleration port range to memory-mapped space, since the PCI bridge + VGA decode policy doesn't allow it to be forwarded directly to the real card. */ + if ((bar->bar_id == 1) && (vfio_bar_gettype(dev, bar) == 0x00) && (bar->size >= 65536)) { + /* Port range from vid_tgui9440.c */ + vfio_quirk_iomirror(dev, bar, 0, 0x2100, 256, enable); + } + break; + + case 0x10de: /* NVIDIA */ + /* BAR 0 configuration space mirrors. */ + if ((bar->bar_id == 0) && (vfio_bar_gettype(dev, bar) == 0x00)) { + vfio_quirk_configmirror(dev, bar, 0x1800, 0, enable); + vfio_quirk_configmirror(dev, bar, 0x88000, 1, enable); + } + + /* BAR 5 configuration space window. */ + if ((bar->bar_id == 5) && (vfio_bar_gettype(dev, bar) == 0x01)) { + vfio_log("VFIO %s: %sapping NVIDIA BAR 5 quirk\n", dev->name, enable ? "M" : "Unm"); + vfio_quirk_capture_io(dev, bar, bar->emulated_offset, 8, enable, + vfio_io_readb_fd, + vfio_io_readw_fd, + vfio_io_readl_fd, + vfio_quirk_nvidiabar5_writeb, + vfio_quirk_nvidiabar5_writew, + vfio_quirk_nvidiabar5_writel); + + /* Update window to account for changes in BAR enable status. */ + dev->quirks.nvidiabar5.bar_enable = enable; + vfio_quirk_nvidiabar5(dev); + } + + /* Port 3D0 configuration space window. */ + if ((bar == &dev->vga_io_hi) && dev->bars[1].size) { + vfio_log("VFIO %s: %sapping NVIDIA 3D0 quirk\n", dev->name, enable ? "M" : "Unm"); + vfio_quirk_capture_io(dev, bar, 0x3d0, 1, enable, + vfio_quirk_nvidia3d0_data_readb, + vfio_quirk_nvidia3d0_data_readw, + vfio_quirk_nvidia3d0_data_readl, + vfio_quirk_nvidia3d0_data_writeb, + vfio_quirk_nvidia3d0_data_writew, + vfio_quirk_nvidia3d0_data_writel); + vfio_quirk_capture_io(dev, bar, 0x3d4, 1, enable, + vfio_quirk_nvidia3d0_state_readb, + vfio_quirk_nvidia3d0_state_readw, + vfio_quirk_nvidia3d0_state_readl, + vfio_quirk_nvidia3d0_state_writeb, + vfio_quirk_nvidia3d0_state_writew, + vfio_quirk_nvidia3d0_state_writel); + } + break; + + case 0x5333: /* S3 */ + /* Mirror enhanced command port ranges to memory-mapped space, since the PCI bridge + VGA decode policy doesn't allow those to be forwarded directly to the real card. */ + if (vfio_bar_gettype(dev, &dev->bars[0]) != 0x00) + break; + if ((dev->bars[0].size == 33554432) && (dev->bar_count == 1)) { + /* Older chips can only remap to VGA A0000. We can tell + those through BAR 0 being 32M and the only BAR. */ + if (bar == &dev->vga_mem) { + i = 0; + + /* Main port list from vid_s3.c */ + vfio_quirk_iomirror(dev, bar, i, 0x42e8, 2, enable); + vfio_quirk_iomirror(dev, bar, i, 0x46e8, 2, enable); + vfio_quirk_iomirror(dev, bar, i, 0x4ae8, 2, enable); + +s3_old_mmio: + vfio_quirk_iomirror(dev, bar, i, 0x82e8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0x86e8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0x8ae8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0x8ee8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0x92e8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0x96e8, 4, enable); + + vfio_quirk_iomirror(dev, bar, i, 0x9ae8, 4, enable); + + vfio_quirk_iomirror(dev, bar, i, 0x9ee8, 2, enable); + vfio_quirk_iomirror(dev, bar, i, 0xa2e8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0xa6e8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0xaae8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0xaee8, 4, enable); + + vfio_quirk_iomirror(dev, bar, i, 0xb2e8, 4, enable); + + vfio_quirk_iomirror(dev, bar, i, 0xb6e8, 2, enable); + vfio_quirk_iomirror(dev, bar, i, 0xbae8, 2, enable); + vfio_quirk_iomirror(dev, bar, i, 0xbee8, 2, enable); + vfio_quirk_iomirror(dev, bar, i, 0xe2e8, 2, enable); + + vfio_quirk_iomirror(dev, bar, i, 0xd2e8, 2, enable); + vfio_quirk_iomirror(dev, bar, i, 0xe6e8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0xeae8, 4, enable); + vfio_quirk_iomirror(dev, bar, i, 0xeee8, 4, enable); + } + } else if ((dev->bars[0].size == 67108864) && (dev->bar_count == 1)) { + /* Trio64V+ and ViRGE chips can remap to BAR 0 + 16M. We can tell those through + BAR 0 being 64M = ((16M linear + 16M MMIO) * both endians) and the only BAR. */ + if (bar->bar_id == 0) { + i = 0x1000000; /* 16M MMIO offset */ + +s3_new_mmio: /* There's a configuration space mirror in here as well. */ + vfio_quirk_configmirror(dev, bar, i + 0x8000, 0, enable); + + /* Subsystem Control/Status and Advanced Function Control. */ + vfio_quirk_iomirror(dev, bar, i + 0x8504 - 0x42e8, 0x42e8, 2, enable); + vfio_quirk_iomirror(dev, bar, i + 0x850c - 0x4ae8, 0x4ae8, 2, enable); + + /* The rest maps exactly as older chips. */ + goto s3_old_mmio; + } + } else if ((dev->bars[0].size >= 524288) && (vfio_bar_gettype(dev, &dev->bars[1]) == 0x00)) { + /* Savage chips break the linear framebuffer out to + BAR 1+, eliminating the 16M MMIO offset from BAR 0. */ + if (bar->bar_id == 0) { + i = 0; + goto s3_new_mmio; + } + } + break; + } +} + +static uint8_t +vfio_bar_gettype(vfio_device_t *dev, vfio_region_t *bar) +{ + /* Read and store BAR type from device if unknown. */ + if (bar->type == 0xff) { + if (pread(dev->config.fd, &bar->type, sizeof(bar->type), + dev->config.offset + 0x10 + (bar->bar_id << 2)) + == sizeof(bar->type)) + bar->type &= 0x01; + else + bar->type = 0xff; + } + + /* Return stored BAR type. */ + return bar->type; +} + +static void +vfio_bar_remap(vfio_device_t *dev, vfio_region_t *bar, uint32_t new_offset) +{ + vfio_log("VFIO %s: bar_remap(%s, %08X)\n", dev->name, bar->name, new_offset); + + /* Act according to the BAR type. */ + uint8_t bar_type = vfio_bar_gettype(dev, bar); + if (bar_type == 0x00) { /* Memory BAR */ + if (bar->emulated_offset) { + vfio_log("VFIO %s: Unmapping %s memory @ %08X-%08X\n", dev->name, + bar->name, bar->emulated_offset, bar->emulated_offset + bar->size - 1); + + /* Unmap any quirks. */ + vfio_quirk_remap(dev, bar, 0); + + /* Disable memory mapping. */ + mem_mapping_disable(&bar->mem_mapping); + + /* Disable MSI-X table and PBA mappings if applicable to this BAR. */ + if (dev->irq.msix.table_bar == bar->bar_id) + mem_mapping_disable(&dev->irq.msix.table_mapping); + if (dev->irq.msix.pba_bar == bar->bar_id) + mem_mapping_disable(&dev->irq.msix.pba_mapping); + } + + bar->mmap_precalc = bar->mmap_base - new_offset; + /* Expansion ROM requires both ROM enable and memory enable. */ + if (((bar->bar_id != 0xff) || dev->rom_enabled) && dev->mem_enabled && new_offset) { + vfio_log("VFIO %s: Mapping %s memory @ %08X-%08X\n", dev->name, + bar->name, new_offset, new_offset + bar->size - 1); + + /* Enable memory mapping. */ + if (bar->mmap_base) /* mmap available */ + mem_mapping_set_p(&bar->mem_mapping, bar->mmap_precalc); + mem_mapping_set_addr(&bar->mem_mapping, new_offset, bar->size); + + /* Map any quirks. */ + vfio_quirk_remap(dev, bar, 1); + + /* Enable MSI-X table and PBA mappings if applicable to this BAR. */ + if (dev->irq.msix.table_bar == bar->bar_id) { + dev->irq.msix.table_offset_precalc = new_offset + dev->irq.msix.table_offset; + mem_mapping_set_addr(&dev->irq.msix.table_mapping, + dev->irq.msix.table_offset_precalc, + dev->irq.msix.table_size); + } + if (dev->irq.msix.pba_bar == bar->bar_id) { + dev->irq.msix.pba_offset_precalc = new_offset + dev->irq.msix.pba_offset; + mem_mapping_set_addr(&dev->irq.msix.pba_mapping, + dev->irq.msix.pba_offset_precalc, + dev->irq.msix.pba_size); + } + } + } else if (bar_type == 0x01) { /* I/O BAR */ + if (bar->emulated_offset) { + vfio_log("VFIO %s: Unmapping %s I/O @ %04X-%04X\n", dev->name, + bar->name, bar->emulated_offset, bar->emulated_offset + bar->size - 1); + + /* Unmap any quirks. */ + vfio_quirk_remap(dev, bar, 0); + + /* Disable I/O mapping. */ + if (bar->mmap_base) /* mmap available */ + io_removehandler(bar->emulated_offset, bar->size, + bar->read ? vfio_io_readb_mm : NULL, + bar->read ? vfio_io_readw_mm : NULL, + bar->read ? vfio_io_readl_mm : NULL, + bar->write ? vfio_io_writeb_mm : NULL, + bar->write ? vfio_io_writew_mm : NULL, + bar->write ? vfio_io_writel_mm : NULL, + bar->mmap_precalc); + else /* mmap not available */ + io_removehandler(bar->emulated_offset, bar->size, + bar->read ? vfio_io_readb_fd : NULL, + bar->read ? vfio_io_readw_fd : NULL, + bar->read ? vfio_io_readl_fd : NULL, + bar->write ? vfio_io_writeb_fd : NULL, + bar->write ? vfio_io_writew_fd : NULL, + bar->write ? vfio_io_writel_fd : NULL, + bar); + } + + bar->mmap_precalc = bar->mmap_base - new_offset; + if (dev->io_enabled && new_offset) { + vfio_log("VFIO %s: Mapping %s I/O @ %04X-%04X\n", dev->name, + bar->name, new_offset, new_offset + bar->size - 1); + + /* Enable I/O mapping. */ + if (bar->mmap_base) /* mmap available */ + io_sethandler(new_offset, bar->size, + bar->read ? vfio_io_readb_mm : NULL, + bar->read ? vfio_io_readw_mm : NULL, + bar->read ? vfio_io_readl_mm : NULL, + bar->write ? vfio_io_writeb_mm : NULL, + bar->write ? vfio_io_writew_mm : NULL, + bar->write ? vfio_io_writel_mm : NULL, + bar->mmap_precalc); + else /* mmap not available */ + io_sethandler(new_offset, bar->size, + bar->read ? vfio_io_readb_fd : NULL, + bar->read ? vfio_io_readw_fd : NULL, + bar->read ? vfio_io_readl_fd : NULL, + bar->write ? vfio_io_writeb_fd : NULL, + bar->write ? vfio_io_writew_fd : NULL, + bar->write ? vfio_io_writel_fd : NULL, + bar); + + /* Map any quirks. */ + vfio_quirk_remap(dev, bar, 1); + } + } + + /* Set new emulated and precalculated offsets. + The precalculated offsets speed up read/write operations. */ + bar->emulated_offset = new_offset; + bar->precalc_offset = bar->offset - new_offset; +} + +static uint32_t +ceilpow2(uint32_t size) +{ + uint32_t pow_size = 1 << log2i(size); + if (pow_size < size) + return pow_size << 1; + return pow_size; +} + +static uint8_t +vfio_config_readb(int func, int addr, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + if (func) + return 0xff; + + intx_high = 0; + + /* Read register from device. */ + addr &= 0xff; + uint8_t ret; + if (pread(dev->config.fd, &ret, 1, dev->config.offset + addr) != 1) { + vfio_log("VFIO %s: config_readb(%d, %02X) failed\n", dev->name, + func, addr); + return 0xff; + } + + /* Change value accordingly. */ + uint8_t bar_id, offset, new; + switch (addr) { + case 0x10 ... 0x27: /* BARs */ + /* Stop if this BAR is absent. */ + bar_id = (addr - 0x10) >> 2; + if (!dev->bars[bar_id].read && !dev->bars[bar_id].write) { + ret = 0x00; + break; + } + + /* Mask off and insert static bits. */ + offset = (addr & 3) << 3; + new = dev->bars[bar_id].emulated_offset >> offset; + if (!offset) { + switch (vfio_bar_gettype(dev, &dev->bars[bar_id])) { + case 0x00: /* Memory BAR */ + new = (new & ~0x07) | (ret & 0x07); + break; + + case 0x01: /* I/O BAR */ + new = (new & ~0x03) | (ret & 0x03); + break; + } + } + ret = new; + break; + + case 0x30 ... 0x33: /* Expansion ROM */ + /* Stop if the ROM is absent. */ + if (!dev->rom.read) { + ret = 0x00; + break; + } + + /* Mask off and insert ROM enable bit. */ + offset = (addr & 3) << 3; + ret = dev->rom.emulated_offset >> offset; + if (!offset) + ret = (ret & ~0x01) | dev->rom_enabled; + break; + + default: /* other (capabilities) */ + if (dev->msi_cap && (addr >= dev->msi_cap)) { /* MSI */ + /* Adjust register offset to account for different structure levels. */ + offset = addr - dev->msi_cap; + if (!(dev->irq.msi.ctl & 0x0080) && (offset >= 0x08)) + offset += 4; + switch (offset) { + case 0x02 ... 0x03: /* Message Control */ + offset = (offset - 0x02) << 3; + ret = dev->irq.msi.ctl >> offset; + goto end; + + case 0x04 ... 0x07: /* Message Address */ + offset = (offset - 0x04) << 3; + ret = dev->irq.msi.address >> offset; + goto end; + + case 0x08 ... 0x0b: /* Message Upper Address */ + offset = (offset - 0x08) << 3; + ret = dev->irq.msi.address_upper >> offset; + goto end; + + case 0x0c ... 0x0d: /* Message Data */ + offset = (offset - 0x0c) << 3; + ret = dev->irq.msi.data >> offset; + goto end; + + case 0x10 ... 0x13: /* Mask Bits */ + if (dev->irq.msi.ctl & 0x0100) { + offset = (offset - 0x10) << 3; + ret = dev->irq.msi.mask >> offset; + goto end; + } + break; + + case 0x14 ... 0x17: /* Pending Bits */ + if (dev->irq.msi.ctl & 0x0100) { + offset = (offset - 0x14) << 3; + ret = dev->irq.msi.pending >> offset; + goto end; + } + break; + } + } + if (dev->msix_cap && (addr >= dev->msix_cap)) { /* MSI-X */ + offset = addr - dev->msix_cap; + switch (offset) { + case 0x02 ... 0x03: /* Message Control */ + offset = (offset - 0x02) << 3; + ret = dev->irq.msix.ctl >> offset; + goto end; + } + } +end: + break; + } + + vfio_log("VFIO %s: config_readb(%02X) = %02X\n", dev->name, + addr, ret); + + return ret; +} + +static uint16_t +vfio_config_readw(int func, int addr, void *priv) +{ + return vfio_config_readb(func, addr, priv) | (vfio_config_readb(func, addr + 1, priv) << 8); +} + +static uint32_t +vfio_config_readl(int func, int addr, void *priv) +{ + return vfio_config_readb(func, addr, priv) | (vfio_config_readb(func, addr + 1, priv) << 8) | (vfio_config_readb(func, addr + 2, priv) << 16) | (vfio_config_readb(func, addr + 3, priv) << 24); +} + +static void +vfio_config_writeb(int func, int addr, uint8_t val, void *priv) +{ + vfio_device_t *dev = (vfio_device_t *) priv; + if (func) + return; + + addr &= 0xff; + vfio_log("VFIO %s: config_writeb(%02X, %02X)\n", dev->name, addr, val); + + intx_high = 0; + + /* VFIO should block anything we shouldn't write to, such as BARs. */ + (void) !pwrite(dev->config.fd, &val, 1, dev->config.offset + addr); + + /* Act on some written values. */ + uint8_t new_mem_enabled; + uint8_t new_io_enabled; + uint8_t bar_id; + uint8_t offset; + uint32_t new_value; + uint64_t val64; + + switch (addr) { + case 0x04: /* Command */ + /* Determine new memory and I/O enable states. */ + new_mem_enabled = !!(val & PCI_COMMAND_MEM); + new_io_enabled = !!(val & PCI_COMMAND_IO); + + vfio_log("VFIO %s: Command Memory[%d] I/O[%d]\n", dev->name, + new_mem_enabled, new_io_enabled); + + /* Remap regions only if their respective enable bits have changed. */ + if (dev->mem_enabled ^ new_mem_enabled) { + /* Set new memory enable state. */ + dev->mem_enabled = new_mem_enabled; + + /* Remap memory BARs. */ + for (uint8_t i = 0; i < 6; i++) { + if (vfio_bar_gettype(dev, &dev->bars[i]) == 0x00) + vfio_bar_remap(dev, &dev->bars[i], dev->bars[i].emulated_offset); + } + + /* Remap ROM if present. */ + if (dev->rom.read) + vfio_bar_remap(dev, &dev->rom, dev->rom.emulated_offset); + + /* Remap VGA framebuffer region if present. */ + if (dev->vga_mem.bar_id) + vfio_bar_remap(dev, &dev->vga_mem, 0xa0000); + } + if (dev->io_enabled ^ new_io_enabled) { + /* Set new I/O enable state. */ + dev->io_enabled = new_io_enabled; + + /* Remap I/O BARs. */ + for (uint8_t i = 0; i < 6; i++) { + if (vfio_bar_gettype(dev, &dev->bars[i]) == 0x01) + vfio_bar_remap(dev, &dev->bars[i], dev->bars[i].emulated_offset); + } + + /* Remap VGA I/O regions if present. */ + if (dev->vga_io_lo.bar_id) { + vfio_bar_remap(dev, &dev->vga_io_lo, 0x3b0); + vfio_bar_remap(dev, &dev->vga_io_hi, 0x3c0); + } + } + break; + + case 0x10 ... 0x27: /* BARs */ + /* Stop if this BAR is absent. */ + bar_id = (addr - 0x10) >> 2; + if (!dev->bars[bar_id].read && !dev->bars[bar_id].write) + break; + + /* Mask off static bits. */ + offset = (addr & 3) << 3; + if (!offset) { + switch (vfio_bar_gettype(dev, &dev->bars[bar_id])) { + case 0x00: /* Memory BAR */ + val &= ~0x07; + break; + + case 0x01: /* I/O BAR */ + val &= ~0x03; + break; + } + } + + /* Remap BAR. */ + new_value = dev->bars[bar_id].emulated_offset & ~(0x000000ff << offset); + new_value |= val << offset; + new_value &= ~(ceilpow2(dev->bars[bar_id].size) - 1); + vfio_bar_remap(dev, &dev->bars[bar_id], new_value); + break; + + case 0x30 ... 0x33: /* Expansion ROM */ + /* Stop if the ROM is absent. */ + if (!dev->rom.read) + break; + + /* Set ROM enable bit. */ + offset = (addr & 3) << 3; + if (!offset) { + dev->rom_enabled = val & 0x01; + val &= 0xfe; + } + + /* Remap ROM. */ + new_value = (dev->rom.emulated_offset & ~(0x000000ff << offset)); + new_value |= val << offset; + new_value &= ~(ceilpow2(dev->rom.size) - 1); + vfio_bar_remap(dev, &dev->rom, new_value); + break; + + case 0x3d: /* Interrupt Pin */ + if (val != dev->irq.intx.pin) + vfio_irq_intx_setpin(dev); + break; + + default: /* other (capabilities) */ + if (dev->msi_cap && (addr >= dev->msi_cap)) { /* MSI */ + /* Adjust register offset to account for different structure levels. */ + offset = addr - dev->msi_cap; + if (!(dev->irq.msi.ctl & 0x0080) && (offset >= 0x08)) + offset += 4; + switch (offset) { + case 0x00 ... 0x01: /* Capability */ + goto end; + + case 0x02 ... 0x03: /* Message Control */ + offset = (offset - 0x02) << 3; + new_value = dev->irq.msi.ctl & ~(0x00ff << offset); + new_value |= val << offset; + + /* Enable or disable MSI if requested and not conflicting with MSI-X. */ + if (dev->irq.type != VFIO_PCI_MSIX_IRQ_INDEX) { + if (!(dev->irq.msi.ctl & 0x0001) && (new_value & 0x0001)) + vfio_irq_enable(dev, VFIO_PCI_MSI_IRQ_INDEX); + else if ((dev->irq.msi.ctl & 0x0001) && !(new_value & 0x0001)) + vfio_irq_msi_disable(dev); + } + + /* Update control register. */ + dev->irq.msi.ctl = (new_value & 0x0071) | (dev->irq.msi.ctl & 0xff8e); + + /* Update enabled vector count and mask. */ + dev->irq.msi.vector_enable_count = MIN(1 << ((dev->irq.msi.ctl >> 1) & 3), dev->irq.msi.vector_count); + dev->irq.msi.vector_enable_mask = dev->irq.msi.vector_enable_count - 1; + goto end; + + case 0x04 ... 0x07: /* Message Address */ + offset = (offset - 0x04) << 3; + new_value = dev->irq.msi.address & ~(0x000000ff << offset); + new_value |= val << offset; + dev->irq.msi.address = new_value & 0xfffffffc; + goto end; + + case 0x08 ... 0x0b: /* Message Upper Address */ + offset = (offset - 0x08) << 3; + new_value = dev->irq.msi.address_upper & ~(0x000000ff << offset); + new_value |= val << offset; + dev->irq.msi.address_upper = new_value; + goto end; + + case 0x0c ... 0x0d: /* Message Data */ + offset = (offset - 0x0c) << 3; + new_value = dev->irq.msi.data & ~(0x00ff << offset); + new_value |= val << offset; + dev->irq.msi.data = new_value; + goto end; + + case 0x0e ... 0x0f: /* Reserved */ + case 0x14 ... 0x17: /* Pending Bits */ + if (dev->irq.msi.ctl & 0x0100) + goto end; + break; + + case 0x10 ... 0x13: /* Mask Bits */ + if (dev->irq.msi.ctl & 0x0100) { + offset = (offset - 0x10) << 3; + new_value = dev->irq.msi.mask & ~(0x000000ff << offset); + new_value |= val << offset; + dev->irq.msi.mask = new_value; + + /* Service any unmasked pending interrupts if MSI is enabled. */ + if (dev->irq.msi.ctl & 0x0001) { + new_value = ~new_value; + val64 = 1; + for (uint8_t i = 0; i < dev->irq.msi.vector_enable_count; i++) { + if (dev->irq.msi.pending & ((1 << i) & new_value)) + (void) !write(dev->irq.vectors[i].fd, &val64, sizeof(val64)); + } + dev->irq.msi.pending &= new_value; + } + + goto end; + } + break; + } + } + if (dev->msix_cap && (addr >= dev->msix_cap)) { /* MSI-X */ + offset = addr - dev->msix_cap; + switch (offset) { + case 0x00 ... 0x01: /* Capability */ + case 0x04 ... 0x0b: /* Table/PBA Offset */ + goto end; + + case 0x02 ... 0x03: /* Message Control */ + offset = (offset - 0x02) << 3; + new_value = dev->irq.msix.ctl & ~(0x00ff << offset); + new_value |= val << offset; + + /* Enable or disable MSI-X if requested. */ + if (!(dev->irq.msix.ctl & 0x8000) && (new_value & 0x8000)) + vfio_irq_enable(dev, VFIO_PCI_MSIX_IRQ_INDEX); + else if ((dev->irq.msix.ctl & 0x8000) && !(new_value & 0x8000)) + vfio_irq_msix_disable(dev); + + /* Update control register. */ + dev->irq.msix.ctl = (new_value & 0xc000) | (dev->irq.msix.ctl & 0x3fff); + + /* Service any unmasked pending interrupts if MSI-X + is enabled and the global mask bit was cleared. */ + if ((dev->irq.msix.ctl & 0xc000) == 0x8000) { + for (uint16_t i = 0x000c; i < dev->irq.msix.table_size; i += 0x0010) + vfio_irq_msix_updatemask(dev, i); + } + goto end; + } + } +end: + break; + } +} + +static void +vfio_config_writew(int func, int addr, uint16_t val, void *priv) +{ + vfio_config_writeb(func, addr, val, priv); + vfio_config_writeb(func, addr | 1, val >> 8, priv); +} + +static void +vfio_config_writel(int func, int addr, uint32_t val, void *priv) +{ + vfio_config_writeb(func, addr, val, priv); + vfio_config_writeb(func, addr | 1, val >> 8, priv); + vfio_config_writeb(func, addr | 2, val >> 16, priv); + vfio_config_writeb(func, addr | 3, val >> 24, priv); +} + +static void +vfio_irq_thread(void *priv) +{ + int nfds, i; + uint64_t buf; + struct epoll_event events[16]; + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .index = VFIO_PCI_INTX_IRQ_INDEX, + .start = 0, + .count = 1 + }; + vfio_device_t *dev; + vfio_irq_t *irq; + + vfio_log("VFIO: IRQ thread started\n"); + + while (epoll_fd >= 0) { + /* Wait for an interrupt to come in. */ + nfds = epoll_wait(epoll_fd, events, sizeof(events) / sizeof(events[0]), -1); + if (nfds < 0) { + vfio_log("VFIO %s: epoll_wait failed (%d)\n", errno); + break; + } + + /* Process all interrupts which came in. */ + for (i = 0; i < nfds; i++) { + /* Only handle read events. */ + if (!(events[i].events & EPOLLIN)) + continue; + + /* Get the IRQ and device structures for this interrupt. */ + irq = (vfio_irq_t *) events[i].data.ptr; + if (!irq) { + /* Do nothing if this is the wake eventfd, which has no data. */ + (void) !read(irq_thread_wake_fd, &buf, sizeof(buf)); + continue; + } + dev = irq->dev; + + /* Reset eventfd counter. */ + (void) !read(irq->fd, &buf, sizeof(buf)); + + /* Don't hang waiting for the timer if we're closing. */ + if (closing) + continue; + + /* Log VFIO IRQ type and vector. */ + vfio_log_op("VFIO %s: %s IRQ on vector %d\n", dev->name, + ((irq->type == VFIO_PCI_INTX_IRQ_INDEX) ? "INTx" : (((irq->type == VFIO_PCI_MSI_IRQ_INDEX) ? "MSI" : ((irq->type == VFIO_PCI_MSIX_IRQ_INDEX) ? "MSI-X" : NULL)))), + irq->vector); + + /* Perform pre-checks for specific IRQ types. */ + switch (irq->type) { + case VFIO_PCI_INTX_IRQ_INDEX: + /* Mask host IRQ. */ + irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_MASK; + ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + break; + + case VFIO_PCI_MSI_IRQ_INDEX: + /* Ignore MSI if this vector is not enabled. */ + if (irq->vector >= dev->irq.msi.vector_enable_count) { + vfio_log_op("VFIO %s: MSI vector not enabled (%d >= %d)\n", dev->name, + irq->vector, dev->irq.msi.vector_enable_count); + continue; + } + + /* Ignore MSI if the upper 32 bits of a 64-bit address are non-zero. */ + if (dev->irq.msi.address_upper) { + vfio_log_op("VFIO %s: MSI 64-bit address %08X%08X\n", dev->name, + dev->irq.msi.address_upper, dev->irq.msi.address); + continue; + } + + /* Mark MSI as pending if this vector is masked through per-vector masking. */ + if (dev->irq.msi.mask & (1 << irq->vector)) { + vfio_log_op("VFIO %s: MSI masked\n", dev->name); + dev->irq.msi.pending |= 1 << irq->vector; + continue; + } + break; + + case VFIO_PCI_MSIX_IRQ_INDEX: + /* Ignore MSI-X if the upper 32 bits of a 64-bit address are non-zero. */ + if (*((uint32_t *) &dev->irq.msix.table[irq->msix_offset | 0x4])) { + vfio_log_op("VFIO %s: MSI-X 64-bit address %016X\n", dev->name, + *((uint64_t *) &dev->irq.msix.table[irq->msix_offset])); + continue; + } + + /* Mark MSI-X as pending if this vector or all vectors are masked. */ + if ((dev->irq.msix.ctl & 0x4000) || (dev->irq.msix.table[irq->msix_offset | 0xc] & 0x01)) { + vfio_log_op("VFIO %s: MSI-X masked\n", dev->name); + dev->irq.msix.pba[irq->vector >> 3] |= 1 << (irq->vector & 0x07); + continue; + } + break; + } + + /* Tell the timer to service this interrupt. */ + current_irq = irq; + + /* Wait for the timer to do its job. */ + thread_wait_event(irq_event, -1); + thread_reset_event(irq_event); + vfio_log_op("VFIO %s: IRQ serviced\n", dev->name); + + /* Unmask host IRQ if this is INTx. */ + if (irq->type == VFIO_PCI_INTX_IRQ_INDEX) { + irq_set.flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_UNMASK; + ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); + } + } + + /* Pause if we were asked to. */ + thread_wait_event(irq_thread_resume, -1); + } + + /* We're done here. */ + vfio_log("VFIO: IRQ thread finished\n"); +} + +static void +vfio_irq_timer(void *priv) +{ + /* Schedule next run. */ + timer_on_auto(&irq_timer, 100.0); + + /* Stop if we're not servicing an IRQ at the moment. */ + if (!current_irq) + return; + vfio_device_t *dev = current_irq->dev; + + /* Act according to the IRQ type. */ + uint16_t val; + switch (current_irq->type) { + case VFIO_PCI_INTX_IRQ_INDEX: + if (!dev->irq.intx.raised) { /* rising edge */ + vfio_log_op("VFIO %s: Raising IRQ on pin INT%c\n", dev->name, + '@' + dev->irq.intx.pin); + + /* Raise IRQ. */ + pci_set_irq(dev->slot, dev->irq.intx.pin, &dev->irq.intx.state); + + /* Mark the IRQ as active, so that a BAR read/write can lower it. */ + dev->irq.intx.raised = intx_high = 1; + } else if (!intx_high) { /* falling edge */ + vfio_log_op("VFIO %s: Lowering IRQ on pin INT%c\n", dev->name, + '@' + dev->irq.intx.pin); + + /* Lower IRQ. */ + pci_clear_irq(dev->slot, dev->irq.intx.pin, &dev->irq.intx.state); + + /* Mark the IRQ as no longer high. */ + dev->irq.intx.raised = intx_high = 0; + + /* Allow the IRQ thread to be unblocked. */ + break; + } + + /* Don't unblock the IRQ thread unless otherwise stated. */ + return; + + case VFIO_PCI_MSI_IRQ_INDEX: + /* Insert the vector number into the value's lower bits. */ + val = (dev->irq.msi.data & ~dev->irq.msi.vector_enable_mask) | (current_irq->vector & dev->irq.msi.vector_enable_mask); + + /* Write value. */ + vfio_log_op("VFIO %s: Writing MSI value %04X to %04X\n", dev->name, val, dev->irq.msi.address); + mem_writew_phys(dev->irq.msi.address, val); + break; + + case VFIO_PCI_MSIX_IRQ_INDEX: + /* Write value. */ + vfio_log_op("VFIO %s: Writing MSI-X value %08X to %08X\n", dev->name, + *((uint32_t *) &dev->irq.msix.table[current_irq->msix_offset | 0x8]), + *((uint32_t *) &dev->irq.msix.table[current_irq->msix_offset])); + mem_writel_phys(*((uint32_t *) &dev->irq.msix.table[current_irq->msix_offset]), + *((uint32_t *) &dev->irq.msix.table[current_irq->msix_offset | 0x8])); + break; + } + + /* Unblock the IRQ thread. */ + current_irq = NULL; + thread_set_event(irq_event); +} + +static void +vfio_irq_disabletype(vfio_device_t *dev, int type) +{ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set), + .flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = type, + .start = 0, + .count = 0, + }; + ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, &irq_set); +} + +static void +vfio_irq_intx_disable(vfio_device_t *dev) +{ + /* Disable INTx on VFIO. */ + vfio_irq_disabletype(dev, VFIO_PCI_INTX_IRQ_INDEX); + + /* Clear pending interrupts. */ + dev->irq.intx.raised = intx_high = 0; + if (dev->irq.intx.pin) + pci_clear_irq(dev->slot, dev->irq.intx.pin, &dev->irq.intx.state); + + /* Disable interrupts altogether. */ + dev->irq.type = VFIO_PCI_NUM_IRQS; +} + +static void +vfio_irq_intx_setpin(vfio_device_t *dev) +{ + uint8_t val; + if (pread(dev->config.fd, &val, sizeof(val), dev->config.offset + 0x3d) == sizeof(val)) + dev->irq.intx.pin = val; + vfio_log("VFIO %s: IRQ pin is INT%c\n", dev->name, '@' + MIN(dev->irq.intx.pin, 'Z')); +} + +static void +vfio_irq_msi_disable(vfio_device_t *dev) +{ + /* Clear pending interrupts. */ + dev->irq.msi.pending = 0; + + /* Disable MSI on VFIO. */ + vfio_irq_disabletype(dev, VFIO_PCI_MSI_IRQ_INDEX); + + /* Re-enable INTx interrupts. */ + vfio_irq_enable(dev, VFIO_PCI_INTX_IRQ_INDEX); +} + +static void +vfio_irq_msix_disable(vfio_device_t *dev) +{ + /* Clear pending interrupts. */ + memset(dev->irq.msix.pba, 0, dev->irq.vector_count); + + /* Disable MSI-X on VFIO. */ + vfio_irq_disabletype(dev, VFIO_PCI_MSIX_IRQ_INDEX); + + /* Re-enable INTx interrupts. */ + vfio_irq_enable(dev, VFIO_PCI_INTX_IRQ_INDEX); +} + +static void +vfio_irq_msix_updatemask(vfio_device_t *dev, uint16_t offset) +{ + /* Service any unmasked pending interrupts. */ + if (((dev->irq.msix.ctl & 0xc000) == 0x8000) && !(dev->irq.msix.table[offset] & 0x01) && (dev->irq.msix.pba[offset >> 7] & (1 << (offset & 0x07)))) { + uint64_t val = 1; + (void) !write(dev->irq.vectors[offset >> 4].fd, &val, sizeof(val)); + dev->irq.msix.pba[offset >> 7] &= ~(1 << (offset & 0x07)); + } +} + +#define VFIO_RW_MSIX(length_char, val_type, val_slength) \ + static val_type \ + vfio_irq_msix_table_read##length_char(uint32_t addr, void *priv) \ + { \ + vfio_device_t *dev = (vfio_device_t *) priv; \ + val_type ret = dev->irq.msix.table[addr - dev->irq.msix.table_offset_precalc]; \ + vfio_log_op("[%08X:%04X] VFIO %s: msix_table_read" #length_char "(%08X) = %0" #val_slength "X\n", CS, cpu_state.pc, dev->name, addr, ret); \ + return ret; \ + } \ + \ + static void \ + vfio_irq_msix_table_write##length_char(uint32_t addr, val_type val, void *priv) \ + { \ + vfio_device_t *dev = (vfio_device_t *) priv; \ + vfio_log_op("[%08X:%04X] VFIO %s: msix_table_write" #length_char "(%08X, %0" #val_slength "X)\n", CS, cpu_state.pc, dev->name, addr, val); \ + uint16_t offset = addr - dev->irq.msix.table_offset_precalc; \ + dev->irq.msix.table[offset] = val; \ + if ((offset & 0x000f) == 0x000c) \ + vfio_irq_msix_updatemask(dev, offset); \ + } \ + \ + static val_type \ + vfio_irq_msix_pba_read##length_char(uint32_t addr, void *priv) \ + { \ + vfio_device_t *dev = (vfio_device_t *) priv; \ + val_type ret = dev->irq.msix.table[addr - dev->irq.msix.pba_offset_precalc]; \ + vfio_log_op("[%08X:%04X] VFIO %s: msix_pba_read" #length_char "(%08X) = %0" #val_slength "X\n", CS, cpu_state.pc, dev->name, addr, ret); \ + return ret; \ + } \ + \ + static void \ + vfio_irq_msix_pba_write##length_char(uint32_t addr, val_type val, void *priv) \ + { \ + vfio_device_t *dev = (vfio_device_t *) priv; \ + vfio_log_op("[%08X:%04X] VFIO %s: msix_pba_write" #length_char "(%08X, %0" #val_slength "X)\n", CS, cpu_state.pc, dev->name, addr, val); \ + } + +VFIO_RW_MSIX(b, uint8_t, 2) +VFIO_RW_MSIX(w, uint16_t, 4) +VFIO_RW_MSIX(l, uint32_t, 8) + +static void +vfio_irq_disable(vfio_device_t *dev) +{ + /* Do nothing if IRQs are already disabled. */ + if (dev->irq.type == VFIO_PCI_NUM_IRQS) + return; + vfio_log("VFIO %s: irq_disable(%d)\n", dev->name, dev->irq.type); + + /* Pause IRQ thread. */ + thread_reset_event(irq_thread_resume); + uint64_t val = 1; + (void) !write(irq_thread_wake_fd, &val, sizeof(val)); + + /* Always disable INTx after disabling MSI/MSI-X. */ + if (dev->irq.type == VFIO_PCI_MSIX_IRQ_INDEX) + vfio_irq_msix_disable(dev); + else if (dev->irq.type == VFIO_PCI_MSI_IRQ_INDEX) + vfio_irq_msi_disable(dev); + if (dev->irq.type == VFIO_PCI_INTX_IRQ_INDEX) + vfio_irq_intx_disable(dev); + + /* Invalidate all IRQ vectors. */ + if (dev->irq.vectors) { + for (int i = 0; i < dev->irq.vector_count; i++) { + if (dev->irq.vectors[i].fd >= 0) { + /* Remove eventfd from epoll. */ + epoll_ctl(epoll_fd, EPOLL_CTL_DEL, dev->irq.vectors[i].fd, NULL); + close(dev->irq.vectors[i].fd); + } + } + free(dev->irq.vectors); + dev->irq.vectors = NULL; + dev->irq.vector_count = 0; + } + + /* Resume IRQ thread. */ + thread_set_event(irq_thread_resume); +} + +static void +vfio_irq_enable(vfio_device_t *dev, int type) +{ + /* Disable any existing IRQs. */ + vfio_irq_disable(dev); + + vfio_log("VFIO %s: irq_enable(%d)\n", dev->name, type); + + /* Determine the number of vectors needed. */ + switch (type) { + case VFIO_PCI_INTX_IRQ_INDEX: + /* Only one vector needed. */ + dev->irq.vector_count = 1; + break; + + case VFIO_PCI_MSI_IRQ_INDEX: + /* Up to the number of vectors read during init is needed. */ + dev->irq.vector_count = dev->irq.msi.vector_count; + break; + + case VFIO_PCI_MSIX_IRQ_INDEX: + /* The number of vectors read during init is needed. */ + dev->irq.vector_count = dev->irq.msix.vector_count; + break; + } + + /* Prepare structure for enabling the interrupt type. */ + struct vfio_irq_set irq_set = { + .argsz = sizeof(irq_set) + (sizeof(int32_t) * dev->irq.vector_count), + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = type, + .start = 0, + .count = dev->irq.vector_count + }; + int32_t *fd_list = (int32_t *) &irq_set.data; + struct epoll_event event = { .events = EPOLLIN }; + + /* Create interrupt vectors with their respective eventfds. */ + dev->irq.vectors = (vfio_irq_t *) malloc(sizeof(vfio_irq_t) * dev->irq.vector_count); + for (int i = 0; i < dev->irq.vector_count; i++) { + dev->irq.vectors[i].dev = dev; + dev->irq.vectors[i].type = type; + dev->irq.vectors[i].vector = i; + fd_list[i] = dev->irq.vectors[i].fd = eventfd(0, 0); + if (fd_list[i] < 0) + pclog("VFIO %s: IRQ eventfd %d failed (%d)\n", dev->name, i, errno); + else { + /* Add eventfd to epoll. */ + event.data.ptr = &dev->irq.vectors[i]; + epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd_list[i], &event); + } + dev->irq.vectors[i].msix_offset = i << 4; /* pre-calculated value to save operations on MSI-X processing */ + } + + /* Enable interrupt type on VFIO. */ + if (ioctl(dev->fd, VFIO_DEVICE_SET_IRQS, &irq_set)) + pclog("VFIO %s: SET_IRQS(%d, %d) failed (%d)\n", dev->name, + type, dev->irq.vector_count, errno); + dev->irq.type = type; +} + +static void +vfio_region_init(vfio_device_t *dev, struct vfio_region_info *reg, vfio_region_t *region) +{ + /* Set region structure information. */ + region->fd = dev->fd; + region->offset = reg->offset; + if (reg->index == VFIO_PCI_VGA_REGION_INDEX) { + region->bar_id = 0xfe; + if (region == &dev->vga_io_lo) { + region->offset += 0x3b0; + region->size = 12; + region->type = 0x01; + } else if (region == &dev->vga_io_hi) { + region->offset += 0x3c0; + region->size = 32; + region->type = 0x01; + } else { + region->offset += 0xa0000; + region->size = 131072; + region->type = 0x00; + } + } else { + region->size = reg->size; + region->type = 0xff; + } + region->read = !!(reg->flags & VFIO_REGION_INFO_FLAG_READ); + region->write = !!(reg->flags & VFIO_REGION_INFO_FLAG_WRITE); + region->dev = dev; + + /* Use special memory mapping for expansion ROMs. */ + if (reg->index == VFIO_PCI_ROM_REGION_INDEX) { + /* Use MMIO only. */ + region->fd = -1; + + /* Open ROM file if one was given. */ + FILE *fp = NULL; + if (dev->rom_fn) { + pclog("VFIO %s: Loading ROM from file: %s\n", dev->name, dev->rom_fn); + fp = fopen(dev->rom_fn, "rb"); + if (fp) { + /* Determine region size if the device has no ROM region. */ + if (!region->size) { + fseek(fp, 0, SEEK_END); + region->size = ceilpow2(ftell(fp)); + if (region->size < 2048) /* minimum size for an expansion ROM */ + region->size = 2048; + fseek(fp, 0, SEEK_SET); + } + } else { + /* Fall back to the device's ROM if it has one. */ + pclog("VFIO %s: Could not read ROM file, ", dev->name); + if (region->size) { + pclog("falling back to device ROM\n"); + } else { + /* Disable ROM. */ + pclog("not enabling ROM\n"); + region->read = region->write = 0; + goto end; + } + } + } + + /* Mark this as the expansion ROM region. */ + region->type = 0x00; + region->bar_id = 0xff; + + /* Allocate ROM shadow area. */ + region->mmap_base = region->mmap_precalc = plat_mmap(region->size, 0); + if (region->mmap_base == ((void *) -1)) { + pclog("VFIO %s: ROM mmap(%" PRIu64 ") failed\n", dev->name, region->size); + region->mmap_base = NULL; + goto end; + } + memset(region->mmap_base, 0xff, region->size); + + int i, j = 0; + if (fp) { + /* Read ROM from file. */ + while ((i = fread(region->mmap_precalc, 1, + region->size - j, + fp)) + != 0) { + region->mmap_precalc += i; + j += i; + } + fclose(fp); + } else { + /* Read ROM from device. */ + while ((i = pread(dev->fd, region->mmap_precalc, + region->size - j, + region->offset + j)) + != 0) { + region->mmap_precalc += i; + j += i; + } + } + + /* Perform a few sanity checks on the ROM, starting with the signature. */ + j = 0; + if (*((uint16_t *) ®ion->mmap_base[0x00]) == 0xaa55) { + /* Check ROM length. */ + uint32_t rom_len = region->mmap_base[0x02] << 9; /* 512-byte blocks */ + if (rom_len > region->size) { + pclog("VFIO %s: Warning: ROM length (%d bytes) is larger than ROM region (%" PRIu64 " bytes)\n", + dev->name, rom_len, region->size); + j = 1; + } + + /* Check PCI pointer. */ + uint16_t pci_ptr = *((uint16_t *) ®ion->mmap_base[0x18]); + if (pci_ptr && (pci_ptr != 0xffff)) { + /* Check PCI pointer bounds. */ + if (pci_ptr <= (region->size - 0x12)) { + /* Check PCI header ROM length only if <= 130048 bytes, as the + ROM length is 8 bits in the main header and 16 bits in here. */ + uint32_t pci_len = *((uint16_t *) ®ion->mmap_base[pci_ptr + 0x18]) << 9; /* 512-byte blocks */ + if ((pci_len <= (254 << 9)) && (pci_len != rom_len)) { + pclog("VFIO %s: Warning: ROM length in main header (%d bytes) is " + "different from length in PCI header (%d bytes)\n", + dev->name, rom_len, pci_len); + j = 1; + } + } else { + pclog("VFIO %s: Warning: ROM has invalid PCI header pointer: %04X\n", + dev->name, pci_ptr); + j = 1; + } + } else { + pclog("VFIO %s: Warning: ROM has no PCI header pointer\n", + dev->name); + j = 1; + } + + /* Compare checksum. */ + uint8_t checksum = 0; + if (rom_len > region->size) /* don't go out of bounds */ + rom_len = region->size; + rom_len -= 1; + for (i = 0; i < rom_len; i++) + checksum -= region->mmap_base[i]; + if (checksum != region->mmap_base[i]) { + pclog("VFIO %s: Warning: ROM has bad checksum; expected %02X, got %02X\n", + dev->name, checksum, region->mmap_base[i]); + j = 1; + } + } else { + pclog("VFIO %s: Warning: ROM has no 55 AA signature\n", dev->name); + j = 1; + } + + /* Add a helpful reminder if a sanity check warning was printed + and no ROM file was specified in this device's configuration. */ + if (j && !dev->rom_fn) + pclog("VFIO %s: A custom ROM can be loaded with the _rom_fn directive.\n", dev->name); + } else { + /* Attempt to mmap the region. */ + region->mmap_base = mmap(NULL, region->size, + (region->read ? PROT_READ : 0) | (region->write ? PROT_WRITE : 0), + MAP_SHARED, region->fd, region->offset); + if (region->mmap_base == ((void *) -1)) /* mmap failed */ + region->mmap_base = NULL; + } + region->mmap_precalc = region->mmap_base; + +end: + vfio_log("VFIO %s: Region: %s (offset %lX) (%d bytes) ", dev->name, + region->name, region->offset, region->size); + + /* Create memory mapping for if we need it. */ + if (region->mmap_base) { /* mmap available */ + vfio_log("(MM)"); + mem_mapping_add(®ion->mem_mapping, 0, 0, + region->read ? vfio_mem_readb_mm : NULL, + region->read ? vfio_mem_readw_mm : NULL, + region->read ? vfio_mem_readl_mm : NULL, + region->write ? vfio_mem_writeb_mm : NULL, + region->write ? vfio_mem_writew_mm : NULL, + region->write ? vfio_mem_writel_mm : NULL, + NULL, MEM_MAPPING_EXTERNAL, region->mmap_precalc); + } else if (region->fd >= 0) { /* mmap not available, but fd is */ + vfio_log("(FD)"); + mem_mapping_add(®ion->mem_mapping, 0, 0, + region->read ? vfio_mem_readb_fd : NULL, + region->read ? vfio_mem_readw_fd : NULL, + region->read ? vfio_mem_readl_fd : NULL, + region->write ? vfio_mem_writeb_fd : NULL, + region->write ? vfio_mem_writew_fd : NULL, + region->write ? vfio_mem_writel_fd : NULL, + NULL, MEM_MAPPING_EXTERNAL, region); + } else { + vfio_log("(not mapped)"); + } + + vfio_log(" (%c%c)\n", region->read ? 'R' : '-', region->write ? 'W' : '-'); +} + +static void +vfio_region_close(vfio_device_t *dev, vfio_region_t *region) +{ + /* Stop if this region was not initialized. */ + if (!region->size) + return; + + /* Unmap memory if mmap was available. */ + if (region->mmap_base) + plat_munmap(region->mmap_base, region->size); +} + +static vfio_group_t * +vfio_group_get(int id, uint8_t add) +{ + /* Look for an existing group. */ + vfio_group_t *group = first_group; + while (group) { + if (group->id == id) + return group; + else if (group->next) + group = group->next; + else + break; + } + + /* Don't add a group if told not to. */ + if (!add) + return NULL; + + /* Add group if no matches were found. */ + if (group) { + group->next = (vfio_group_t *) malloc(sizeof(vfio_group_t)); + group = group->next; + } else { + group = first_group = (vfio_group_t *) malloc(sizeof(vfio_group_t)); + } + memset(group, 0, sizeof(vfio_group_t)); + group->id = id; + + /* Open VFIO group. */ + char group_file[32]; + snprintf(group_file, sizeof(group_file), "/dev/vfio/%d", group->id); + group->fd = open(group_file, O_RDWR); + if (group->fd < 0) { + pclog("VFIO: Group %d not found\n", group->id); + goto end; + } + + /* Check if the group is viable. */ + struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + if (ioctl(group->fd, VFIO_GROUP_GET_STATUS, &group_status)) { + pclog("VFIO: Group %d GET_STATUS failed (%d)\n", group->id, errno); + goto close_fd; + } else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + pclog("VFIO: Group %d not viable\n", group->id); + goto close_fd; + } + + /* Claim the group. */ + if (ioctl(group->fd, VFIO_GROUP_SET_CONTAINER, &container_fd)) { + pclog("VFIO: Group %d SET_CONTAINER failed\n", group->id); + goto close_fd; + } + + goto end; + +close_fd: + close(group->fd); + group->fd = -1; +end: + return group; +} + +static void +vfio_dev_prereset(vfio_device_t *dev) +{ + vfio_log("VFIO %s: prereset()\n", dev->name); + + /* Disable interrupts. */ + vfio_irq_disable(dev); + + /* Extra steps for devices with power management capability. */ + if (dev->pm_cap) { + /* Make sure the device is in D0 state. */ + uint8_t pm_ctrl = vfio_config_readb(0, dev->pm_cap + 4, dev), + state = pm_ctrl & 0x03; + if (state) { + pm_ctrl &= ~0x03; + vfio_config_writeb(0, dev->pm_cap + 4, pm_ctrl, dev); + + pm_ctrl = vfio_config_readb(0, dev->pm_cap + 4, dev); + state = pm_ctrl & 0x03; + if (state) + vfio_log("VFIO %s: Device stuck in D%d state\n", dev->name, state); + } + + /* Enable PM reset if the device supports it. */ + dev->can_pm_reset = !(pm_ctrl & 0x08); + } + + /* Enable function-level reset if supported. */ + dev->can_flr_reset = (dev->pcie_cap && (vfio_config_readb(0, dev->pcie_cap + 7, dev) & 0x10)) || (dev->af_cap && (vfio_config_readb(0, dev->af_cap + 3, dev) & 0x02)); + + /* Disable bus master, BARs, expansion ROM and VGA regions; also enable INTx. */ + vfio_config_writew(0, 0x04, vfio_config_readw(0, 0x04, dev) & ~0x0407, dev); +} + +static void +vfio_dev_postreset(vfio_device_t *dev) +{ + vfio_log("VFIO %s: postreset()\n", dev->name); + + /* Enable INTx interrupts. MSI(-X) can be enabled by the OS later. */ + if (!closing) + vfio_irq_enable(dev, VFIO_PCI_INTX_IRQ_INDEX); + + /* Reset BARs, whatever this does. */ + uint32_t val = 0; + for (uint8_t i = 0x10; i < 0x28; i++) + (void) !pwrite(dev->config.fd, &val, sizeof(val), dev->config.offset + i); +} + +static int +vfio_dev_init(vfio_device_t *dev) +{ + vfio_log("VFIO %s: init()\n", dev->name); + + /* Grab device. */ + dev->fd = ioctl(current_group->fd, VFIO_GROUP_GET_DEVICE_FD, dev->name); + if (dev->fd < 0) { + vfio_log("VFIO %s: GET_DEVICE_FD failed (%d)\n", dev->name, errno); + goto end; + } + + /* Get device information. */ + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + if (ioctl(dev->fd, VFIO_DEVICE_GET_INFO, &device_info)) { + pclog("VFIO %s: GET_INFO failed (%d), check for error in kernel log\n", dev->name, errno); + goto end; + } + + /* Check if any regions were returned. */ + if (!device_info.num_regions) { + pclog("VFIO %s: No regions returned, check for error in kernel log\n", dev->name); + goto end; + } + + /* Set main reset flag. */ + dev->can_reset = !!(device_info.flags & VFIO_DEVICE_FLAGS_RESET); + + /* Establish region names. */ + for (uint8_t i = 0; i < 6; i++) { + sprintf(dev->bars[i].name, "BAR #%d", dev->bars[i].bar_id = i); + dev->bars[i].type = 0xff; + } + strcpy(dev->rom.name, "Expansion ROM"); + strcpy(dev->config.name, "Configuration space"); + strcpy(dev->vga_io_lo.name, "VGA MDA"); + strcpy(dev->vga_io_hi.name, "VGA CGA/EGA"); + strcpy(dev->vga_mem.name, "VGA Framebuffer"); + + /* Initialize all regions. */ + struct vfio_region_info reg = { .argsz = sizeof(reg) }; + uint8_t cls; + for (int i = 0; i < device_info.num_regions; i++) { + /* Get region information. */ + reg.index = i; + ioctl(dev->fd, VFIO_DEVICE_GET_REGION_INFO, ®); + + /* Move on to the next region if this one is not valid. */ + if (!reg.size) + continue; + + /* Initialize region according to its type. */ + switch (reg.index) { + case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX: + vfio_region_init(dev, ®, &dev->bars[reg.index - VFIO_PCI_BAR0_REGION_INDEX]); + if (reg.size) + dev->bar_count++; + break; + + case VFIO_PCI_ROM_REGION_INDEX: + vfio_region_init(dev, ®, &dev->rom); + break; + + case VFIO_PCI_CONFIG_REGION_INDEX: + vfio_region_init(dev, ®, &dev->config); + break; + + case VFIO_PCI_VGA_REGION_INDEX: + /* Don't initialize VGA region if this is not a video card. */ + if ((dev->config.fd > 0) && (pread(dev->config.fd, &cls, sizeof(cls), dev->config.offset + 0x0b) == sizeof(cls)) && (cls != 0x03)) + break; + + vfio_region_init(dev, ®, &dev->vga_io_lo); /* I/O [3B0:3BB] */ + vfio_region_init(dev, ®, &dev->vga_io_hi); /* I/O [3C0:3DF] */ + vfio_region_init(dev, ®, &dev->vga_mem); /* memory [A0000:BFFFF] */ + + /* Inform that a PCI VGA video card is attached if no video card is emulated. */ + if (gfxcard == VID_NONE) + video_inform(VIDEO_FLAG_TYPE_SPECIAL, &timing_default); + break; + + default: + vfio_log("VFIO %s: Unknown region %d (offset %lX) (%d bytes) (%c%c)\n", + dev->name, reg.index, reg.offset, reg.size, + (reg.flags & VFIO_REGION_INFO_FLAG_READ) ? 'R' : '-', + (reg.flags & VFIO_REGION_INFO_FLAG_WRITE) ? 'W' : '-'); + break; + } + } + + /* Make sure we have a valid device. */ + if (!dev->config.fd || !dev->config.read) { + pclog("VFIO %s: No configuration space region\n", dev->name); + goto end; + } + + /* Initialize ROM region if the device doesn't have one and we're loading a ROM from file. */ + if (dev->rom_fn && !dev->rom.fd) { + reg.index = VFIO_PCI_ROM_REGION_INDEX; + reg.offset = reg.size = 0; + reg.flags = VFIO_REGION_INFO_FLAG_READ; + vfio_region_init(dev, ®, &dev->rom); + } + + /* Go through PCI capability list if the device declares one. */ + dev->irq.msix.table_bar = dev->irq.msix.pba_bar = 0x07; + uint8_t cap_ptr; + uint8_t cap_id; + if ((pread(dev->config.fd, &cap_ptr, sizeof(cap_ptr), dev->config.offset + 0x06) == sizeof(cap_ptr)) && (cap_ptr & 0x10)) { + vfio_log("VFIO %s: Device capabilities:", dev->name); + + /* Read pointer to the first capability. */ + if (pread(dev->config.fd, &cap_ptr, sizeof(cap_ptr), dev->config.offset + 0x34) != sizeof(cap_ptr)) + cap_ptr = 0; + while (cap_ptr && (cap_ptr != 0xff)) { /* check 0xff just in case */ + /* Read capability ID, and store pointers to ones we care about. */ + if (pread(dev->config.fd, &cap_id, sizeof(cap_id), dev->config.offset + cap_ptr) != sizeof(cap_id)) + cap_id = 0; + switch (cap_id) { + case 0x01: + vfio_log(" PM"); + dev->pm_cap = cap_ptr; + break; + + case 0x05: + vfio_log(" MSI"); + if (dev->msi_cap) /* multiple copies not permitted by spec */ + break; + dev->msi_cap = cap_ptr; + + /* Read control register. */ + if (pread(dev->config.fd, &dev->irq.msi.ctl, sizeof(dev->irq.msi.ctl), + dev->config.offset + dev->msi_cap + 2) + != sizeof(dev->irq.msi.ctl)) + dev->irq.msi.ctl = 0; + + /* Set vector count. */ + dev->irq.msi.vector_count = (dev->irq.msi.ctl >> 1) & 0x07; + break; + + case 0x10: + vfio_log(" PCIe"); + dev->pcie_cap = cap_ptr; + break; + + case 0x11: + vfio_log(" MSI-X"); + if (dev->msix_cap) /* multiple copies not permitted by spec */ + break; + dev->msix_cap = cap_ptr; + + /* Read control register. */ + if (pread(dev->config.fd, &dev->irq.msix.ctl, sizeof(dev->irq.msix.ctl), + dev->config.offset + dev->msix_cap + 2) + != sizeof(dev->irq.msix.ctl)) + dev->irq.msix.ctl = 0; + + /* Set vector count. */ + dev->irq.msix.vector_count = (dev->irq.msix.ctl & 0x07ff) + 1; + + /* Read table and PBA BARs and offsets. */ + if (pread(dev->config.fd, &dev->irq.msix.table_offset, sizeof(dev->irq.msix.table_offset), + dev->config.offset + dev->msix_cap + 4) + != sizeof(dev->irq.msix.table_offset)) + dev->irq.msix.table_offset = 0x00000007; + dev->irq.msix.table_bar = dev->irq.msix.table_offset & 0x00000007; + dev->irq.msix.table_offset &= 0xfffffff8; + + if (pread(dev->config.fd, &dev->irq.msix.pba_offset, sizeof(dev->irq.msix.pba_offset), + dev->config.offset + dev->msix_cap + 8) + != sizeof(dev->irq.msix.pba_offset)) + dev->irq.msix.pba_offset = 0x00000007; + dev->irq.msix.pba_bar = dev->irq.msix.pba_offset & 0x00000007; + dev->irq.msix.pba_offset &= 0xfffffff8; + + /* Allocate table and PBA structures. */ + dev->irq.msix.table_size = dev->irq.msix.vector_count << 4; + dev->irq.msix.table = malloc(dev->irq.msix.table_size); + if (!dev->irq.msix.table) { + pclog("VFIO %s: MSI-X table malloc(%d) failed\n", dev->name, dev->irq.msix.table_size); + dev->irq.msix.table_size = dev->irq.msix.vector_count = 0; + } + + dev->irq.msix.pba_size = ((dev->irq.msix.vector_count - 1) >> 3) + 1; + dev->irq.msix.pba = malloc(dev->irq.msix.pba_size); + if (!dev->irq.msix.pba) { + pclog("VFIO %s: MSI-X PBA malloc(%d) failed\n", dev->name, dev->irq.msix.pba_size); + dev->irq.msix.pba_size = dev->irq.msix.vector_count = 0; + } + + /* Add table and PBA mappings. + Being added after region setup, they should override the main BAR mapping. */ + mem_mapping_add(&dev->irq.msix.table_mapping, 0, 0, + vfio_irq_msix_table_readb, + vfio_irq_msix_table_readw, + vfio_irq_msix_table_readl, + vfio_irq_msix_table_writeb, + vfio_irq_msix_table_writew, + vfio_irq_msix_table_writel, + NULL, MEM_MAPPING_EXTERNAL, dev); + + mem_mapping_add(&dev->irq.msix.pba_mapping, 0, 0, + vfio_irq_msix_pba_readb, + vfio_irq_msix_pba_readw, + vfio_irq_msix_pba_readl, + vfio_irq_msix_pba_writeb, + vfio_irq_msix_pba_writew, + vfio_irq_msix_pba_writel, + NULL, MEM_MAPPING_EXTERNAL, dev); + break; + + case 0x13: + vfio_log(" AF"); + dev->af_cap = cap_ptr; + break; + + default: + vfio_log(" [%02X]", cap_id); + break; + } + + /* Read pointer to the next capability. */ + if (pread(dev->config.fd, &cap_ptr, sizeof(cap_ptr), dev->config.offset + cap_ptr + 1) != sizeof(cap_ptr)) + cap_ptr = 0; + } + + vfio_log("\n"); + } + + /* Read INTx IRQ pin. */ + vfio_irq_intx_setpin(dev); + + /* Add PCI card while mapping the configuration space. */ + pci_add_card(PCI_ADD_NORMAL, vfio_config_readb, vfio_config_writeb, dev, &dev->slot); + + return 0; + +end: + if (dev->fd >= 0) + close(dev->fd); + return 1; +} + +static void +vfio_dev_close(vfio_device_t *dev) +{ + vfio_log("VFIO %s: close()\n", dev->name); + + /* Close all regions. */ + for (uint8_t i = 0; i < 6; i++) + vfio_region_close(dev, &dev->bars[i]); + vfio_region_close(dev, &dev->rom); + vfio_region_close(dev, &dev->config); + vfio_region_close(dev, &dev->vga_io_lo); + vfio_region_close(dev, &dev->vga_io_hi); + vfio_region_close(dev, &dev->vga_mem); + + /* Close device fd. */ + if (dev->fd >= 0) { + close(dev->fd); + dev->fd = -1; + } + + /* Clean up. */ + if (dev->irq.msix.table) + free(dev->irq.msix.table); + if (dev->irq.msix.pba) + free(dev->irq.msix.pba); + free(dev->name); +} + +void +vfio_unmap_dma(uint32_t offset, uint32_t size) +{ + struct vfio_iommu_type1_dma_unmap dma_unmap = { + .argsz = sizeof(dma_unmap), + .iova = offset, + .size = size + }; + + vfio_log("VFIO: unmap_dma(%08X, %d)\n", offset, size); + + /* Unmap DMA region. */ + if (!ioctl(container_fd, VFIO_IOMMU_UNMAP_DMA, &dma_unmap)) + return; + + vfio_log("VFIO: unmap_dma(%08X, %d) failed (%d)\n", offset, size, errno); +} + +void +vfio_map_dma(uint8_t *ptr, uint32_t offset, uint32_t size) +{ + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .vaddr = (uint64_t) ptr, + .iova = offset, + .size = size, + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE + }; + + vfio_log("VFIO: map_dma(%08X, %d)\n", offset, size); + + /* Map DMA region. */ + if (!ioctl(container_fd, VFIO_IOMMU_MAP_DMA, &dma_map)) + return; + + /* QEMU says mapping should be retried in case of EBUSY. */ + if (errno == EBUSY) { + vfio_unmap_dma(offset, size); + if (!ioctl(container_fd, VFIO_IOMMU_MAP_DMA, &dma_map)) + return; + } + + pclog("VFIO: map_dma(%08X, %d) failed (%d)\n", offset, size, errno); +} + +static void +vfio_reset(void *priv) +{ + vfio_log("VFIO: reset()\n"); + + /* Pre-reset and figure out the reset type for all devices. */ + int size; + int count; + struct vfio_pci_hot_reset_info *hot_reset_info; + struct vfio_pci_dependent_device *devices; + char name[13]; + vfio_group_t *group = first_group; + vfio_device_t *dev; + while (group) { + dev = group->first_device; + while (dev) { + /* Pre-reset this device. */ + vfio_dev_prereset(dev); + + /* Clear hot reset capable flag for this device. */ + dev->can_hot_reset = 0; + + /* Get hot reset information for the first time to get the entry count. */ + size = sizeof(struct vfio_pci_hot_reset_info); + hot_reset_info = (struct vfio_pci_hot_reset_info *) malloc(size); + if (!hot_reset_info) { + vfio_log("VFIO %s: malloc(hot_reset_info) 1 failed\n", dev->name); + goto next1; + } + memset(hot_reset_info, 0, size); + hot_reset_info->argsz = size; + if (ioctl(dev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, hot_reset_info) && (errno != ENOSPC)) { + vfio_log("VFIO %s: GET_PCI_HOT_RESET_INFO 1 failed (%d)\n", dev->name, errno); + goto next1; + } + count = hot_reset_info->count; + free(hot_reset_info); + + /* Get hot reset information for the second time to get the actual entries. */ + size = sizeof(struct vfio_pci_hot_reset) + (sizeof(struct vfio_pci_dependent_device) * count); + hot_reset_info = (struct vfio_pci_hot_reset_info *) malloc(size); + if (!hot_reset_info) { + vfio_log("VFIO %s: malloc(hot_reset_info) 2 failed\n", dev->name); + goto next1; + } + memset(hot_reset_info, 0, size); + hot_reset_info->argsz = size; + if (ioctl(dev->fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, hot_reset_info)) { + vfio_log("VFIO %s: GET_PCI_HOT_RESET_INFO 2 failed (%d)\n", dev->name, errno); + goto next1; + } + devices = &hot_reset_info->devices[0]; + + /* Go through the dependent device entries. */ + for (int i = 0; i < count; i++) { + /* Build this dependent device's name. */ + snprintf(name, sizeof(name), "%04x:%02x:%02x.%1x", + devices[i].segment, devices[i].bus, + PCI_SLOT(devices[i].devfn), PCI_FUNC(devices[i].devfn)); + + /* Check if we own this device's group. */ + if (!vfio_group_get(devices[i].group_id, 0)) { + vfio_log("VFIO %s: Cannot hot reset; we don't own" + "group %d for dependent device %s\n", + dev->name, devices[i].group_id, name); + goto next1; + } + } + + /* Mark this device as hot reset capable. */ + dev->can_hot_reset = 1; + +next1: + if (hot_reset_info) + free(hot_reset_info); + dev = dev->next; + } + group = group->next; + } + + /* Count the number of groups we own. */ + count = 0; + group = first_group; + while (group) { + count++; + group = group->next; + } + + /* Allocate hot reset structure. */ + struct vfio_pci_hot_reset *hot_reset; + size = sizeof(struct vfio_pci_hot_reset) + (sizeof(int32_t) * count); + hot_reset = (struct vfio_pci_hot_reset *) calloc(1, size); + hot_reset->argsz = size; + int32_t *fds = &hot_reset->group_fds[0]; + + /* Add group fds. */ + group = first_group; + while (group) { + fds[hot_reset->count++] = group->fd; + group = group->next; + } + + /* Reset all devices. */ + group = first_group; + while (group) { + dev = group->first_device; + while (dev) { + /* Try function-level reset. + I don't really understand the !pm_reset check, but QEMU does it. */ + if (dev->can_reset && (!dev->can_pm_reset || dev->can_flr_reset)) { + if (ioctl(dev->fd, VFIO_DEVICE_RESET)) + vfio_log("VFIO %s: DEVICE_RESET 1 failed (%d)\n", dev->name, errno); + else { + vfio_log("VFIO %s: FLR reset successful\n", dev->name); + goto next2; + } + } + + /* Try hot reset. */ + if (dev->can_hot_reset) { + if (ioctl(dev->fd, VFIO_DEVICE_PCI_HOT_RESET, hot_reset)) + vfio_log("VFIO %s: PCI_HOT_RESET failed (%d)\n", dev->name, errno); + else { + vfio_log("VFIO %s: Hot reset successful\n", dev->name); + goto next2; + } + } + + /* Try PM reset. */ + if (dev->can_reset && dev->can_pm_reset) { + if (ioctl(dev->fd, VFIO_DEVICE_RESET)) + vfio_log("VFIO %s: DEVICE_RESET 2 failed (%d)\n", dev->name, errno); + else { + vfio_log("VFIO %s: PM reset successful\n", dev->name); + goto next2; + } + } + + /* Warn if no reset types were successful. */ + pclog("VFIO %s: Device was not reset!\n", dev->name); + +next2: + dev = dev->next; + } + group = group->next; + } + + /* Clean up. */ + free(hot_reset); + + /* Post-reset all devices. */ + group = first_group; + while (group) { + dev = group->first_device; + while (dev) { + vfio_dev_postreset(dev); + dev = dev->next; + } + group = group->next; + } +} + +void +vfio_init(void) +{ + vfio_log("VFIO: init()\n"); + + /* Stay quiet if VFIO is not configured. */ + char *category = "VFIO", + *devices = config_get_string(category, "devices", NULL); + if (!devices || !strlen(devices)) + return; + + /* Open VFIO container. */ + container_fd = open("/dev/vfio/vfio", O_RDWR); + if (container_fd < 0) { + pclog("VFIO: Container not found (is vfio-pci loaded?)\n"); + return; + } + + /* Check VFIO API version. */ + int api = ioctl(container_fd, VFIO_GET_API_VERSION); + if (api != VFIO_API_VERSION) { + pclog("VFIO: Unknown API version %d (expected %d)\n", api, VFIO_API_VERSION); + goto close_container; + } + + /* Check for Type1 IOMMU support. */ + if (!ioctl(container_fd, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { + pclog("VFIO: Type1 IOMMU not supported\n"); + goto close_container; + } + + /* Parse device list. */ + char *strtok_save; + char *token = strtok_r(devices, " ", &strtok_save); + char *p; + char *dev_name; + char *sysfs_device; + char *config_key; + int i; + int domain_id; + int bus_id; + int dev_id; + int func_id; + vfio_device_t *dev = NULL; + vfio_device_t *prev_dev; + vfio_group_t *group; + while (token) { + /* Determine if the device was specified by location or sysfs path. */ + dev_name = NULL; + if (token[0] == '/') { + /* sysfs path: use basename as device name. */ + i = strlen(token); + dev_name = malloc(i + 1); + strncpy(dev_name, path_get_basename(token), i); + + /* Just append iommu_group to the path. */ + sysfs_device = malloc(i + 13); + snprintf(sysfs_device, i + 13, + "%s/iommu_group", token); + } else if (token[0]) { + /* Location: read domain/bus/device/function. */ + i = sscanf(token, "%x:%x:%x.%x", &domain_id, &bus_id, &dev_id, &func_id); + if (i < 3) { + domain_id = 0; + i = sscanf(token, "%x:%x.%x", &bus_id, &dev_id, &func_id); + if (i < 2) { + bus_id = 0; + i = sscanf(token, "%x.%x", &dev_id, &func_id); + if (i < 1) { + pclog("VFIO: Invalid device location: %s\n", token); + goto next; + } else if (i == 1) { + func_id = 0; + } + } else if (i == 2) { + func_id = 0; + } + } else if (i == 3) { + func_id = 0; + } + + /* Use dddd:bb:dd.f as device name. */ + dev_name = malloc(13); + snprintf(dev_name, 13, + "%04x:%02x:%02x.%1x", domain_id, bus_id, dev_id, func_id); + + /* Generate sysfs path. */ + sysfs_device = malloc(46); + snprintf(sysfs_device, 46, + "/sys/bus/pci/devices/%s/iommu_group", dev_name); + } else { + /* Skip blank token. */ + goto next; + } + + pclog("VFIO %s: IOMMU group ", dev_name); + + p = realpath(sysfs_device, NULL); + free(sysfs_device); + if (p) { + /* Parse group ID. */ + if (sscanf(path_get_basename(p), "%d", &i) != 1) { + pclog("path could not be parsed: %s\n", p); + free(p); + goto next; + } + + pclog("%d\n", i); + free(p); + } else { + /* No symlink found, move on to the next device. */ + pclog("not found (%d)\n", errno); + goto next; + } + + /* Get group by ID, and move on to the next device + if the group failed to initialize. (Not viable, etc.) */ + group = vfio_group_get(i, 1); + if (group->fd < 0) { + pclog("VFIO %s: Skipping because group failed to initialize\n", dev_name); + goto next; + } + + /* Allocate device structure. */ + prev_dev = group->current_device; + dev = group->current_device = (vfio_device_t *) calloc(1, sizeof(vfio_device_t)); + + /* Initialize device structure. */ + dev->name = dev_name; + dev_name = NULL; /* don't free it further down */ + dev->irq.type = VFIO_PCI_NUM_IRQS; + + /* Read device-specific settings. */ + i = strlen(token) + 8; + config_key = malloc(i); + snprintf(config_key, i, "%s_rom_fn", token); + dev->rom_fn = config_get_string(category, config_key, NULL); + free(config_key); + + /* Add to linked device list. */ + if (prev_dev) + prev_dev->next = dev; + else + group->first_device = dev; + +next: /* Clean up. */ + if (dev_name) + free(dev_name); + + /* Read next device name. */ + token = strtok_r(NULL, " ", &strtok_save); + } + + /* Stop if no devices were added. */ + if (!dev) + goto close_container; + + /* Set IOMMU type. */ + if (ioctl(container_fd, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { + pclog("VFIO: SET_IOMMU failed (%d)\n", errno); + goto close_container; + } + + /* Map RAM to container for DMA. */ + vfio_map_dma(ram, 0, 1024UL * MIN(mem_size, 1048576)); + if (ram2) + vfio_map_dma(ram2, 1024UL * 1048576, 1024UL * (mem_size - 1048576)); + + /* Initialize epoll. */ + epoll_fd = epoll_create1(0); + if (epoll_fd < 0) { + pclog("VFIO: epoll_create1 failed (%d)\n", errno); + goto close_container; + } + + /* Initialize IRQ thread wake eventfd. */ + irq_thread_wake_fd = eventfd(0, 0); + if (irq_thread_wake_fd <= 0) { + pclog("VFIO: eventfd failed (%d)\n", errno); + goto close_container; + } + struct epoll_event event = { .events = EPOLLIN }; + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, irq_thread_wake_fd, &event) < 0) { + pclog("VFIO: EPOLL_CTL_ADD failed (%d)\n", errno); + goto close_container; + } + + /* Initialize and start IRQ thread. */ + irq_event = thread_create_event(); + irq_thread_resume = thread_create_event(); + thread_set_event(irq_thread_resume); + irq_thread = thread_create(vfio_irq_thread, NULL); + + /* Start IRQ timer. */ + timer_add(&irq_timer, vfio_irq_timer, NULL, 0); + vfio_irq_timer(NULL); + + /* Initialize all devices. */ + current_group = first_group; + while (current_group) { + prev_dev = NULL; + dev = current_group->first_device; + while (dev) { + current_group->current_device = dev; + if (vfio_dev_init(dev)) { + pclog("VFIO %s: dev_init failed\n", dev->name); + + /* Deallocate this device if initialization failed. */ + if (prev_dev) + prev_dev->next = dev->next; + else + current_group->first_device = dev->next; + dev = dev->next; + free(current_group->current_device); + continue; + } + prev_dev = dev; + dev = dev->next; + } + current_group = current_group->next; + } + + /* Reset all devices. */ + vfio_log("VFIO: Performing initial reset\n"); + closing = 0; + + /* Add device_t to keep track of reset and close. */ + device_add(&vfio_device); + +close_container: + close(container_fd); + container_fd = -1; +} + +void +vfio_close(void *priv) +{ + vfio_log("VFIO: close()\n"); + + /* Reset all devices. */ + closing = 1; + vfio_reset(priv); + + /* Stop IRQ timer. */ + timer_on_auto(&irq_timer, 0.0); + + /* Stop IRQ thread by closing the epoll fd. */ + if (epoll_fd >= 0) { + close(epoll_fd); + epoll_fd = -1; + } + thread_set_event(irq_thread_resume); + + /* Close all groups. */ + while (first_group) { + current_group = first_group; + + /* Close all devices. */ + while (current_group->first_device) { + current_group->current_device = current_group->first_device; + + /* Close device. */ + vfio_dev_close(current_group->current_device); + + /* Deallocate device. */ + current_group->first_device = current_group->current_device->next; + free(current_group->current_device); + } + + /* Close group fd. */ + if (current_group->fd >= 0) + close(current_group->fd); + + /* Deallocate group. */ + first_group = current_group->next; + free(current_group); + } + + /* Close container. */ + if (container_fd >= 0) { + close(container_fd); + container_fd = -1; + } +} + +static void +vfio_speed_changed(void *priv) +{ + /* Set operation timings. */ + timing_readb = (int) (pci_timing * timing_default.read_b); + timing_readw = (int) (pci_timing * timing_default.read_w); + timing_readl = (int) (pci_timing * timing_default.read_l); + timing_writeb = (int) (pci_timing * timing_default.write_b); + timing_writew = (int) (pci_timing * timing_default.write_w); + timing_writel = (int) (pci_timing * timing_default.write_l); +} + +static const device_t vfio_device = { + .name = "VFIO PCI Passthrough", + .internal_name = "vfio", + .flags = DEVICE_PCI, + .local = 0, + .init = NULL, + .close = vfio_close, + .reset = vfio_reset, + .available = NULL, + .speed_changed = vfio_speed_changed, + .force_redraw = NULL, + .config = NULL +}; diff --git a/src/include/86box/vfio.h b/src/include/86box/vfio.h new file mode 100644 index 000000000..2166070a0 --- /dev/null +++ b/src/include/86box/vfio.h @@ -0,0 +1,20 @@ +/* + * 86Box A hypervisor and IBM PC system emulator that specializes in + * running old operating systems and software designed for IBM + * PC systems and compatibles from 1981 through fairly recent + * system designs based on the PCI bus. + * + * This file is part of the 86Box distribution. + * + * Definitions for Virtual Function I/O PCI passthrough. + * + * Authors: RichardG, + * + * Copyright 2021-2025 RichardG. + */ +#if !defined(EMU_VFIO_H) && defined(USE_VFIO) +# define EMU_VFIO_H + +extern void vfio_init(void); + +#endif