Files
qemu/system/physmem.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

4448 lines
136 KiB
C
Raw Permalink Normal View History

/*
* RAM allocation and memory access
*
* Copyright (c) 2003 Fabrice Bellard
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#include "qemu/osdep.h"
#include "exec/page-vary.h"
2016-03-14 09:01:28 +01:00
#include "qapi/error.h"
#include "qemu/cutils.h"
#include "qemu/cacheflush.h"
#include "qemu/hbitmap.h"
#include "qemu/madvise.h"
#include "qemu/lockable.h"
#ifdef CONFIG_TCG
#include "accel/tcg/cpu-ops.h"
#include "accel/tcg/iommu.h"
#endif /* CONFIG_TCG */
#include "exec/cputlb.h"
#include "exec/page-protection.h"
#include "exec/target_page.h"
accel/tcg: Include missing 'exec/translation-block.h' header TB compile flags, tb_page_addr_t type, tb_cflags() and few other methods are defined in "exec/translation-block.h". All these files don't include "exec/translation-block.h" but include "exec/exec-all.h" which include it. Explicitly include "exec/translation-block.h" to be able to remove it from "exec/exec-all.h" later when it won't be necessary. Otherwise we'd get errors such: accel/tcg/internal-target.h:59:20: error: a parameter list without types is only allowed in a function definition 59 | void tb_lock_page0(tb_page_addr_t); | ^ accel/tcg/tb-hash.h:64:23: error: unknown type name 'tb_page_addr_t' 64 | uint32_t tb_hash_func(tb_page_addr_t phys_pc, vaddr pc, | ^ accel/tcg/tcg-accel-ops.c:62:36: error: use of undeclared identifier 'CF_CLUSTER_SHIFT' 62 | cflags = cpu->cluster_index << CF_CLUSTER_SHIFT; | ^ accel/tcg/watchpoint.c:102:47: error: use of undeclared identifier 'CF_NOIRQ' 102 | cpu->cflags_next_tb = 1 | CF_NOIRQ | curr_cflags(cpu); | ^ target/i386/helper.c:536:28: error: use of undeclared identifier 'CF_PCREL' 536 | if (tcg_cflags_has(cs, CF_PCREL)) { | ^ target/rx/cpu.c:51:21: error: incomplete definition of type 'struct TranslationBlock' 51 | cpu->env.pc = tb->pc; | ~~^ system/physmem.c:2977:9: error: call to undeclared function 'tb_invalidate_phys_range'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 2977 | tb_invalidate_phys_range(addr, addr + length - 1); | ^ plugins/api.c:96:12: error: call to undeclared function 'tb_cflags'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 96 | return tb_cflags(tcg_ctx->gen_tb) & CF_MEMI_ONLY; | ^ Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org> Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Message-Id: <20241114011310.3615-5-philmd@linaro.org>
2024-11-13 22:46:13 +01:00
#include "exec/translation-block.h"
#include "hw/core/qdev.h"
#include "hw/core/qdev-properties.h"
#include "hw/core/boards.h"
#include "system/xen.h"
#include "system/kvm.h"
#include "system/tcg.h"
#include "system/qtest.h"
#include "system/physmem.h"
#include "system/ramblock.h"
#include "qemu/timer.h"
#include "qemu/config-file.h"
#include "qemu/error-report.h"
#include "qemu/qemu-print.h"
#include "qemu/log.h"
#include "qemu/memalign.h"
#include "qemu/memfd.h"
#include "system/memory.h"
#include "system/memory_cached.h"
#include "system/ioport.h"
#include "system/dma.h"
#include "system/hostmem.h"
#include "system/hw_accel.h"
#include "system/xen-mapcache.h"
#include "trace.h"
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
#include <linux/falloc.h>
#endif
#include "qemu/rcu_queue.h"
#include "qemu/main-loop.h"
#include "system/replay.h"
#include "system/ramblock.h"
#include "qemu/pmem.h"
#include "qapi/qapi-types-migration.h"
#include "migration/blocker.h"
#include "migration/cpr.h"
#include "migration/options.h"
#include "migration/vmstate.h"
#include "qemu/range.h"
#ifndef _WIN32
#include "qemu/mmap-alloc.h"
#endif
#include "monitor/monitor.h"
#ifdef CONFIG_LIBDAXCTL
#include <daxctl/libdaxctl.h>
#endif
#include "memory-internal.h"
/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
* are protected by the ramlist lock.
*/
RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
static MemoryRegion *system_memory;
static MemoryRegion *system_io;
AddressSpace address_space_io;
AddressSpace address_space_memory;
static MemoryRegion io_mem_unassigned;
typedef struct PhysPageEntry PhysPageEntry;
struct PhysPageEntry {
/* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
uint32_t skip : 6;
/* index into phys_sections (!skip) or phys_map_nodes (skip) */
uint32_t ptr : 26;
};
#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
/* Size of the L2 (and L3, etc) page tables. */
#define ADDR_SPACE_BITS 64
#define P_L2_BITS 9
#define P_L2_SIZE (1 << P_L2_BITS)
#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
typedef PhysPageEntry Node[P_L2_SIZE];
typedef struct PhysPageMap {
struct rcu_head rcu;
unsigned sections_nb;
unsigned sections_nb_alloc;
unsigned nodes_nb;
unsigned nodes_nb_alloc;
Node *nodes;
MemoryRegionSection *sections;
} PhysPageMap;
struct AddressSpaceDispatch {
MemoryRegionSection *mru_section;
/* This is a multi-level map on the physical address space.
* The bottom level has pointers to MemoryRegionSections.
*/
PhysPageEntry phys_map;
PhysPageMap map;
};
#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
typedef struct subpage_t {
MemoryRegion iomem;
FlatView *fv;
hwaddr base;
uint16_t sub_section[];
} subpage_t;
#define PHYS_SECTION_UNASSIGNED 0
static void io_mem_init(void);
static void memory_map_init(void);
static void tcg_log_global_after_sync(MemoryListener *listener);
static void tcg_commit(MemoryListener *listener);
static bool ram_is_cpr_compatible(RAMBlock *rb);
/**
* CPUAddressSpace: all the information a CPU needs about an AddressSpace
* @cpu: the CPU whose AddressSpace this is
* @as: the AddressSpace itself
* @tcg_as_listener: listener for tracking changes to the AddressSpace
*/
typedef struct CPUAddressSpace {
CPUState *cpu;
AddressSpace *as;
MemoryListener tcg_as_listener;
} CPUAddressSpace;
struct DirtyBitmapSnapshot {
ram_addr_t start;
ram_addr_t end;
unsigned long dirty[];
};
static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
{
static unsigned alloc_hint = 16;
if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
alloc_hint = map->nodes_nb_alloc;
}
}
static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
{
unsigned i;
uint32_t ret;
PhysPageEntry e;
PhysPageEntry *p;
ret = map->nodes_nb++;
p = map->nodes[ret];
assert(ret != PHYS_MAP_NODE_NIL);
assert(ret != map->nodes_nb_alloc);
e.skip = leaf ? 0 : 1;
e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
for (i = 0; i < P_L2_SIZE; ++i) {
memcpy(&p[i], &e, sizeof(e));
}
return ret;
}
static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
hwaddr *index, uint64_t *nb, uint16_t leaf,
int level)
{
PhysPageEntry *p;
hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
lp->ptr = phys_map_node_alloc(map, level == 0);
}
p = map->nodes[lp->ptr];
lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
while (*nb && lp < &p[P_L2_SIZE]) {
if ((*index & (step - 1)) == 0 && *nb >= step) {
lp->skip = 0;
lp->ptr = leaf;
*index += step;
*nb -= step;
} else {
phys_page_set_level(map, lp, index, nb, leaf, level - 1);
}
++lp;
}
}
static void phys_page_set(AddressSpaceDispatch *d,
hwaddr index, uint64_t nb,
uint16_t leaf)
{
/* Wildly overreserve - it doesn't matter much. */
phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
}
/* Compact a non leaf page entry. Simply detect that the entry has a single child,
* and update our entry so we can skip it and go directly to the destination.
*/
static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
{
unsigned valid_ptr = P_L2_SIZE;
int valid = 0;
PhysPageEntry *p;
int i;
if (lp->ptr == PHYS_MAP_NODE_NIL) {
return;
}
p = nodes[lp->ptr];
for (i = 0; i < P_L2_SIZE; i++) {
if (p[i].ptr == PHYS_MAP_NODE_NIL) {
continue;
}
valid_ptr = i;
valid++;
if (p[i].skip) {
phys_page_compact(&p[i], nodes);
}
}
/* We can only compress if there's only one child. */
if (valid != 1) {
return;
}
assert(valid_ptr < P_L2_SIZE);
/* Don't compress if it won't fit in the # of bits we have. */
if (P_L2_LEVELS >= (1 << 6) &&
lp->skip + p[valid_ptr].skip >= (1 << 6)) {
return;
}
lp->ptr = p[valid_ptr].ptr;
if (!p[valid_ptr].skip) {
/* If our only child is a leaf, make this a leaf. */
/* By design, we should have made this node a leaf to begin with so we
* should never reach here.
* But since it's so simple to handle this, let's do it just in case we
* change this rule.
*/
lp->skip = 0;
} else {
lp->skip += p[valid_ptr].skip;
}
}
void address_space_dispatch_compact(AddressSpaceDispatch *d)
{
if (d->phys_map.skip) {
phys_page_compact(&d->phys_map, d->map.nodes);
}
}
static inline bool section_covers_addr(const MemoryRegionSection *section,
hwaddr addr)
{
/* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
* the section must cover the entire address space.
*/
return int128_gethi(section->size) ||
range_covers_byte(section->offset_within_address_space,
int128_getlo(section->size), addr);
}
static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
{
PhysPageEntry lp = d->phys_map, *p;
Node *nodes = d->map.nodes;
MemoryRegionSection *sections = d->map.sections;
hwaddr index = addr >> TARGET_PAGE_BITS;
int i;
for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
if (lp.ptr == PHYS_MAP_NODE_NIL) {
return &sections[PHYS_SECTION_UNASSIGNED];
}
p = nodes[lp.ptr];
lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
}
if (section_covers_addr(&sections[lp.ptr], addr)) {
return &sections[lp.ptr];
} else {
return &sections[PHYS_SECTION_UNASSIGNED];
}
}
/* Called from RCU critical section */
static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
hwaddr addr,
bool resolve_subpage)
{
MemoryRegionSection *section = qatomic_read(&d->mru_section);
subpage_t *subpage;
if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
!section_covers_addr(section, addr)) {
section = phys_page_find(d, addr);
qatomic_set(&d->mru_section, section);
}
if (resolve_subpage && section->mr->subpage) {
subpage = container_of(section->mr, subpage_t, iomem);
section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
}
return section;
}
/* Called from RCU critical section */
static MemoryRegionSection *
address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
hwaddr *plen, bool resolve_subpage)
{
MemoryRegionSection *section;
MemoryRegion *mr;
Int128 diff;
section = address_space_lookup_region(d, addr, resolve_subpage);
/* Compute offset within MemoryRegionSection */
addr -= section->offset_within_address_space;
/* Compute offset within MemoryRegion */
*xlat = addr + section->offset_within_region;
mr = section->mr;
exec: skip MMIO regions correctly in cpu_physical_memory_write_rom_internal Loading the BIOS in the mac99 machine is interesting, because there is a PROM in the middle of the BIOS region (from 16K to 32K). Before memory region accesses were clamped, when QEMU was asked to load a BIOS from 0xfff00000 to 0xffffffff it would put even those 16K from the BIOS file into the region. This is weird because those 16K were not actually visible between 0xfff04000 and 0xfff07fff. However, it worked. After clamping was added, this also worked. In this case, the cpu_physical_memory_write_rom_internal function split the write in three parts: the first 16K were copied, the PROM area (second 16K) were ignored, then the rest was copied. Problems then started with commit 965eb2f (exec: do not clamp accesses to MMIO regions, 2015-06-17). Clamping accesses is not done for MMIO regions because they can overlap wildly, and MMIO registers can be expected to perform full-width accesses based only on their address (with no respect for adjacent registers that could decode to completely different MemoryRegions). However, this lack of clamping also applied to the PROM area! cpu_physical_memory_write_rom_internal thus failed to copy the third range above, i.e. only copied the first 16K of the BIOS. In effect, address_space_translate is expecting _something else_ to do the clamping for MMIO regions if the incoming length is large. This "something else" is memory_access_size in the case of address_space_rw, so use the same logic in cpu_physical_memory_write_rom_internal. Reported-by: Alexander Graf <agraf@redhat.com> Reviewed-by: Laurent Vivier <lvivier@redhat.com> Tested-by: Laurent Vivier <lvivier@redhat.com> Fixes: 965eb2f Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2015-07-04 00:24:51 +02:00
/* MMIO registers can be expected to perform full-width accesses based only
* on their address, without considering adjacent registers that could
* decode to completely different MemoryRegions. When such registers
* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
* regions overlap wildly. For this reason we cannot clamp the accesses
* here.
*
* If the length is small (as is the case for address_space_ldl/stl),
* everything works fine. If the incoming length is large, however,
* the caller really has to do the clamping through memory_access_size.
*/
if (memory_region_is_ram(mr)) {
diff = int128_sub(section->size, int128_make64(addr));
*plen = int128_get64(int128_min(diff, int128_make64(*plen)));
}
return section;
}
/**
* address_space_translate_iommu - translate an address through an IOMMU
* memory region and then through the target address space.
*
* @iommu_mr: the IOMMU memory region that we start the translation from
* @addr: the address to be translated through the MMU
* @xlat: the translated address offset within the destination memory region.
* It cannot be %NULL.
* @plen_out: valid read/write length of the translated address. It
* cannot be %NULL.
* @page_mask_out: page mask for the translated address. This
* should only be meaningful for IOMMU translated
* addresses, since there may be huge pages that this bit
* would tell. It can be %NULL if we don't care about it.
* @is_write: whether the translation operation is for write
* @is_mmio: whether this can be MMIO, set true if it can
* @target_as: the address space targeted by the IOMMU
* @attrs: transaction attributes
*
* This function is called from RCU critical section. It is the common
* part of flatview_do_translate and address_space_translate_cached.
*/
static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
hwaddr *xlat,
hwaddr *plen_out,
hwaddr *page_mask_out,
bool is_write,
bool is_mmio,
AddressSpace **target_as,
MemTxAttrs attrs)
{
MemoryRegionSection *section;
hwaddr page_mask = (hwaddr)-1;
do {
hwaddr addr = *xlat;
IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
int iommu_idx = 0;
IOMMUTLBEntry iotlb;
if (imrc->attrs_to_index) {
iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
}
iotlb = imrc->translate(iommu_mr, addr, is_write ?
IOMMU_WO : IOMMU_RO, iommu_idx);
if (!(iotlb.perm & (1 << is_write))) {
goto unassigned;
}
addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
| (addr & iotlb.addr_mask));
page_mask &= iotlb.addr_mask;
*plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
*target_as = iotlb.target_as;
section = address_space_translate_internal(
address_space_to_dispatch(iotlb.target_as), addr, xlat,
plen_out, is_mmio);
iommu_mr = memory_region_get_iommu(section->mr);
} while (unlikely(iommu_mr));
if (page_mask_out) {
*page_mask_out = page_mask;
}
return *section;
unassigned:
return (MemoryRegionSection) { .mr = &io_mem_unassigned };
}
/**
* flatview_do_translate - translate an address in FlatView
*
* @fv: the flat view that we want to translate on
* @addr: the address to be translated in above address space
* @xlat: the translated address offset within memory region. It
* cannot be @NULL.
* @plen_out: valid read/write length of the translated address. It
* can be @NULL when we don't care about it.
* @page_mask_out: page mask for the translated address. This
* should only be meaningful for IOMMU translated
* addresses, since there may be huge pages that this bit
* would tell. It can be @NULL if we don't care about it.
* @is_write: whether the translation operation is for write
* @is_mmio: whether this can be MMIO, set true if it can
* @target_as: the address space targeted by the IOMMU
* @attrs: memory transaction attributes
*
* This function is called from RCU critical section
*/
static MemoryRegionSection flatview_do_translate(FlatView *fv,
hwaddr addr,
hwaddr *xlat,
hwaddr *plen_out,
hwaddr *page_mask_out,
bool is_write,
bool is_mmio,
AddressSpace **target_as,
MemTxAttrs attrs)
{
MemoryRegionSection *section;
IOMMUMemoryRegion *iommu_mr;
hwaddr plen = (hwaddr)(-1);
if (!plen_out) {
plen_out = &plen;
}
section = address_space_translate_internal(
flatview_to_dispatch(fv), addr, xlat,
plen_out, is_mmio);
iommu_mr = memory_region_get_iommu(section->mr);
if (unlikely(iommu_mr)) {
return address_space_translate_iommu(iommu_mr, xlat,
plen_out, page_mask_out,
is_write, is_mmio,
target_as, attrs);
}
if (page_mask_out) {
/* Not behind an IOMMU, use default page size. */
*page_mask_out = ~TARGET_PAGE_MASK;
}
return *section;
}
/* Called from RCU critical section */
IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
bool is_write, MemTxAttrs attrs)
{
MemoryRegionSection section;
hwaddr xlat, page_mask;
/*
* This can never be MMIO, and we don't really care about plen,
* but page mask.
*/
section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
NULL, &page_mask, is_write, false, &as,
attrs);
/* Illegal translation */
if (section.mr == &io_mem_unassigned) {
goto iotlb_fail;
}
/* Convert memory region offset into address space offset */
xlat += section.offset_within_address_space -
section.offset_within_region;
return (IOMMUTLBEntry) {
.target_as = as,
.iova = addr & ~page_mask,
.translated_addr = xlat & ~page_mask,
.addr_mask = page_mask,
/* IOTLBs are for DMAs, and DMA only allows on RAMs. */
.perm = IOMMU_RW,
};
iotlb_fail:
return (IOMMUTLBEntry) {0};
}
/* Called from RCU critical section */
MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
hwaddr *plen, bool is_write,
MemTxAttrs attrs)
{
MemoryRegion *mr;
MemoryRegionSection section;
AddressSpace *as = NULL;
/* This can be MMIO, so setup MMIO bit. */
section = flatview_do_translate(fv, addr, xlat, plen, NULL,
is_write, true, &as, attrs);
mr = section.mr;
if (xen_enabled() && memory_access_is_direct(mr, is_write, attrs)) {
hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
*plen = MIN(page, *plen);
}
return mr;
}
#ifdef CONFIG_TCG
typedef struct TCGIOMMUNotifier {
IOMMUNotifier n;
MemoryRegion *mr;
CPUState *cpu;
int iommu_idx;
bool active;
} TCGIOMMUNotifier;
static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
{
TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
if (!notifier->active) {
return;
}
tlb_flush(notifier->cpu);
notifier->active = false;
/* We leave the notifier struct on the list to avoid reallocating it later.
* Generally the number of IOMMUs a CPU deals with will be small.
* In any case we can't unregister the iommu notifier from a notify
* callback.
*/
}
static void tcg_register_iommu_notifier(CPUState *cpu,
IOMMUMemoryRegion *iommu_mr,
int iommu_idx)
{
/* Make sure this CPU has an IOMMU notifier registered for this
* IOMMU/IOMMU index combination, so that we can flush its TLB
* when the IOMMU tells us the mappings we've cached have changed.
*/
MemoryRegion *mr = MEMORY_REGION(iommu_mr);
TCGIOMMUNotifier *notifier = NULL;
int i;
for (i = 0; i < cpu->iommu_notifiers->len; i++) {
notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
break;
}
}
if (i == cpu->iommu_notifiers->len) {
/* Not found, add a new entry at the end of the array */
cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
notifier = g_new0(TCGIOMMUNotifier, 1);
g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
notifier->mr = mr;
notifier->iommu_idx = iommu_idx;
notifier->cpu = cpu;
/* Rather than trying to register interest in the specific part
* of the iommu's address space that we've accessed and then
* expand it later as subsequent accesses touch more of it, we
* just register interest in the whole thing, on the assumption
* that iommu reconfiguration will be rare.
*/
iommu_notifier_init(&notifier->n,
tcg_iommu_unmap_notify,
IOMMU_NOTIFIER_UNMAP,
0,
HWADDR_MAX,
iommu_idx);
memory_region_register_iommu_notifier(notifier->mr, &notifier->n,
&error_fatal);
}
if (!notifier->active) {
notifier->active = true;
}
}
void tcg_iommu_free_notifier_list(CPUState *cpu)
{
/* Destroy the CPU's notifier list */
int i;
TCGIOMMUNotifier *notifier;
for (i = 0; i < cpu->iommu_notifiers->len; i++) {
notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n);
g_free(notifier);
}
g_array_free(cpu->iommu_notifiers, true);
}
void tcg_iommu_init_notifier_list(CPUState *cpu)
{
cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
}
/* Called from RCU critical section */
MemoryRegionSection *
address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr orig_addr,
hwaddr *xlat, hwaddr *plen,
MemTxAttrs attrs, int *prot)
{
MemoryRegionSection *section;
IOMMUMemoryRegion *iommu_mr;
IOMMUMemoryRegionClass *imrc;
IOMMUTLBEntry iotlb;
int iommu_idx;
hwaddr addr = orig_addr;
system/physmem: fix use-after-free with dispatch A use-after-free bug was reported when booting a Linux kernel during the pci setup phase. It's quite hard to reproduce (needs smp, and favored by having several pci devices with BAR and specific Linux config, which is Debian default one in this case). After investigation (see the associated bug ticket), it appears that, under specific conditions, we might access a cached AddressSpaceDispatch that was reclaimed by RCU thread meanwhile. In the Linux boot scenario, during the pci phase, memory region are destroyed/recreated, resulting in exposition of the bug. The core of the issue is that we cache the dispatch associated to current cpu in cpu->cpu_ases[asidx].memory_dispatch. It is updated with tcg_commit, which runs asynchronously on a given cpu. At some point, we leave the rcu critial section, and the RCU thread starts reclaiming it, but tcg_commit is not yet invoked, resulting in the use-after-free. It's not the first problem around this area, and commit 0d58c660689 [1] ("softmmu: Use async_run_on_cpu in tcg_commit") already tried to address it. It did a good job, but it seems that we found a specific situation where it's not enough. This patch takes a simple approach: remove the cached value creating the issue, and make sure we always get the current mapping for address space, using address_space_to_dispatch(cpu->cpu_ases[asidx].as). It's equivalent to qatomic_rcu_read(&as->current_map)->dispatch; This is not really costly, we just need two dereferences, including one atomic (rcu) read, which is negligible considering we are already on mmu slow path anyway. Note that tcg_commit is still needed, as it's taking care of flushing TLB, removing previously mapped entries. Another solution would be to cache directly values under the dispatch (dispatch themselves are not ref counted), keep an active reference on associated memory section, and release it when appropriate (tricky). Given the time already spent debugging this area now and previously, I strongly prefer eliminating the root of the issue, instead of adding more complexity for a hypothetical performance gain. RCU is precisely used to ensure good performance when reading data, so caching is not as beneficial as it might seem IMHO. [1] https://gitlab.com/qemu-project/qemu/-/commit/0d58c660689f6da1e3feff8a997014003d928b3b Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/3040 Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Michael Tokarev <mjt@tls.msk.ru> Tested-by: Michael Tokarev <mjt@tls.msk.ru> Message-ID: <20250724161142.2803091-1-pierrick.bouvier@linaro.org> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
2025-07-24 09:11:42 -07:00
AddressSpaceDispatch *d = address_space_to_dispatch(cpu->cpu_ases[asidx].as);
for (;;) {
section = address_space_translate_internal(d, addr, &addr, plen, false);
iommu_mr = memory_region_get_iommu(section->mr);
if (!iommu_mr) {
break;
}
imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
/* We need all the permissions, so pass IOMMU_NONE so the IOMMU
* doesn't short-cut its translation table walk.
*/
iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
| (addr & iotlb.addr_mask));
/* Update the caller's prot bits to remove permissions the IOMMU
* is giving us a failure response for. If we get down to no
* permissions left at all we can give up now.
*/
if (!(iotlb.perm & IOMMU_RO)) {
*prot &= ~(PAGE_READ | PAGE_EXEC);
}
if (!(iotlb.perm & IOMMU_WO)) {
*prot &= ~PAGE_WRITE;
}
if (!*prot) {
goto translate_fail;
}
d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
}
assert(!memory_region_is_iommu(section->mr));
*xlat = addr;
return section;
translate_fail:
/*
* We should be given a page-aligned address -- certainly
* tlb_set_page_with_attrs() does so. The page offset of xlat
* is used to index sections[], and PHYS_SECTION_UNASSIGNED = 0.
* The page portion of xlat will be logged by memory_region_access_valid()
* when this memory access is rejected, so use the original untranslated
* physical address.
*/
assert((orig_addr & ~TARGET_PAGE_MASK) == 0);
*xlat = orig_addr;
return &d->map.sections[PHYS_SECTION_UNASSIGNED];
}
#endif /* CONFIG_TCG */
void cpu_address_space_init(CPUState *cpu, int asidx,
const char *prefix, MemoryRegion *mr)
{
CPUAddressSpace *newas;
AddressSpace *as = g_new0(AddressSpace, 1);
char *as_name;
assert(mr);
as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
address_space_init(as, mr, as_name);
g_free(as_name);
/* Target code should have set max_as before calling us */
assert(asidx <= cpu->cc->max_as);
if (asidx == 0) {
/* address space 0 gets the convenience alias */
cpu->as = as;
}
if (!cpu->cpu_ases) {
cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->cc->max_as + 1);
}
newas = &cpu->cpu_ases[asidx];
newas->cpu = cpu;
newas->as = as;
if (tcg_enabled()) {
newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
newas->tcg_as_listener.commit = tcg_commit;
newas->tcg_as_listener.name = "tcg";
memory_listener_register(&newas->tcg_as_listener, as);
}
}
physmem: Destroy all CPU AddressSpaces on unrealize When we unrealize a CPU object (which happens on vCPU hot-unplug), we should destroy all the AddressSpace objects we created via calls to cpu_address_space_init() when the CPU was realized. Commit 24bec42f3d6eae added a function to do this for a specific AddressSpace, but did not add any places where the function was called. Since we always want to destroy all the AddressSpaces on unrealize, regardless of the target architecture, we don't need to try to keep track of how many are still undestroyed, or make the target architecture code manually call a destroy function for each AS it created. Instead we can adjust the function to always completely destroy the whole cpu->ases array, and arrange for it to be called during CPU unrealize as part of the common code. Without this fix, AddressSanitizer will report a leak like this from a run where we hot-plugged and then hot-unplugged an x86 KVM vCPU: Direct leak of 416 byte(s) in 1 object(s) allocated from: #0 0x5b638565053d in calloc (/data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/qemu-system-x86_64+0x1ee153d) (BuildId: c1cd6022b195142106e1bffeca23498c2b752bca) #1 0x7c28083f77b1 in g_malloc0 (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x637b1) (BuildId: 1eb6131419edb83b2178b682829a6913cf682d75) #2 0x5b6386999c7c in cpu_address_space_init /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../system/physmem.c:797:25 #3 0x5b638727f049 in kvm_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/kvm/kvm-cpu.c:102:5 #4 0x5b6385745f40 in accel_cpu_common_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../accel/accel-common.c:101:13 #5 0x5b638568fe3c in cpu_exec_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/cpu-common.c:232:10 #6 0x5b63874a2cd5 in x86_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/cpu.c:9321:5 #7 0x5b6387a0469a in device_set_realized /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:494:13 #8 0x5b6387a27d9e in property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:2375:5 #9 0x5b6387a2090b in object_property_set /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1450:5 #10 0x5b6387a35b05 in object_property_set_qobject /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/qom-qobject.c:28:10 #11 0x5b6387a21739 in object_property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1520:15 #12 0x5b63879fe510 in qdev_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:276:12 Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2517 Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20250929144228.1994037-4-peter.maydell@linaro.org Signed-off-by: Peter Xu <peterx@redhat.com>
2025-09-29 15:42:28 +01:00
void cpu_destroy_address_spaces(CPUState *cpu)
{
CPUAddressSpace *cpuas;
physmem: Destroy all CPU AddressSpaces on unrealize When we unrealize a CPU object (which happens on vCPU hot-unplug), we should destroy all the AddressSpace objects we created via calls to cpu_address_space_init() when the CPU was realized. Commit 24bec42f3d6eae added a function to do this for a specific AddressSpace, but did not add any places where the function was called. Since we always want to destroy all the AddressSpaces on unrealize, regardless of the target architecture, we don't need to try to keep track of how many are still undestroyed, or make the target architecture code manually call a destroy function for each AS it created. Instead we can adjust the function to always completely destroy the whole cpu->ases array, and arrange for it to be called during CPU unrealize as part of the common code. Without this fix, AddressSanitizer will report a leak like this from a run where we hot-plugged and then hot-unplugged an x86 KVM vCPU: Direct leak of 416 byte(s) in 1 object(s) allocated from: #0 0x5b638565053d in calloc (/data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/qemu-system-x86_64+0x1ee153d) (BuildId: c1cd6022b195142106e1bffeca23498c2b752bca) #1 0x7c28083f77b1 in g_malloc0 (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x637b1) (BuildId: 1eb6131419edb83b2178b682829a6913cf682d75) #2 0x5b6386999c7c in cpu_address_space_init /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../system/physmem.c:797:25 #3 0x5b638727f049 in kvm_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/kvm/kvm-cpu.c:102:5 #4 0x5b6385745f40 in accel_cpu_common_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../accel/accel-common.c:101:13 #5 0x5b638568fe3c in cpu_exec_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/cpu-common.c:232:10 #6 0x5b63874a2cd5 in x86_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/cpu.c:9321:5 #7 0x5b6387a0469a in device_set_realized /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:494:13 #8 0x5b6387a27d9e in property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:2375:5 #9 0x5b6387a2090b in object_property_set /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1450:5 #10 0x5b6387a35b05 in object_property_set_qobject /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/qom-qobject.c:28:10 #11 0x5b6387a21739 in object_property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1520:15 #12 0x5b63879fe510 in qdev_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:276:12 Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2517 Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20250929144228.1994037-4-peter.maydell@linaro.org Signed-off-by: Peter Xu <peterx@redhat.com>
2025-09-29 15:42:28 +01:00
int asidx;
assert(cpu->cpu_ases);
physmem: Destroy all CPU AddressSpaces on unrealize When we unrealize a CPU object (which happens on vCPU hot-unplug), we should destroy all the AddressSpace objects we created via calls to cpu_address_space_init() when the CPU was realized. Commit 24bec42f3d6eae added a function to do this for a specific AddressSpace, but did not add any places where the function was called. Since we always want to destroy all the AddressSpaces on unrealize, regardless of the target architecture, we don't need to try to keep track of how many are still undestroyed, or make the target architecture code manually call a destroy function for each AS it created. Instead we can adjust the function to always completely destroy the whole cpu->ases array, and arrange for it to be called during CPU unrealize as part of the common code. Without this fix, AddressSanitizer will report a leak like this from a run where we hot-plugged and then hot-unplugged an x86 KVM vCPU: Direct leak of 416 byte(s) in 1 object(s) allocated from: #0 0x5b638565053d in calloc (/data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/qemu-system-x86_64+0x1ee153d) (BuildId: c1cd6022b195142106e1bffeca23498c2b752bca) #1 0x7c28083f77b1 in g_malloc0 (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x637b1) (BuildId: 1eb6131419edb83b2178b682829a6913cf682d75) #2 0x5b6386999c7c in cpu_address_space_init /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../system/physmem.c:797:25 #3 0x5b638727f049 in kvm_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/kvm/kvm-cpu.c:102:5 #4 0x5b6385745f40 in accel_cpu_common_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../accel/accel-common.c:101:13 #5 0x5b638568fe3c in cpu_exec_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/cpu-common.c:232:10 #6 0x5b63874a2cd5 in x86_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/cpu.c:9321:5 #7 0x5b6387a0469a in device_set_realized /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:494:13 #8 0x5b6387a27d9e in property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:2375:5 #9 0x5b6387a2090b in object_property_set /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1450:5 #10 0x5b6387a35b05 in object_property_set_qobject /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/qom-qobject.c:28:10 #11 0x5b6387a21739 in object_property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1520:15 #12 0x5b63879fe510 in qdev_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:276:12 Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2517 Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20250929144228.1994037-4-peter.maydell@linaro.org Signed-off-by: Peter Xu <peterx@redhat.com>
2025-09-29 15:42:28 +01:00
/* convenience alias just points to some cpu_ases[n] */
cpu->as = NULL;
for (asidx = 0; asidx <= cpu->cc->max_as; asidx++) {
physmem: Destroy all CPU AddressSpaces on unrealize When we unrealize a CPU object (which happens on vCPU hot-unplug), we should destroy all the AddressSpace objects we created via calls to cpu_address_space_init() when the CPU was realized. Commit 24bec42f3d6eae added a function to do this for a specific AddressSpace, but did not add any places where the function was called. Since we always want to destroy all the AddressSpaces on unrealize, regardless of the target architecture, we don't need to try to keep track of how many are still undestroyed, or make the target architecture code manually call a destroy function for each AS it created. Instead we can adjust the function to always completely destroy the whole cpu->ases array, and arrange for it to be called during CPU unrealize as part of the common code. Without this fix, AddressSanitizer will report a leak like this from a run where we hot-plugged and then hot-unplugged an x86 KVM vCPU: Direct leak of 416 byte(s) in 1 object(s) allocated from: #0 0x5b638565053d in calloc (/data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/qemu-system-x86_64+0x1ee153d) (BuildId: c1cd6022b195142106e1bffeca23498c2b752bca) #1 0x7c28083f77b1 in g_malloc0 (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x637b1) (BuildId: 1eb6131419edb83b2178b682829a6913cf682d75) #2 0x5b6386999c7c in cpu_address_space_init /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../system/physmem.c:797:25 #3 0x5b638727f049 in kvm_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/kvm/kvm-cpu.c:102:5 #4 0x5b6385745f40 in accel_cpu_common_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../accel/accel-common.c:101:13 #5 0x5b638568fe3c in cpu_exec_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/cpu-common.c:232:10 #6 0x5b63874a2cd5 in x86_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/cpu.c:9321:5 #7 0x5b6387a0469a in device_set_realized /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:494:13 #8 0x5b6387a27d9e in property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:2375:5 #9 0x5b6387a2090b in object_property_set /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1450:5 #10 0x5b6387a35b05 in object_property_set_qobject /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/qom-qobject.c:28:10 #11 0x5b6387a21739 in object_property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1520:15 #12 0x5b63879fe510 in qdev_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:276:12 Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2517 Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20250929144228.1994037-4-peter.maydell@linaro.org Signed-off-by: Peter Xu <peterx@redhat.com>
2025-09-29 15:42:28 +01:00
cpuas = &cpu->cpu_ases[asidx];
if (!cpuas->as) {
/* This index was never initialized; no deinit needed */
continue;
}
if (tcg_enabled()) {
memory_listener_unregister(&cpuas->tcg_as_listener);
}
g_clear_pointer(&cpuas->as, address_space_destroy_free);
}
physmem: Destroy all CPU AddressSpaces on unrealize When we unrealize a CPU object (which happens on vCPU hot-unplug), we should destroy all the AddressSpace objects we created via calls to cpu_address_space_init() when the CPU was realized. Commit 24bec42f3d6eae added a function to do this for a specific AddressSpace, but did not add any places where the function was called. Since we always want to destroy all the AddressSpaces on unrealize, regardless of the target architecture, we don't need to try to keep track of how many are still undestroyed, or make the target architecture code manually call a destroy function for each AS it created. Instead we can adjust the function to always completely destroy the whole cpu->ases array, and arrange for it to be called during CPU unrealize as part of the common code. Without this fix, AddressSanitizer will report a leak like this from a run where we hot-plugged and then hot-unplugged an x86 KVM vCPU: Direct leak of 416 byte(s) in 1 object(s) allocated from: #0 0x5b638565053d in calloc (/data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/qemu-system-x86_64+0x1ee153d) (BuildId: c1cd6022b195142106e1bffeca23498c2b752bca) #1 0x7c28083f77b1 in g_malloc0 (/lib/x86_64-linux-gnu/libglib-2.0.so.0+0x637b1) (BuildId: 1eb6131419edb83b2178b682829a6913cf682d75) #2 0x5b6386999c7c in cpu_address_space_init /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../system/physmem.c:797:25 #3 0x5b638727f049 in kvm_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/kvm/kvm-cpu.c:102:5 #4 0x5b6385745f40 in accel_cpu_common_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../accel/accel-common.c:101:13 #5 0x5b638568fe3c in cpu_exec_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/cpu-common.c:232:10 #6 0x5b63874a2cd5 in x86_cpu_realizefn /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../target/i386/cpu.c:9321:5 #7 0x5b6387a0469a in device_set_realized /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:494:13 #8 0x5b6387a27d9e in property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:2375:5 #9 0x5b6387a2090b in object_property_set /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1450:5 #10 0x5b6387a35b05 in object_property_set_qobject /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/qom-qobject.c:28:10 #11 0x5b6387a21739 in object_property_set_bool /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../qom/object.c:1520:15 #12 0x5b63879fe510 in qdev_realize /data_nvme1n1/linaro/qemu-from-laptop/qemu/build/x86-tgts-asan/../../hw/core/qdev.c:276:12 Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2517 Signed-off-by: Peter Maydell <peter.maydell@linaro.org> Reviewed-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20250929144228.1994037-4-peter.maydell@linaro.org Signed-off-by: Peter Xu <peterx@redhat.com>
2025-09-29 15:42:28 +01:00
g_clear_pointer(&cpu->cpu_ases, g_free);
}
AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
{
/* Return the AddressSpace corresponding to the specified index */
return cpu->cpu_ases[asidx].as;
}
/* Called from RCU critical section */
static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
{
RAMBlock *block;
block = qatomic_rcu_read(&ram_list.mru_block);
if (block && addr - block->offset < block->max_length) {
return block;
}
RAMBLOCK_FOREACH(block) {
if (addr - block->offset < block->max_length) {
goto found;
}
}
fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
abort();
found:
/* It is safe to write mru_block outside the BQL. This
* is what happens:
*
* mru_block = xxx
* rcu_read_unlock()
* xxx removed from list
* rcu_read_lock()
* read mru_block
* mru_block = NULL;
* call_rcu(reclaim_ramblock, xxx);
* rcu_read_unlock()
*
* qatomic_rcu_set is not needed here. The block was already published
* when it was placed into the list. Here we're just making an extra
* copy of the pointer.
*/
ram_list.mru_block = block;
return block;
}
void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
{
CPUState *cpu;
ram_addr_t start1;
RAMBlock *block;
ram_addr_t end;
assert(tcg_enabled());
end = TARGET_PAGE_ALIGN(start + length);
start &= TARGET_PAGE_MASK;
RCU_READ_LOCK_GUARD();
block = qemu_get_ram_block(start);
assert(block == qemu_get_ram_block(end - 1));
start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
CPU_FOREACH(cpu) {
tlb_reset_dirty(cpu, start1, length);
}
}
void physical_memory_dirty_bits_cleared(ram_addr_t start, ram_addr_t length)
{
if (tcg_enabled()) {
tlb_reset_dirty_range_all(start, length);
}
}
static bool physical_memory_get_dirty(ram_addr_t start, ram_addr_t length,
unsigned client)
{
DirtyMemoryBlocks *blocks;
unsigned long end, page;
unsigned long idx, offset, base;
bool dirty = false;
assert(client < DIRTY_MEMORY_NUM);
end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
page = start >> TARGET_PAGE_BITS;
WITH_RCU_READ_LOCK_GUARD() {
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
idx = page / DIRTY_MEMORY_BLOCK_SIZE;
offset = page % DIRTY_MEMORY_BLOCK_SIZE;
base = page - offset;
while (page < end) {
unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE);
unsigned long num = next - base;
unsigned long found = find_next_bit(blocks->blocks[idx],
num, offset);
if (found < num) {
dirty = true;
break;
}
page = next;
idx++;
offset = 0;
base += DIRTY_MEMORY_BLOCK_SIZE;
}
}
return dirty;
}
bool physical_memory_get_dirty_flag(ram_addr_t addr, unsigned client)
{
return physical_memory_get_dirty(addr, 1, client);
}
bool physical_memory_is_clean(ram_addr_t addr)
{
bool vga = physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_VGA);
bool code = physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_CODE);
bool migration =
physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_MIGRATION);
return !(vga && code && migration);
}
static bool physical_memory_all_dirty(ram_addr_t start, ram_addr_t length,
unsigned client)
{
DirtyMemoryBlocks *blocks;
unsigned long end, page;
unsigned long idx, offset, base;
bool dirty = true;
assert(client < DIRTY_MEMORY_NUM);
end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
page = start >> TARGET_PAGE_BITS;
RCU_READ_LOCK_GUARD();
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
idx = page / DIRTY_MEMORY_BLOCK_SIZE;
offset = page % DIRTY_MEMORY_BLOCK_SIZE;
base = page - offset;
while (page < end) {
unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE);
unsigned long num = next - base;
unsigned long found = find_next_zero_bit(blocks->blocks[idx],
num, offset);
if (found < num) {
dirty = false;
break;
}
page = next;
idx++;
offset = 0;
base += DIRTY_MEMORY_BLOCK_SIZE;
}
return dirty;
}
uint8_t physical_memory_range_includes_clean(ram_addr_t start,
ram_addr_t length,
uint8_t mask)
{
uint8_t ret = 0;
if (mask & (1 << DIRTY_MEMORY_VGA) &&
!physical_memory_all_dirty(start, length, DIRTY_MEMORY_VGA)) {
ret |= (1 << DIRTY_MEMORY_VGA);
}
if (mask & (1 << DIRTY_MEMORY_CODE) &&
!physical_memory_all_dirty(start, length, DIRTY_MEMORY_CODE)) {
ret |= (1 << DIRTY_MEMORY_CODE);
}
if (mask & (1 << DIRTY_MEMORY_MIGRATION) &&
!physical_memory_all_dirty(start, length, DIRTY_MEMORY_MIGRATION)) {
ret |= (1 << DIRTY_MEMORY_MIGRATION);
}
return ret;
}
void physical_memory_set_dirty_flag(ram_addr_t addr, unsigned client)
{
unsigned long page, idx, offset;
DirtyMemoryBlocks *blocks;
assert(client < DIRTY_MEMORY_NUM);
page = addr >> TARGET_PAGE_BITS;
idx = page / DIRTY_MEMORY_BLOCK_SIZE;
offset = page % DIRTY_MEMORY_BLOCK_SIZE;
RCU_READ_LOCK_GUARD();
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
set_bit_atomic(offset, blocks->blocks[idx]);
}
void physical_memory_set_dirty_range(ram_addr_t start, ram_addr_t length,
uint8_t mask)
{
DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM];
unsigned long end, page;
unsigned long idx, offset, base;
int i;
if (!mask && !xen_enabled()) {
return;
}
end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
page = start >> TARGET_PAGE_BITS;
WITH_RCU_READ_LOCK_GUARD() {
for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
blocks[i] = qatomic_rcu_read(&ram_list.dirty_memory[i]);
}
idx = page / DIRTY_MEMORY_BLOCK_SIZE;
offset = page % DIRTY_MEMORY_BLOCK_SIZE;
base = page - offset;
while (page < end) {
unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE);
if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx],
offset, next - page);
}
if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx],
offset, next - page);
}
if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx],
offset, next - page);
}
page = next;
idx++;
offset = 0;
base += DIRTY_MEMORY_BLOCK_SIZE;
}
}
if (xen_enabled()) {
xen_hvm_modified_memory(start, length);
}
}
migration: merge fragmented clear_dirty ioctls In our long-term experience in Bytedance, we've found that under the same load, live migration of larger VMs with more devices is often more difficult to converge (requiring a larger downtime limit). Through some testing and calculations, we conclude that bitmap sync time affects the calculation of live migration bandwidth. When the addresses processed are not aligned, a large number of clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate 2048 clear_dirty ioctls from two different memory_listener), which increases the time required for bitmap_sync and makes it more difficult for dirty pages to converge. For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms* (tested with 10GBps dirty rate, the sync time increases as the dirty page rate increases), Here are each part of the sync time: - sync from kvm to ram_list: 2.5ms - vhost_log_sync:3ms - sync aligned memory from ram_list to RAMBlock: 5ms - sync misaligned memory from ram_list to RAMBlock: 61ms Attempt to merge those fragmented clear_dirty ioctls, then syncing misaligned memory from ram_list to RAMBlock takes only about 1ms, and the total sync time is only *12ms*. Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com [peterx: drop var "offset" in physical_memory_sync_dirty_bitmap] Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
/*
* Note: start and end must be within the same ram block.
*
* @bmap usage:
* - When @bmap is provided, set bits for dirty pages, but
* only count those pages if the bit wasn't already set in @bmap.
* - When @bmap is NULL, count all dirty pages in the range.
*
* @return:
* - Number of dirty guest pages found within [start, start + length).
*/
uint64_t physical_memory_test_and_clear_dirty(ram_addr_t start,
ram_addr_t length,
migration: merge fragmented clear_dirty ioctls In our long-term experience in Bytedance, we've found that under the same load, live migration of larger VMs with more devices is often more difficult to converge (requiring a larger downtime limit). Through some testing and calculations, we conclude that bitmap sync time affects the calculation of live migration bandwidth. When the addresses processed are not aligned, a large number of clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate 2048 clear_dirty ioctls from two different memory_listener), which increases the time required for bitmap_sync and makes it more difficult for dirty pages to converge. For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms* (tested with 10GBps dirty rate, the sync time increases as the dirty page rate increases), Here are each part of the sync time: - sync from kvm to ram_list: 2.5ms - vhost_log_sync:3ms - sync aligned memory from ram_list to RAMBlock: 5ms - sync misaligned memory from ram_list to RAMBlock: 61ms Attempt to merge those fragmented clear_dirty ioctls, then syncing misaligned memory from ram_list to RAMBlock takes only about 1ms, and the total sync time is only *12ms*. Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com [peterx: drop var "offset" in physical_memory_sync_dirty_bitmap] Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
unsigned client,
unsigned long *bmap)
{
DirtyMemoryBlocks *blocks;
unsigned long end, page, start_page;
migration: merge fragmented clear_dirty ioctls In our long-term experience in Bytedance, we've found that under the same load, live migration of larger VMs with more devices is often more difficult to converge (requiring a larger downtime limit). Through some testing and calculations, we conclude that bitmap sync time affects the calculation of live migration bandwidth. When the addresses processed are not aligned, a large number of clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate 2048 clear_dirty ioctls from two different memory_listener), which increases the time required for bitmap_sync and makes it more difficult for dirty pages to converge. For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms* (tested with 10GBps dirty rate, the sync time increases as the dirty page rate increases), Here are each part of the sync time: - sync from kvm to ram_list: 2.5ms - vhost_log_sync:3ms - sync aligned memory from ram_list to RAMBlock: 5ms - sync misaligned memory from ram_list to RAMBlock: 61ms Attempt to merge those fragmented clear_dirty ioctls, then syncing misaligned memory from ram_list to RAMBlock takes only about 1ms, and the total sync time is only *12ms*. Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com [peterx: drop var "offset" in physical_memory_sync_dirty_bitmap] Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
uint64_t num_dirty = 0;
memory: Introduce memory listener hook log_clear() Introduce a new memory region listener hook log_clear() to allow the listeners to hook onto the points where the dirty bitmap is cleared by the bitmap users. Previously log_sync() contains two operations: - dirty bitmap collection, and, - dirty bitmap clear on remote site. Let's take KVM as example - log_sync() for KVM will first copy the kernel dirty bitmap to userspace, and at the same time we'll clear the dirty bitmap there along with re-protecting all the guest pages again. We add this new log_clear() interface only to split the old log_sync() into two separated procedures: - use log_sync() to collect the collection only, and, - use log_clear() to clear the remote dirty bitmap. With the new interface, the memory listener users will still be able to decide how to implement the log synchronization procedure, e.g., they can still only provide log_sync() method only and put all the two procedures within log_sync() (that's how the old KVM works before KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is introduced). However with this new interface the memory listener users will start to have a chance to postpone the log clear operation explicitly if the module supports. That can really benefit users like KVM at least for host kernels that support KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. There are three places that can clear dirty bits in any one of the dirty bitmap in the ram_list.dirty_memory[3] array: cpu_physical_memory_snapshot_and_clear_dirty cpu_physical_memory_test_and_clear_dirty cpu_physical_memory_sync_dirty_bitmap Currently we hook directly into each of the functions to notify about the log_clear(). Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Juan Quintela <quintela@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20190603065056.25211-7-peterx@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
2019-06-03 14:50:51 +08:00
RAMBlock *ramblock;
uint64_t mr_offset, mr_size;
if (length == 0) {
migration: merge fragmented clear_dirty ioctls In our long-term experience in Bytedance, we've found that under the same load, live migration of larger VMs with more devices is often more difficult to converge (requiring a larger downtime limit). Through some testing and calculations, we conclude that bitmap sync time affects the calculation of live migration bandwidth. When the addresses processed are not aligned, a large number of clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate 2048 clear_dirty ioctls from two different memory_listener), which increases the time required for bitmap_sync and makes it more difficult for dirty pages to converge. For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms* (tested with 10GBps dirty rate, the sync time increases as the dirty page rate increases), Here are each part of the sync time: - sync from kvm to ram_list: 2.5ms - vhost_log_sync:3ms - sync aligned memory from ram_list to RAMBlock: 5ms - sync misaligned memory from ram_list to RAMBlock: 61ms Attempt to merge those fragmented clear_dirty ioctls, then syncing misaligned memory from ram_list to RAMBlock takes only about 1ms, and the total sync time is only *12ms*. Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com [peterx: drop var "offset" in physical_memory_sync_dirty_bitmap] Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
return 0;
}
end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
start_page = start >> TARGET_PAGE_BITS;
page = start_page;
WITH_RCU_READ_LOCK_GUARD() {
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
ramblock = qemu_get_ram_block(start);
/* Range sanity check on the ramblock */
assert(start >= ramblock->offset &&
start + length <= ramblock->offset + ramblock->used_length);
while (page < end) {
unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
migration: merge fragmented clear_dirty ioctls In our long-term experience in Bytedance, we've found that under the same load, live migration of larger VMs with more devices is often more difficult to converge (requiring a larger downtime limit). Through some testing and calculations, we conclude that bitmap sync time affects the calculation of live migration bandwidth. When the addresses processed are not aligned, a large number of clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate 2048 clear_dirty ioctls from two different memory_listener), which increases the time required for bitmap_sync and makes it more difficult for dirty pages to converge. For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms* (tested with 10GBps dirty rate, the sync time increases as the dirty page rate increases), Here are each part of the sync time: - sync from kvm to ram_list: 2.5ms - vhost_log_sync:3ms - sync aligned memory from ram_list to RAMBlock: 5ms - sync misaligned memory from ram_list to RAMBlock: 61ms Attempt to merge those fragmented clear_dirty ioctls, then syncing misaligned memory from ram_list to RAMBlock takes only about 1ms, and the total sync time is only *12ms*. Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com [peterx: drop var "offset" in physical_memory_sync_dirty_bitmap] Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
if (bitmap_test_and_clear_atomic(blocks->blocks[idx], offset, 1)) {
if (bmap) {
unsigned long k = page - (ramblock->offset >> TARGET_PAGE_BITS);
if (!test_and_set_bit(k, bmap)) {
num_dirty++;
}
} else {
num_dirty++;
}
}
page++;
}
mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset;
mr_size = (end - start_page) << TARGET_PAGE_BITS;
memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
}
migration: merge fragmented clear_dirty ioctls In our long-term experience in Bytedance, we've found that under the same load, live migration of larger VMs with more devices is often more difficult to converge (requiring a larger downtime limit). Through some testing and calculations, we conclude that bitmap sync time affects the calculation of live migration bandwidth. When the addresses processed are not aligned, a large number of clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate 2048 clear_dirty ioctls from two different memory_listener), which increases the time required for bitmap_sync and makes it more difficult for dirty pages to converge. For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms* (tested with 10GBps dirty rate, the sync time increases as the dirty page rate increases), Here are each part of the sync time: - sync from kvm to ram_list: 2.5ms - vhost_log_sync:3ms - sync aligned memory from ram_list to RAMBlock: 5ms - sync misaligned memory from ram_list to RAMBlock: 61ms Attempt to merge those fragmented clear_dirty ioctls, then syncing misaligned memory from ram_list to RAMBlock takes only about 1ms, and the total sync time is only *12ms*. Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com [peterx: drop var "offset" in physical_memory_sync_dirty_bitmap] Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
if (num_dirty) {
physical_memory_dirty_bits_cleared(start, length);
}
migration: merge fragmented clear_dirty ioctls In our long-term experience in Bytedance, we've found that under the same load, live migration of larger VMs with more devices is often more difficult to converge (requiring a larger downtime limit). Through some testing and calculations, we conclude that bitmap sync time affects the calculation of live migration bandwidth. When the addresses processed are not aligned, a large number of clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate 2048 clear_dirty ioctls from two different memory_listener), which increases the time required for bitmap_sync and makes it more difficult for dirty pages to converge. For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms* (tested with 10GBps dirty rate, the sync time increases as the dirty page rate increases), Here are each part of the sync time: - sync from kvm to ram_list: 2.5ms - vhost_log_sync:3ms - sync aligned memory from ram_list to RAMBlock: 5ms - sync misaligned memory from ram_list to RAMBlock: 61ms Attempt to merge those fragmented clear_dirty ioctls, then syncing misaligned memory from ram_list to RAMBlock takes only about 1ms, and the total sync time is only *12ms*. Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com [peterx: drop var "offset" in physical_memory_sync_dirty_bitmap] Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
return num_dirty;
}
static void physical_memory_clear_dirty_range(ram_addr_t addr, ram_addr_t length)
{
migration: merge fragmented clear_dirty ioctls In our long-term experience in Bytedance, we've found that under the same load, live migration of larger VMs with more devices is often more difficult to converge (requiring a larger downtime limit). Through some testing and calculations, we conclude that bitmap sync time affects the calculation of live migration bandwidth. When the addresses processed are not aligned, a large number of clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate 2048 clear_dirty ioctls from two different memory_listener), which increases the time required for bitmap_sync and makes it more difficult for dirty pages to converge. For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and 16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms* (tested with 10GBps dirty rate, the sync time increases as the dirty page rate increases), Here are each part of the sync time: - sync from kvm to ram_list: 2.5ms - vhost_log_sync:3ms - sync aligned memory from ram_list to RAMBlock: 5ms - sync misaligned memory from ram_list to RAMBlock: 61ms Attempt to merge those fragmented clear_dirty ioctls, then syncing misaligned memory from ram_list to RAMBlock takes only about 1ms, and the total sync time is only *12ms*. Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com [peterx: drop var "offset" in physical_memory_sync_dirty_bitmap] Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_MIGRATION, NULL);
physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_VGA, NULL);
physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_CODE, NULL);
}
DirtyBitmapSnapshot *physical_memory_snapshot_and_clear_dirty
(MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
{
DirtyMemoryBlocks *blocks;
ram_addr_t start, first, last;
unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
DirtyBitmapSnapshot *snap;
unsigned long page, end, dest;
start = memory_region_get_ram_addr(mr);
/* We know we're only called for RAM MemoryRegions */
assert(start != RAM_ADDR_INVALID);
start += offset;
first = QEMU_ALIGN_DOWN(start, align);
last = QEMU_ALIGN_UP(start + length, align);
snap = g_malloc0(sizeof(*snap) +
((last - first) >> (TARGET_PAGE_BITS + 3)));
snap->start = first;
snap->end = last;
page = first >> TARGET_PAGE_BITS;
end = last >> TARGET_PAGE_BITS;
dest = 0;
WITH_RCU_READ_LOCK_GUARD() {
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
while (page < end) {
unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
unsigned long ofs = page % DIRTY_MEMORY_BLOCK_SIZE;
unsigned long num = MIN(end - page,
DIRTY_MEMORY_BLOCK_SIZE - ofs);
assert(QEMU_IS_ALIGNED(ofs, (1 << BITS_PER_LEVEL)));
assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL)));
ofs >>= BITS_PER_LEVEL;
bitmap_copy_and_clear_atomic(snap->dirty + dest,
blocks->blocks[idx] + ofs,
num);
page += num;
dest += num >> BITS_PER_LEVEL;
}
}
physical_memory_dirty_bits_cleared(start, length);
memory: Introduce memory listener hook log_clear() Introduce a new memory region listener hook log_clear() to allow the listeners to hook onto the points where the dirty bitmap is cleared by the bitmap users. Previously log_sync() contains two operations: - dirty bitmap collection, and, - dirty bitmap clear on remote site. Let's take KVM as example - log_sync() for KVM will first copy the kernel dirty bitmap to userspace, and at the same time we'll clear the dirty bitmap there along with re-protecting all the guest pages again. We add this new log_clear() interface only to split the old log_sync() into two separated procedures: - use log_sync() to collect the collection only, and, - use log_clear() to clear the remote dirty bitmap. With the new interface, the memory listener users will still be able to decide how to implement the log synchronization procedure, e.g., they can still only provide log_sync() method only and put all the two procedures within log_sync() (that's how the old KVM works before KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 is introduced). However with this new interface the memory listener users will start to have a chance to postpone the log clear operation explicitly if the module supports. That can really benefit users like KVM at least for host kernels that support KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2. There are three places that can clear dirty bits in any one of the dirty bitmap in the ram_list.dirty_memory[3] array: cpu_physical_memory_snapshot_and_clear_dirty cpu_physical_memory_test_and_clear_dirty cpu_physical_memory_sync_dirty_bitmap Currently we hook directly into each of the functions to notify about the log_clear(). Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Juan Quintela <quintela@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20190603065056.25211-7-peterx@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
2019-06-03 14:50:51 +08:00
memory_region_clear_dirty_bitmap(mr, offset, length);
return snap;
}
bool physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
ram_addr_t start,
ram_addr_t length)
{
unsigned long page, end;
assert(start >= snap->start);
assert(start + length <= snap->end);
end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
page = (start - snap->start) >> TARGET_PAGE_BITS;
while (page < end) {
if (test_bit(page, snap->dirty)) {
return true;
}
page++;
}
return false;
}
uint64_t physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
ram_addr_t start,
ram_addr_t pages)
{
unsigned long i, j;
unsigned long page_number, c, nbits;
hwaddr addr;
ram_addr_t ram_addr;
uint64_t num_dirty = 0;
unsigned long len = (pages + HOST_LONG_BITS - 1) / HOST_LONG_BITS;
unsigned long hpratio = qemu_real_host_page_size() / TARGET_PAGE_SIZE;
unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS);
/* start address is aligned at the start of a word? */
if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) &&
(hpratio == 1)) {
unsigned long **blocks[DIRTY_MEMORY_NUM];
unsigned long idx;
unsigned long offset;
long k;
long nr = BITS_TO_LONGS(pages);
idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
offset = BIT_WORD((start >> TARGET_PAGE_BITS) %
DIRTY_MEMORY_BLOCK_SIZE);
WITH_RCU_READ_LOCK_GUARD() {
for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
blocks[i] =
qatomic_rcu_read(&ram_list.dirty_memory[i])->blocks;
}
for (k = 0; k < nr; k++) {
if (bitmap[k]) {
unsigned long temp = ldn_le_p(&bitmap[k],
sizeof(bitmap[k]));
nbits = ctpopl(temp);
qatomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
if (global_dirty_tracking) {
qatomic_or(
&blocks[DIRTY_MEMORY_MIGRATION][idx][offset],
temp);
if (unlikely(
global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
total_dirty_pages += nbits;
}
}
num_dirty += nbits;
if (tcg_enabled()) {
qatomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset],
temp);
}
}
if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
offset = 0;
idx++;
}
}
}
if (xen_enabled()) {
xen_hvm_modified_memory(start, pages << TARGET_PAGE_BITS);
}
} else {
uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL
: DIRTY_CLIENTS_NOCODE;
if (!global_dirty_tracking) {
clients &= ~(1 << DIRTY_MEMORY_MIGRATION);
}
/*
* bitmap-traveling is faster than memory-traveling (for addr...)
* especially when most of the memory is not dirty.
*/
for (i = 0; i < len; i++) {
if (bitmap[i] != 0) {
c = ldn_le_p(&bitmap[i], sizeof(bitmap[i]));
nbits = ctpopl(c);
if (unlikely(global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
total_dirty_pages += nbits;
}
num_dirty += nbits;
do {
j = ctzl(c);
c &= ~(1ul << j);
page_number = (i * HOST_LONG_BITS + j) * hpratio;
addr = page_number * TARGET_PAGE_SIZE;
ram_addr = start + addr;
physical_memory_set_dirty_range(ram_addr,
TARGET_PAGE_SIZE * hpratio, clients);
} while (c != 0);
}
}
}
return num_dirty;
}
static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
uint16_t section);
static subpage_t *subpage_init(FlatView *fv, hwaddr base);
static uint16_t phys_section_add(PhysPageMap *map,
MemoryRegionSection *section)
{
if (map->sections_nb == map->sections_nb_alloc) {
map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
map->sections = g_renew(MemoryRegionSection, map->sections,
map->sections_nb_alloc);
}
map->sections[map->sections_nb] = *section;
memory_region_ref(section->mr);
return map->sections_nb++;
}
static void phys_section_destroy(MemoryRegion *mr)
{
bool have_sub_page = mr->subpage;
memory_region_unref(mr);
if (have_sub_page) {
subpage_t *subpage = container_of(mr, subpage_t, iomem);
object_unref(OBJECT(&subpage->iomem));
g_free(subpage);
}
}
static void phys_sections_free(PhysPageMap *map)
{
while (map->sections_nb > 0) {
MemoryRegionSection *section = &map->sections[--map->sections_nb];
phys_section_destroy(section->mr);
}
g_free(map->sections);
g_free(map->nodes);
}
static void register_subpage(FlatView *fv, MemoryRegionSection *section)
{
AddressSpaceDispatch *d = flatview_to_dispatch(fv);
subpage_t *subpage;
hwaddr base = section->offset_within_address_space
& TARGET_PAGE_MASK;
MemoryRegionSection *existing = phys_page_find(d, base);
MemoryRegionSection subsection = {
.offset_within_address_space = base,
.size = int128_make64(TARGET_PAGE_SIZE),
};
hwaddr start, end;
assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
if (!(existing->mr->subpage)) {
subpage = subpage_init(fv, base);
subsection.fv = fv;
subsection.mr = &subpage->iomem;
phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
phys_section_add(&d->map, &subsection));
} else {
subpage = container_of(existing->mr, subpage_t, iomem);
}
start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
end = start + int128_get64(section->size) - 1;
subpage_register(subpage, start, end,
phys_section_add(&d->map, section));
}
static void register_multipage(FlatView *fv,
MemoryRegionSection *section)
{
AddressSpaceDispatch *d = flatview_to_dispatch(fv);
hwaddr start_addr = section->offset_within_address_space;
uint16_t section_index = phys_section_add(&d->map, section);
uint64_t num_pages = int128_get64(int128_rshift(section->size,
TARGET_PAGE_BITS));
assert(num_pages);
phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
}
/*
* The range in *section* may look like this:
*
* |s|PPPPPPP|s|
*
* where s stands for subpage and P for page.
*/
void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
{
MemoryRegionSection remain = *section;
Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
/* register first subpage */
if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
- remain.offset_within_address_space;
MemoryRegionSection now = remain;
now.size = int128_min(int128_make64(left), now.size);
register_subpage(fv, &now);
if (int128_eq(remain.size, now.size)) {
return;
}
remain.size = int128_sub(remain.size, now.size);
remain.offset_within_address_space += int128_get64(now.size);
remain.offset_within_region += int128_get64(now.size);
}
/* register whole pages */
if (int128_ge(remain.size, page_size)) {
MemoryRegionSection now = remain;
now.size = int128_and(now.size, int128_neg(page_size));
register_multipage(fv, &now);
if (int128_eq(remain.size, now.size)) {
return;
}
remain.size = int128_sub(remain.size, now.size);
remain.offset_within_address_space += int128_get64(now.size);
remain.offset_within_region += int128_get64(now.size);
}
/* register last subpage */
register_subpage(fv, &remain);
}
void qemu_flush_coalesced_mmio_buffer(void)
{
if (kvm_enabled())
kvm_flush_coalesced_mmio_buffer();
}
void qemu_mutex_lock_ramlist(void)
{
qemu_mutex_lock(&ram_list.mutex);
}
void qemu_mutex_unlock_ramlist(void)
{
qemu_mutex_unlock(&ram_list.mutex);
}
GString *ram_block_format(void)
{
RAMBlock *block;
char *psize;
GString *buf = g_string_new("");
RCU_READ_LOCK_GUARD();
g_string_append_printf(buf, "%24s %8s %18s %18s %18s %18s %3s\n",
"Block Name", "PSize", "Offset", "Used", "Total",
"HVA", "RO");
RAMBLOCK_FOREACH(block) {
psize = size_to_str(block->page_size);
g_string_append_printf(buf, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64
" 0x%016" PRIx64 " 0x%016" PRIx64 " %3s\n",
block->idstr, psize,
(uint64_t)block->offset,
(uint64_t)block->used_length,
(uint64_t)block->max_length,
(uint64_t)(uintptr_t)block->host,
block->mr->readonly ? "ro" : "rw");
g_free(psize);
}
return buf;
}
static int find_min_backend_pagesize(Object *obj, void *opaque)
{
long *hpsize_min = opaque;
if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
HostMemoryBackend *backend = MEMORY_BACKEND(obj);
long hpsize = host_memory_backend_pagesize(backend);
if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
*hpsize_min = hpsize;
}
}
return 0;
}
static int find_max_backend_pagesize(Object *obj, void *opaque)
{
long *hpsize_max = opaque;
if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
HostMemoryBackend *backend = MEMORY_BACKEND(obj);
long hpsize = host_memory_backend_pagesize(backend);
if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
*hpsize_max = hpsize;
}
}
return 0;
}
/*
* TODO: We assume right now that all mapped host memory backends are
* used as RAM, however some might be used for different purposes.
*/
long qemu_minrampagesize(void)
{
long hpsize = LONG_MAX;
Object *memdev_root = object_resolve_path("/objects", NULL);
object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
return hpsize;
}
long qemu_maxrampagesize(void)
{
long pagesize = 0;
Object *memdev_root = object_resolve_path("/objects", NULL);
object_child_foreach(memdev_root, find_max_backend_pagesize, &pagesize);
return pagesize;
}
#if defined(CONFIG_POSIX) && !defined(EMSCRIPTEN)
static int64_t get_file_size(int fd)
{
memory: fetch pmem size in get_file_size() Neither stat(2) nor lseek(2) report the size of Linux devdax pmem character device nodes. Commit 314aec4a6e06844937f1677f6cba21981005f389 ("hostmem-file: reject invalid pmem file sizes") added code to hostmem-file.c to fetch the size from sysfs and compare against the user-provided size=NUM parameter: if (backend->size > size) { error_setg(errp, "size property %" PRIu64 " is larger than " "pmem file \"%s\" size %" PRIu64, backend->size, fb->mem_path, size); return; } It turns out that exec.c:qemu_ram_alloc_from_fd() already has an equivalent size check but it skips devdax pmem character devices because lseek(2) returns 0: if (file_size > 0 && file_size < size) { error_setg(errp, "backing store %s size 0x%" PRIx64 " does not match 'size' option 0x" RAM_ADDR_FMT, mem_path, file_size, size); return NULL; } This patch moves the devdax pmem file size code into get_file_size() so that we check the memory size in a single place: qemu_ram_alloc_from_fd(). This simplifies the code and makes it more general. This also fixes the problem that hostmem-file only checks the devdax pmem file size when the pmem=on parameter is given. An unchecked size=NUM parameter can lead to SIGBUS in QEMU so we must always fetch the file size for Linux devdax pmem character device nodes. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Message-Id: <20190830093056.12572-1-stefanha@redhat.com> Reviewed-by: Eduardo Habkost <ehabkost@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2019-08-30 10:30:56 +01:00
int64_t size;
#if defined(__linux__)
struct stat st;
if (fstat(fd, &st) < 0) {
return -errno;
}
/* Special handling for devdax character devices */
if (S_ISCHR(st.st_mode)) {
g_autofree char *subsystem_path = NULL;
g_autofree char *subsystem = NULL;
subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
major(st.st_rdev), minor(st.st_rdev));
subsystem = g_file_read_link(subsystem_path, NULL);
if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
g_autofree char *size_path = NULL;
g_autofree char *size_str = NULL;
size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
major(st.st_rdev), minor(st.st_rdev));
if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
return g_ascii_strtoll(size_str, NULL, 0);
}
}
}
#endif /* defined(__linux__) */
/* st.st_size may be zero for special files yet lseek(2) works */
size = lseek(fd, 0, SEEK_END);
if (size < 0) {
return -errno;
}
return size;
}
static int64_t get_file_align(int fd)
{
int64_t align = -1;
#if defined(__linux__) && defined(CONFIG_LIBDAXCTL)
struct stat st;
if (fstat(fd, &st) < 0) {
return -errno;
}
/* Special handling for devdax character devices */
if (S_ISCHR(st.st_mode)) {
g_autofree char *path = NULL;
g_autofree char *rpath = NULL;
struct daxctl_ctx *ctx;
struct daxctl_region *region;
int rc = 0;
path = g_strdup_printf("/sys/dev/char/%d:%d",
major(st.st_rdev), minor(st.st_rdev));
rpath = realpath(path, NULL);
if (!rpath) {
return -errno;
}
rc = daxctl_new(&ctx);
if (rc) {
return -1;
}
daxctl_region_foreach(ctx, region) {
if (strstr(rpath, daxctl_region_get_path(region))) {
align = daxctl_region_get_align(region);
break;
}
}
daxctl_unref(ctx);
}
#endif /* defined(__linux__) && defined(CONFIG_LIBDAXCTL) */
return align;
}
static int file_ram_open(const char *path,
const char *region_name,
bool readonly,
bool *created)
{
char *filename;
char *sanitized_name;
char *c;
int fd = -1;
*created = false;
for (;;) {
fd = open(path, readonly ? O_RDONLY : O_RDWR);
if (fd >= 0) {
/*
* open(O_RDONLY) won't fail with EISDIR. Check manually if we
* opened a directory and fail similarly to how we fail ENOENT
* in readonly mode. Note that mkstemp() would imply O_RDWR.
*/
if (readonly) {
struct stat file_stat;
if (fstat(fd, &file_stat)) {
close(fd);
if (errno == EINTR) {
continue;
}
return -errno;
} else if (S_ISDIR(file_stat.st_mode)) {
close(fd);
return -EISDIR;
}
}
/* @path names an existing file, use it */
break;
}
if (errno == ENOENT) {
if (readonly) {
/* Refuse to create new, readonly files. */
return -ENOENT;
}
/* @path names a file that doesn't exist, create it */
fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
if (fd >= 0) {
*created = true;
break;
}
} else if (errno == EISDIR) {
/* @path names a directory, create a file there */
/* Make name safe to use with mkstemp by replacing '/' with '_'. */
sanitized_name = g_strdup(region_name);
for (c = sanitized_name; *c != '\0'; c++) {
if (*c == '/') {
*c = '_';
}
}
filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
sanitized_name);
g_free(sanitized_name);
fd = mkstemp(filename);
if (fd >= 0) {
unlink(filename);
g_free(filename);
break;
}
g_free(filename);
}
if (errno != EEXIST && errno != EINTR) {
return -errno;
}
/*
* Try again on EINTR and EEXIST. The latter happens when
* something else creates the file between our two open().
*/
}
return fd;
}
static void *file_ram_alloc(RAMBlock *block,
ram_addr_t memory,
int fd,
bool truncate,
off_t offset,
Error **errp)
{
uint32_t qemu_map_flags;
void *area;
block->page_size = qemu_fd_getpagesize(fd);
if (block->mr->align % block->page_size) {
error_setg(errp, "alignment 0x%" PRIx64
" must be multiples of page size 0x%zx",
block->mr->align, block->page_size);
return NULL;
} else if (block->mr->align && !is_power_of_2(block->mr->align)) {
error_setg(errp, "alignment 0x%" PRIx64
" must be a power of two", block->mr->align);
return NULL;
} else if (offset % block->page_size) {
error_setg(errp, "offset 0x%" PRIx64
" must be multiples of page size 0x%zx",
offset, block->page_size);
return NULL;
}
block->mr->align = MAX(block->page_size, block->mr->align);
#if defined(__s390x__)
if (kvm_enabled()) {
block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
}
#endif
if (memory < block->page_size) {
error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
"or larger than page size 0x%zx",
memory, block->page_size);
return NULL;
}
memory = ROUND_UP(memory, block->page_size);
/*
* ftruncate is not supported by hugetlbfs in older
* hosts, so don't bother bailing out on errors.
* If anything goes wrong with it under other filesystems,
* mmap will fail.
*
* Do not truncate the non-empty backend file to avoid corrupting
* the existing data in the file. Disabling shrinking is not
* enough. For example, the current vNVDIMM implementation stores
* the guest NVDIMM labels at the end of the backend file. If the
* backend file is later extended, QEMU will not be able to find
* those labels. Therefore, extending the non-empty backend file
* is disabled as well.
*/
if (truncate && ftruncate(fd, offset + memory)) {
perror("ftruncate");
}
qemu_map_flags = (block->flags & RAM_READONLY) ? QEMU_MAP_READONLY : 0;
qemu_map_flags |= (block->flags & RAM_SHARED) ? QEMU_MAP_SHARED : 0;
qemu_map_flags |= (block->flags & RAM_PMEM) ? QEMU_MAP_SYNC : 0;
qemu_map_flags |= (block->flags & RAM_NORESERVE) ? QEMU_MAP_NORESERVE : 0;
area = qemu_ram_mmap(fd, memory, block->mr->align, qemu_map_flags, offset);
if (area == MAP_FAILED) {
error_setg_errno(errp, errno,
"unable to map backing store for guest RAM");
return NULL;
}
block->fd = fd;
block->fd_offset = offset;
return area;
}
#endif
/* Allocate space within the ram_addr_t space that governs the
* dirty bitmaps.
* Called with the ramlist lock held.
*/
static ram_addr_t find_ram_offset(ram_addr_t size)
{
RAMBlock *block, *next_block;
ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
assert(size != 0); /* it would hand out same offset multiple times */
if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
return 0;
}
RAMBLOCK_FOREACH(block) {
ram_addr_t candidate, next = RAM_ADDR_MAX;
find_ram_offset: Align ram_addr_t allocation on long boundaries The dirty bitmaps are built from 'long's and there is fast-path code for synchronising the case where the RAMBlock is aligned to the start of a long boundary. Align the allocation to this boundary to cause the fast path to be used. Offsets before change: 11398@1515169675.018566:find_ram_offset size: 0x1e0000 @ 0x8000000 11398@1515169675.020064:find_ram_offset size: 0x20000 @ 0x81e0000 11398@1515169675.020244:find_ram_offset size: 0x20000 @ 0x8200000 11398@1515169675.024343:find_ram_offset size: 0x1000000 @ 0x8220000 11398@1515169675.025154:find_ram_offset size: 0x10000 @ 0x9220000 11398@1515169675.027682:find_ram_offset size: 0x40000 @ 0x9230000 11398@1515169675.032921:find_ram_offset size: 0x200000 @ 0x9270000 11398@1515169675.033307:find_ram_offset size: 0x1000 @ 0x9470000 11398@1515169675.033601:find_ram_offset size: 0x1000 @ 0x9471000 after change: 10923@1515169108.818245:find_ram_offset size: 0x1e0000 @ 0x8000000 10923@1515169108.819410:find_ram_offset size: 0x20000 @ 0x8200000 10923@1515169108.819587:find_ram_offset size: 0x20000 @ 0x8240000 10923@1515169108.823708:find_ram_offset size: 0x1000000 @ 0x8280000 10923@1515169108.824503:find_ram_offset size: 0x10000 @ 0x9280000 10923@1515169108.827093:find_ram_offset size: 0x40000 @ 0x92c0000 10923@1515169108.833045:find_ram_offset size: 0x200000 @ 0x9300000 10923@1515169108.833504:find_ram_offset size: 0x1000 @ 0x9500000 10923@1515169108.833787:find_ram_offset size: 0x1000 @ 0x9540000 Suggested-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Message-Id: <20180105170138.23357-3-dgilbert@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2018-01-05 17:01:38 +00:00
/* Align blocks to start on a 'long' in the bitmap
* which makes the bitmap sync'ing take the fast path.
*/
candidate = block->offset + block->max_length;
find_ram_offset: Align ram_addr_t allocation on long boundaries The dirty bitmaps are built from 'long's and there is fast-path code for synchronising the case where the RAMBlock is aligned to the start of a long boundary. Align the allocation to this boundary to cause the fast path to be used. Offsets before change: 11398@1515169675.018566:find_ram_offset size: 0x1e0000 @ 0x8000000 11398@1515169675.020064:find_ram_offset size: 0x20000 @ 0x81e0000 11398@1515169675.020244:find_ram_offset size: 0x20000 @ 0x8200000 11398@1515169675.024343:find_ram_offset size: 0x1000000 @ 0x8220000 11398@1515169675.025154:find_ram_offset size: 0x10000 @ 0x9220000 11398@1515169675.027682:find_ram_offset size: 0x40000 @ 0x9230000 11398@1515169675.032921:find_ram_offset size: 0x200000 @ 0x9270000 11398@1515169675.033307:find_ram_offset size: 0x1000 @ 0x9470000 11398@1515169675.033601:find_ram_offset size: 0x1000 @ 0x9471000 after change: 10923@1515169108.818245:find_ram_offset size: 0x1e0000 @ 0x8000000 10923@1515169108.819410:find_ram_offset size: 0x20000 @ 0x8200000 10923@1515169108.819587:find_ram_offset size: 0x20000 @ 0x8240000 10923@1515169108.823708:find_ram_offset size: 0x1000000 @ 0x8280000 10923@1515169108.824503:find_ram_offset size: 0x10000 @ 0x9280000 10923@1515169108.827093:find_ram_offset size: 0x40000 @ 0x92c0000 10923@1515169108.833045:find_ram_offset size: 0x200000 @ 0x9300000 10923@1515169108.833504:find_ram_offset size: 0x1000 @ 0x9500000 10923@1515169108.833787:find_ram_offset size: 0x1000 @ 0x9540000 Suggested-by: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Message-Id: <20180105170138.23357-3-dgilbert@redhat.com> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2018-01-05 17:01:38 +00:00
candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
/* Search for the closest following block
* and find the gap.
*/
RAMBLOCK_FOREACH(next_block) {
if (next_block->offset >= candidate) {
next = MIN(next, next_block->offset);
}
}
/* If it fits remember our place and remember the size
* of gap, but keep going so that we might find a smaller
* gap to fill so avoiding fragmentation.
*/
if (next - candidate >= size && next - candidate < mingap) {
offset = candidate;
mingap = next - candidate;
}
trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
}
if (offset == RAM_ADDR_MAX) {
fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
(uint64_t)size);
abort();
}
trace_find_ram_offset(size, offset);
return offset;
}
static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
{
int ret;
/* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
if (!machine_dump_guest_core(current_machine)) {
ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
if (ret) {
perror("qemu_madvise");
fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
"but dump-guest-core=off specified\n");
}
}
}
const char *qemu_ram_get_idstr(RAMBlock *rb)
{
return rb->idstr;
}
void *qemu_ram_get_host_addr(RAMBlock *rb)
{
return rb->host;
}
ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
{
return rb->offset;
}
ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb)
{
return rb->fd_offset;
}
ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
{
return rb->used_length;
}
ram_addr_t qemu_ram_get_max_length(RAMBlock *rb)
{
return rb->max_length;
}
bool qemu_ram_is_shared(RAMBlock *rb)
{
return rb->flags & RAM_SHARED;
}
bool qemu_ram_is_noreserve(RAMBlock *rb)
{
return rb->flags & RAM_NORESERVE;
}
/* Note: Only set at the start of postcopy */
bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
{
return rb->flags & RAM_UF_ZEROPAGE;
}
void qemu_ram_set_uf_zeroable(RAMBlock *rb)
{
rb->flags |= RAM_UF_ZEROPAGE;
}
bool qemu_ram_is_migratable(RAMBlock *rb)
{
return rb->flags & RAM_MIGRATABLE;
}
void qemu_ram_set_migratable(RAMBlock *rb)
{
rb->flags |= RAM_MIGRATABLE;
}
void qemu_ram_unset_migratable(RAMBlock *rb)
{
rb->flags &= ~RAM_MIGRATABLE;
}
bool qemu_ram_is_named_file(RAMBlock *rb)
{
return rb->flags & RAM_NAMED_FILE;
}
int qemu_ram_get_fd(RAMBlock *rb)
{
return rb->fd;
}
/* Called with the BQL held. */
void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
{
RAMBlock *block;
assert(new_block);
assert(!new_block->idstr[0]);
if (dev) {
char *id = qdev_get_dev_path(dev);
if (id) {
snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
g_free(id);
}
}
pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
RCU_READ_LOCK_GUARD();
RAMBLOCK_FOREACH(block) {
if (block != new_block &&
!strcmp(block->idstr, new_block->idstr)) {
fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
new_block->idstr);
abort();
}
}
}
/* Called with the BQL held. */
void qemu_ram_unset_idstr(RAMBlock *block)
{
/* FIXME: arch_init.c assumes that this is not called throughout
* migration. Ignore the problem since hot-unplug during migration
* does not work anyway.
*/
if (block) {
memset(block->idstr, 0, sizeof(block->idstr));
}
}
static char *cpr_name(MemoryRegion *mr)
{
const char *mr_name = memory_region_name(mr);
g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
if (id) {
return g_strdup_printf("%s/%s", id, mr_name);
} else {
return g_strdup(mr_name);
}
}
size_t qemu_ram_pagesize(RAMBlock *rb)
{
return rb->page_size;
}
/* Returns the largest size of page in use */
size_t qemu_ram_pagesize_largest(void)
{
RAMBlock *block;
size_t largest = 0;
RAMBLOCK_FOREACH(block) {
largest = MAX(largest, qemu_ram_pagesize(block));
}
return largest;
}
static int memory_try_enable_merging(void *addr, size_t len)
{
if (!machine_mem_merge(current_machine)) {
/* disabled by the user */
return 0;
}
return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
}
/*
* Resizing RAM while migrating can result in the migration being canceled.
* Care has to be taken if the guest might have already detected the memory.
*
* As memory core doesn't know how is memory accessed, it is up to
* resize callback to update device state and/or add assertions to detect
* misuse, if necessary.
*/
int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
{
const ram_addr_t oldsize = block->used_length;
const ram_addr_t unaligned_size = newsize;
assert(block);
newsize = TARGET_PAGE_ALIGN(newsize);
newsize = REAL_HOST_PAGE_ALIGN(newsize);
if (block->used_length == newsize) {
/*
* We don't have to resize the ram block (which only knows aligned
* sizes), however, we have to notify if the unaligned size changed.
*/
if (unaligned_size != memory_region_size(block->mr)) {
memory_region_set_size(block->mr, unaligned_size);
if (block->resized) {
block->resized(block->idstr, unaligned_size, block->host);
}
}
return 0;
}
if (!(block->flags & RAM_RESIZEABLE)) {
error_setg_errno(errp, EINVAL,
"Size mismatch: %s: 0x" RAM_ADDR_FMT
" != 0x" RAM_ADDR_FMT, block->idstr,
newsize, block->used_length);
return -EINVAL;
}
if (block->max_length < newsize) {
error_setg_errno(errp, EINVAL,
"Size too large: %s: 0x" RAM_ADDR_FMT
" > 0x" RAM_ADDR_FMT, block->idstr,
newsize, block->max_length);
return -EINVAL;
}
/* Notify before modifying the ram block and touching the bitmaps. */
if (block->host) {
ram_block_notify_resize(block->host, oldsize, newsize);
}
physical_memory_clear_dirty_range(block->offset, block->used_length);
block->used_length = newsize;
physical_memory_set_dirty_range(block->offset, block->used_length,
DIRTY_CLIENTS_ALL);
memory_region_set_size(block->mr, unaligned_size);
if (block->resized) {
block->resized(block->idstr, unaligned_size, block->host);
}
return 0;
}
/*
* Trigger sync on the given ram block for range [start, start + length]
* with the backing store if one is available.
* Otherwise no-op.
* @Note: this is supposed to be a synchronous op.
*/
void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length)
{
/* The requested range should fit in within the block range */
g_assert((start + length) <= block->used_length);
#ifdef CONFIG_LIBPMEM
/* The lack of support for pmem should not block the sync */
if (ram_block_is_pmem(block)) {
void *addr = ramblock_ptr(block, start);
pmem_persist(addr, length);
return;
}
#endif
if (block->fd >= 0) {
/**
* Case there is no support for PMEM or the memory has not been
* specified as persistent (or is not one) - use the msync.
* Less optimal but still achieves the same goal
*/
void *addr = ramblock_ptr(block, start);
if (qemu_msync(addr, length, block->fd)) {
warn_report("%s: failed to sync memory range: start: "
RAM_ADDR_FMT " length: " RAM_ADDR_FMT,
__func__, start, length);
}
}
}
/* Called with ram_list.mutex held */
softmmu/physmem: fix memory leak in dirty_memory_extend() As reported by Peter, we might be leaking memory when removing the highest RAMBlock (in the weird ram_addr_t space), and adding a new one. We will fail to realize that we already allocated bitmaps for more dirty memory blocks, and effectively discard the pointers to them. Fix it by getting rid of last_ram_page() and by remembering the number of dirty memory blocks that have been allocated already. While at it, let's use "unsigned int" for the number of blocks, which should be sufficient until we reach ~32 exabytes. Looks like this leak was introduced as we switched from using a single bitmap_zero_extend() to allocating multiple bitmaps: bitmap_zero_extend() relies on g_renew() which should have taken care of this. Resolves: https://lkml.kernel.org/r/CAFEAcA-k7a+VObGAfCFNygQNfCKL=AfX6A4kScq=VSSK0peqPg@mail.gmail.com Reported-by: Peter Maydell <peter.maydell@linaro.org> Fixes: 5b82b703b69a ("memory: RCU ram_list.dirty_memory[] for safe RAM hotplug") Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Peter Xu <peterx@redhat.com> Tested-by: Peter Maydell <peter.maydell@linaro.org> Cc: qemu-stable@nongnu.org Cc: Stefan Hajnoczi <stefanha@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Peter Xu <peterx@redhat.com> Cc: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20240828090743.128647-1-david@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com>
2024-08-28 11:07:43 +02:00
static void dirty_memory_extend(ram_addr_t new_ram_size)
{
softmmu/physmem: fix memory leak in dirty_memory_extend() As reported by Peter, we might be leaking memory when removing the highest RAMBlock (in the weird ram_addr_t space), and adding a new one. We will fail to realize that we already allocated bitmaps for more dirty memory blocks, and effectively discard the pointers to them. Fix it by getting rid of last_ram_page() and by remembering the number of dirty memory blocks that have been allocated already. While at it, let's use "unsigned int" for the number of blocks, which should be sufficient until we reach ~32 exabytes. Looks like this leak was introduced as we switched from using a single bitmap_zero_extend() to allocating multiple bitmaps: bitmap_zero_extend() relies on g_renew() which should have taken care of this. Resolves: https://lkml.kernel.org/r/CAFEAcA-k7a+VObGAfCFNygQNfCKL=AfX6A4kScq=VSSK0peqPg@mail.gmail.com Reported-by: Peter Maydell <peter.maydell@linaro.org> Fixes: 5b82b703b69a ("memory: RCU ram_list.dirty_memory[] for safe RAM hotplug") Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Peter Xu <peterx@redhat.com> Tested-by: Peter Maydell <peter.maydell@linaro.org> Cc: qemu-stable@nongnu.org Cc: Stefan Hajnoczi <stefanha@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Peter Xu <peterx@redhat.com> Cc: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20240828090743.128647-1-david@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com>
2024-08-28 11:07:43 +02:00
unsigned int old_num_blocks = ram_list.num_dirty_blocks;
unsigned int new_num_blocks = DIV_ROUND_UP(new_ram_size,
DIRTY_MEMORY_BLOCK_SIZE);
int i;
/* Only need to extend if block count increased */
if (new_num_blocks <= old_num_blocks) {
return;
}
for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
DirtyMemoryBlocks *old_blocks;
DirtyMemoryBlocks *new_blocks;
int j;
old_blocks = qatomic_rcu_read(&ram_list.dirty_memory[i]);
new_blocks = g_malloc(sizeof(*new_blocks) +
sizeof(new_blocks->blocks[0]) * new_num_blocks);
if (old_num_blocks) {
memcpy(new_blocks->blocks, old_blocks->blocks,
old_num_blocks * sizeof(old_blocks->blocks[0]));
}
for (j = old_num_blocks; j < new_num_blocks; j++) {
new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
}
qatomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
if (old_blocks) {
g_free_rcu(old_blocks, rcu);
}
}
softmmu/physmem: fix memory leak in dirty_memory_extend() As reported by Peter, we might be leaking memory when removing the highest RAMBlock (in the weird ram_addr_t space), and adding a new one. We will fail to realize that we already allocated bitmaps for more dirty memory blocks, and effectively discard the pointers to them. Fix it by getting rid of last_ram_page() and by remembering the number of dirty memory blocks that have been allocated already. While at it, let's use "unsigned int" for the number of blocks, which should be sufficient until we reach ~32 exabytes. Looks like this leak was introduced as we switched from using a single bitmap_zero_extend() to allocating multiple bitmaps: bitmap_zero_extend() relies on g_renew() which should have taken care of this. Resolves: https://lkml.kernel.org/r/CAFEAcA-k7a+VObGAfCFNygQNfCKL=AfX6A4kScq=VSSK0peqPg@mail.gmail.com Reported-by: Peter Maydell <peter.maydell@linaro.org> Fixes: 5b82b703b69a ("memory: RCU ram_list.dirty_memory[] for safe RAM hotplug") Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Peter Xu <peterx@redhat.com> Tested-by: Peter Maydell <peter.maydell@linaro.org> Cc: qemu-stable@nongnu.org Cc: Stefan Hajnoczi <stefanha@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Peter Xu <peterx@redhat.com> Cc: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20240828090743.128647-1-david@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com>
2024-08-28 11:07:43 +02:00
ram_list.num_dirty_blocks = new_num_blocks;
}
static void ram_block_add(RAMBlock *new_block, Error **errp)
{
const bool noreserve = qemu_ram_is_noreserve(new_block);
const bool shared = qemu_ram_is_shared(new_block);
RAMBlock *block;
RAMBlock *last_block = NULL;
bool free_on_error = false;
softmmu/physmem: fix memory leak in dirty_memory_extend() As reported by Peter, we might be leaking memory when removing the highest RAMBlock (in the weird ram_addr_t space), and adding a new one. We will fail to realize that we already allocated bitmaps for more dirty memory blocks, and effectively discard the pointers to them. Fix it by getting rid of last_ram_page() and by remembering the number of dirty memory blocks that have been allocated already. While at it, let's use "unsigned int" for the number of blocks, which should be sufficient until we reach ~32 exabytes. Looks like this leak was introduced as we switched from using a single bitmap_zero_extend() to allocating multiple bitmaps: bitmap_zero_extend() relies on g_renew() which should have taken care of this. Resolves: https://lkml.kernel.org/r/CAFEAcA-k7a+VObGAfCFNygQNfCKL=AfX6A4kScq=VSSK0peqPg@mail.gmail.com Reported-by: Peter Maydell <peter.maydell@linaro.org> Fixes: 5b82b703b69a ("memory: RCU ram_list.dirty_memory[] for safe RAM hotplug") Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Peter Xu <peterx@redhat.com> Tested-by: Peter Maydell <peter.maydell@linaro.org> Cc: qemu-stable@nongnu.org Cc: Stefan Hajnoczi <stefanha@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Peter Xu <peterx@redhat.com> Cc: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20240828090743.128647-1-david@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com>
2024-08-28 11:07:43 +02:00
ram_addr_t ram_size;
Error *err = NULL;
qemu_mutex_lock_ramlist();
new_block->offset = find_ram_offset(new_block->max_length);
if (!new_block->host) {
if (xen_enabled()) {
xen_ram_alloc(new_block->offset, new_block->max_length,
new_block->mr, &err);
if (err) {
error_propagate(errp, err);
qemu_mutex_unlock_ramlist();
return;
}
} else {
new_block->host = qemu_anon_ram_alloc(new_block->max_length,
&new_block->mr->align,
shared, noreserve);
if (!new_block->host) {
error_setg_errno(errp, errno,
"cannot set up guest memory '%s'",
memory_region_name(new_block->mr));
qemu_mutex_unlock_ramlist();
return;
}
memory_try_enable_merging(new_block->host, new_block->max_length);
free_on_error = true;
}
}
if (new_block->flags & RAM_GUEST_MEMFD) {
int ret;
2025-02-17 13:08:12 +01:00
if (!kvm_enabled()) {
error_setg(errp, "cannot set up private guest memory for %s: KVM required",
object_get_typename(OBJECT(current_machine->cgs)));
goto out_free;
}
assert(new_block->guest_memfd < 0);
ret = ram_block_coordinated_discard_require(true);
if (ret < 0) {
error_setg_errno(errp, -ret,
"cannot set up private guest memory: discard currently blocked");
error_append_hint(errp, "Are you using assigned devices?\n");
goto out_free;
}
new_block->guest_memfd = kvm_create_guest_memfd(new_block->max_length,
0, errp);
if (new_block->guest_memfd < 0) {
qemu_mutex_unlock_ramlist();
goto out_free;
}
/*
* The attribute bitmap of the RamBlockAttributes is default to
* discarded, which mimics the behavior of kvm_set_phys_mem() when it
* calls kvm_set_memory_attributes_private(). This leads to a brief
* period of inconsistency between the creation of the RAMBlock and its
* mapping into the physical address space. However, this is not
* problematic, as no users rely on the attribute status to perform
* any actions during this interval.
*/
new_block->attributes = ram_block_attributes_create(new_block);
if (!new_block->attributes) {
error_setg(errp, "Failed to create ram block attribute");
close(new_block->guest_memfd);
ram_block_coordinated_discard_require(false);
qemu_mutex_unlock_ramlist();
goto out_free;
}
/*
* Add a specific guest_memfd blocker if a generic one would not be
* added by ram_block_add_cpr_blocker.
*/
if (ram_is_cpr_compatible(new_block)) {
error_setg(&new_block->cpr_blocker,
"Memory region %s uses guest_memfd, "
"which is not supported with CPR.",
memory_region_name(new_block->mr));
migrate_add_blocker_modes(&new_block->cpr_blocker,
BIT(MIG_MODE_CPR_TRANSFER), errp);
}
}
softmmu/physmem: fix memory leak in dirty_memory_extend() As reported by Peter, we might be leaking memory when removing the highest RAMBlock (in the weird ram_addr_t space), and adding a new one. We will fail to realize that we already allocated bitmaps for more dirty memory blocks, and effectively discard the pointers to them. Fix it by getting rid of last_ram_page() and by remembering the number of dirty memory blocks that have been allocated already. While at it, let's use "unsigned int" for the number of blocks, which should be sufficient until we reach ~32 exabytes. Looks like this leak was introduced as we switched from using a single bitmap_zero_extend() to allocating multiple bitmaps: bitmap_zero_extend() relies on g_renew() which should have taken care of this. Resolves: https://lkml.kernel.org/r/CAFEAcA-k7a+VObGAfCFNygQNfCKL=AfX6A4kScq=VSSK0peqPg@mail.gmail.com Reported-by: Peter Maydell <peter.maydell@linaro.org> Fixes: 5b82b703b69a ("memory: RCU ram_list.dirty_memory[] for safe RAM hotplug") Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Peter Xu <peterx@redhat.com> Tested-by: Peter Maydell <peter.maydell@linaro.org> Cc: qemu-stable@nongnu.org Cc: Stefan Hajnoczi <stefanha@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Cc: Peter Xu <peterx@redhat.com> Cc: Philippe Mathieu-Daudé <philmd@linaro.org> Signed-off-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20240828090743.128647-1-david@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com>
2024-08-28 11:07:43 +02:00
ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS;
dirty_memory_extend(ram_size);
/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
* QLIST (which has an RCU-friendly variant) does not have insertion at
* tail, so save the last element in last_block.
*/
RAMBLOCK_FOREACH(block) {
last_block = block;
if (block->max_length < new_block->max_length) {
break;
}
}
if (block) {
QLIST_INSERT_BEFORE_RCU(block, new_block, next);
} else if (last_block) {
QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
} else { /* list is empty */
QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
}
ram_list.mru_block = NULL;
/* Write list before version */
smp_wmb();
ram_list.version++;
qemu_mutex_unlock_ramlist();
physical_memory_set_dirty_range(new_block->offset,
new_block->used_length,
DIRTY_CLIENTS_ALL);
if (new_block->host) {
qemu_ram_setup_dump(new_block->host, new_block->max_length);
qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
/*
* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU
* Configure it unless the machine is a qtest server, in which case
* KVM is not used and it may be forked (eg for fuzzing purposes).
*/
if (!qtest_enabled()) {
qemu_madvise(new_block->host, new_block->max_length,
QEMU_MADV_DONTFORK);
}
ram_block_notify_add(new_block->host, new_block->used_length,
new_block->max_length);
}
return;
out_free:
if (free_on_error) {
qemu_anon_ram_free(new_block->host, new_block->max_length);
new_block->host = NULL;
}
}
#if defined(CONFIG_POSIX) && !defined(EMSCRIPTEN)
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, ram_addr_t max_size,
qemu_ram_resize_cb resized, MemoryRegion *mr,
uint32_t ram_flags, int fd, off_t offset,
bool grow,
Error **errp)
{
ERRP_GUARD();
RAMBlock *new_block;
Error *local_err = NULL;
int64_t file_size, file_align, share_flags;
share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED);
assert(share_flags != (RAM_SHARED | RAM_PRIVATE));
ram_flags &= ~RAM_PRIVATE;
/* Just support these ram flags by now. */
assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY |
RAM_READONLY_FD | RAM_GUEST_MEMFD |
RAM_RESIZEABLE)) == 0);
assert(max_size >= size);
if (xen_enabled()) {
error_setg(errp, "-mem-path not supported with Xen");
return NULL;
}
if (kvm_enabled() && !kvm_has_sync_mmu()) {
error_setg(errp,
"host lacks kvm mmu notifiers, -mem-path unsupported");
return NULL;
}
size = TARGET_PAGE_ALIGN(size);
size = REAL_HOST_PAGE_ALIGN(size);
max_size = TARGET_PAGE_ALIGN(max_size);
max_size = REAL_HOST_PAGE_ALIGN(max_size);
file_size = get_file_size(fd);
if (file_size && file_size < offset + max_size && !grow) {
error_setg(errp, "%s backing store size 0x%" PRIx64
" is too small for 'size' option 0x" RAM_ADDR_FMT
" plus 'offset' option 0x%" PRIx64,
memory_region_name(mr), file_size, max_size,
(uint64_t)offset);
return NULL;
}
file_align = get_file_align(fd);
if (file_align > 0 && file_align > mr->align) {
error_setg(errp, "backing store align 0x%" PRIx64
" is larger than 'align' option 0x%" PRIx64,
file_align, mr->align);
return NULL;
}
new_block = g_malloc0(sizeof(*new_block));
new_block->mr = mr;
new_block->used_length = size;
new_block->max_length = max_size;
new_block->resized = resized;
new_block->flags = ram_flags;
new_block->guest_memfd = -1;
new_block->host = file_ram_alloc(new_block, max_size, fd,
file_size < offset + max_size,
offset, errp);
if (!new_block->host) {
g_free(new_block);
return NULL;
}
ram_block_add(new_block, &local_err);
if (local_err) {
g_free(new_block);
error_propagate(errp, local_err);
return NULL;
}
return new_block;
}
RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
uint32_t ram_flags, const char *mem_path,
off_t offset, Error **errp)
{
int fd;
bool created;
RAMBlock *block;
fd = file_ram_open(mem_path, memory_region_name(mr),
!!(ram_flags & RAM_READONLY_FD), &created);
if (fd < 0) {
error_setg_errno(errp, -fd, "can't open backing store %s for guest RAM",
mem_path);
if (!(ram_flags & RAM_READONLY_FD) && !(ram_flags & RAM_SHARED) &&
fd == -EACCES) {
/*
* If we can open the file R/O (note: will never create a new file)
* and we are dealing with a private mapping, there are still ways
* to consume such files and get RAM instead of ROM.
*/
fd = file_ram_open(mem_path, memory_region_name(mr), true,
&created);
if (fd < 0) {
return NULL;
}
assert(!created);
close(fd);
error_append_hint(errp, "Consider opening the backing store"
" read-only but still creating writable RAM using"
" '-object memory-backend-file,readonly=on,rom=off...'"
" (see \"VM templating\" documentation)\n");
}
return NULL;
}
block = qemu_ram_alloc_from_fd(size, size, NULL, mr, ram_flags, fd, offset,
false, errp);
if (!block) {
if (created) {
unlink(mem_path);
}
close(fd);
return NULL;
}
return block;
}
#endif
#ifdef CONFIG_POSIX
/*
* Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor, so it can be
* shared with another process if CPR is being used. Use memfd if available
* because it has no size limits, else use POSIX shm.
*/
static int qemu_ram_get_shared_fd(const char *name, bool *reused, Error **errp)
{
int fd = cpr_find_fd(name, 0);
if (fd >= 0) {
*reused = true;
return fd;
}
if (qemu_memfd_check(0)) {
fd = qemu_memfd_create(name, 0, 0, 0, 0, errp);
} else {
fd = qemu_shm_alloc(0, errp);
}
if (fd >= 0) {
cpr_save_fd(name, 0, fd);
}
*reused = false;
return fd;
}
#endif
static
RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
qemu_ram_resize_cb resized,
void *host, uint32_t ram_flags,
MemoryRegion *mr, Error **errp)
{
RAMBlock *new_block;
Error *local_err = NULL;
int align, share_flags;
share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED);
assert(share_flags != (RAM_SHARED | RAM_PRIVATE));
ram_flags &= ~RAM_PRIVATE;
assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0);
assert(!host ^ (ram_flags & RAM_PREALLOC));
assert(max_size >= size);
/* ignore RAM_SHARED for Windows and emscripten*/
#if defined(CONFIG_POSIX) && !defined(EMSCRIPTEN)
if (!host) {
if (!share_flags && current_machine->aux_ram_share) {
ram_flags |= RAM_SHARED;
}
if (ram_flags & RAM_SHARED) {
bool reused;
g_autofree char *name = cpr_name(mr);
int fd = qemu_ram_get_shared_fd(name, &reused, errp);
if (fd < 0) {
return NULL;
}
/* Use same alignment as qemu_anon_ram_alloc */
mr->align = QEMU_VMALLOC_ALIGN;
/*
* This can fail if the shm mount size is too small, or alloc from
* fd is not supported, but previous QEMU versions that called
* qemu_anon_ram_alloc for anonymous shared memory could have
* succeeded. Quietly fail and fall back.
*
* After cpr-transfer, new QEMU could create a memory region
* with a larger max size than old, so pass reused to grow the
* region if necessary. The extra space will be usable after a
* guest reset.
*/
new_block = qemu_ram_alloc_from_fd(size, max_size, resized, mr,
ram_flags, fd, 0, reused, NULL);
if (new_block) {
trace_qemu_ram_alloc_shared(name, new_block->used_length,
new_block->max_length, fd,
new_block->host);
return new_block;
}
cpr_delete_fd(name, 0);
close(fd);
/* fall back to anon allocation */
}
}
#endif
align = qemu_real_host_page_size();
align = MAX(align, TARGET_PAGE_SIZE);
size = ROUND_UP(size, align);
max_size = ROUND_UP(max_size, align);
new_block = g_malloc0(sizeof(*new_block));
new_block->mr = mr;
new_block->resized = resized;
new_block->used_length = size;
new_block->max_length = max_size;
new_block->fd = -1;
new_block->guest_memfd = -1;
new_block->page_size = qemu_real_host_page_size();
new_block->host = host;
new_block->flags = ram_flags;
ram_block_add(new_block, &local_err);
if (local_err) {
g_free(new_block);
error_propagate(errp, local_err);
return NULL;
}
return new_block;
}
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
MemoryRegion *mr, Error **errp)
{
return qemu_ram_alloc_internal(size, size, NULL, host, RAM_PREALLOC, mr,
errp);
}
RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags,
MemoryRegion *mr, Error **errp)
{
assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE | RAM_GUEST_MEMFD |
RAM_PRIVATE)) == 0);
return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp);
}
RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
qemu_ram_resize_cb resized,
MemoryRegion *mr, Error **errp)
{
return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
RAM_RESIZEABLE, mr, errp);
}
static void reclaim_ramblock(RAMBlock *block)
{
if (block->flags & RAM_PREALLOC) {
;
} else if (xen_enabled()) {
xen_invalidate_map_cache_entry(block->host);
#if !defined(_WIN32) && !defined(EMSCRIPTEN)
} else if (block->fd >= 0) {
qemu_ram_munmap(block->fd, block->host, block->max_length);
close(block->fd);
#endif
} else {
qemu_anon_ram_free(block->host, block->max_length);
}
if (block->guest_memfd >= 0) {
ram_block_attributes_destroy(block->attributes);
close(block->guest_memfd);
ram_block_coordinated_discard_require(false);
}
g_free(block);
}
void qemu_ram_free(RAMBlock *block)
{
g_autofree char *name = NULL;
if (!block) {
return;
}
if (block->host) {
ram_block_notify_remove(block->host, block->used_length,
block->max_length);
}
qemu_mutex_lock_ramlist();
name = cpr_name(block->mr);
cpr_delete_fd(name, 0);
QLIST_REMOVE_RCU(block, next);
ram_list.mru_block = NULL;
/* Write list before version */
smp_wmb();
ram_list.version++;
call_rcu(block, reclaim_ramblock, rcu);
qemu_mutex_unlock_ramlist();
}
#ifndef _WIN32
/* Simply remap the given VM memory location from start to start+length */
static int qemu_ram_remap_mmap(RAMBlock *block, uint64_t start, size_t length)
{
int flags, prot;
void *area;
void *host_startaddr = block->host + start;
assert(block->fd < 0);
flags = MAP_FIXED | MAP_ANONYMOUS;
flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE;
flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
prot = PROT_READ;
prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
area = mmap(host_startaddr, length, prot, flags, -1, 0);
return area != host_startaddr ? -errno : 0;
}
/*
* qemu_ram_remap - remap a single RAM page
*
* @addr: address in ram_addr_t address space.
*
* This function will try remapping a single page of guest RAM identified by
* @addr, essentially discarding memory to recover from previously poisoned
* memory (MCE). The page size depends on the RAMBlock (i.e., hugetlb). @addr
* does not have to point at the start of the page.
*
* This function is only to be used during system resets; it will kill the
* VM if remapping failed.
*/
void qemu_ram_remap(ram_addr_t addr)
{
RAMBlock *block;
uint64_t offset;
void *vaddr;
size_t page_size;
RAMBLOCK_FOREACH(block) {
offset = addr - block->offset;
if (offset < block->max_length) {
/* Respect the pagesize of our RAMBlock */
page_size = qemu_ram_pagesize(block);
offset = QEMU_ALIGN_DOWN(offset, page_size);
vaddr = ramblock_ptr(block, offset);
if (block->flags & RAM_PREALLOC) {
;
} else if (xen_enabled()) {
abort();
} else {
if (ram_block_discard_range(block, offset, page_size) != 0) {
/*
* Fall back to using mmap() only for anonymous mapping,
* as if a backing file is associated we may not be able
* to recover the memory in all cases.
* So don't take the risk of using only mmap and fail now.
*/
if (block->fd >= 0) {
error_report("Could not remap RAM %s:%" PRIx64 "+%"
PRIx64 " +%zx", block->idstr, offset,
block->fd_offset, page_size);
exit(1);
}
if (qemu_ram_remap_mmap(block, offset, page_size) != 0) {
error_report("Could not remap RAM %s:%" PRIx64 " +%zx",
block->idstr, offset, page_size);
exit(1);
}
}
memory_try_enable_merging(vaddr, page_size);
qemu_ram_setup_dump(vaddr, page_size);
}
break;
}
}
}
#endif /* !_WIN32 */
/*
* Return a host pointer to guest's ram.
* For Xen, foreign mappings get created if they don't already exist.
*
* @block: block for the RAM to lookup (optional and may be NULL).
* @addr: address within the memory region.
* @size: pointer to requested size (optional and may be NULL).
* size may get modified and return a value smaller than
* what was requested.
* @lock: wether to lock the mapping in xen-mapcache until invalidated.
* @is_write: hint wether to map RW or RO in the xen-mapcache.
* (optional and may always be set to true).
*
* Called within RCU critical section.
*/
static void *qemu_ram_ptr_length(RAMBlock *block, ram_addr_t addr,
hwaddr *size, bool lock,
bool is_write)
{
hwaddr len = 0;
if (size && *size == 0) {
return NULL;
}
if (block == NULL) {
block = qemu_get_ram_block(addr);
addr -= block->offset;
}
if (size) {
*size = MIN(*size, block->max_length - addr);
len = *size;
}
if (xen_enabled() && block->host == NULL) {
/* We need to check if the requested address is in the RAM
* because we don't want to map the entire memory in QEMU.
* In that case just map the requested area.
*/
if (xen_mr_is_memory(block->mr)) {
return xen_map_cache(block->mr, block->offset + addr,
len, block->offset,
lock, lock, is_write);
}
block->host = xen_map_cache(block->mr, block->offset,
block->max_length,
block->offset,
1, lock, is_write);
}
return ramblock_ptr(block, addr);
}
/*
* Return a host pointer to ram allocated with qemu_ram_alloc.
* This should not be used for general purpose DMA. Use address_space_map
* or address_space_rw instead. For local memory (e.g. video ram) that the
* device owns, use memory_region_get_ram_ptr.
*
* Called within RCU critical section.
*/
void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
{
return qemu_ram_ptr_length(ram_block, addr, NULL, false, true);
}
/* Return the offset of a hostpointer within a ramblock */
ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
{
ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
assert((uintptr_t)host >= (uintptr_t)rb->host);
assert(res < rb->max_length);
return res;
}
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
ram_addr_t *offset)
{
RAMBlock *block;
uint8_t *host = ptr;
if (xen_enabled()) {
ram_addr_t ram_addr;
RCU_READ_LOCK_GUARD();
ram_addr = xen_ram_addr_from_mapcache(ptr);
if (ram_addr == RAM_ADDR_INVALID) {
return NULL;
}
block = qemu_get_ram_block(ram_addr);
if (block) {
*offset = ram_addr - block->offset;
}
return block;
}
RCU_READ_LOCK_GUARD();
block = qatomic_rcu_read(&ram_list.mru_block);
if (block && block->host && host - block->host < block->max_length) {
goto found;
}
RAMBLOCK_FOREACH(block) {
/* This case append when the block is not mapped. */
if (block->host == NULL) {
continue;
}
if (host - block->host < block->max_length) {
goto found;
}
}
return NULL;
found:
*offset = (host - block->host);
if (round_offset) {
*offset &= TARGET_PAGE_MASK;
}
return block;
}
/*
* Finds the named RAMBlock
*
* name: The name of RAMBlock to find
*
* Returns: RAMBlock (or NULL if not found)
*/
RAMBlock *qemu_ram_block_by_name(const char *name)
{
RAMBlock *block;
RAMBLOCK_FOREACH(block) {
if (!strcmp(name, block->idstr)) {
return block;
}
}
return NULL;
}
/*
* Some of the system routines need to translate from a host pointer
* (typically a TLB entry) back to a ram offset.
*/
ram_addr_t qemu_ram_addr_from_host(void *ptr)
{
RAMBlock *block;
ram_addr_t offset;
block = qemu_ram_block_from_host(ptr, false, &offset);
if (!block) {
return RAM_ADDR_INVALID;
}
return block->offset + offset;
}
ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
{
ram_addr_t ram_addr;
ram_addr = qemu_ram_addr_from_host(ptr);
if (ram_addr == RAM_ADDR_INVALID) {
error_report("Bad ram pointer %p", ptr);
abort();
}
return ram_addr;
}
static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
MemTxAttrs attrs, void *buf, hwaddr len);
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
const void *buf, hwaddr len);
static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
bool is_write, MemTxAttrs attrs);
static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
unsigned len, MemTxAttrs attrs)
{
subpage_t *subpage = opaque;
uint8_t buf[8];
MemTxResult res;
trace_subpage_read(subpage, len, addr);
res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
if (res) {
return res;
}
*data = ldn_p(buf, len);
return MEMTX_OK;
}
static MemTxResult subpage_write(void *opaque, hwaddr addr,
uint64_t value, unsigned len, MemTxAttrs attrs)
{
subpage_t *subpage = opaque;
uint8_t buf[8];
trace_subpage_write(subpage, len, addr, value);
stn_p(buf, len, value);
return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
}
static bool subpage_accepts(void *opaque, hwaddr addr,
unsigned len, bool is_write,
MemTxAttrs attrs)
{
subpage_t *subpage = opaque;
trace_subpage_accepts(subpage, is_write ? 'w' : 'r', len, addr);
return flatview_access_valid(subpage->fv, addr + subpage->base,
len, is_write, attrs);
}
static const MemoryRegionOps subpage_ops = {
.read_with_attrs = subpage_read,
.write_with_attrs = subpage_write,
.impl.min_access_size = 1,
.impl.max_access_size = 8,
.valid.min_access_size = 1,
.valid.max_access_size = 8,
.valid.accepts = subpage_accepts,
.endianness = DEVICE_NATIVE_ENDIAN,
};
static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
uint16_t section)
{
int idx, eidx;
if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
return -1;
idx = SUBPAGE_IDX(start);
eidx = SUBPAGE_IDX(end);
trace_subpage_register(mmio, start, end, idx, eidx, section);
for (; idx <= eidx; idx++) {
mmio->sub_section[idx] = section;
}
return 0;
}
static subpage_t *subpage_init(FlatView *fv, hwaddr base)
{
subpage_t *mmio;
/* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
mmio->fv = fv;
mmio->base = base;
memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
NULL, TARGET_PAGE_SIZE);
mmio->iomem.subpage = true;
trace_subpage_init(mmio, base, TARGET_PAGE_SIZE);
return mmio;
}
static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
{
assert(fv);
MemoryRegionSection section = {
.fv = fv,
.mr = mr,
.offset_within_address_space = 0,
.offset_within_region = 0,
.size = int128_2_64(),
};
return phys_section_add(map, &section);
}
static void io_mem_init(void)
{
memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
NULL, UINT64_MAX);
system/physmem: mark io_mem_unassigned lockless When the Bus Master bit is disabled in a PCI device's Command Register, the device's DMA address space becomes unassigned memory (i.e. the io_mem_unassigned MemoryRegion). This can lead to deadlocks with IOThreads since io_mem_unassigned accesses attempt to acquire the Big QEMU Lock (BQL). For example, virtio-pci devices deadlock in virtio_write_config() -> virtio_pci_stop_ioeventfd() when waiting for the IOThread while holding the BQL. The IOThread is unable to acquire the BQL but the vcpu thread won't release the BQL while waiting for the IOThread. io_mem_unassigned is trivially thread-safe since it has no state, it simply rejects all load/store accesses. Therefore it is safe to enable lockless I/O on io_mem_unassigned to eliminate this deadlock. Here is the backtrace described above: Thread 9 (Thread 0x7fccfcdff6c0 (LWP 247832) "CPU 4/KVM"): #0 0x00007fcd11529d46 in ppoll () from target:/lib64/libc.so.6 #1 0x000056468a1a9bad in ppoll (__fds=<optimized out>, __nfds=<optimized out>, __timeout=0x0, __ss=0x0) at /usr/include/bits/poll2.h:88 #2 0x000056468a18f9d9 in fdmon_poll_wait (ctx=0x5646c6a1dc30, ready_list=0x7fccfcdfb310, timeout=-1) at ../util/fdmon-poll.c:79 #3 0x000056468a18f14f in aio_poll (ctx=<optimized out>, blocking=blocking@entry=true) at ../util/aio-posix.c:730 #4 0x000056468a1ad842 in aio_wait_bh_oneshot (ctx=<optimized out>, cb=cb@entry=0x564689faa420 <virtio_blk_ioeventfd_stop_vq_bh>, opaque=<optimized out>) at ../util/aio-wait.c:85 #5 0x0000564689faaa89 in virtio_blk_stop_ioeventfd (vdev=0x5646c8fd7e90) at ../hw/block/virtio-blk.c:1644 #6 0x0000564689d77880 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x5646c8fd7e08) at ../hw/virtio/virtio-bus.c:264 #7 0x0000564689d780db in virtio_bus_stop_ioeventfd (bus=bus@entry=0x5646c8fd7e08) at ../hw/virtio/virtio-bus.c:256 #8 0x0000564689d7d98a in virtio_pci_stop_ioeventfd (proxy=0x5646c8fcf8e0) at ../hw/virtio/virtio-pci.c:413 #9 virtio_write_config (pci_dev=0x5646c8fcf8e0, address=4, val=<optimized out>, len=<optimized out>) at ../hw/virtio/virtio-pci.c:803 #10 0x0000564689dcb45a in memory_region_write_accessor (mr=mr@entry=0x5646c6dc2d30, addr=3145732, value=value@entry=0x7fccfcdfb528, size=size@entry=2, shift=<optimized out>, mask=mask@entry=65535, attrs=...) at ../system/memory.c:491 #11 0x0000564689dcaeb0 in access_with_adjusted_size (addr=addr@entry=3145732, value=value@entry=0x7fccfcdfb528, size=size@entry=2, access_size_min=<optimized out>, access_size_max=<optimized out>, access_fn=0x564689dcb3f0 <memory_region_write_accessor>, mr=0x5646c6dc2d30, attrs=...) at ../system/memory.c:567 #12 0x0000564689dcb156 in memory_region_dispatch_write (mr=mr@entry=0x5646c6dc2d30, addr=addr@entry=3145732, data=<optimized out>, op=<optimized out>, attrs=attrs@entry=...) at ../system/memory.c:1554 #13 0x0000564689dd389a in flatview_write_continue_step (attrs=..., attrs@entry=..., buf=buf@entry=0x7fcd05b87028 "", mr_addr=3145732, l=l@entry=0x7fccfcdfb5f0, mr=0x5646c6dc2d30, len=2) at ../system/physmem.c:3266 #14 0x0000564689dd3adb in flatview_write_continue (fv=0x7fcadc0d8930, addr=3761242116, attrs=..., ptr=0xe0300004, len=2, mr_addr=<optimized out>, l=<optimized out>, mr=<optimized out>) at ../system/physmem.c:3296 #15 flatview_write (fv=0x7fcadc0d8930, addr=addr@entry=3761242116, attrs=attrs@entry=..., buf=buf@entry=0x7fcd05b87028, len=len@entry=2) at ../system/physmem.c:3327 #16 0x0000564689dd7191 in address_space_write (as=0x56468b433600 <address_space_memory>, addr=3761242116, attrs=..., buf=0x7fcd05b87028, len=2) at ../system/physmem.c:3447 #17 address_space_rw (as=0x56468b433600 <address_space_memory>, addr=3761242116, attrs=attrs@entry=..., buf=buf@entry=0x7fcd05b87028, len=2, is_write=<optimized out>) at ../system/physmem.c:3457 #18 0x0000564689ff1ef6 in kvm_cpu_exec (cpu=cpu@entry=0x5646c6dab810) at ../accel/kvm/kvm-all.c:3248 #19 0x0000564689ff32f5 in kvm_vcpu_thread_fn (arg=arg@entry=0x5646c6dab810) at ../accel/kvm/kvm-accel-ops.c:53 #20 0x000056468a19225c in qemu_thread_start (args=0x5646c6db6190) at ../util/qemu-thread-posix.c:393 #21 0x00007fcd114c5b68 in start_thread () from target:/lib64/libc.so.6 #22 0x00007fcd115364e4 in clone () from target:/lib64/libc.so.6 Thread 3 (Thread 0x7fcd0503a6c0 (LWP 247825) "IO iothread1"): #0 0x00007fcd114c2d30 in __lll_lock_wait () from target:/lib64/libc.so.6 #1 0x00007fcd114c8fe2 in pthread_mutex_lock@@GLIBC_2.2.5 () from target:/lib64/libc.so.6 #2 0x000056468a192538 in qemu_mutex_lock_impl (mutex=0x56468b432e60 <bql>, file=0x56468a1e26a5 "../system/physmem.c", line=3198) at ../util/qemu-thread-posix.c:94 #3 0x0000564689dc12e2 in bql_lock_impl (file=file@entry=0x56468a1e26a5 "../system/physmem.c", line=line@entry=3198) at ../system/cpus.c:566 #4 0x0000564689ddc151 in prepare_mmio_access (mr=0x56468b433800 <io_mem_unassigned>) at ../system/physmem.c:3198 #5 address_space_lduw_internal_cached_slow (cache=<optimized out>, addr=2, attrs=..., result=0x0, endian=DEVICE_LITTLE_ENDIAN) at ../system/memory_ldst.c.inc:211 #6 address_space_lduw_le_cached_slow (cache=<optimized out>, addr=addr@entry=2, attrs=attrs@entry=..., result=result@entry=0x0) at ../system/memory_ldst.c.inc:253 #7 0x0000564689fd692c in address_space_lduw_le_cached (result=0x0, cache=<optimized out>, addr=2, attrs=...) at /var/tmp/qemu/include/exec/memory_ldst_cached.h.inc:35 #8 lduw_le_phys_cached (cache=<optimized out>, addr=2) at /var/tmp/qemu/include/exec/memory_ldst_phys.h.inc:66 #9 virtio_lduw_phys_cached (vdev=<optimized out>, cache=<optimized out>, pa=2) at /var/tmp/qemu/include/hw/virtio/virtio-access.h:166 #10 vring_avail_idx (vq=0x5646c8fe2470) at ../hw/virtio/virtio.c:396 #11 virtio_queue_split_set_notification (vq=0x5646c8fe2470, enable=0) at ../hw/virtio/virtio.c:534 #12 virtio_queue_set_notification (vq=0x5646c8fe2470, enable=0) at ../hw/virtio/virtio.c:595 #13 0x000056468a18e7a8 in poll_set_started (ctx=ctx@entry=0x5646c6c74e30, ready_list=ready_list@entry=0x7fcd050366a0, started=started@entry=true) at ../util/aio-posix.c:247 #14 0x000056468a18f2bb in poll_set_started (ctx=0x5646c6c74e30, ready_list=0x7fcd050366a0, started=true) at ../util/aio-posix.c:226 #15 try_poll_mode (ctx=0x5646c6c74e30, ready_list=0x7fcd050366a0, timeout=<synthetic pointer>) at ../util/aio-posix.c:612 #16 aio_poll (ctx=0x5646c6c74e30, blocking=blocking@entry=true) at ../util/aio-posix.c:689 #17 0x000056468a032c26 in iothread_run (opaque=opaque@entry=0x5646c69f3380) at ../iothread.c:63 #18 0x000056468a19225c in qemu_thread_start (args=0x5646c6c75410) at ../util/qemu-thread-posix.c:393 #19 0x00007fcd114c5b68 in start_thread () from target:/lib64/libc.so.6 #20 0x00007fcd115364e4 in clone () from target:/lib64/libc.so.6 Buglink: https://issues.redhat.com/browse/RHEL-71933 Reported-by: Peixiu Hou <phou@redhat.com> Cc: Kevin Wolf <kwolf@redhat.com> Cc: Paolo Bonzini <pbonzini@redhat.com> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> Link: https://lore.kernel.org/r/20251029185224.420261-1-stefanha@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com>
2025-10-29 14:52:24 -04:00
/* Trivially thread-safe since memory accesses are rejected */
memory_region_enable_lockless_io(&io_mem_unassigned);
}
AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
{
AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
uint16_t n;
n = dummy_section(&d->map, fv, &io_mem_unassigned);
assert(n == PHYS_SECTION_UNASSIGNED);
d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
return d;
}
void address_space_dispatch_free(AddressSpaceDispatch *d)
{
phys_sections_free(&d->map);
g_free(d);
}
static void do_nothing(CPUState *cpu, run_on_cpu_data d)
{
}
static void tcg_log_global_after_sync(MemoryListener *listener)
{
CPUAddressSpace *cpuas;
/* Wait for the CPU to end the current TB. This avoids the following
* incorrect race:
*
* vCPU migration
* ---------------------- -------------------------
* TLB check -> slow path
* notdirty_mem_write
* write to RAM
* mark dirty
* clear dirty flag
* TLB check -> fast path
* read memory
* write to RAM
*
* by pushing the migration thread's memory read after the vCPU thread has
* written the memory.
*/
if (replay_mode == REPLAY_MODE_NONE) {
/*
* VGA can make calls to this function while updating the screen.
* In record/replay mode this causes a deadlock, because
* run_on_cpu waits for rr mutex. Therefore no races are possible
* in this case and no need for making run_on_cpu when
* record/replay is enabled.
*/
cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
}
}
static void tcg_commit_cpu(CPUState *cpu, run_on_cpu_data data)
{
tlb_flush(cpu);
}
static void tcg_commit(MemoryListener *listener)
{
CPUAddressSpace *cpuas;
CPUState *cpu;
assert(tcg_enabled());
/* since each CPU stores ram addresses in its TLB cache, we must
reset the modified entries */
cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
cpu = cpuas->cpu;
/*
system/physmem: fix use-after-free with dispatch A use-after-free bug was reported when booting a Linux kernel during the pci setup phase. It's quite hard to reproduce (needs smp, and favored by having several pci devices with BAR and specific Linux config, which is Debian default one in this case). After investigation (see the associated bug ticket), it appears that, under specific conditions, we might access a cached AddressSpaceDispatch that was reclaimed by RCU thread meanwhile. In the Linux boot scenario, during the pci phase, memory region are destroyed/recreated, resulting in exposition of the bug. The core of the issue is that we cache the dispatch associated to current cpu in cpu->cpu_ases[asidx].memory_dispatch. It is updated with tcg_commit, which runs asynchronously on a given cpu. At some point, we leave the rcu critial section, and the RCU thread starts reclaiming it, but tcg_commit is not yet invoked, resulting in the use-after-free. It's not the first problem around this area, and commit 0d58c660689 [1] ("softmmu: Use async_run_on_cpu in tcg_commit") already tried to address it. It did a good job, but it seems that we found a specific situation where it's not enough. This patch takes a simple approach: remove the cached value creating the issue, and make sure we always get the current mapping for address space, using address_space_to_dispatch(cpu->cpu_ases[asidx].as). It's equivalent to qatomic_rcu_read(&as->current_map)->dispatch; This is not really costly, we just need two dereferences, including one atomic (rcu) read, which is negligible considering we are already on mmu slow path anyway. Note that tcg_commit is still needed, as it's taking care of flushing TLB, removing previously mapped entries. Another solution would be to cache directly values under the dispatch (dispatch themselves are not ref counted), keep an active reference on associated memory section, and release it when appropriate (tricky). Given the time already spent debugging this area now and previously, I strongly prefer eliminating the root of the issue, instead of adding more complexity for a hypothetical performance gain. RCU is precisely used to ensure good performance when reading data, so caching is not as beneficial as it might seem IMHO. [1] https://gitlab.com/qemu-project/qemu/-/commit/0d58c660689f6da1e3feff8a997014003d928b3b Cc: qemu-stable@nongnu.org Resolves: https://gitlab.com/qemu-project/qemu/-/issues/3040 Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org> Reviewed-by: Richard Henderson <richard.henderson@linaro.org> Reviewed-by: Michael Tokarev <mjt@tls.msk.ru> Tested-by: Michael Tokarev <mjt@tls.msk.ru> Message-ID: <20250724161142.2803091-1-pierrick.bouvier@linaro.org> Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
2025-07-24 09:11:42 -07:00
* Queueing the work function will kick the cpu back to
* the main loop, which will end the RCU critical section and reclaim
* the memory data structures.
*
* That said, the listener is also called during realize, before
* all of the tcg machinery for run-on is initialized: thus halt_cond.
*/
if (cpu->halt_cond) {
async_run_on_cpu(cpu, tcg_commit_cpu, RUN_ON_CPU_HOST_PTR(cpuas));
} else {
tcg_commit_cpu(cpu, RUN_ON_CPU_HOST_PTR(cpuas));
}
}
static void memory_map_init(void)
{
system_memory = g_malloc(sizeof(*system_memory));
memory_region_init(system_memory, NULL, "system", UINT64_MAX);
address_space_init(&address_space_memory, system_memory, "memory");
system_io = g_malloc(sizeof(*system_io));
memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
65536);
address_space_init(&address_space_io, system_io, "I/O");
}
MemoryRegion *get_system_memory(void)
{
return system_memory;
}
MemoryRegion *get_system_io(void)
{
return system_io;
}
static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
hwaddr length)
{
uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
ram_addr_t ramaddr = memory_region_get_ram_addr(mr);
/* We know we're only called for RAM MemoryRegions */
assert(ramaddr != RAM_ADDR_INVALID);
addr += ramaddr;
/* No early return if dirty_log_mask is or becomes 0, because
* physical_memory_set_dirty_range will still call
* xen_modified_memory.
*/
if (dirty_log_mask) {
dirty_log_mask =
physical_memory_range_includes_clean(addr, length, dirty_log_mask);
}
if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
assert(tcg_enabled());
tb_invalidate_phys_range(NULL, addr, addr + length - 1);
dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
}
physical_memory_set_dirty_range(addr, length, dirty_log_mask);
}
void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
{
/*
* In principle this function would work on other memory region types too,
* but the ROM device use case is the only one where this operation is
* necessary. Other memory regions should use the
* address_space_read/write() APIs.
*/
assert(memory_region_is_romd(mr));
invalidate_and_set_dirty(mr, addr, size);
}
int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
{
unsigned access_size_max = mr->ops->valid.max_access_size;
/* Regions are assumed to support 1-4 byte accesses unless
otherwise specified. */
if (access_size_max == 0) {
access_size_max = 4;
}
/* Bound the maximum access by the alignment of the address. */
if (!mr->ops->impl.unaligned) {
unsigned align_size_max = addr & -addr;
if (align_size_max != 0 && align_size_max < access_size_max) {
access_size_max = align_size_max;
}
}
/* Don't attempt accesses larger than the maximum. */
if (l > access_size_max) {
l = access_size_max;
}
l = pow2floor(l);
return l;
}
bool prepare_mmio_access(MemoryRegion *mr)
{
bool release_lock = false;
if (!bql_locked() && !mr->lockless_io) {
system/cpus: rename qemu_mutex_lock_iothread() to bql_lock() The Big QEMU Lock (BQL) has many names and they are confusing. The actual QemuMutex variable is called qemu_global_mutex but it's commonly referred to as the BQL in discussions and some code comments. The locking APIs, however, are called qemu_mutex_lock_iothread() and qemu_mutex_unlock_iothread(). The "iothread" name is historic and comes from when the main thread was split into into KVM vcpu threads and the "iothread" (now called the main loop thread). I have contributed to the confusion myself by introducing a separate --object iothread, a separate concept unrelated to the BQL. The "iothread" name is no longer appropriate for the BQL. Rename the locking APIs to: - void bql_lock(void) - void bql_unlock(void) - bool bql_locked(void) There are more APIs with "iothread" in their names. Subsequent patches will rename them. There are also comments and documentation that will be updated in later patches. Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> Reviewed-by: Paul Durrant <paul@xen.org> Acked-by: Fabiano Rosas <farosas@suse.de> Acked-by: David Woodhouse <dwmw@amazon.co.uk> Reviewed-by: Cédric Le Goater <clg@kaod.org> Acked-by: Peter Xu <peterx@redhat.com> Acked-by: Eric Farman <farman@linux.ibm.com> Reviewed-by: Harsh Prateek Bora <harshpb@linux.ibm.com> Acked-by: Hyman Huang <yong.huang@smartx.com> Reviewed-by: Akihiko Odaki <akihiko.odaki@daynix.com> Message-id: 20240102153529.486531-2-stefanha@redhat.com Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2024-01-02 10:35:25 -05:00
bql_lock();
release_lock = true;
}
if (mr->flush_coalesced_mmio) {
qemu_flush_coalesced_mmio_buffer();
}
return release_lock;
}
/**
* flatview_access_allowed
* @mr: #MemoryRegion to be accessed
* @attrs: memory transaction attributes
* @addr: address within that memory region
* @len: the number of bytes to access
*
* Check if a memory transaction is allowed.
*
* Returns: true if transaction is allowed, false if denied.
*/
static bool flatview_access_allowed(MemoryRegion *mr, MemTxAttrs attrs,
hwaddr addr, hwaddr len)
{
if (likely(!attrs.memory)) {
return true;
}
if (memory_region_is_ram(mr)) {
return true;
}
qemu_log_mask(LOG_INVALID_MEM,
"Invalid access to non-RAM device at "
"addr 0x%" HWADDR_PRIX ", size %" HWADDR_PRIu ", "
"region '%s'\n", addr, len, memory_region_name(mr));
return false;
}
static MemTxResult flatview_write_continue_step(MemTxAttrs attrs,
const uint8_t *buf,
hwaddr len, hwaddr mr_addr,
hwaddr *l, MemoryRegion *mr)
{
if (!flatview_access_allowed(mr, attrs, mr_addr, *l)) {
return MEMTX_ACCESS_ERROR;
}
if (!memory_access_is_direct(mr, true, attrs)) {
uint64_t val;
MemTxResult result;
bool release_lock = prepare_mmio_access(mr);
*l = memory_access_size(mr, *l, mr_addr);
/*
* XXX: could force current_cpu to NULL to avoid
* potential bugs
*/
/*
* Assure Coverity (and ourselves) that we are not going to OVERRUN
* the buffer by following ldn_he_p().
*/
#ifdef QEMU_STATIC_ANALYSIS
assert((*l == 1 && len >= 1) ||
(*l == 2 && len >= 2) ||
(*l == 4 && len >= 4) ||
(*l == 8 && len >= 8));
#endif
val = ldn_he_p(buf, *l);
result = memory_region_dispatch_write(mr, mr_addr, val,
size_memop(*l), attrs);
if (release_lock) {
bql_unlock();
}
return result;
} else {
/* RAM case */
uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
false, true);
memmove(ram_ptr, buf, *l);
invalidate_and_set_dirty(mr, mr_addr, *l);
return MEMTX_OK;
}
}
/* Called within RCU critical section. */
static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
MemTxAttrs attrs,
const void *ptr,
hwaddr len, hwaddr mr_addr,
hwaddr l, MemoryRegion *mr)
{
MemTxResult result = MEMTX_OK;
const uint8_t *buf = ptr;
for (;;) {
result |= flatview_write_continue_step(attrs, buf, len, mr_addr, &l,
mr);
len -= l;
buf += l;
addr += l;
if (!len) {
break;
}
l = len;
mr = flatview_translate(fv, addr, &mr_addr, &l, true, attrs);
}
return result;
}
/* Called from RCU critical section. */
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
const void *buf, hwaddr len)
{
hwaddr l;
hwaddr mr_addr;
MemoryRegion *mr;
l = len;
mr = flatview_translate(fv, addr, &mr_addr, &l, true, attrs);
if (!flatview_access_allowed(mr, attrs, mr_addr, l)) {
return MEMTX_ACCESS_ERROR;
}
return flatview_write_continue(fv, addr, attrs, buf, len,
mr_addr, l, mr);
}
static MemTxResult flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf,
hwaddr len, hwaddr mr_addr,
hwaddr *l,
MemoryRegion *mr)
{
if (!flatview_access_allowed(mr, attrs, mr_addr, *l)) {
return MEMTX_ACCESS_ERROR;
}
if (!memory_access_is_direct(mr, false, attrs)) {
/* I/O case */
uint64_t val;
MemTxResult result;
bool release_lock = prepare_mmio_access(mr);
*l = memory_access_size(mr, *l, mr_addr);
result = memory_region_dispatch_read(mr, mr_addr, &val, size_memop(*l),
attrs);
/*
* Assure Coverity (and ourselves) that we are not going to OVERRUN
* the buffer by following stn_he_p().
*/
#ifdef QEMU_STATIC_ANALYSIS
assert((*l == 1 && len >= 1) ||
(*l == 2 && len >= 2) ||
(*l == 4 && len >= 4) ||
(*l == 8 && len >= 8));
#endif
stn_he_p(buf, *l, val);
if (release_lock) {
bql_unlock();
}
return result;
} else {
/* RAM case */
uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
false, false);
memcpy(buf, ram_ptr, *l);
return MEMTX_OK;
}
}
/* Called within RCU critical section. */
MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
MemTxAttrs attrs, void *ptr,
hwaddr len, hwaddr mr_addr, hwaddr l,
MemoryRegion *mr)
{
MemTxResult result = MEMTX_OK;
uint8_t *buf = ptr;
fuzz_dma_read_cb(addr, len, mr);
for (;;) {
result |= flatview_read_continue_step(attrs, buf, len, mr_addr, &l, mr);
len -= l;
buf += l;
addr += l;
if (!len) {
break;
}
l = len;
mr = flatview_translate(fv, addr, &mr_addr, &l, false, attrs);
}
return result;
}
/* Called from RCU critical section. */
static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
MemTxAttrs attrs, void *buf, hwaddr len)
{
hwaddr l;
hwaddr mr_addr;
MemoryRegion *mr;
l = len;
mr = flatview_translate(fv, addr, &mr_addr, &l, false, attrs);
if (!flatview_access_allowed(mr, attrs, mr_addr, l)) {
return MEMTX_ACCESS_ERROR;
}
return flatview_read_continue(fv, addr, attrs, buf, len,
mr_addr, l, mr);
}
MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
MemTxAttrs attrs, void *buf, hwaddr len)
{
MemTxResult result = MEMTX_OK;
FlatView *fv;
if (len > 0) {
RCU_READ_LOCK_GUARD();
fv = address_space_to_flatview(as);
result = flatview_read(fv, addr, attrs, buf, len);
}
return result;
}
MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
MemTxAttrs attrs,
const void *buf, hwaddr len)
{
MemTxResult result = MEMTX_OK;
FlatView *fv;
if (len > 0) {
RCU_READ_LOCK_GUARD();
fv = address_space_to_flatview(as);
result = flatview_write(fv, addr, attrs, buf, len);
}
return result;
}
MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
void *buf, hwaddr len, bool is_write)
{
if (is_write) {
return address_space_write(as, addr, attrs, buf, len);
} else {
return address_space_read_full(as, addr, attrs, buf, len);
}
}
MemTxResult address_space_set(AddressSpace *as, hwaddr addr,
uint8_t c, hwaddr len, MemTxAttrs attrs)
{
#define FILLBUF_SIZE 512
uint8_t fillbuf[FILLBUF_SIZE];
int l;
MemTxResult error = MEMTX_OK;
memset(fillbuf, c, FILLBUF_SIZE);
while (len > 0) {
l = len < FILLBUF_SIZE ? len : FILLBUF_SIZE;
error |= address_space_write(as, addr, attrs, fillbuf, l);
len -= l;
addr += l;
}
return error;
}
void cpu_physical_memory_read(hwaddr addr, void *buf, hwaddr len)
{
address_space_read(&address_space_memory, addr,
MEMTXATTRS_UNSPECIFIED, buf, len);
}
void cpu_physical_memory_write(hwaddr addr, const void *buf, hwaddr len)
{
address_space_write(&address_space_memory, addr,
MEMTXATTRS_UNSPECIFIED, buf, len);
}
/* used for ROM loading : can write in RAM and ROM */
MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
MemTxAttrs attrs,
const void *buf, hwaddr len)
{
RCU_READ_LOCK_GUARD();
while (len > 0) {
hwaddr addr1, l = len;
MemoryRegion *mr = address_space_translate(as, addr, &addr1, &l,
true, attrs);
if (!memory_region_supports_direct_access(mr)) {
exec: skip MMIO regions correctly in cpu_physical_memory_write_rom_internal Loading the BIOS in the mac99 machine is interesting, because there is a PROM in the middle of the BIOS region (from 16K to 32K). Before memory region accesses were clamped, when QEMU was asked to load a BIOS from 0xfff00000 to 0xffffffff it would put even those 16K from the BIOS file into the region. This is weird because those 16K were not actually visible between 0xfff04000 and 0xfff07fff. However, it worked. After clamping was added, this also worked. In this case, the cpu_physical_memory_write_rom_internal function split the write in three parts: the first 16K were copied, the PROM area (second 16K) were ignored, then the rest was copied. Problems then started with commit 965eb2f (exec: do not clamp accesses to MMIO regions, 2015-06-17). Clamping accesses is not done for MMIO regions because they can overlap wildly, and MMIO registers can be expected to perform full-width accesses based only on their address (with no respect for adjacent registers that could decode to completely different MemoryRegions). However, this lack of clamping also applied to the PROM area! cpu_physical_memory_write_rom_internal thus failed to copy the third range above, i.e. only copied the first 16K of the BIOS. In effect, address_space_translate is expecting _something else_ to do the clamping for MMIO regions if the incoming length is large. This "something else" is memory_access_size in the case of address_space_rw, so use the same logic in cpu_physical_memory_write_rom_internal. Reported-by: Alexander Graf <agraf@redhat.com> Reviewed-by: Laurent Vivier <lvivier@redhat.com> Tested-by: Laurent Vivier <lvivier@redhat.com> Fixes: 965eb2f Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2015-07-04 00:24:51 +02:00
l = memory_access_size(mr, l, addr1);
} else {
/* ROM/RAM case */
void *ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
memcpy(ram_ptr, buf, l);
invalidate_and_set_dirty(mr, addr1, l);
}
len -= l;
addr += l;
buf += l;
}
return MEMTX_OK;
}
void address_space_flush_icache_range(AddressSpace *as, hwaddr addr, hwaddr len)
{
/*
* This function should do the same thing as an icache flush that was
* triggered from within the guest. For TCG we are always cache coherent,
* so there is no need to flush anything. For KVM / Xen we need to flush
* the host's instruction cache at least.
*/
if (tcg_enabled()) {
return;
}
RCU_READ_LOCK_GUARD();
while (len > 0) {
hwaddr addr1, l = len;
MemoryRegion *mr = address_space_translate(as, addr, &addr1, &l, true,
MEMTXATTRS_UNSPECIFIED);
if (!memory_region_supports_direct_access(mr)) {
l = memory_access_size(mr, l, addr1);
} else {
/* ROM/RAM case */
void *ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
flush_idcache_range((uintptr_t)ram_ptr, (uintptr_t)ram_ptr, l);
}
len -= l;
addr += l;
}
}
2024-08-19 06:54:54 -07:00
/*
* A magic value stored in the first 8 bytes of the bounce buffer struct. Used
* to detect illegal pointers passed to address_space_unmap.
*/
#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
typedef struct {
uint64_t magic;
MemoryRegion *mr;
hwaddr addr;
size_t len;
uint8_t buffer[];
} BounceBuffer;
static void
address_space_unregister_map_client_do(AddressSpaceMapClient *client)
{
QLIST_REMOVE(client, link);
g_free(client);
}
static void address_space_notify_map_clients_locked(AddressSpace *as)
{
AddressSpaceMapClient *client;
while (!QLIST_EMPTY(&as->map_client_list)) {
client = QLIST_FIRST(&as->map_client_list);
qemu_bh_schedule(client->bh);
address_space_unregister_map_client_do(client);
}
}
void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
{
AddressSpaceMapClient *client = g_malloc(sizeof(*client));
QEMU_LOCK_GUARD(&as->map_client_list_lock);
client->bh = bh;
QLIST_INSERT_HEAD(&as->map_client_list, client, link);
2024-08-19 06:54:54 -07:00
/* Write map_client_list before reading bounce_buffer_size. */
smp_mb();
2024-08-19 06:54:54 -07:00
if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
address_space_notify_map_clients_locked(as);
}
}
void cpu_exec_init_all(void)
{
qemu_mutex_init(&ram_list.mutex);
/* The data structures we set up here depend on knowing the page size,
* so no more changes can be made after this point.
* In an ideal world, nothing we did before we had finished the
* machine setup would care about the target page size, and we could
* do this much later, rather than requiring board models to state
* up front what their requirements are.
*/
finalize_target_page_bits();
io_mem_init();
memory_map_init();
}
void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh)
{
AddressSpaceMapClient *client;
QEMU_LOCK_GUARD(&as->map_client_list_lock);
QLIST_FOREACH(client, &as->map_client_list, link) {
if (client->bh == bh) {
address_space_unregister_map_client_do(client);
break;
}
}
}
static void address_space_notify_map_clients(AddressSpace *as)
{
QEMU_LOCK_GUARD(&as->map_client_list_lock);
address_space_notify_map_clients_locked(as);
}
static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
bool is_write, MemTxAttrs attrs)
{
MemoryRegion *mr;
hwaddr l, xlat;
while (len > 0) {
l = len;
mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
if (!memory_access_is_direct(mr, is_write, attrs)) {
l = memory_access_size(mr, l, addr);
if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
return false;
}
}
len -= l;
addr += l;
}
return true;
}
bool address_space_access_valid(AddressSpace *as, hwaddr addr,
hwaddr len, bool is_write,
MemTxAttrs attrs)
{
FlatView *fv;
RCU_READ_LOCK_GUARD();
fv = address_space_to_flatview(as);
return flatview_access_valid(fv, addr, len, is_write, attrs);
}
bool address_space_is_io(AddressSpace *as, hwaddr addr)
{
MemoryRegion *mr;
RCU_READ_LOCK_GUARD();
mr = address_space_translate(as, addr, &addr, NULL, false,
MEMTXATTRS_UNSPECIFIED);
return !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
}
static hwaddr
flatview_extend_translation(FlatView *fv, hwaddr addr,
hwaddr target_len,
MemoryRegion *mr, hwaddr base, hwaddr len,
bool is_write, MemTxAttrs attrs)
{
hwaddr done = 0;
hwaddr xlat;
MemoryRegion *this_mr;
for (;;) {
target_len -= len;
addr += len;
done += len;
if (target_len == 0) {
return done;
}
len = target_len;
this_mr = flatview_translate(fv, addr, &xlat,
&len, is_write, attrs);
if (this_mr != mr || xlat != base + done) {
return done;
}
}
}
/* Map a physical memory region into a host virtual address.
* May map a subset of the requested range, given by and returned in *plen.
* May return NULL if resources needed to perform the mapping are exhausted.
* Use only for reads OR writes - not for read-modify-write operations.
* Use address_space_register_map_client() to know when retrying the map
* operation is likely to succeed.
*/
void *address_space_map(AddressSpace *as,
hwaddr addr,
hwaddr *plen,
bool is_write,
MemTxAttrs attrs)
{
hwaddr len = *plen;
hwaddr l, xlat;
MemoryRegion *mr;
FlatView *fv;
trace_address_space_map(as, addr, len, is_write, *(uint32_t *) &attrs);
if (len == 0) {
return NULL;
}
l = len;
RCU_READ_LOCK_GUARD();
fv = address_space_to_flatview(as);
mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
if (!memory_access_is_direct(mr, is_write, attrs)) {
2024-08-19 06:54:54 -07:00
size_t used = qatomic_read(&as->bounce_buffer_size);
for (;;) {
hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l);
size_t new_size = used + alloc;
size_t actual =
qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size);
if (actual == used) {
l = alloc;
break;
}
used = actual;
}
if (l == 0) {
*plen = 0;
return NULL;
}
2024-08-19 06:54:54 -07:00
BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
bounce->magic = BOUNCE_BUFFER_MAGIC;
memory_region_ref(mr);
2024-08-19 06:54:54 -07:00
bounce->mr = mr;
bounce->addr = addr;
bounce->len = l;
if (!is_write) {
flatview_read(fv, addr, attrs,
2024-08-19 06:54:54 -07:00
bounce->buffer, l);
}
*plen = l;
2024-08-19 06:54:54 -07:00
return bounce->buffer;
}
memory_region_ref(mr);
*plen = flatview_extend_translation(fv, addr, len, mr, xlat,
l, is_write, attrs);
fuzz_dma_read_cb(addr, *plen, mr);
return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true, is_write);
}
/* Unmaps a memory region previously mapped by address_space_map().
* Will also mark the memory as dirty if is_write is true. access_len gives
* the amount of memory that was actually read or written by the caller.
*/
void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
bool is_write, hwaddr access_len)
{
2024-08-19 06:54:54 -07:00
MemoryRegion *mr;
ram_addr_t addr1;
2024-08-19 06:54:54 -07:00
mr = memory_region_from_host(buffer, &addr1);
if (mr != NULL) {
if (is_write) {
invalidate_and_set_dirty(mr, addr1, access_len);
}
if (xen_enabled()) {
xen_invalidate_map_cache_entry(buffer);
}
memory_region_unref(mr);
return;
}
2024-08-19 06:54:54 -07:00
BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
assert(bounce->magic == BOUNCE_BUFFER_MAGIC);
if (is_write) {
2024-08-19 06:54:54 -07:00
address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
bounce->buffer, access_len);
}
qatomic_sub(&as->bounce_buffer_size, bounce->len);
bounce->magic = ~BOUNCE_BUFFER_MAGIC;
memory_region_unref(bounce->mr);
g_free(bounce);
/* Write bounce_buffer_size before reading map_client_list. */
smp_mb();
address_space_notify_map_clients(as);
}
void *cpu_physical_memory_map(hwaddr addr,
hwaddr *plen,
bool is_write)
{
return address_space_map(&address_space_memory, addr, plen, is_write,
MEMTXATTRS_UNSPECIFIED);
}
void cpu_physical_memory_unmap(void *buffer, hwaddr len,
bool is_write, hwaddr access_len)
{
return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
}
#define ARG1_DECL AddressSpace *as
#define ARG1 as
#define SUFFIX
#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__)
#define RCU_READ_LOCK(...) rcu_read_lock()
#define RCU_READ_UNLOCK(...) rcu_read_unlock()
#include "memory_ldst.c.inc"
int64_t address_space_cache_init(MemoryRegionCache *cache,
AddressSpace *as,
hwaddr addr,
hwaddr len,
bool is_write)
{
AddressSpaceDispatch *d;
hwaddr l;
MemoryRegion *mr;
Int128 diff;
assert(len > 0);
l = len;
cache->fv = address_space_get_flatview(as);
d = flatview_to_dispatch(cache->fv);
cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
/*
* cache->xlat is now relative to cache->mrs.mr, not to the section itself.
* Take that into account to compute how many bytes are there between
* cache->xlat and the end of the section.
*/
diff = int128_sub(cache->mrs.size,
int128_make64(cache->xlat - cache->mrs.offset_within_region));
l = int128_get64(int128_min(diff, int128_make64(l)));
mr = cache->mrs.mr;
memory_region_ref(mr);
if (memory_access_is_direct(mr, is_write, MEMTXATTRS_UNSPECIFIED)) {
/* We don't care about the memory attributes here as we're only
* doing this if we found actual RAM, which behaves the same
* regardless of attributes; so UNSPECIFIED is fine.
*/
l = flatview_extend_translation(cache->fv, addr, len, mr,
cache->xlat, l, is_write,
MEMTXATTRS_UNSPECIFIED);
cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true,
is_write);
} else {
cache->ptr = NULL;
}
cache->len = l;
cache->is_write = is_write;
return l;
}
void address_space_cache_invalidate(MemoryRegionCache *cache,
hwaddr addr,
hwaddr access_len)
{
assert(cache->is_write);
if (likely(cache->ptr)) {
invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
}
}
void address_space_cache_destroy(MemoryRegionCache *cache)
{
if (!cache->mrs.mr) {
return;
}
if (xen_enabled()) {
xen_invalidate_map_cache_entry(cache->ptr);
}
memory_region_unref(cache->mrs.mr);
flatview_unref(cache->fv);
cache->mrs.mr = NULL;
cache->fv = NULL;
}
/* Called from RCU critical section. This function has the same
* semantics as address_space_translate, but it only works on a
* predefined range of a MemoryRegion that was mapped with
* address_space_cache_init.
*/
static inline MemoryRegion *address_space_translate_cached(
MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
hwaddr *plen, bool is_write, MemTxAttrs attrs)
{
MemoryRegionSection section;
MemoryRegion *mr;
IOMMUMemoryRegion *iommu_mr;
AddressSpace *target_as;
assert(!cache->ptr);
*xlat = addr + cache->xlat;
mr = cache->mrs.mr;
iommu_mr = memory_region_get_iommu(mr);
if (!iommu_mr) {
/* MMIO region. */
return mr;
}
section = address_space_translate_iommu(iommu_mr, xlat, plen,
NULL, is_write, true,
&target_as, attrs);
return section.mr;
}
/* Called within RCU critical section. */
static MemTxResult address_space_write_continue_cached(MemTxAttrs attrs,
const void *ptr,
hwaddr len,
hwaddr mr_addr,
hwaddr l,
MemoryRegion *mr)
{
MemTxResult result = MEMTX_OK;
const uint8_t *buf = ptr;
for (;;) {
result |= flatview_write_continue_step(attrs, buf, len, mr_addr, &l,
mr);
len -= l;
buf += l;
mr_addr += l;
if (!len) {
break;
}
l = len;
}
return result;
}
/* Called within RCU critical section. */
static MemTxResult address_space_read_continue_cached(MemTxAttrs attrs,
void *ptr, hwaddr len,
hwaddr mr_addr, hwaddr l,
MemoryRegion *mr)
{
MemTxResult result = MEMTX_OK;
uint8_t *buf = ptr;
for (;;) {
result |= flatview_read_continue_step(attrs, buf, len, mr_addr, &l, mr);
len -= l;
buf += l;
mr_addr += l;
if (!len) {
break;
}
l = len;
}
return result;
}
/* Called from RCU critical section. address_space_read_cached uses this
* out of line function when the target is an MMIO or IOMMU region.
*/
MemTxResult
address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
void *buf, hwaddr len)
{
hwaddr mr_addr, l;
MemoryRegion *mr;
l = len;
mr = address_space_translate_cached(cache, addr, &mr_addr, &l, false,
MEMTXATTRS_UNSPECIFIED);
return address_space_read_continue_cached(MEMTXATTRS_UNSPECIFIED,
buf, len, mr_addr, l, mr);
}
/* Called from RCU critical section. address_space_write_cached uses this
* out of line function when the target is an MMIO or IOMMU region.
*/
MemTxResult
address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
const void *buf, hwaddr len)
{
hwaddr mr_addr, l;
MemoryRegion *mr;
l = len;
mr = address_space_translate_cached(cache, addr, &mr_addr, &l, true,
MEMTXATTRS_UNSPECIFIED);
return address_space_write_continue_cached(MEMTXATTRS_UNSPECIFIED,
buf, len, mr_addr, l, mr);
}
#define ARG1_DECL MemoryRegionCache *cache
#define ARG1 cache
#define SUFFIX _cached_slow
#define TRANSLATE(...) address_space_translate_cached(cache, __VA_ARGS__)
#define RCU_READ_LOCK() ((void)0)
#define RCU_READ_UNLOCK() ((void)0)
#include "memory_ldst.c.inc"
/* virtual memory access for debug (includes writing to ROM) */
int cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
void *ptr, size_t len, bool is_write)
{
hwaddr phys_addr;
vaddr l, page;
uint8_t *buf = ptr;
cpu_synchronize_state(cpu);
while (len > 0) {
int asidx;
MemTxAttrs attrs;
MemTxResult res;
page = addr & TARGET_PAGE_MASK;
phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
asidx = cpu_asidx_from_attrs(cpu, attrs);
/* if no physical page mapped, return an error */
if (phys_addr == -1)
return -1;
l = (page + TARGET_PAGE_SIZE) - addr;
if (l > len)
l = len;
phys_addr += (addr & ~TARGET_PAGE_MASK);
physmem: teach cpu_memory_rw_debug() to write to more memory regions Right now, we only allow for writing to memory regions that allow direct access using memcpy etc; all other writes are simply ignored. This implies that debugging guests will not work as expected when writing to MMIO device regions. Let's extend cpu_memory_rw_debug() to write to more memory regions, including MMIO device regions. Reshuffle the condition in memory_access_is_direct() to make it easier to read and add a comment. While this change implies that debug access can now also write to MMIO devices, we now are also permit ELF image loads and similar users of cpu_memory_rw_debug() to write to MMIO devices; currently we ignore these writes. Peter assumes [1] that there's probably a class of guest images, which will start writing junk (likely zeroes) into device model registers; we previously would silently ignore any such bogus ELF sections. Likely these images are of questionable correctness and this can be ignored. If ever a problem, we could make these cases use address_space_write_rom() instead, which is left unchanged for now. This patch is based on previous work by Stefan Zabka. [1] https://lore.kernel.org/all/CAFEAcA_2CEJKFyjvbwmpt=on=GgMVamQ5hiiVt+zUr6AY3X=Xg@mail.gmail.com/ Resolves: https://gitlab.com/qemu-project/qemu/-/issues/213 Reviewed-by: Peter Xu <peterx@redhat.com> Signed-off-by: David Hildenbrand <david@redhat.com> Link: https://lore.kernel.org/r/20250210084648.33798-8-david@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com>
2025-02-10 09:46:48 +01:00
res = address_space_rw(cpu->cpu_ases[asidx].as, phys_addr, attrs, buf,
l, is_write);
if (res != MEMTX_OK) {
return -1;
}
len -= l;
buf += l;
addr += l;
}
return 0;
}
int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
{
RAMBlock *block;
int ret = 0;
RCU_READ_LOCK_GUARD();
RAMBLOCK_FOREACH(block) {
ret = func(block, opaque);
if (ret) {
break;
}
}
return ret;
}
/*
* Unmap pages of memory from offset to offset+length such that
* they a) read as 0, b) Trigger whatever fault mechanism
* the OS provides for postcopy.
* The pages must be unmapped by the end of the function.
* Returns: 0 on success, none-0 on failure
*
*/
int ram_block_discard_range(RAMBlock *rb, uint64_t offset, size_t length)
{
int ret = -1;
uint8_t *host_startaddr = rb->host + offset;
if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) {
error_report("%s: Unaligned start address: %p",
__func__, host_startaddr);
goto err;
}
if ((offset + length) <= rb->max_length) {
bool need_madvise, need_fallocate;
if (!QEMU_IS_ALIGNED(length, rb->page_size)) {
error_report("%s: Unaligned length: %zx", __func__, length);
goto err;
}
errno = ENOTSUP; /* If we are missing MADVISE etc */
/* The logic here is messy;
* madvise DONTNEED fails for hugepages
* fallocate works on hugepages and shmem
* shared anonymous memory requires madvise REMOVE
*/
need_madvise = (rb->page_size == qemu_real_host_page_size());
need_fallocate = rb->fd != -1;
if (need_fallocate) {
/* For a file, this causes the area of the file to be zero'd
* if read, and for hugetlbfs also causes it to be unmapped
* so a userfault will trigger.
*/
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
/*
* fallocate() will fail with readonly files. Let's print a
* proper error message.
*/
if (rb->flags & RAM_READONLY_FD) {
error_report("%s: Discarding RAM with readonly files is not"
" supported", __func__);
goto err;
}
/*
* We'll discard data from the actual file, even though we only
* have a MAP_PRIVATE mapping, possibly messing with other
* MAP_PRIVATE/MAP_SHARED mappings. There is no easy way to
* change that behavior whithout violating the promised
* semantics of ram_block_discard_range().
*
* Only warn, because it works as long as nobody else uses that
* file.
*/
if (!qemu_ram_is_shared(rb)) {
warn_report_once("%s: Discarding RAM"
" in private file mappings is possibly"
" dangerous, because it will modify the"
" underlying file and will affect other"
" users of the file", __func__);
}
ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset + rb->fd_offset, length);
if (ret) {
ret = -errno;
error_report("%s: Failed to fallocate %s:%" PRIx64 "+%" PRIx64
" +%zx (%d)", __func__, rb->idstr, offset,
rb->fd_offset, length, ret);
goto err;
}
#else
ret = -ENOSYS;
error_report("%s: fallocate not available/file"
"%s:%" PRIx64 "+%" PRIx64 " +%zx (%d)", __func__,
rb->idstr, offset, rb->fd_offset, length, ret);
goto err;
#endif
}
if (need_madvise) {
/* For normal RAM this causes it to be unmapped,
* for shared memory it causes the local mapping to disappear
* and to fall back on the file contents (which we just
* fallocate'd away).
*/
#if defined(CONFIG_MADVISE)
if (qemu_ram_is_shared(rb) && rb->fd < 0) {
ret = madvise(host_startaddr, length, QEMU_MADV_REMOVE);
} else {
ret = madvise(host_startaddr, length, QEMU_MADV_DONTNEED);
}
if (ret) {
ret = -errno;
error_report("%s: Failed to discard range "
"%s:%" PRIx64 " +%zx (%d)",
__func__, rb->idstr, offset, length, ret);
goto err;
}
#else
ret = -ENOSYS;
error_report("%s: MADVISE not available %s:%" PRIx64 " +%zx (%d)",
__func__, rb->idstr, offset, length, ret);
goto err;
#endif
}
trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
need_madvise, need_fallocate, ret);
} else {
error_report("%s: Overrun block '%s' (%" PRIu64 "/%zx/" RAM_ADDR_FMT")",
__func__, rb->idstr, offset, length, rb->max_length);
}
err:
return ret;
}
int ram_block_discard_guest_memfd_range(RAMBlock *rb, uint64_t offset,
size_t length)
{
int ret = -1;
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
/* ignore fd_offset with guest_memfd */
ret = fallocate(rb->guest_memfd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, length);
if (ret) {
ret = -errno;
error_report("%s: Failed to fallocate %s:%" PRIx64 " +%zx (%d)",
__func__, rb->idstr, offset, length, ret);
}
#else
ret = -ENOSYS;
error_report("%s: fallocate not available %s:%" PRIx64 " +%zx (%d)",
__func__, rb->idstr, offset, length, ret);
#endif
return ret;
}
bool ram_block_is_pmem(RAMBlock *rb)
{
return rb->flags & RAM_PMEM;
}
static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
{
if (start == end - 1) {
qemu_printf("\t%3d ", start);
} else {
qemu_printf("\t%3d..%-3d ", start, end - 1);
}
qemu_printf(" skip=%d ", skip);
if (ptr == PHYS_MAP_NODE_NIL) {
qemu_printf(" ptr=NIL");
} else if (!skip) {
qemu_printf(" ptr=#%d", ptr);
} else {
qemu_printf(" ptr=[%d]", ptr);
}
qemu_printf("\n");
}
#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
int128_sub((size), int128_one())) : 0)
void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
{
int i;
qemu_printf(" Dispatch\n");
qemu_printf(" Physical sections\n");
for (i = 0; i < d->map.sections_nb; ++i) {
MemoryRegionSection *s = d->map.sections + i;
const char *names[] = { " [unassigned]", " [not dirty]",
" [ROM]", " [watch]" };
qemu_printf(" #%d @" HWADDR_FMT_plx ".." HWADDR_FMT_plx
" %s%s%s%s%s",
i,
s->offset_within_address_space,
s->offset_within_address_space + MR_SIZE(s->size),
s->mr->name ? s->mr->name : "(noname)",
i < ARRAY_SIZE(names) ? names[i] : "",
s->mr == root ? " [ROOT]" : "",
s == d->mru_section ? " [MRU]" : "",
s->mr->is_iommu ? " [iommu]" : "");
if (s->mr->alias) {
qemu_printf(" alias=%s", s->mr->alias->name ?
s->mr->alias->name : "noname");
}
qemu_printf("\n");
}
qemu_printf(" Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
for (i = 0; i < d->map.nodes_nb; ++i) {
int j, jprev;
PhysPageEntry prev;
Node *n = d->map.nodes + i;
qemu_printf(" [%d]\n", i);
for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
PhysPageEntry *pe = *n + j;
if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
continue;
}
mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
jprev = j;
prev = *pe;
}
if (jprev != ARRAY_SIZE(*n)) {
mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
}
}
}
/* Require any discards to work. */
static unsigned int ram_block_discard_required_cnt;
/* Require only coordinated discards to work. */
static unsigned int ram_block_coordinated_discard_required_cnt;
/* Disable any discards. */
static unsigned int ram_block_discard_disabled_cnt;
/* Disable only uncoordinated discards. */
static unsigned int ram_block_uncoordinated_discard_disabled_cnt;
static QemuMutex ram_block_discard_disable_mutex;
static void ram_block_discard_disable_mutex_lock(void)
{
static gsize initialized;
if (g_once_init_enter(&initialized)) {
qemu_mutex_init(&ram_block_discard_disable_mutex);
g_once_init_leave(&initialized, 1);
}
qemu_mutex_lock(&ram_block_discard_disable_mutex);
}
static void ram_block_discard_disable_mutex_unlock(void)
{
qemu_mutex_unlock(&ram_block_discard_disable_mutex);
}
int ram_block_discard_disable(bool state)
{
int ret = 0;
ram_block_discard_disable_mutex_lock();
if (!state) {
ram_block_discard_disabled_cnt--;
} else if (ram_block_discard_required_cnt ||
ram_block_coordinated_discard_required_cnt) {
ret = -EBUSY;
} else {
ram_block_discard_disabled_cnt++;
}
ram_block_discard_disable_mutex_unlock();
return ret;
}
int ram_block_uncoordinated_discard_disable(bool state)
{
int ret = 0;
ram_block_discard_disable_mutex_lock();
if (!state) {
ram_block_uncoordinated_discard_disabled_cnt--;
} else if (ram_block_discard_required_cnt) {
ret = -EBUSY;
} else {
ram_block_uncoordinated_discard_disabled_cnt++;
}
ram_block_discard_disable_mutex_unlock();
return ret;
}
int ram_block_discard_require(bool state)
{
int ret = 0;
ram_block_discard_disable_mutex_lock();
if (!state) {
ram_block_discard_required_cnt--;
} else if (ram_block_discard_disabled_cnt ||
ram_block_uncoordinated_discard_disabled_cnt) {
ret = -EBUSY;
} else {
ram_block_discard_required_cnt++;
}
ram_block_discard_disable_mutex_unlock();
return ret;
}
int ram_block_coordinated_discard_require(bool state)
{
int ret = 0;
ram_block_discard_disable_mutex_lock();
if (!state) {
ram_block_coordinated_discard_required_cnt--;
} else if (ram_block_discard_disabled_cnt) {
ret = -EBUSY;
} else {
ram_block_coordinated_discard_required_cnt++;
}
ram_block_discard_disable_mutex_unlock();
return ret;
}
bool ram_block_discard_is_disabled(void)
{
return qatomic_read(&ram_block_discard_disabled_cnt) ||
qatomic_read(&ram_block_uncoordinated_discard_disabled_cnt);
}
bool ram_block_discard_is_required(void)
{
return qatomic_read(&ram_block_discard_required_cnt) ||
qatomic_read(&ram_block_coordinated_discard_required_cnt);
}
/*
* Return true if ram is compatible with CPR. Do not exclude rom,
* because the rom file could change in new QEMU.
*/
static bool ram_is_cpr_compatible(RAMBlock *rb)
{
MemoryRegion *mr = rb->mr;
if (!mr || !memory_region_is_ram(mr)) {
return true;
}
/* Ram device is remapped in new QEMU */
if (memory_region_is_ram_device(mr)) {
return true;
}
/*
* A file descriptor is passed to new QEMU and remapped, or its backing
* file is reopened and mapped. It must be shared to avoid COW.
*/
if (rb->fd >= 0 && qemu_ram_is_shared(rb)) {
return true;
}
return false;
}
/*
* Add a blocker for each volatile ram block. This function should only be
* called after we know that the block is migratable. Non-migratable blocks
* are either re-created in new QEMU, or are handled specially, or are covered
* by a device-level CPR blocker.
*/
void ram_block_add_cpr_blocker(RAMBlock *rb, Error **errp)
{
assert(qemu_ram_is_migratable(rb));
if (ram_is_cpr_compatible(rb)) {
return;
}
error_setg(&rb->cpr_blocker,
"Memory region %s is not compatible with CPR. share=on is "
"required for memory-backend objects, and aux-ram-share=on is "
"required.", memory_region_name(rb->mr));
migrate_add_blocker_modes(&rb->cpr_blocker, BIT(MIG_MODE_CPR_TRANSFER),
errp);
}
void ram_block_del_cpr_blocker(RAMBlock *rb)
{
migrate_del_blocker(&rb->cpr_blocker);
}