2003-05-13 00:25:15 +00:00
|
|
|
/*
|
2020-10-06 09:05:29 +02:00
|
|
|
* RAM allocation and memory access
|
2007-09-16 21:08:06 +00:00
|
|
|
*
|
2003-05-13 00:25:15 +00:00
|
|
|
* Copyright (c) 2003 Fabrice Bellard
|
|
|
|
|
*
|
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
|
* License as published by the Free Software Foundation; either
|
2020-10-23 12:44:24 +00:00
|
|
|
* version 2.1 of the License, or (at your option) any later version.
|
2003-05-13 00:25:15 +00:00
|
|
|
*
|
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
2009-07-16 20:47:01 +00:00
|
|
|
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
|
2003-05-13 00:25:15 +00:00
|
|
|
*/
|
2019-05-23 16:35:05 +02:00
|
|
|
|
2016-01-26 18:16:56 +00:00
|
|
|
#include "qemu/osdep.h"
|
2022-03-23 19:57:34 +04:00
|
|
|
#include "exec/page-vary.h"
|
include/qemu/osdep.h: Don't include qapi/error.h
Commit 57cb38b included qapi/error.h into qemu/osdep.h to get the
Error typedef. Since then, we've moved to include qemu/osdep.h
everywhere. Its file comment explains: "To avoid getting into
possible circular include dependencies, this file should not include
any other QEMU headers, with the exceptions of config-host.h,
compiler.h, os-posix.h and os-win32.h, all of which are doing a
similar job to this file and are under similar constraints."
qapi/error.h doesn't do a similar job, and it doesn't adhere to
similar constraints: it includes qapi-types.h. That's in excess of
100KiB of crap most .c files don't actually need.
Add the typedef to qemu/typedefs.h, and include that instead of
qapi/error.h. Include qapi/error.h in .c files that need it and don't
get it now. Include qapi-types.h in qom/object.h for uint16List.
Update scripts/clean-includes accordingly. Update it further to match
reality: replace config.h by config-target.h, add sysemu/os-posix.h,
sysemu/os-win32.h. Update the list of includes in the qemu/osdep.h
comment quoted above similarly.
This reduces the number of objects depending on qapi/error.h from "all
of them" to less than a third. Unfortunately, the number depending on
qapi-types.h shrinks only a little. More work is needed for that one.
Signed-off-by: Markus Armbruster <armbru@redhat.com>
[Fix compilation without the spice devel packages. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-14 09:01:28 +01:00
|
|
|
#include "qapi/error.h"
|
2003-05-13 00:25:15 +00:00
|
|
|
|
2016-03-20 19:16:19 +02:00
|
|
|
#include "qemu/cutils.h"
|
2020-12-14 08:02:33 -06:00
|
|
|
#include "qemu/cacheflush.h"
|
2022-12-21 14:35:49 +01:00
|
|
|
#include "qemu/hbitmap.h"
|
2022-02-08 20:08:52 +00:00
|
|
|
#include "qemu/madvise.h"
|
2024-05-07 14:12:46 +02:00
|
|
|
#include "qemu/lockable.h"
|
2021-02-04 17:39:23 +01:00
|
|
|
|
|
|
|
|
#ifdef CONFIG_TCG
|
2025-01-24 00:03:40 +01:00
|
|
|
#include "accel/tcg/cpu-ops.h"
|
2025-04-24 22:24:10 +02:00
|
|
|
#include "accel/tcg/iommu.h"
|
2021-02-04 17:39:23 +01:00
|
|
|
#endif /* CONFIG_TCG */
|
|
|
|
|
|
2024-11-14 02:12:58 +01:00
|
|
|
#include "exec/cputlb.h"
|
2023-12-06 20:27:32 +01:00
|
|
|
#include "exec/page-protection.h"
|
2017-04-24 20:50:19 +02:00
|
|
|
#include "exec/target_page.h"
|
2024-11-13 22:46:13 +01:00
|
|
|
#include "exec/translation-block.h"
|
2025-11-27 08:37:58 +01:00
|
|
|
#include "hw/core/qdev.h"
|
2025-11-27 08:38:05 +01:00
|
|
|
#include "hw/core/qdev-properties.h"
|
2025-11-27 08:37:19 +01:00
|
|
|
#include "hw/core/boards.h"
|
2024-12-03 15:20:13 +01:00
|
|
|
#include "system/xen.h"
|
|
|
|
|
#include "system/kvm.h"
|
|
|
|
|
#include "system/tcg.h"
|
|
|
|
|
#include "system/qtest.h"
|
2025-09-30 09:08:54 +02:00
|
|
|
#include "system/physmem.h"
|
2025-09-29 17:12:12 +02:00
|
|
|
#include "system/ramblock.h"
|
2012-12-17 18:20:00 +01:00
|
|
|
#include "qemu/timer.h"
|
|
|
|
|
#include "qemu/config-file.h"
|
2013-09-02 16:57:02 +02:00
|
|
|
#include "qemu/error-report.h"
|
2019-04-17 21:17:56 +02:00
|
|
|
#include "qemu/qemu-print.h"
|
2021-12-15 19:24:21 +01:00
|
|
|
#include "qemu/log.h"
|
2022-02-26 18:07:23 +00:00
|
|
|
#include "qemu/memalign.h"
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
#include "qemu/memfd.h"
|
2025-03-12 12:11:21 -07:00
|
|
|
#include "system/memory.h"
|
2025-12-29 23:31:08 +01:00
|
|
|
#include "system/memory_cached.h"
|
2025-03-12 12:49:38 -07:00
|
|
|
#include "system/ioport.h"
|
2024-12-03 15:20:13 +01:00
|
|
|
#include "system/dma.h"
|
|
|
|
|
#include "system/hostmem.h"
|
|
|
|
|
#include "system/hw_accel.h"
|
|
|
|
|
#include "system/xen-mapcache.h"
|
2024-07-05 09:40:10 +01:00
|
|
|
#include "trace.h"
|
2017-02-24 18:28:32 +00:00
|
|
|
|
2017-02-24 18:28:33 +00:00
|
|
|
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
|
|
|
|
|
#include <linux/falloc.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
2013-09-05 14:41:35 -04:00
|
|
|
#include "qemu/rcu_queue.h"
|
2015-06-18 18:47:22 +02:00
|
|
|
#include "qemu/main-loop.h"
|
2024-12-03 15:20:13 +01:00
|
|
|
#include "system/replay.h"
|
2012-04-09 16:50:52 +00:00
|
|
|
|
2025-11-27 09:41:14 +01:00
|
|
|
#include "system/ramblock.h"
|
2011-12-15 15:25:22 +02:00
|
|
|
|
2019-11-21 00:08:41 +00:00
|
|
|
#include "qemu/pmem.h"
|
|
|
|
|
|
2025-02-27 06:48:01 -08:00
|
|
|
#include "qapi/qapi-types-migration.h"
|
|
|
|
|
#include "migration/blocker.h"
|
2025-01-15 11:00:34 -08:00
|
|
|
#include "migration/cpr.h"
|
2025-02-27 06:48:01 -08:00
|
|
|
#include "migration/options.h"
|
2016-05-12 09:18:12 +05:30
|
|
|
#include "migration/vmstate.h"
|
|
|
|
|
|
2013-11-11 17:52:07 +02:00
|
|
|
#include "qemu/range.h"
|
2015-09-24 14:41:17 +03:00
|
|
|
#ifndef _WIN32
|
|
|
|
|
#include "qemu/mmap-alloc.h"
|
|
|
|
|
#endif
|
2013-11-11 17:52:07 +02:00
|
|
|
|
2017-05-12 12:17:41 +08:00
|
|
|
#include "monitor/monitor.h"
|
|
|
|
|
|
2020-04-29 16:50:09 +08:00
|
|
|
#ifdef CONFIG_LIBDAXCTL
|
|
|
|
|
#include <daxctl/libdaxctl.h>
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-03-17 17:13:29 +01:00
|
|
|
#include "memory-internal.h"
|
|
|
|
|
|
2013-09-05 14:41:35 -04:00
|
|
|
/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
|
|
|
|
|
* are protected by the ramlist lock.
|
|
|
|
|
*/
|
2015-01-21 13:45:24 +01:00
|
|
|
RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) };
|
2011-07-26 14:26:14 +03:00
|
|
|
|
|
|
|
|
static MemoryRegion *system_memory;
|
2011-08-08 16:09:03 +03:00
|
|
|
static MemoryRegion *system_io;
|
2011-07-26 14:26:14 +03:00
|
|
|
|
2012-10-02 20:13:51 +02:00
|
|
|
AddressSpace address_space_io;
|
|
|
|
|
AddressSpace address_space_memory;
|
2012-10-02 18:49:28 +02:00
|
|
|
|
2013-05-26 21:55:37 +02:00
|
|
|
static MemoryRegion io_mem_unassigned;
|
2012-02-10 17:00:01 +02:00
|
|
|
|
2013-05-21 12:07:21 +02:00
|
|
|
typedef struct PhysPageEntry PhysPageEntry;
|
|
|
|
|
|
|
|
|
|
struct PhysPageEntry {
|
2013-11-11 14:42:43 +02:00
|
|
|
/* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */
|
2013-11-11 14:51:56 +02:00
|
|
|
uint32_t skip : 6;
|
2013-11-11 14:42:43 +02:00
|
|
|
/* index into phys_sections (!skip) or phys_map_nodes (skip) */
|
2013-11-11 14:51:56 +02:00
|
|
|
uint32_t ptr : 26;
|
2013-05-21 12:07:21 +02:00
|
|
|
};
|
|
|
|
|
|
2013-11-11 14:51:56 +02:00
|
|
|
#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6)
|
|
|
|
|
|
2013-11-07 17:14:36 +01:00
|
|
|
/* Size of the L2 (and L3, etc) page tables. */
|
2013-11-07 17:14:37 +01:00
|
|
|
#define ADDR_SPACE_BITS 64
|
2013-11-07 17:14:36 +01:00
|
|
|
|
2013-11-13 20:13:03 +02:00
|
|
|
#define P_L2_BITS 9
|
2013-11-07 17:14:36 +01:00
|
|
|
#define P_L2_SIZE (1 << P_L2_BITS)
|
|
|
|
|
|
|
|
|
|
#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1)
|
|
|
|
|
|
|
|
|
|
typedef PhysPageEntry Node[P_L2_SIZE];
|
2013-05-29 12:28:21 +02:00
|
|
|
|
2013-12-01 14:02:23 +02:00
|
|
|
typedef struct PhysPageMap {
|
2015-01-21 12:09:14 +01:00
|
|
|
struct rcu_head rcu;
|
|
|
|
|
|
2013-12-01 14:02:23 +02:00
|
|
|
unsigned sections_nb;
|
|
|
|
|
unsigned sections_nb_alloc;
|
|
|
|
|
unsigned nodes_nb;
|
|
|
|
|
unsigned nodes_nb_alloc;
|
|
|
|
|
Node *nodes;
|
|
|
|
|
MemoryRegionSection *sections;
|
|
|
|
|
} PhysPageMap;
|
|
|
|
|
|
2013-05-21 12:07:21 +02:00
|
|
|
struct AddressSpaceDispatch {
|
2016-03-01 14:18:24 +08:00
|
|
|
MemoryRegionSection *mru_section;
|
2013-05-21 12:07:21 +02:00
|
|
|
/* This is a multi-level map on the physical address space.
|
|
|
|
|
* The bottom level has pointers to MemoryRegionSections.
|
|
|
|
|
*/
|
|
|
|
|
PhysPageEntry phys_map;
|
2013-12-01 14:02:23 +02:00
|
|
|
PhysPageMap map;
|
2013-05-21 12:07:21 +02:00
|
|
|
};
|
|
|
|
|
|
2013-05-26 21:46:51 +02:00
|
|
|
#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK)
|
|
|
|
|
typedef struct subpage_t {
|
|
|
|
|
MemoryRegion iomem;
|
2017-09-21 18:50:58 +10:00
|
|
|
FlatView *fv;
|
2013-05-26 21:46:51 +02:00
|
|
|
hwaddr base;
|
2016-10-24 16:26:49 +01:00
|
|
|
uint16_t sub_section[];
|
2013-05-26 21:46:51 +02:00
|
|
|
} subpage_t;
|
|
|
|
|
|
2013-05-29 11:09:17 +02:00
|
|
|
#define PHYS_SECTION_UNASSIGNED 0
|
2012-02-12 18:32:55 +02:00
|
|
|
|
2008-06-08 01:09:01 +00:00
|
|
|
static void io_mem_init(void);
|
2011-07-26 14:26:14 +03:00
|
|
|
static void memory_map_init(void);
|
2018-02-06 18:37:39 +01:00
|
|
|
static void tcg_log_global_after_sync(MemoryListener *listener);
|
2013-12-17 13:06:51 +10:00
|
|
|
static void tcg_commit(MemoryListener *listener);
|
2025-03-27 07:52:10 -07:00
|
|
|
static bool ram_is_cpr_compatible(RAMBlock *rb);
|
2008-06-08 01:09:01 +00:00
|
|
|
|
2015-10-01 15:29:50 +01:00
|
|
|
/**
|
|
|
|
|
* CPUAddressSpace: all the information a CPU needs about an AddressSpace
|
|
|
|
|
* @cpu: the CPU whose AddressSpace this is
|
|
|
|
|
* @as: the AddressSpace itself
|
|
|
|
|
* @tcg_as_listener: listener for tracking changes to the AddressSpace
|
|
|
|
|
*/
|
2024-05-02 17:14:42 +02:00
|
|
|
typedef struct CPUAddressSpace {
|
2015-10-01 15:29:50 +01:00
|
|
|
CPUState *cpu;
|
|
|
|
|
AddressSpace *as;
|
|
|
|
|
MemoryListener tcg_as_listener;
|
2024-05-02 17:14:42 +02:00
|
|
|
} CPUAddressSpace;
|
2015-10-01 15:29:50 +01:00
|
|
|
|
2017-04-21 11:16:25 +02:00
|
|
|
struct DirtyBitmapSnapshot {
|
|
|
|
|
ram_addr_t start;
|
|
|
|
|
ram_addr_t end;
|
|
|
|
|
unsigned long dirty[];
|
|
|
|
|
};
|
|
|
|
|
|
2013-12-01 14:02:23 +02:00
|
|
|
static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes)
|
2012-02-12 20:12:49 +02:00
|
|
|
{
|
2016-07-15 12:03:50 +02:00
|
|
|
static unsigned alloc_hint = 16;
|
2013-12-01 14:02:23 +02:00
|
|
|
if (map->nodes_nb + nodes > map->nodes_nb_alloc) {
|
2019-03-21 16:25:52 +08:00
|
|
|
map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes);
|
2013-12-01 14:02:23 +02:00
|
|
|
map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc);
|
2016-07-15 12:03:50 +02:00
|
|
|
alloc_hint = map->nodes_nb_alloc;
|
2012-02-12 20:12:49 +02:00
|
|
|
}
|
2012-02-13 20:12:05 +02:00
|
|
|
}
|
|
|
|
|
|
2015-05-21 15:12:29 +02:00
|
|
|
static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf)
|
2012-02-13 20:12:05 +02:00
|
|
|
{
|
|
|
|
|
unsigned i;
|
2013-11-11 14:51:56 +02:00
|
|
|
uint32_t ret;
|
2015-05-21 15:12:29 +02:00
|
|
|
PhysPageEntry e;
|
|
|
|
|
PhysPageEntry *p;
|
2012-02-13 20:12:05 +02:00
|
|
|
|
2013-12-01 14:02:23 +02:00
|
|
|
ret = map->nodes_nb++;
|
2015-05-21 15:12:29 +02:00
|
|
|
p = map->nodes[ret];
|
2012-02-13 20:12:05 +02:00
|
|
|
assert(ret != PHYS_MAP_NODE_NIL);
|
2013-12-01 14:02:23 +02:00
|
|
|
assert(ret != map->nodes_nb_alloc);
|
2015-05-21 15:12:29 +02:00
|
|
|
|
|
|
|
|
e.skip = leaf ? 0 : 1;
|
|
|
|
|
e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL;
|
2013-11-07 17:14:36 +01:00
|
|
|
for (i = 0; i < P_L2_SIZE; ++i) {
|
2015-05-21 15:12:29 +02:00
|
|
|
memcpy(&p[i], &e, sizeof(e));
|
2012-02-12 20:12:49 +02:00
|
|
|
}
|
2012-02-13 20:12:05 +02:00
|
|
|
return ret;
|
2012-02-12 20:12:49 +02:00
|
|
|
}
|
|
|
|
|
|
2013-12-01 14:02:23 +02:00
|
|
|
static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp,
|
2019-03-21 16:25:50 +08:00
|
|
|
hwaddr *index, uint64_t *nb, uint16_t leaf,
|
2012-02-13 20:21:20 +02:00
|
|
|
int level)
|
2012-02-13 20:12:05 +02:00
|
|
|
{
|
|
|
|
|
PhysPageEntry *p;
|
2013-11-07 17:14:36 +01:00
|
|
|
hwaddr step = (hwaddr)1 << (level * P_L2_BITS);
|
2005-07-24 12:55:09 +00:00
|
|
|
|
2013-11-11 14:42:43 +02:00
|
|
|
if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) {
|
2015-05-21 15:12:29 +02:00
|
|
|
lp->ptr = phys_map_node_alloc(map, level == 0);
|
2004-05-21 14:52:29 +00:00
|
|
|
}
|
2015-05-21 15:12:29 +02:00
|
|
|
p = map->nodes[lp->ptr];
|
2013-11-07 17:14:36 +01:00
|
|
|
lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)];
|
2012-02-13 20:12:05 +02:00
|
|
|
|
2013-11-07 17:14:36 +01:00
|
|
|
while (*nb && lp < &p[P_L2_SIZE]) {
|
2012-02-13 20:45:32 +02:00
|
|
|
if ((*index & (step - 1)) == 0 && *nb >= step) {
|
2013-11-11 14:42:43 +02:00
|
|
|
lp->skip = 0;
|
2012-02-13 20:25:31 +02:00
|
|
|
lp->ptr = leaf;
|
2012-02-13 20:45:32 +02:00
|
|
|
*index += step;
|
|
|
|
|
*nb -= step;
|
2012-02-13 20:21:20 +02:00
|
|
|
} else {
|
2013-12-01 14:02:23 +02:00
|
|
|
phys_page_set_level(map, lp, index, nb, leaf, level - 1);
|
2012-02-13 20:21:20 +02:00
|
|
|
}
|
|
|
|
|
++lp;
|
2012-02-13 20:12:05 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-10-03 16:22:53 +02:00
|
|
|
static void phys_page_set(AddressSpaceDispatch *d,
|
2019-03-21 16:25:50 +08:00
|
|
|
hwaddr index, uint64_t nb,
|
2012-02-13 20:21:20 +02:00
|
|
|
uint16_t leaf)
|
2012-02-13 20:12:05 +02:00
|
|
|
{
|
2012-02-13 20:21:20 +02:00
|
|
|
/* Wildly overreserve - it doesn't matter much. */
|
2013-12-01 14:02:23 +02:00
|
|
|
phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS);
|
2010-03-10 15:53:37 -08:00
|
|
|
|
2013-12-01 14:02:23 +02:00
|
|
|
phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1);
|
2004-05-21 14:52:29 +00:00
|
|
|
}
|
|
|
|
|
|
2013-11-11 17:52:07 +02:00
|
|
|
/* Compact a non leaf page entry. Simply detect that the entry has a single child,
|
|
|
|
|
* and update our entry so we can skip it and go directly to the destination.
|
|
|
|
|
*/
|
2016-09-28 16:37:20 +04:00
|
|
|
static void phys_page_compact(PhysPageEntry *lp, Node *nodes)
|
2013-11-11 17:52:07 +02:00
|
|
|
{
|
|
|
|
|
unsigned valid_ptr = P_L2_SIZE;
|
|
|
|
|
int valid = 0;
|
|
|
|
|
PhysPageEntry *p;
|
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
if (lp->ptr == PHYS_MAP_NODE_NIL) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
p = nodes[lp->ptr];
|
|
|
|
|
for (i = 0; i < P_L2_SIZE; i++) {
|
|
|
|
|
if (p[i].ptr == PHYS_MAP_NODE_NIL) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
valid_ptr = i;
|
|
|
|
|
valid++;
|
|
|
|
|
if (p[i].skip) {
|
2016-09-28 16:37:20 +04:00
|
|
|
phys_page_compact(&p[i], nodes);
|
2013-11-11 17:52:07 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* We can only compress if there's only one child. */
|
|
|
|
|
if (valid != 1) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert(valid_ptr < P_L2_SIZE);
|
|
|
|
|
|
|
|
|
|
/* Don't compress if it won't fit in the # of bits we have. */
|
2019-03-21 16:25:55 +08:00
|
|
|
if (P_L2_LEVELS >= (1 << 6) &&
|
|
|
|
|
lp->skip + p[valid_ptr].skip >= (1 << 6)) {
|
2013-11-11 17:52:07 +02:00
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
lp->ptr = p[valid_ptr].ptr;
|
|
|
|
|
if (!p[valid_ptr].skip) {
|
|
|
|
|
/* If our only child is a leaf, make this a leaf. */
|
|
|
|
|
/* By design, we should have made this node a leaf to begin with so we
|
|
|
|
|
* should never reach here.
|
|
|
|
|
* But since it's so simple to handle this, let's do it just in case we
|
|
|
|
|
* change this rule.
|
|
|
|
|
*/
|
|
|
|
|
lp->skip = 0;
|
|
|
|
|
} else {
|
|
|
|
|
lp->skip += p[valid_ptr].skip;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-09-21 18:51:00 +10:00
|
|
|
void address_space_dispatch_compact(AddressSpaceDispatch *d)
|
2013-11-11 17:52:07 +02:00
|
|
|
{
|
|
|
|
|
if (d->phys_map.skip) {
|
2016-09-28 16:37:20 +04:00
|
|
|
phys_page_compact(&d->phys_map, d->map.nodes);
|
2013-11-11 17:52:07 +02:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-01 14:18:23 +08:00
|
|
|
static inline bool section_covers_addr(const MemoryRegionSection *section,
|
|
|
|
|
hwaddr addr)
|
|
|
|
|
{
|
|
|
|
|
/* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means
|
|
|
|
|
* the section must cover the entire address space.
|
|
|
|
|
*/
|
2016-06-29 15:48:03 -07:00
|
|
|
return int128_gethi(section->size) ||
|
2016-03-01 14:18:23 +08:00
|
|
|
range_covers_byte(section->offset_within_address_space,
|
2016-06-29 15:48:03 -07:00
|
|
|
int128_getlo(section->size), addr);
|
2016-03-01 14:18:23 +08:00
|
|
|
}
|
|
|
|
|
|
2017-05-15 16:50:57 +08:00
|
|
|
static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr)
|
2004-05-21 14:52:29 +00:00
|
|
|
{
|
2017-05-15 16:50:57 +08:00
|
|
|
PhysPageEntry lp = d->phys_map, *p;
|
|
|
|
|
Node *nodes = d->map.nodes;
|
|
|
|
|
MemoryRegionSection *sections = d->map.sections;
|
2013-11-13 20:08:19 +02:00
|
|
|
hwaddr index = addr >> TARGET_PAGE_BITS;
|
2012-02-13 16:44:19 +02:00
|
|
|
int i;
|
2011-11-20 17:52:22 +02:00
|
|
|
|
2013-11-11 14:42:43 +02:00
|
|
|
for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) {
|
2012-02-13 20:25:31 +02:00
|
|
|
if (lp.ptr == PHYS_MAP_NODE_NIL) {
|
2013-05-29 12:09:47 +02:00
|
|
|
return §ions[PHYS_SECTION_UNASSIGNED];
|
2012-02-13 16:44:19 +02:00
|
|
|
}
|
2013-05-29 12:09:47 +02:00
|
|
|
p = nodes[lp.ptr];
|
2013-11-07 17:14:36 +01:00
|
|
|
lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)];
|
2012-02-12 18:32:55 +02:00
|
|
|
}
|
2013-11-11 17:52:07 +02:00
|
|
|
|
2016-03-01 14:18:23 +08:00
|
|
|
if (section_covers_addr(§ions[lp.ptr], addr)) {
|
2013-11-11 17:52:07 +02:00
|
|
|
return §ions[lp.ptr];
|
|
|
|
|
} else {
|
|
|
|
|
return §ions[PHYS_SECTION_UNASSIGNED];
|
|
|
|
|
}
|
2012-03-08 16:16:34 +02:00
|
|
|
}
|
|
|
|
|
|
2015-01-21 12:09:14 +01:00
|
|
|
/* Called from RCU critical section */
|
2013-06-02 15:27:39 +02:00
|
|
|
static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d,
|
2013-05-26 21:46:51 +02:00
|
|
|
hwaddr addr,
|
|
|
|
|
bool resolve_subpage)
|
2013-05-06 16:48:02 +02:00
|
|
|
{
|
2020-09-23 11:56:46 +01:00
|
|
|
MemoryRegionSection *section = qatomic_read(&d->mru_section);
|
2013-05-26 21:46:51 +02:00
|
|
|
subpage_t *subpage;
|
|
|
|
|
|
2017-11-15 15:11:03 +01:00
|
|
|
if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] ||
|
|
|
|
|
!section_covers_addr(section, addr)) {
|
2017-05-15 16:50:57 +08:00
|
|
|
section = phys_page_find(d, addr);
|
2020-09-23 11:56:46 +01:00
|
|
|
qatomic_set(&d->mru_section, section);
|
2016-03-01 14:18:24 +08:00
|
|
|
}
|
2013-05-26 21:46:51 +02:00
|
|
|
if (resolve_subpage && section->mr->subpage) {
|
|
|
|
|
subpage = container_of(section->mr, subpage_t, iomem);
|
2013-12-01 14:02:23 +02:00
|
|
|
section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]];
|
2013-05-26 21:46:51 +02:00
|
|
|
}
|
|
|
|
|
return section;
|
2013-05-06 16:48:02 +02:00
|
|
|
}
|
|
|
|
|
|
2015-01-21 12:09:14 +01:00
|
|
|
/* Called from RCU critical section */
|
2013-05-26 21:46:51 +02:00
|
|
|
static MemoryRegionSection *
|
2013-06-02 15:27:39 +02:00
|
|
|
address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat,
|
2013-05-26 21:46:51 +02:00
|
|
|
hwaddr *plen, bool resolve_subpage)
|
2013-05-24 12:59:37 +02:00
|
|
|
{
|
|
|
|
|
MemoryRegionSection *section;
|
2015-06-17 10:40:27 +02:00
|
|
|
MemoryRegion *mr;
|
2014-02-07 15:47:46 +01:00
|
|
|
Int128 diff;
|
2013-05-24 12:59:37 +02:00
|
|
|
|
2013-06-02 15:27:39 +02:00
|
|
|
section = address_space_lookup_region(d, addr, resolve_subpage);
|
2013-05-24 12:59:37 +02:00
|
|
|
/* Compute offset within MemoryRegionSection */
|
|
|
|
|
addr -= section->offset_within_address_space;
|
|
|
|
|
|
|
|
|
|
/* Compute offset within MemoryRegion */
|
|
|
|
|
*xlat = addr + section->offset_within_region;
|
|
|
|
|
|
2015-06-17 10:40:27 +02:00
|
|
|
mr = section->mr;
|
exec: skip MMIO regions correctly in cpu_physical_memory_write_rom_internal
Loading the BIOS in the mac99 machine is interesting, because there is a
PROM in the middle of the BIOS region (from 16K to 32K). Before memory
region accesses were clamped, when QEMU was asked to load a BIOS from
0xfff00000 to 0xffffffff it would put even those 16K from the BIOS file
into the region. This is weird because those 16K were not actually
visible between 0xfff04000 and 0xfff07fff. However, it worked.
After clamping was added, this also worked. In this case, the
cpu_physical_memory_write_rom_internal function split the write in
three parts: the first 16K were copied, the PROM area (second 16K) were
ignored, then the rest was copied.
Problems then started with commit 965eb2f (exec: do not clamp accesses
to MMIO regions, 2015-06-17). Clamping accesses is not done for MMIO
regions because they can overlap wildly, and MMIO registers can be
expected to perform full-width accesses based only on their address
(with no respect for adjacent registers that could decode to completely
different MemoryRegions). However, this lack of clamping also applied
to the PROM area! cpu_physical_memory_write_rom_internal thus failed
to copy the third range above, i.e. only copied the first 16K of the BIOS.
In effect, address_space_translate is expecting _something else_ to do
the clamping for MMIO regions if the incoming length is large. This
"something else" is memory_access_size in the case of address_space_rw,
so use the same logic in cpu_physical_memory_write_rom_internal.
Reported-by: Alexander Graf <agraf@redhat.com>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Tested-by: Laurent Vivier <lvivier@redhat.com>
Fixes: 965eb2f
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2015-07-04 00:24:51 +02:00
|
|
|
|
|
|
|
|
/* MMIO registers can be expected to perform full-width accesses based only
|
|
|
|
|
* on their address, without considering adjacent registers that could
|
|
|
|
|
* decode to completely different MemoryRegions. When such registers
|
|
|
|
|
* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
|
|
|
|
|
* regions overlap wildly. For this reason we cannot clamp the accesses
|
|
|
|
|
* here.
|
|
|
|
|
*
|
|
|
|
|
* If the length is small (as is the case for address_space_ldl/stl),
|
|
|
|
|
* everything works fine. If the incoming length is large, however,
|
|
|
|
|
* the caller really has to do the clamping through memory_access_size.
|
|
|
|
|
*/
|
2015-06-17 10:40:27 +02:00
|
|
|
if (memory_region_is_ram(mr)) {
|
2015-06-17 10:36:54 +02:00
|
|
|
diff = int128_sub(section->size, int128_make64(addr));
|
2015-06-17 10:40:27 +02:00
|
|
|
*plen = int128_get64(int128_min(diff, int128_make64(*plen)));
|
|
|
|
|
}
|
2013-05-24 12:59:37 +02:00
|
|
|
return section;
|
|
|
|
|
}
|
2013-05-26 21:46:51 +02:00
|
|
|
|
2018-03-03 17:24:04 +01:00
|
|
|
/**
|
|
|
|
|
* address_space_translate_iommu - translate an address through an IOMMU
|
|
|
|
|
* memory region and then through the target address space.
|
|
|
|
|
*
|
|
|
|
|
* @iommu_mr: the IOMMU memory region that we start the translation from
|
|
|
|
|
* @addr: the address to be translated through the MMU
|
|
|
|
|
* @xlat: the translated address offset within the destination memory region.
|
|
|
|
|
* It cannot be %NULL.
|
|
|
|
|
* @plen_out: valid read/write length of the translated address. It
|
|
|
|
|
* cannot be %NULL.
|
|
|
|
|
* @page_mask_out: page mask for the translated address. This
|
|
|
|
|
* should only be meaningful for IOMMU translated
|
|
|
|
|
* addresses, since there may be huge pages that this bit
|
|
|
|
|
* would tell. It can be %NULL if we don't care about it.
|
|
|
|
|
* @is_write: whether the translation operation is for write
|
|
|
|
|
* @is_mmio: whether this can be MMIO, set true if it can
|
|
|
|
|
* @target_as: the address space targeted by the IOMMU
|
2018-05-31 14:50:53 +01:00
|
|
|
* @attrs: transaction attributes
|
2018-03-03 17:24:04 +01:00
|
|
|
*
|
|
|
|
|
* This function is called from RCU critical section. It is the common
|
|
|
|
|
* part of flatview_do_translate and address_space_translate_cached.
|
|
|
|
|
*/
|
|
|
|
|
static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr,
|
|
|
|
|
hwaddr *xlat,
|
|
|
|
|
hwaddr *plen_out,
|
|
|
|
|
hwaddr *page_mask_out,
|
|
|
|
|
bool is_write,
|
|
|
|
|
bool is_mmio,
|
2018-05-31 14:50:53 +01:00
|
|
|
AddressSpace **target_as,
|
|
|
|
|
MemTxAttrs attrs)
|
2018-03-03 17:24:04 +01:00
|
|
|
{
|
|
|
|
|
MemoryRegionSection *section;
|
|
|
|
|
hwaddr page_mask = (hwaddr)-1;
|
|
|
|
|
|
|
|
|
|
do {
|
|
|
|
|
hwaddr addr = *xlat;
|
|
|
|
|
IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
|
2018-06-15 14:57:16 +01:00
|
|
|
int iommu_idx = 0;
|
|
|
|
|
IOMMUTLBEntry iotlb;
|
|
|
|
|
|
|
|
|
|
if (imrc->attrs_to_index) {
|
|
|
|
|
iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
iotlb = imrc->translate(iommu_mr, addr, is_write ?
|
|
|
|
|
IOMMU_WO : IOMMU_RO, iommu_idx);
|
2018-03-03 17:24:04 +01:00
|
|
|
|
|
|
|
|
if (!(iotlb.perm & (1 << is_write))) {
|
|
|
|
|
goto unassigned;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
|
|
|
|
|
| (addr & iotlb.addr_mask));
|
|
|
|
|
page_mask &= iotlb.addr_mask;
|
|
|
|
|
*plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1);
|
|
|
|
|
*target_as = iotlb.target_as;
|
|
|
|
|
|
|
|
|
|
section = address_space_translate_internal(
|
|
|
|
|
address_space_to_dispatch(iotlb.target_as), addr, xlat,
|
|
|
|
|
plen_out, is_mmio);
|
|
|
|
|
|
|
|
|
|
iommu_mr = memory_region_get_iommu(section->mr);
|
|
|
|
|
} while (unlikely(iommu_mr));
|
|
|
|
|
|
|
|
|
|
if (page_mask_out) {
|
|
|
|
|
*page_mask_out = page_mask;
|
|
|
|
|
}
|
|
|
|
|
return *section;
|
|
|
|
|
|
|
|
|
|
unassigned:
|
|
|
|
|
return (MemoryRegionSection) { .mr = &io_mem_unassigned };
|
|
|
|
|
}
|
|
|
|
|
|
exec: add page_mask for flatview_do_translate
The function is originally used for flatview_space_translate() and what
we care about most is (xlat, plen) range. However for iotlb requests, we
don't really care about "plen", but the size of the page that "xlat" is
located on. While, plen cannot really contain this information.
A simple example to show why "plen" is not good for IOTLB translations:
E.g., for huge pages, it is possible that guest mapped 1G huge page on
device side that used this GPA range:
0x100000000 - 0x13fffffff
Then let's say we want to translate one IOVA that finally mapped to GPA
0x13ffffe00 (which is located on this 1G huge page). Then here we'll
get:
(xlat, plen) = (0x13fffe00, 0x200)
So the IOTLB would be only covering a very small range since from
"plen" (which is 0x200 bytes) we cannot tell the size of the page.
Actually we can really know that this is a huge page - we just throw the
information away in flatview_do_translate().
This patch introduced "page_mask" optional parameter to capture that
page mask info. Also, I made "plen" an optional parameter as well, with
some comments for the whole function.
No functional change yet.
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-2-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-10-10 11:42:45 +02:00
|
|
|
/**
|
|
|
|
|
* flatview_do_translate - translate an address in FlatView
|
|
|
|
|
*
|
|
|
|
|
* @fv: the flat view that we want to translate on
|
|
|
|
|
* @addr: the address to be translated in above address space
|
|
|
|
|
* @xlat: the translated address offset within memory region. It
|
|
|
|
|
* cannot be @NULL.
|
|
|
|
|
* @plen_out: valid read/write length of the translated address. It
|
|
|
|
|
* can be @NULL when we don't care about it.
|
|
|
|
|
* @page_mask_out: page mask for the translated address. This
|
|
|
|
|
* should only be meaningful for IOMMU translated
|
|
|
|
|
* addresses, since there may be huge pages that this bit
|
|
|
|
|
* would tell. It can be @NULL if we don't care about it.
|
|
|
|
|
* @is_write: whether the translation operation is for write
|
|
|
|
|
* @is_mmio: whether this can be MMIO, set true if it can
|
2018-04-17 11:39:35 +02:00
|
|
|
* @target_as: the address space targeted by the IOMMU
|
2018-05-31 14:50:53 +01:00
|
|
|
* @attrs: memory transaction attributes
|
exec: add page_mask for flatview_do_translate
The function is originally used for flatview_space_translate() and what
we care about most is (xlat, plen) range. However for iotlb requests, we
don't really care about "plen", but the size of the page that "xlat" is
located on. While, plen cannot really contain this information.
A simple example to show why "plen" is not good for IOTLB translations:
E.g., for huge pages, it is possible that guest mapped 1G huge page on
device side that used this GPA range:
0x100000000 - 0x13fffffff
Then let's say we want to translate one IOVA that finally mapped to GPA
0x13ffffe00 (which is located on this 1G huge page). Then here we'll
get:
(xlat, plen) = (0x13fffe00, 0x200)
So the IOTLB would be only covering a very small range since from
"plen" (which is 0x200 bytes) we cannot tell the size of the page.
Actually we can really know that this is a huge page - we just throw the
information away in flatview_do_translate().
This patch introduced "page_mask" optional parameter to capture that
page mask info. Also, I made "plen" an optional parameter as well, with
some comments for the whole function.
No functional change yet.
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-2-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-10-10 11:42:45 +02:00
|
|
|
*
|
|
|
|
|
* This function is called from RCU critical section
|
|
|
|
|
*/
|
2017-09-21 18:50:58 +10:00
|
|
|
static MemoryRegionSection flatview_do_translate(FlatView *fv,
|
|
|
|
|
hwaddr addr,
|
|
|
|
|
hwaddr *xlat,
|
exec: add page_mask for flatview_do_translate
The function is originally used for flatview_space_translate() and what
we care about most is (xlat, plen) range. However for iotlb requests, we
don't really care about "plen", but the size of the page that "xlat" is
located on. While, plen cannot really contain this information.
A simple example to show why "plen" is not good for IOTLB translations:
E.g., for huge pages, it is possible that guest mapped 1G huge page on
device side that used this GPA range:
0x100000000 - 0x13fffffff
Then let's say we want to translate one IOVA that finally mapped to GPA
0x13ffffe00 (which is located on this 1G huge page). Then here we'll
get:
(xlat, plen) = (0x13fffe00, 0x200)
So the IOTLB would be only covering a very small range since from
"plen" (which is 0x200 bytes) we cannot tell the size of the page.
Actually we can really know that this is a huge page - we just throw the
information away in flatview_do_translate().
This patch introduced "page_mask" optional parameter to capture that
page mask info. Also, I made "plen" an optional parameter as well, with
some comments for the whole function.
No functional change yet.
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-2-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-10-10 11:42:45 +02:00
|
|
|
hwaddr *plen_out,
|
|
|
|
|
hwaddr *page_mask_out,
|
2017-09-21 18:50:58 +10:00
|
|
|
bool is_write,
|
|
|
|
|
bool is_mmio,
|
2018-05-31 14:50:53 +01:00
|
|
|
AddressSpace **target_as,
|
|
|
|
|
MemTxAttrs attrs)
|
2016-12-30 18:09:13 +08:00
|
|
|
{
|
|
|
|
|
MemoryRegionSection *section;
|
2017-07-11 13:56:19 +10:00
|
|
|
IOMMUMemoryRegion *iommu_mr;
|
exec: add page_mask for flatview_do_translate
The function is originally used for flatview_space_translate() and what
we care about most is (xlat, plen) range. However for iotlb requests, we
don't really care about "plen", but the size of the page that "xlat" is
located on. While, plen cannot really contain this information.
A simple example to show why "plen" is not good for IOTLB translations:
E.g., for huge pages, it is possible that guest mapped 1G huge page on
device side that used this GPA range:
0x100000000 - 0x13fffffff
Then let's say we want to translate one IOVA that finally mapped to GPA
0x13ffffe00 (which is located on this 1G huge page). Then here we'll
get:
(xlat, plen) = (0x13fffe00, 0x200)
So the IOTLB would be only covering a very small range since from
"plen" (which is 0x200 bytes) we cannot tell the size of the page.
Actually we can really know that this is a huge page - we just throw the
information away in flatview_do_translate().
This patch introduced "page_mask" optional parameter to capture that
page mask info. Also, I made "plen" an optional parameter as well, with
some comments for the whole function.
No functional change yet.
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-2-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-10-10 11:42:45 +02:00
|
|
|
hwaddr plen = (hwaddr)(-1);
|
|
|
|
|
|
2018-04-17 11:39:35 +02:00
|
|
|
if (!plen_out) {
|
|
|
|
|
plen_out = &plen;
|
exec: add page_mask for flatview_do_translate
The function is originally used for flatview_space_translate() and what
we care about most is (xlat, plen) range. However for iotlb requests, we
don't really care about "plen", but the size of the page that "xlat" is
located on. While, plen cannot really contain this information.
A simple example to show why "plen" is not good for IOTLB translations:
E.g., for huge pages, it is possible that guest mapped 1G huge page on
device side that used this GPA range:
0x100000000 - 0x13fffffff
Then let's say we want to translate one IOVA that finally mapped to GPA
0x13ffffe00 (which is located on this 1G huge page). Then here we'll
get:
(xlat, plen) = (0x13fffe00, 0x200)
So the IOTLB would be only covering a very small range since from
"plen" (which is 0x200 bytes) we cannot tell the size of the page.
Actually we can really know that this is a huge page - we just throw the
information away in flatview_do_translate().
This patch introduced "page_mask" optional parameter to capture that
page mask info. Also, I made "plen" an optional parameter as well, with
some comments for the whole function.
No functional change yet.
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-2-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-10-10 11:42:45 +02:00
|
|
|
}
|
2016-12-30 18:09:13 +08:00
|
|
|
|
2018-03-03 17:24:04 +01:00
|
|
|
section = address_space_translate_internal(
|
|
|
|
|
flatview_to_dispatch(fv), addr, xlat,
|
|
|
|
|
plen_out, is_mmio);
|
2016-12-30 18:09:13 +08:00
|
|
|
|
2018-03-03 17:24:04 +01:00
|
|
|
iommu_mr = memory_region_get_iommu(section->mr);
|
|
|
|
|
if (unlikely(iommu_mr)) {
|
|
|
|
|
return address_space_translate_iommu(iommu_mr, xlat,
|
|
|
|
|
plen_out, page_mask_out,
|
|
|
|
|
is_write, is_mmio,
|
2018-05-31 14:50:53 +01:00
|
|
|
target_as, attrs);
|
2016-12-30 18:09:13 +08:00
|
|
|
}
|
exec: add page_mask for flatview_do_translate
The function is originally used for flatview_space_translate() and what
we care about most is (xlat, plen) range. However for iotlb requests, we
don't really care about "plen", but the size of the page that "xlat" is
located on. While, plen cannot really contain this information.
A simple example to show why "plen" is not good for IOTLB translations:
E.g., for huge pages, it is possible that guest mapped 1G huge page on
device side that used this GPA range:
0x100000000 - 0x13fffffff
Then let's say we want to translate one IOVA that finally mapped to GPA
0x13ffffe00 (which is located on this 1G huge page). Then here we'll
get:
(xlat, plen) = (0x13fffe00, 0x200)
So the IOTLB would be only covering a very small range since from
"plen" (which is 0x200 bytes) we cannot tell the size of the page.
Actually we can really know that this is a huge page - we just throw the
information away in flatview_do_translate().
This patch introduced "page_mask" optional parameter to capture that
page mask info. Also, I made "plen" an optional parameter as well, with
some comments for the whole function.
No functional change yet.
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-2-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-10-10 11:42:45 +02:00
|
|
|
if (page_mask_out) {
|
2018-03-03 17:24:04 +01:00
|
|
|
/* Not behind an IOMMU, use default page size. */
|
|
|
|
|
*page_mask_out = ~TARGET_PAGE_MASK;
|
exec: add page_mask for flatview_do_translate
The function is originally used for flatview_space_translate() and what
we care about most is (xlat, plen) range. However for iotlb requests, we
don't really care about "plen", but the size of the page that "xlat" is
located on. While, plen cannot really contain this information.
A simple example to show why "plen" is not good for IOTLB translations:
E.g., for huge pages, it is possible that guest mapped 1G huge page on
device side that used this GPA range:
0x100000000 - 0x13fffffff
Then let's say we want to translate one IOVA that finally mapped to GPA
0x13ffffe00 (which is located on this 1G huge page). Then here we'll
get:
(xlat, plen) = (0x13fffe00, 0x200)
So the IOTLB would be only covering a very small range since from
"plen" (which is 0x200 bytes) we cannot tell the size of the page.
Actually we can really know that this is a huge page - we just throw the
information away in flatview_do_translate().
This patch introduced "page_mask" optional parameter to capture that
page mask info. Also, I made "plen" an optional parameter as well, with
some comments for the whole function.
No functional change yet.
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-2-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-10-10 11:42:45 +02:00
|
|
|
}
|
|
|
|
|
|
2017-05-17 16:57:42 +08:00
|
|
|
return *section;
|
2016-12-30 18:09:13 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Called from RCU critical section */
|
2017-05-17 16:57:42 +08:00
|
|
|
IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr,
|
2018-05-31 14:50:53 +01:00
|
|
|
bool is_write, MemTxAttrs attrs)
|
2013-05-26 21:46:51 +02:00
|
|
|
{
|
2017-05-17 16:57:42 +08:00
|
|
|
MemoryRegionSection section;
|
2017-10-10 11:42:46 +02:00
|
|
|
hwaddr xlat, page_mask;
|
2012-10-30 13:47:46 +02:00
|
|
|
|
2017-10-10 11:42:46 +02:00
|
|
|
/*
|
|
|
|
|
* This can never be MMIO, and we don't really care about plen,
|
|
|
|
|
* but page mask.
|
|
|
|
|
*/
|
|
|
|
|
section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat,
|
2018-05-31 14:50:53 +01:00
|
|
|
NULL, &page_mask, is_write, false, &as,
|
|
|
|
|
attrs);
|
2012-10-30 13:47:46 +02:00
|
|
|
|
2017-05-17 16:57:42 +08:00
|
|
|
/* Illegal translation */
|
|
|
|
|
if (section.mr == &io_mem_unassigned) {
|
|
|
|
|
goto iotlb_fail;
|
|
|
|
|
}
|
2012-10-30 13:47:46 +02:00
|
|
|
|
2017-05-17 16:57:42 +08:00
|
|
|
/* Convert memory region offset into address space offset */
|
|
|
|
|
xlat += section.offset_within_address_space -
|
|
|
|
|
section.offset_within_region;
|
|
|
|
|
|
|
|
|
|
return (IOMMUTLBEntry) {
|
2017-09-21 18:50:53 +10:00
|
|
|
.target_as = as,
|
2017-10-10 11:42:46 +02:00
|
|
|
.iova = addr & ~page_mask,
|
|
|
|
|
.translated_addr = xlat & ~page_mask,
|
|
|
|
|
.addr_mask = page_mask,
|
2017-05-17 16:57:42 +08:00
|
|
|
/* IOTLBs are for DMAs, and DMA only allows on RAMs. */
|
|
|
|
|
.perm = IOMMU_RW,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
iotlb_fail:
|
|
|
|
|
return (IOMMUTLBEntry) {0};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Called from RCU critical section */
|
2017-09-21 18:50:58 +10:00
|
|
|
MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat,
|
2018-05-31 14:50:52 +01:00
|
|
|
hwaddr *plen, bool is_write,
|
|
|
|
|
MemTxAttrs attrs)
|
2017-05-17 16:57:42 +08:00
|
|
|
{
|
|
|
|
|
MemoryRegion *mr;
|
|
|
|
|
MemoryRegionSection section;
|
2017-09-21 18:50:58 +10:00
|
|
|
AddressSpace *as = NULL;
|
2017-05-17 16:57:42 +08:00
|
|
|
|
|
|
|
|
/* This can be MMIO, so setup MMIO bit. */
|
exec: add page_mask for flatview_do_translate
The function is originally used for flatview_space_translate() and what
we care about most is (xlat, plen) range. However for iotlb requests, we
don't really care about "plen", but the size of the page that "xlat" is
located on. While, plen cannot really contain this information.
A simple example to show why "plen" is not good for IOTLB translations:
E.g., for huge pages, it is possible that guest mapped 1G huge page on
device side that used this GPA range:
0x100000000 - 0x13fffffff
Then let's say we want to translate one IOVA that finally mapped to GPA
0x13ffffe00 (which is located on this 1G huge page). Then here we'll
get:
(xlat, plen) = (0x13fffe00, 0x200)
So the IOTLB would be only covering a very small range since from
"plen" (which is 0x200 bytes) we cannot tell the size of the page.
Actually we can really know that this is a huge page - we just throw the
information away in flatview_do_translate().
This patch introduced "page_mask" optional parameter to capture that
page mask info. Also, I made "plen" an optional parameter as well, with
some comments for the whole function.
No functional change yet.
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Message-Id: <20171010094247.10173-2-maxime.coquelin@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2017-10-10 11:42:45 +02:00
|
|
|
section = flatview_do_translate(fv, addr, xlat, plen, NULL,
|
2018-05-31 14:50:53 +01:00
|
|
|
is_write, true, &as, attrs);
|
2017-05-17 16:57:42 +08:00
|
|
|
mr = section.mr;
|
|
|
|
|
|
2025-02-10 09:46:46 +01:00
|
|
|
if (xen_enabled() && memory_access_is_direct(mr, is_write, attrs)) {
|
2014-02-07 15:47:46 +01:00
|
|
|
hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr;
|
2015-03-16 22:35:54 -07:00
|
|
|
*plen = MIN(page, *plen);
|
2014-02-07 15:47:46 +01:00
|
|
|
}
|
|
|
|
|
|
2012-10-30 13:47:46 +02:00
|
|
|
return mr;
|
2013-05-26 21:46:51 +02:00
|
|
|
}
|
|
|
|
|
|
2025-04-24 22:24:10 +02:00
|
|
|
#ifdef CONFIG_TCG
|
|
|
|
|
|
2018-06-15 14:57:16 +01:00
|
|
|
typedef struct TCGIOMMUNotifier {
|
|
|
|
|
IOMMUNotifier n;
|
|
|
|
|
MemoryRegion *mr;
|
|
|
|
|
CPUState *cpu;
|
|
|
|
|
int iommu_idx;
|
|
|
|
|
bool active;
|
|
|
|
|
} TCGIOMMUNotifier;
|
|
|
|
|
|
|
|
|
|
static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
|
|
|
|
|
{
|
|
|
|
|
TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n);
|
|
|
|
|
|
|
|
|
|
if (!notifier->active) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
tlb_flush(notifier->cpu);
|
|
|
|
|
notifier->active = false;
|
|
|
|
|
/* We leave the notifier struct on the list to avoid reallocating it later.
|
|
|
|
|
* Generally the number of IOMMUs a CPU deals with will be small.
|
|
|
|
|
* In any case we can't unregister the iommu notifier from a notify
|
|
|
|
|
* callback.
|
|
|
|
|
*/
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void tcg_register_iommu_notifier(CPUState *cpu,
|
|
|
|
|
IOMMUMemoryRegion *iommu_mr,
|
|
|
|
|
int iommu_idx)
|
|
|
|
|
{
|
|
|
|
|
/* Make sure this CPU has an IOMMU notifier registered for this
|
|
|
|
|
* IOMMU/IOMMU index combination, so that we can flush its TLB
|
|
|
|
|
* when the IOMMU tells us the mappings we've cached have changed.
|
|
|
|
|
*/
|
|
|
|
|
MemoryRegion *mr = MEMORY_REGION(iommu_mr);
|
2021-01-17 18:04:11 +01:00
|
|
|
TCGIOMMUNotifier *notifier = NULL;
|
2020-07-22 10:40:48 +02:00
|
|
|
int i;
|
2018-06-15 14:57:16 +01:00
|
|
|
|
|
|
|
|
for (i = 0; i < cpu->iommu_notifiers->len; i++) {
|
2019-02-01 14:55:45 +00:00
|
|
|
notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
|
2018-06-15 14:57:16 +01:00
|
|
|
if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (i == cpu->iommu_notifiers->len) {
|
|
|
|
|
/* Not found, add a new entry at the end of the array */
|
|
|
|
|
cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1);
|
2019-02-01 14:55:45 +00:00
|
|
|
notifier = g_new0(TCGIOMMUNotifier, 1);
|
|
|
|
|
g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier;
|
2018-06-15 14:57:16 +01:00
|
|
|
|
|
|
|
|
notifier->mr = mr;
|
|
|
|
|
notifier->iommu_idx = iommu_idx;
|
|
|
|
|
notifier->cpu = cpu;
|
|
|
|
|
/* Rather than trying to register interest in the specific part
|
|
|
|
|
* of the iommu's address space that we've accessed and then
|
|
|
|
|
* expand it later as subsequent accesses touch more of it, we
|
|
|
|
|
* just register interest in the whole thing, on the assumption
|
|
|
|
|
* that iommu reconfiguration will be rare.
|
|
|
|
|
*/
|
|
|
|
|
iommu_notifier_init(¬ifier->n,
|
|
|
|
|
tcg_iommu_unmap_notify,
|
|
|
|
|
IOMMU_NOTIFIER_UNMAP,
|
|
|
|
|
0,
|
|
|
|
|
HWADDR_MAX,
|
|
|
|
|
iommu_idx);
|
2020-07-22 10:40:48 +02:00
|
|
|
memory_region_register_iommu_notifier(notifier->mr, ¬ifier->n,
|
|
|
|
|
&error_fatal);
|
2018-06-15 14:57:16 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!notifier->active) {
|
|
|
|
|
notifier->active = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-06 09:05:29 +02:00
|
|
|
void tcg_iommu_free_notifier_list(CPUState *cpu)
|
2018-06-15 14:57:16 +01:00
|
|
|
{
|
|
|
|
|
/* Destroy the CPU's notifier list */
|
|
|
|
|
int i;
|
|
|
|
|
TCGIOMMUNotifier *notifier;
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < cpu->iommu_notifiers->len; i++) {
|
2019-02-01 14:55:45 +00:00
|
|
|
notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i);
|
2018-06-15 14:57:16 +01:00
|
|
|
memory_region_unregister_iommu_notifier(notifier->mr, ¬ifier->n);
|
2019-02-01 14:55:45 +00:00
|
|
|
g_free(notifier);
|
2018-06-15 14:57:16 +01:00
|
|
|
}
|
|
|
|
|
g_array_free(cpu->iommu_notifiers, true);
|
|
|
|
|
}
|
|
|
|
|
|
2020-10-06 09:05:29 +02:00
|
|
|
void tcg_iommu_init_notifier_list(CPUState *cpu)
|
|
|
|
|
{
|
|
|
|
|
cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *));
|
|
|
|
|
}
|
|
|
|
|
|
2015-01-21 12:09:14 +01:00
|
|
|
/* Called from RCU critical section */
|
2013-05-26 21:46:51 +02:00
|
|
|
MemoryRegionSection *
|
2022-06-21 08:38:29 -07:00
|
|
|
address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr orig_addr,
|
2018-06-15 14:57:16 +01:00
|
|
|
hwaddr *xlat, hwaddr *plen,
|
|
|
|
|
MemTxAttrs attrs, int *prot)
|
2013-05-26 21:46:51 +02:00
|
|
|
{
|
2012-10-30 13:47:46 +02:00
|
|
|
MemoryRegionSection *section;
|
2018-06-15 14:57:16 +01:00
|
|
|
IOMMUMemoryRegion *iommu_mr;
|
|
|
|
|
IOMMUMemoryRegionClass *imrc;
|
|
|
|
|
IOMMUTLBEntry iotlb;
|
|
|
|
|
int iommu_idx;
|
2022-06-21 08:38:29 -07:00
|
|
|
hwaddr addr = orig_addr;
|
system/physmem: fix use-after-free with dispatch
A use-after-free bug was reported when booting a Linux kernel during the
pci setup phase. It's quite hard to reproduce (needs smp, and favored by
having several pci devices with BAR and specific Linux config, which
is Debian default one in this case).
After investigation (see the associated bug ticket), it appears that,
under specific conditions, we might access a cached AddressSpaceDispatch
that was reclaimed by RCU thread meanwhile.
In the Linux boot scenario, during the pci phase, memory region are
destroyed/recreated, resulting in exposition of the bug.
The core of the issue is that we cache the dispatch associated to
current cpu in cpu->cpu_ases[asidx].memory_dispatch. It is updated with
tcg_commit, which runs asynchronously on a given cpu.
At some point, we leave the rcu critial section, and the RCU thread
starts reclaiming it, but tcg_commit is not yet invoked, resulting in
the use-after-free.
It's not the first problem around this area, and commit 0d58c660689 [1]
("softmmu: Use async_run_on_cpu in tcg_commit") already tried to
address it. It did a good job, but it seems that we found a specific
situation where it's not enough.
This patch takes a simple approach: remove the cached value creating the
issue, and make sure we always get the current mapping for address
space, using address_space_to_dispatch(cpu->cpu_ases[asidx].as).
It's equivalent to qatomic_rcu_read(&as->current_map)->dispatch;
This is not really costly, we just need two dereferences,
including one atomic (rcu) read, which is negligible considering we are
already on mmu slow path anyway.
Note that tcg_commit is still needed, as it's taking care of flushing
TLB, removing previously mapped entries.
Another solution would be to cache directly values under the dispatch
(dispatch themselves are not ref counted), keep an active reference on
associated memory section, and release it when appropriate (tricky).
Given the time already spent debugging this area now and previously, I
strongly prefer eliminating the root of the issue, instead of adding
more complexity for a hypothetical performance gain. RCU is precisely
used to ensure good performance when reading data, so caching is not as
beneficial as it might seem IMHO.
[1] https://gitlab.com/qemu-project/qemu/-/commit/0d58c660689f6da1e3feff8a997014003d928b3b
Cc: qemu-stable@nongnu.org
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/3040
Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Michael Tokarev <mjt@tls.msk.ru>
Tested-by: Michael Tokarev <mjt@tls.msk.ru>
Message-ID: <20250724161142.2803091-1-pierrick.bouvier@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
2025-07-24 09:11:42 -07:00
|
|
|
AddressSpaceDispatch *d = address_space_to_dispatch(cpu->cpu_ases[asidx].as);
|
2016-01-21 14:15:05 +00:00
|
|
|
|
2018-06-15 14:57:16 +01:00
|
|
|
for (;;) {
|
|
|
|
|
section = address_space_translate_internal(d, addr, &addr, plen, false);
|
|
|
|
|
|
|
|
|
|
iommu_mr = memory_region_get_iommu(section->mr);
|
|
|
|
|
if (!iommu_mr) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
imrc = memory_region_get_iommu_class_nocheck(iommu_mr);
|
|
|
|
|
|
|
|
|
|
iommu_idx = imrc->attrs_to_index(iommu_mr, attrs);
|
|
|
|
|
tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx);
|
|
|
|
|
/* We need all the permissions, so pass IOMMU_NONE so the IOMMU
|
|
|
|
|
* doesn't short-cut its translation table walk.
|
|
|
|
|
*/
|
|
|
|
|
iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx);
|
|
|
|
|
addr = ((iotlb.translated_addr & ~iotlb.addr_mask)
|
|
|
|
|
| (addr & iotlb.addr_mask));
|
|
|
|
|
/* Update the caller's prot bits to remove permissions the IOMMU
|
|
|
|
|
* is giving us a failure response for. If we get down to no
|
|
|
|
|
* permissions left at all we can give up now.
|
|
|
|
|
*/
|
|
|
|
|
if (!(iotlb.perm & IOMMU_RO)) {
|
|
|
|
|
*prot &= ~(PAGE_READ | PAGE_EXEC);
|
|
|
|
|
}
|
|
|
|
|
if (!(iotlb.perm & IOMMU_WO)) {
|
|
|
|
|
*prot &= ~PAGE_WRITE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!*prot) {
|
|
|
|
|
goto translate_fail;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as));
|
|
|
|
|
}
|
2012-10-30 13:47:46 +02:00
|
|
|
|
2017-07-11 13:56:19 +10:00
|
|
|
assert(!memory_region_is_iommu(section->mr));
|
2018-06-15 14:57:16 +01:00
|
|
|
*xlat = addr;
|
2012-10-30 13:47:46 +02:00
|
|
|
return section;
|
2018-06-15 14:57:16 +01:00
|
|
|
|
|
|
|
|
translate_fail:
|
2022-06-21 08:38:29 -07:00
|
|
|
/*
|
|
|
|
|
* We should be given a page-aligned address -- certainly
|
|
|
|
|
* tlb_set_page_with_attrs() does so. The page offset of xlat
|
|
|
|
|
* is used to index sections[], and PHYS_SECTION_UNASSIGNED = 0.
|
|
|
|
|
* The page portion of xlat will be logged by memory_region_access_valid()
|
|
|
|
|
* when this memory access is rejected, so use the original untranslated
|
|
|
|
|
* physical address.
|
|
|
|
|
*/
|
|
|
|
|
assert((orig_addr & ~TARGET_PAGE_MASK) == 0);
|
|
|
|
|
*xlat = orig_addr;
|
2018-06-15 14:57:16 +01:00
|
|
|
return &d->map.sections[PHYS_SECTION_UNASSIGNED];
|
2013-05-26 21:46:51 +02:00
|
|
|
}
|
2013-06-17 04:09:11 +02:00
|
|
|
|
2025-04-24 22:24:10 +02:00
|
|
|
#endif /* CONFIG_TCG */
|
|
|
|
|
|
2017-11-23 17:23:32 +08:00
|
|
|
void cpu_address_space_init(CPUState *cpu, int asidx,
|
|
|
|
|
const char *prefix, MemoryRegion *mr)
|
2013-12-17 13:06:51 +10:00
|
|
|
{
|
2016-01-21 14:15:04 +00:00
|
|
|
CPUAddressSpace *newas;
|
2017-11-23 17:23:32 +08:00
|
|
|
AddressSpace *as = g_new0(AddressSpace, 1);
|
2017-11-23 17:23:33 +08:00
|
|
|
char *as_name;
|
2017-11-23 17:23:32 +08:00
|
|
|
|
|
|
|
|
assert(mr);
|
2017-11-23 17:23:33 +08:00
|
|
|
as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index);
|
|
|
|
|
address_space_init(as, mr, as_name);
|
|
|
|
|
g_free(as_name);
|
2016-01-21 14:15:04 +00:00
|
|
|
|
2025-12-18 09:39:16 +01:00
|
|
|
/* Target code should have set max_as before calling us */
|
|
|
|
|
assert(asidx <= cpu->cc->max_as);
|
2016-01-21 14:15:04 +00:00
|
|
|
|
2016-01-21 14:15:04 +00:00
|
|
|
if (asidx == 0) {
|
|
|
|
|
/* address space 0 gets the convenience alias */
|
|
|
|
|
cpu->as = as;
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-21 14:15:04 +00:00
|
|
|
if (!cpu->cpu_ases) {
|
2025-12-18 09:39:16 +01:00
|
|
|
cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->cc->max_as + 1);
|
2013-12-17 13:06:51 +10:00
|
|
|
}
|
2015-10-01 15:29:50 +01:00
|
|
|
|
2016-01-21 14:15:04 +00:00
|
|
|
newas = &cpu->cpu_ases[asidx];
|
|
|
|
|
newas->cpu = cpu;
|
|
|
|
|
newas->as = as;
|
2016-01-21 14:15:04 +00:00
|
|
|
if (tcg_enabled()) {
|
2018-02-06 18:37:39 +01:00
|
|
|
newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync;
|
2016-01-21 14:15:04 +00:00
|
|
|
newas->tcg_as_listener.commit = tcg_commit;
|
2021-08-16 21:35:52 -04:00
|
|
|
newas->tcg_as_listener.name = "tcg";
|
2016-01-21 14:15:04 +00:00
|
|
|
memory_listener_register(&newas->tcg_as_listener, as);
|
2016-01-21 14:15:04 +00:00
|
|
|
}
|
2013-12-17 13:06:51 +10:00
|
|
|
}
|
2016-01-21 14:15:05 +00:00
|
|
|
|
2025-09-29 15:42:28 +01:00
|
|
|
void cpu_destroy_address_spaces(CPUState *cpu)
|
2024-07-16 12:15:01 +01:00
|
|
|
{
|
|
|
|
|
CPUAddressSpace *cpuas;
|
2025-09-29 15:42:28 +01:00
|
|
|
int asidx;
|
2024-07-16 12:15:01 +01:00
|
|
|
|
|
|
|
|
assert(cpu->cpu_ases);
|
|
|
|
|
|
2025-09-29 15:42:28 +01:00
|
|
|
/* convenience alias just points to some cpu_ases[n] */
|
|
|
|
|
cpu->as = NULL;
|
2024-07-16 12:15:01 +01:00
|
|
|
|
2025-12-18 09:39:16 +01:00
|
|
|
for (asidx = 0; asidx <= cpu->cc->max_as; asidx++) {
|
2025-09-29 15:42:28 +01:00
|
|
|
cpuas = &cpu->cpu_ases[asidx];
|
|
|
|
|
if (!cpuas->as) {
|
|
|
|
|
/* This index was never initialized; no deinit needed */
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (tcg_enabled()) {
|
|
|
|
|
memory_listener_unregister(&cpuas->tcg_as_listener);
|
|
|
|
|
}
|
|
|
|
|
g_clear_pointer(&cpuas->as, address_space_destroy_free);
|
2024-07-16 12:15:01 +01:00
|
|
|
}
|
|
|
|
|
|
2025-09-29 15:42:28 +01:00
|
|
|
g_clear_pointer(&cpu->cpu_ases, g_free);
|
2024-07-16 12:15:01 +01:00
|
|
|
}
|
|
|
|
|
|
2016-01-21 14:15:05 +00:00
|
|
|
AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx)
|
|
|
|
|
{
|
|
|
|
|
/* Return the AddressSpace corresponding to the specified index */
|
|
|
|
|
return cpu->cpu_ases[asidx].as;
|
|
|
|
|
}
|
2018-05-30 11:58:36 +02:00
|
|
|
|
2013-09-05 14:41:35 -04:00
|
|
|
/* Called from RCU critical section */
|
2013-09-09 17:49:45 +02:00
|
|
|
static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
|
|
|
|
|
{
|
|
|
|
|
RAMBlock *block;
|
|
|
|
|
|
2020-09-23 11:56:46 +01:00
|
|
|
block = qatomic_rcu_read(&ram_list.mru_block);
|
2014-12-15 22:55:32 +02:00
|
|
|
if (block && addr - block->offset < block->max_length) {
|
2015-10-22 13:51:30 +02:00
|
|
|
return block;
|
2013-09-09 17:49:45 +02:00
|
|
|
}
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
2014-12-15 22:55:32 +02:00
|
|
|
if (addr - block->offset < block->max_length) {
|
2013-09-09 17:49:45 +02:00
|
|
|
goto found;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr);
|
|
|
|
|
abort();
|
|
|
|
|
|
|
|
|
|
found:
|
2024-01-02 10:35:28 -05:00
|
|
|
/* It is safe to write mru_block outside the BQL. This
|
2013-09-09 17:58:40 +02:00
|
|
|
* is what happens:
|
|
|
|
|
*
|
|
|
|
|
* mru_block = xxx
|
|
|
|
|
* rcu_read_unlock()
|
|
|
|
|
* xxx removed from list
|
|
|
|
|
* rcu_read_lock()
|
|
|
|
|
* read mru_block
|
|
|
|
|
* mru_block = NULL;
|
|
|
|
|
* call_rcu(reclaim_ramblock, xxx);
|
|
|
|
|
* rcu_read_unlock()
|
|
|
|
|
*
|
2020-09-23 11:56:46 +01:00
|
|
|
* qatomic_rcu_set is not needed here. The block was already published
|
2013-09-09 17:58:40 +02:00
|
|
|
* when it was placed into the list. Here we're just making an extra
|
|
|
|
|
* copy of the pointer.
|
|
|
|
|
*/
|
2013-09-09 17:49:45 +02:00
|
|
|
ram_list.mru_block = block;
|
|
|
|
|
return block;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-12 21:14:56 +01:00
|
|
|
void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length)
|
2012-05-22 00:42:40 +02:00
|
|
|
{
|
2015-09-10 22:39:41 -07:00
|
|
|
CPUState *cpu;
|
2013-09-09 17:49:45 +02:00
|
|
|
ram_addr_t start1;
|
2013-10-10 11:49:53 +02:00
|
|
|
RAMBlock *block;
|
|
|
|
|
ram_addr_t end;
|
|
|
|
|
|
2018-06-22 13:45:31 -04:00
|
|
|
assert(tcg_enabled());
|
2013-10-10 11:49:53 +02:00
|
|
|
end = TARGET_PAGE_ALIGN(start + length);
|
|
|
|
|
start &= TARGET_PAGE_MASK;
|
2012-05-22 00:42:40 +02:00
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2013-09-09 17:49:45 +02:00
|
|
|
block = qemu_get_ram_block(start);
|
|
|
|
|
assert(block == qemu_get_ram_block(end - 1));
|
2014-11-12 11:44:41 +02:00
|
|
|
start1 = (uintptr_t)ramblock_ptr(block, start - block->offset);
|
2015-09-10 22:39:41 -07:00
|
|
|
CPU_FOREACH(cpu) {
|
|
|
|
|
tlb_reset_dirty(cpu, start1, length);
|
|
|
|
|
}
|
2012-05-22 00:42:40 +02:00
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
void physical_memory_dirty_bits_cleared(ram_addr_t start, ram_addr_t length)
|
2025-09-29 13:53:07 +02:00
|
|
|
{
|
|
|
|
|
if (tcg_enabled()) {
|
|
|
|
|
tlb_reset_dirty_range_all(start, length);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 13:31:32 +02:00
|
|
|
static bool physical_memory_get_dirty(ram_addr_t start, ram_addr_t length,
|
|
|
|
|
unsigned client)
|
|
|
|
|
{
|
|
|
|
|
DirtyMemoryBlocks *blocks;
|
|
|
|
|
unsigned long end, page;
|
|
|
|
|
unsigned long idx, offset, base;
|
|
|
|
|
bool dirty = false;
|
|
|
|
|
|
|
|
|
|
assert(client < DIRTY_MEMORY_NUM);
|
|
|
|
|
|
|
|
|
|
end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
|
|
|
|
|
page = start >> TARGET_PAGE_BITS;
|
|
|
|
|
|
|
|
|
|
WITH_RCU_READ_LOCK_GUARD() {
|
|
|
|
|
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
|
|
|
|
|
|
|
|
|
|
idx = page / DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
offset = page % DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
base = page - offset;
|
|
|
|
|
while (page < end) {
|
|
|
|
|
unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE);
|
|
|
|
|
unsigned long num = next - base;
|
|
|
|
|
unsigned long found = find_next_bit(blocks->blocks[idx],
|
|
|
|
|
num, offset);
|
|
|
|
|
if (found < num) {
|
|
|
|
|
dirty = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
page = next;
|
|
|
|
|
idx++;
|
|
|
|
|
offset = 0;
|
|
|
|
|
base += DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return dirty;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
bool physical_memory_get_dirty_flag(ram_addr_t addr, unsigned client)
|
2025-09-29 13:31:32 +02:00
|
|
|
{
|
|
|
|
|
return physical_memory_get_dirty(addr, 1, client);
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
bool physical_memory_is_clean(ram_addr_t addr)
|
2025-09-29 13:33:02 +02:00
|
|
|
{
|
2025-09-30 09:08:44 +02:00
|
|
|
bool vga = physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_VGA);
|
|
|
|
|
bool code = physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_CODE);
|
2025-09-29 13:33:02 +02:00
|
|
|
bool migration =
|
2025-09-30 09:08:44 +02:00
|
|
|
physical_memory_get_dirty_flag(addr, DIRTY_MEMORY_MIGRATION);
|
2025-09-29 13:33:02 +02:00
|
|
|
return !(vga && code && migration);
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 13:35:49 +02:00
|
|
|
static bool physical_memory_all_dirty(ram_addr_t start, ram_addr_t length,
|
|
|
|
|
unsigned client)
|
|
|
|
|
{
|
|
|
|
|
DirtyMemoryBlocks *blocks;
|
|
|
|
|
unsigned long end, page;
|
|
|
|
|
unsigned long idx, offset, base;
|
|
|
|
|
bool dirty = true;
|
|
|
|
|
|
|
|
|
|
assert(client < DIRTY_MEMORY_NUM);
|
|
|
|
|
|
|
|
|
|
end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
|
|
|
|
|
page = start >> TARGET_PAGE_BITS;
|
|
|
|
|
|
|
|
|
|
RCU_READ_LOCK_GUARD();
|
|
|
|
|
|
|
|
|
|
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
|
|
|
|
|
|
|
|
|
|
idx = page / DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
offset = page % DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
base = page - offset;
|
|
|
|
|
while (page < end) {
|
|
|
|
|
unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE);
|
|
|
|
|
unsigned long num = next - base;
|
|
|
|
|
unsigned long found = find_next_zero_bit(blocks->blocks[idx],
|
|
|
|
|
num, offset);
|
|
|
|
|
if (found < num) {
|
|
|
|
|
dirty = false;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
page = next;
|
|
|
|
|
idx++;
|
|
|
|
|
offset = 0;
|
|
|
|
|
base += DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return dirty;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
uint8_t physical_memory_range_includes_clean(ram_addr_t start,
|
2025-09-29 13:35:49 +02:00
|
|
|
ram_addr_t length,
|
|
|
|
|
uint8_t mask)
|
|
|
|
|
{
|
|
|
|
|
uint8_t ret = 0;
|
|
|
|
|
|
|
|
|
|
if (mask & (1 << DIRTY_MEMORY_VGA) &&
|
|
|
|
|
!physical_memory_all_dirty(start, length, DIRTY_MEMORY_VGA)) {
|
|
|
|
|
ret |= (1 << DIRTY_MEMORY_VGA);
|
|
|
|
|
}
|
|
|
|
|
if (mask & (1 << DIRTY_MEMORY_CODE) &&
|
|
|
|
|
!physical_memory_all_dirty(start, length, DIRTY_MEMORY_CODE)) {
|
|
|
|
|
ret |= (1 << DIRTY_MEMORY_CODE);
|
|
|
|
|
}
|
|
|
|
|
if (mask & (1 << DIRTY_MEMORY_MIGRATION) &&
|
|
|
|
|
!physical_memory_all_dirty(start, length, DIRTY_MEMORY_MIGRATION)) {
|
|
|
|
|
ret |= (1 << DIRTY_MEMORY_MIGRATION);
|
|
|
|
|
}
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
void physical_memory_set_dirty_flag(ram_addr_t addr, unsigned client)
|
2025-09-29 13:38:52 +02:00
|
|
|
{
|
|
|
|
|
unsigned long page, idx, offset;
|
|
|
|
|
DirtyMemoryBlocks *blocks;
|
|
|
|
|
|
|
|
|
|
assert(client < DIRTY_MEMORY_NUM);
|
|
|
|
|
|
|
|
|
|
page = addr >> TARGET_PAGE_BITS;
|
|
|
|
|
idx = page / DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
offset = page % DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
|
|
|
|
|
RCU_READ_LOCK_GUARD();
|
|
|
|
|
|
|
|
|
|
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
|
|
|
|
|
|
|
|
|
|
set_bit_atomic(offset, blocks->blocks[idx]);
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
void physical_memory_set_dirty_range(ram_addr_t start, ram_addr_t length,
|
2025-09-29 13:40:29 +02:00
|
|
|
uint8_t mask)
|
|
|
|
|
{
|
|
|
|
|
DirtyMemoryBlocks *blocks[DIRTY_MEMORY_NUM];
|
|
|
|
|
unsigned long end, page;
|
|
|
|
|
unsigned long idx, offset, base;
|
|
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
if (!mask && !xen_enabled()) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
|
|
|
|
|
page = start >> TARGET_PAGE_BITS;
|
|
|
|
|
|
|
|
|
|
WITH_RCU_READ_LOCK_GUARD() {
|
|
|
|
|
for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
|
|
|
|
|
blocks[i] = qatomic_rcu_read(&ram_list.dirty_memory[i]);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
idx = page / DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
offset = page % DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
base = page - offset;
|
|
|
|
|
while (page < end) {
|
|
|
|
|
unsigned long next = MIN(end, base + DIRTY_MEMORY_BLOCK_SIZE);
|
|
|
|
|
|
|
|
|
|
if (likely(mask & (1 << DIRTY_MEMORY_MIGRATION))) {
|
|
|
|
|
bitmap_set_atomic(blocks[DIRTY_MEMORY_MIGRATION]->blocks[idx],
|
|
|
|
|
offset, next - page);
|
|
|
|
|
}
|
|
|
|
|
if (unlikely(mask & (1 << DIRTY_MEMORY_VGA))) {
|
|
|
|
|
bitmap_set_atomic(blocks[DIRTY_MEMORY_VGA]->blocks[idx],
|
|
|
|
|
offset, next - page);
|
|
|
|
|
}
|
|
|
|
|
if (unlikely(mask & (1 << DIRTY_MEMORY_CODE))) {
|
|
|
|
|
bitmap_set_atomic(blocks[DIRTY_MEMORY_CODE]->blocks[idx],
|
|
|
|
|
offset, next - page);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
page = next;
|
|
|
|
|
idx++;
|
|
|
|
|
offset = 0;
|
|
|
|
|
base += DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (xen_enabled()) {
|
|
|
|
|
xen_hvm_modified_memory(start, length);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
migration: merge fragmented clear_dirty ioctls
In our long-term experience in Bytedance, we've found that under
the same load, live migration of larger VMs with more devices is
often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time
affects the calculation of live migration bandwidth.
When the addresses processed are not aligned, a large number of
clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
2048 clear_dirty ioctls from two different memory_listener),
which increases the time required for bitmap_sync and makes it
more difficult for dirty pages to converge.
For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
(tested with 10GBps dirty rate, the sync time increases as the dirty
page rate increases), Here are each part of the sync time:
- sync from kvm to ram_list: 2.5ms
- vhost_log_sync:3ms
- sync aligned memory from ram_list to RAMBlock: 5ms
- sync misaligned memory from ram_list to RAMBlock: 61ms
Attempt to merge those fragmented clear_dirty ioctls, then syncing
misaligned memory from ram_list to RAMBlock takes only about 1ms,
and the total sync time is only *12ms*.
Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com
[peterx: drop var "offset" in physical_memory_sync_dirty_bitmap]
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
|
|
|
/*
|
|
|
|
|
* Note: start and end must be within the same ram block.
|
|
|
|
|
*
|
|
|
|
|
* @bmap usage:
|
|
|
|
|
* - When @bmap is provided, set bits for dirty pages, but
|
|
|
|
|
* only count those pages if the bit wasn't already set in @bmap.
|
|
|
|
|
* - When @bmap is NULL, count all dirty pages in the range.
|
|
|
|
|
*
|
|
|
|
|
* @return:
|
|
|
|
|
* - Number of dirty guest pages found within [start, start + length).
|
|
|
|
|
*/
|
|
|
|
|
uint64_t physical_memory_test_and_clear_dirty(ram_addr_t start,
|
2014-12-02 11:23:18 +00:00
|
|
|
ram_addr_t length,
|
migration: merge fragmented clear_dirty ioctls
In our long-term experience in Bytedance, we've found that under
the same load, live migration of larger VMs with more devices is
often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time
affects the calculation of live migration bandwidth.
When the addresses processed are not aligned, a large number of
clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
2048 clear_dirty ioctls from two different memory_listener),
which increases the time required for bitmap_sync and makes it
more difficult for dirty pages to converge.
For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
(tested with 10GBps dirty rate, the sync time increases as the dirty
page rate increases), Here are each part of the sync time:
- sync from kvm to ram_list: 2.5ms
- vhost_log_sync:3ms
- sync aligned memory from ram_list to RAMBlock: 5ms
- sync misaligned memory from ram_list to RAMBlock: 61ms
Attempt to merge those fragmented clear_dirty ioctls, then syncing
misaligned memory from ram_list to RAMBlock takes only about 1ms,
and the total sync time is only *12ms*.
Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com
[peterx: drop var "offset" in physical_memory_sync_dirty_bitmap]
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
|
|
|
unsigned client,
|
|
|
|
|
unsigned long *bmap)
|
2004-02-06 19:46:14 +00:00
|
|
|
{
|
2016-01-25 13:33:20 +00:00
|
|
|
DirtyMemoryBlocks *blocks;
|
2020-02-18 03:19:10 -07:00
|
|
|
unsigned long end, page, start_page;
|
migration: merge fragmented clear_dirty ioctls
In our long-term experience in Bytedance, we've found that under
the same load, live migration of larger VMs with more devices is
often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time
affects the calculation of live migration bandwidth.
When the addresses processed are not aligned, a large number of
clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
2048 clear_dirty ioctls from two different memory_listener),
which increases the time required for bitmap_sync and makes it
more difficult for dirty pages to converge.
For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
(tested with 10GBps dirty rate, the sync time increases as the dirty
page rate increases), Here are each part of the sync time:
- sync from kvm to ram_list: 2.5ms
- vhost_log_sync:3ms
- sync aligned memory from ram_list to RAMBlock: 5ms
- sync misaligned memory from ram_list to RAMBlock: 61ms
Attempt to merge those fragmented clear_dirty ioctls, then syncing
misaligned memory from ram_list to RAMBlock takes only about 1ms,
and the total sync time is only *12ms*.
Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com
[peterx: drop var "offset" in physical_memory_sync_dirty_bitmap]
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
|
|
|
uint64_t num_dirty = 0;
|
2019-06-03 14:50:51 +08:00
|
|
|
RAMBlock *ramblock;
|
|
|
|
|
uint64_t mr_offset, mr_size;
|
2014-12-02 11:23:18 +00:00
|
|
|
|
|
|
|
|
if (length == 0) {
|
migration: merge fragmented clear_dirty ioctls
In our long-term experience in Bytedance, we've found that under
the same load, live migration of larger VMs with more devices is
often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time
affects the calculation of live migration bandwidth.
When the addresses processed are not aligned, a large number of
clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
2048 clear_dirty ioctls from two different memory_listener),
which increases the time required for bitmap_sync and makes it
more difficult for dirty pages to converge.
For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
(tested with 10GBps dirty rate, the sync time increases as the dirty
page rate increases), Here are each part of the sync time:
- sync from kvm to ram_list: 2.5ms
- vhost_log_sync:3ms
- sync aligned memory from ram_list to RAMBlock: 5ms
- sync misaligned memory from ram_list to RAMBlock: 61ms
Attempt to merge those fragmented clear_dirty ioctls, then syncing
misaligned memory from ram_list to RAMBlock takes only about 1ms,
and the total sync time is only *12ms*.
Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com
[peterx: drop var "offset" in physical_memory_sync_dirty_bitmap]
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
|
|
|
return 0;
|
2014-12-02 11:23:18 +00:00
|
|
|
}
|
2005-08-21 19:12:28 +00:00
|
|
|
|
2014-12-02 11:23:18 +00:00
|
|
|
end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS;
|
2020-02-18 03:19:10 -07:00
|
|
|
start_page = start >> TARGET_PAGE_BITS;
|
|
|
|
|
page = start_page;
|
2016-01-25 13:33:20 +00:00
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
WITH_RCU_READ_LOCK_GUARD() {
|
2020-09-23 11:56:46 +01:00
|
|
|
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
|
2019-10-07 15:36:41 +01:00
|
|
|
ramblock = qemu_get_ram_block(start);
|
|
|
|
|
/* Range sanity check on the ramblock */
|
|
|
|
|
assert(start >= ramblock->offset &&
|
|
|
|
|
start + length <= ramblock->offset + ramblock->used_length);
|
2016-01-25 13:33:20 +00:00
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
while (page < end) {
|
|
|
|
|
unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE;
|
2016-01-25 13:33:20 +00:00
|
|
|
|
migration: merge fragmented clear_dirty ioctls
In our long-term experience in Bytedance, we've found that under
the same load, live migration of larger VMs with more devices is
often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time
affects the calculation of live migration bandwidth.
When the addresses processed are not aligned, a large number of
clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
2048 clear_dirty ioctls from two different memory_listener),
which increases the time required for bitmap_sync and makes it
more difficult for dirty pages to converge.
For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
(tested with 10GBps dirty rate, the sync time increases as the dirty
page rate increases), Here are each part of the sync time:
- sync from kvm to ram_list: 2.5ms
- vhost_log_sync:3ms
- sync aligned memory from ram_list to RAMBlock: 5ms
- sync misaligned memory from ram_list to RAMBlock: 61ms
Attempt to merge those fragmented clear_dirty ioctls, then syncing
misaligned memory from ram_list to RAMBlock takes only about 1ms,
and the total sync time is only *12ms*.
Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com
[peterx: drop var "offset" in physical_memory_sync_dirty_bitmap]
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
|
|
|
if (bitmap_test_and_clear_atomic(blocks->blocks[idx], offset, 1)) {
|
|
|
|
|
if (bmap) {
|
|
|
|
|
unsigned long k = page - (ramblock->offset >> TARGET_PAGE_BITS);
|
|
|
|
|
if (!test_and_set_bit(k, bmap)) {
|
|
|
|
|
num_dirty++;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
num_dirty++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
page++;
|
2019-10-07 15:36:41 +01:00
|
|
|
}
|
2016-01-25 13:33:20 +00:00
|
|
|
|
2020-02-18 03:19:10 -07:00
|
|
|
mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset;
|
|
|
|
|
mr_size = (end - start_page) << TARGET_PAGE_BITS;
|
2019-10-07 15:36:41 +01:00
|
|
|
memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size);
|
2016-01-25 13:33:20 +00:00
|
|
|
}
|
|
|
|
|
|
migration: merge fragmented clear_dirty ioctls
In our long-term experience in Bytedance, we've found that under
the same load, live migration of larger VMs with more devices is
often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time
affects the calculation of live migration bandwidth.
When the addresses processed are not aligned, a large number of
clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
2048 clear_dirty ioctls from two different memory_listener),
which increases the time required for bitmap_sync and makes it
more difficult for dirty pages to converge.
For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
(tested with 10GBps dirty rate, the sync time increases as the dirty
page rate increases), Here are each part of the sync time:
- sync from kvm to ram_list: 2.5ms
- vhost_log_sync:3ms
- sync aligned memory from ram_list to RAMBlock: 5ms
- sync misaligned memory from ram_list to RAMBlock: 61ms
Attempt to merge those fragmented clear_dirty ioctls, then syncing
misaligned memory from ram_list to RAMBlock takes only about 1ms,
and the total sync time is only *12ms*.
Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com
[peterx: drop var "offset" in physical_memory_sync_dirty_bitmap]
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
|
|
|
if (num_dirty) {
|
2025-09-30 09:08:44 +02:00
|
|
|
physical_memory_dirty_bits_cleared(start, length);
|
2009-04-11 14:47:08 +00:00
|
|
|
}
|
2014-12-02 11:23:18 +00:00
|
|
|
|
migration: merge fragmented clear_dirty ioctls
In our long-term experience in Bytedance, we've found that under
the same load, live migration of larger VMs with more devices is
often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time
affects the calculation of live migration bandwidth.
When the addresses processed are not aligned, a large number of
clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
2048 clear_dirty ioctls from two different memory_listener),
which increases the time required for bitmap_sync and makes it
more difficult for dirty pages to converge.
For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
(tested with 10GBps dirty rate, the sync time increases as the dirty
page rate increases), Here are each part of the sync time:
- sync from kvm to ram_list: 2.5ms
- vhost_log_sync:3ms
- sync aligned memory from ram_list to RAMBlock: 5ms
- sync misaligned memory from ram_list to RAMBlock: 61ms
Attempt to merge those fragmented clear_dirty ioctls, then syncing
misaligned memory from ram_list to RAMBlock takes only about 1ms,
and the total sync time is only *12ms*.
Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com
[peterx: drop var "offset" in physical_memory_sync_dirty_bitmap]
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
|
|
|
return num_dirty;
|
2004-02-06 19:46:14 +00:00
|
|
|
}
|
|
|
|
|
|
2025-09-29 13:55:15 +02:00
|
|
|
static void physical_memory_clear_dirty_range(ram_addr_t addr, ram_addr_t length)
|
|
|
|
|
{
|
migration: merge fragmented clear_dirty ioctls
In our long-term experience in Bytedance, we've found that under
the same load, live migration of larger VMs with more devices is
often more difficult to converge (requiring a larger downtime limit).
Through some testing and calculations, we conclude that bitmap sync time
affects the calculation of live migration bandwidth.
When the addresses processed are not aligned, a large number of
clear_dirty ioctl occur (e.g. a 4MB misaligned memory can generate
2048 clear_dirty ioctls from two different memory_listener),
which increases the time required for bitmap_sync and makes it
more difficult for dirty pages to converge.
For a 64C256G vm with 8 vhost-user-net(32 queue per nic) and
16 vhost-user-blk(4 queue per blk), the sync time is as high as *73ms*
(tested with 10GBps dirty rate, the sync time increases as the dirty
page rate increases), Here are each part of the sync time:
- sync from kvm to ram_list: 2.5ms
- vhost_log_sync:3ms
- sync aligned memory from ram_list to RAMBlock: 5ms
- sync misaligned memory from ram_list to RAMBlock: 61ms
Attempt to merge those fragmented clear_dirty ioctls, then syncing
misaligned memory from ram_list to RAMBlock takes only about 1ms,
and the total sync time is only *12ms*.
Signed-off-by: Chuang Xu <xuchuangxclwt@bytedance.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20251218114220.83354-1-xuchuangxclwt@bytedance.com
[peterx: drop var "offset" in physical_memory_sync_dirty_bitmap]
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-12-18 19:42:20 +08:00
|
|
|
physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_MIGRATION, NULL);
|
|
|
|
|
physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_VGA, NULL);
|
|
|
|
|
physical_memory_test_and_clear_dirty(addr, length, DIRTY_MEMORY_CODE, NULL);
|
2025-09-29 13:55:15 +02:00
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
DirtyBitmapSnapshot *physical_memory_snapshot_and_clear_dirty
|
2019-06-03 14:50:50 +08:00
|
|
|
(MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client)
|
2017-04-21 11:16:25 +02:00
|
|
|
{
|
|
|
|
|
DirtyMemoryBlocks *blocks;
|
2024-07-23 18:05:13 +01:00
|
|
|
ram_addr_t start, first, last;
|
2017-04-21 11:16:25 +02:00
|
|
|
unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL);
|
|
|
|
|
DirtyBitmapSnapshot *snap;
|
|
|
|
|
unsigned long page, end, dest;
|
|
|
|
|
|
2024-07-23 18:05:13 +01:00
|
|
|
start = memory_region_get_ram_addr(mr);
|
|
|
|
|
/* We know we're only called for RAM MemoryRegions */
|
|
|
|
|
assert(start != RAM_ADDR_INVALID);
|
|
|
|
|
start += offset;
|
|
|
|
|
|
|
|
|
|
first = QEMU_ALIGN_DOWN(start, align);
|
|
|
|
|
last = QEMU_ALIGN_UP(start + length, align);
|
|
|
|
|
|
2017-04-21 11:16:25 +02:00
|
|
|
snap = g_malloc0(sizeof(*snap) +
|
|
|
|
|
((last - first) >> (TARGET_PAGE_BITS + 3)));
|
|
|
|
|
snap->start = first;
|
|
|
|
|
snap->end = last;
|
|
|
|
|
|
|
|
|
|
page = first >> TARGET_PAGE_BITS;
|
|
|
|
|
end = last >> TARGET_PAGE_BITS;
|
|
|
|
|
dest = 0;
|
|
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
WITH_RCU_READ_LOCK_GUARD() {
|
2020-09-23 11:56:46 +01:00
|
|
|
blocks = qatomic_rcu_read(&ram_list.dirty_memory[client]);
|
2017-04-21 11:16:25 +02:00
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
while (page < end) {
|
|
|
|
|
unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE;
|
2023-09-04 18:12:34 +02:00
|
|
|
unsigned long ofs = page % DIRTY_MEMORY_BLOCK_SIZE;
|
2019-10-07 15:36:41 +01:00
|
|
|
unsigned long num = MIN(end - page,
|
2023-09-04 18:12:34 +02:00
|
|
|
DIRTY_MEMORY_BLOCK_SIZE - ofs);
|
2017-04-21 11:16:25 +02:00
|
|
|
|
2023-09-04 18:12:34 +02:00
|
|
|
assert(QEMU_IS_ALIGNED(ofs, (1 << BITS_PER_LEVEL)));
|
2019-10-07 15:36:41 +01:00
|
|
|
assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL)));
|
2023-09-04 18:12:34 +02:00
|
|
|
ofs >>= BITS_PER_LEVEL;
|
2017-04-21 11:16:25 +02:00
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
bitmap_copy_and_clear_atomic(snap->dirty + dest,
|
2023-09-04 18:12:34 +02:00
|
|
|
blocks->blocks[idx] + ofs,
|
2019-10-07 15:36:41 +01:00
|
|
|
num);
|
|
|
|
|
page += num;
|
|
|
|
|
dest += num >> BITS_PER_LEVEL;
|
|
|
|
|
}
|
2017-04-21 11:16:25 +02:00
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
physical_memory_dirty_bits_cleared(start, length);
|
2017-04-21 11:16:25 +02:00
|
|
|
|
2019-06-03 14:50:51 +08:00
|
|
|
memory_region_clear_dirty_bitmap(mr, offset, length);
|
|
|
|
|
|
2017-04-21 11:16:25 +02:00
|
|
|
return snap;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
bool physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap,
|
2017-04-21 11:16:25 +02:00
|
|
|
ram_addr_t start,
|
|
|
|
|
ram_addr_t length)
|
|
|
|
|
{
|
|
|
|
|
unsigned long page, end;
|
|
|
|
|
|
|
|
|
|
assert(start >= snap->start);
|
|
|
|
|
assert(start + length <= snap->end);
|
|
|
|
|
|
|
|
|
|
end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS;
|
|
|
|
|
page = (start - snap->start) >> TARGET_PAGE_BITS;
|
|
|
|
|
|
|
|
|
|
while (page < end) {
|
|
|
|
|
if (test_bit(page, snap->dirty)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
page++;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
uint64_t physical_memory_set_dirty_lebitmap(unsigned long *bitmap,
|
2025-09-29 13:43:30 +02:00
|
|
|
ram_addr_t start,
|
|
|
|
|
ram_addr_t pages)
|
|
|
|
|
{
|
|
|
|
|
unsigned long i, j;
|
|
|
|
|
unsigned long page_number, c, nbits;
|
|
|
|
|
hwaddr addr;
|
|
|
|
|
ram_addr_t ram_addr;
|
|
|
|
|
uint64_t num_dirty = 0;
|
|
|
|
|
unsigned long len = (pages + HOST_LONG_BITS - 1) / HOST_LONG_BITS;
|
|
|
|
|
unsigned long hpratio = qemu_real_host_page_size() / TARGET_PAGE_SIZE;
|
|
|
|
|
unsigned long page = BIT_WORD(start >> TARGET_PAGE_BITS);
|
|
|
|
|
|
|
|
|
|
/* start address is aligned at the start of a word? */
|
|
|
|
|
if ((((page * BITS_PER_LONG) << TARGET_PAGE_BITS) == start) &&
|
|
|
|
|
(hpratio == 1)) {
|
|
|
|
|
unsigned long **blocks[DIRTY_MEMORY_NUM];
|
|
|
|
|
unsigned long idx;
|
|
|
|
|
unsigned long offset;
|
|
|
|
|
long k;
|
|
|
|
|
long nr = BITS_TO_LONGS(pages);
|
|
|
|
|
|
|
|
|
|
idx = (start >> TARGET_PAGE_BITS) / DIRTY_MEMORY_BLOCK_SIZE;
|
|
|
|
|
offset = BIT_WORD((start >> TARGET_PAGE_BITS) %
|
|
|
|
|
DIRTY_MEMORY_BLOCK_SIZE);
|
|
|
|
|
|
|
|
|
|
WITH_RCU_READ_LOCK_GUARD() {
|
|
|
|
|
for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
|
|
|
|
|
blocks[i] =
|
|
|
|
|
qatomic_rcu_read(&ram_list.dirty_memory[i])->blocks;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (k = 0; k < nr; k++) {
|
|
|
|
|
if (bitmap[k]) {
|
2025-12-09 21:45:06 +01:00
|
|
|
unsigned long temp = ldn_le_p(&bitmap[k],
|
|
|
|
|
sizeof(bitmap[k]));
|
2025-09-29 13:43:30 +02:00
|
|
|
|
|
|
|
|
nbits = ctpopl(temp);
|
|
|
|
|
qatomic_or(&blocks[DIRTY_MEMORY_VGA][idx][offset], temp);
|
|
|
|
|
|
|
|
|
|
if (global_dirty_tracking) {
|
|
|
|
|
qatomic_or(
|
|
|
|
|
&blocks[DIRTY_MEMORY_MIGRATION][idx][offset],
|
|
|
|
|
temp);
|
|
|
|
|
if (unlikely(
|
|
|
|
|
global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
|
|
|
|
|
total_dirty_pages += nbits;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
num_dirty += nbits;
|
|
|
|
|
|
|
|
|
|
if (tcg_enabled()) {
|
|
|
|
|
qatomic_or(&blocks[DIRTY_MEMORY_CODE][idx][offset],
|
|
|
|
|
temp);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (++offset >= BITS_TO_LONGS(DIRTY_MEMORY_BLOCK_SIZE)) {
|
|
|
|
|
offset = 0;
|
|
|
|
|
idx++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (xen_enabled()) {
|
|
|
|
|
xen_hvm_modified_memory(start, pages << TARGET_PAGE_BITS);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
uint8_t clients = tcg_enabled() ? DIRTY_CLIENTS_ALL
|
|
|
|
|
: DIRTY_CLIENTS_NOCODE;
|
|
|
|
|
|
|
|
|
|
if (!global_dirty_tracking) {
|
|
|
|
|
clients &= ~(1 << DIRTY_MEMORY_MIGRATION);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* bitmap-traveling is faster than memory-traveling (for addr...)
|
|
|
|
|
* especially when most of the memory is not dirty.
|
|
|
|
|
*/
|
|
|
|
|
for (i = 0; i < len; i++) {
|
|
|
|
|
if (bitmap[i] != 0) {
|
2025-12-09 21:45:06 +01:00
|
|
|
c = ldn_le_p(&bitmap[i], sizeof(bitmap[i]));
|
2025-09-29 13:43:30 +02:00
|
|
|
nbits = ctpopl(c);
|
|
|
|
|
if (unlikely(global_dirty_tracking & GLOBAL_DIRTY_DIRTY_RATE)) {
|
|
|
|
|
total_dirty_pages += nbits;
|
|
|
|
|
}
|
|
|
|
|
num_dirty += nbits;
|
|
|
|
|
do {
|
|
|
|
|
j = ctzl(c);
|
|
|
|
|
c &= ~(1ul << j);
|
|
|
|
|
page_number = (i * HOST_LONG_BITS + j) * hpratio;
|
|
|
|
|
addr = page_number * TARGET_PAGE_SIZE;
|
|
|
|
|
ram_addr = start + addr;
|
2025-09-30 09:08:44 +02:00
|
|
|
physical_memory_set_dirty_range(ram_addr,
|
2025-09-29 13:43:30 +02:00
|
|
|
TARGET_PAGE_SIZE * hpratio, clients);
|
|
|
|
|
} while (c != 0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return num_dirty;
|
|
|
|
|
}
|
|
|
|
|
|
2019-03-21 16:25:53 +08:00
|
|
|
static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
|
|
|
|
|
uint16_t section);
|
2017-09-21 18:50:58 +10:00
|
|
|
static subpage_t *subpage_init(FlatView *fv, hwaddr base);
|
2012-02-09 17:34:32 +02:00
|
|
|
|
2013-12-01 14:02:23 +02:00
|
|
|
static uint16_t phys_section_add(PhysPageMap *map,
|
|
|
|
|
MemoryRegionSection *section)
|
2012-02-12 18:32:55 +02:00
|
|
|
{
|
2013-12-01 14:02:23 +02:00
|
|
|
if (map->sections_nb == map->sections_nb_alloc) {
|
|
|
|
|
map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16);
|
|
|
|
|
map->sections = g_renew(MemoryRegionSection, map->sections,
|
|
|
|
|
map->sections_nb_alloc);
|
2012-02-12 18:32:55 +02:00
|
|
|
}
|
2013-12-01 14:02:23 +02:00
|
|
|
map->sections[map->sections_nb] = *section;
|
2013-05-06 10:46:11 +02:00
|
|
|
memory_region_ref(section->mr);
|
2013-12-01 14:02:23 +02:00
|
|
|
return map->sections_nb++;
|
2012-02-12 18:32:55 +02:00
|
|
|
}
|
|
|
|
|
|
2013-06-25 09:30:48 +02:00
|
|
|
static void phys_section_destroy(MemoryRegion *mr)
|
|
|
|
|
{
|
2015-11-30 17:11:04 -05:00
|
|
|
bool have_sub_page = mr->subpage;
|
|
|
|
|
|
2013-05-06 10:46:11 +02:00
|
|
|
memory_region_unref(mr);
|
|
|
|
|
|
2015-11-30 17:11:04 -05:00
|
|
|
if (have_sub_page) {
|
2013-06-25 09:30:48 +02:00
|
|
|
subpage_t *subpage = container_of(mr, subpage_t, iomem);
|
2014-06-05 23:15:52 -07:00
|
|
|
object_unref(OBJECT(&subpage->iomem));
|
2013-06-25 09:30:48 +02:00
|
|
|
g_free(subpage);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-05-29 12:30:26 +02:00
|
|
|
static void phys_sections_free(PhysPageMap *map)
|
2012-02-12 18:32:55 +02:00
|
|
|
{
|
2013-05-29 12:09:47 +02:00
|
|
|
while (map->sections_nb > 0) {
|
|
|
|
|
MemoryRegionSection *section = &map->sections[--map->sections_nb];
|
2013-06-25 09:30:48 +02:00
|
|
|
phys_section_destroy(section->mr);
|
|
|
|
|
}
|
2013-05-29 12:09:47 +02:00
|
|
|
g_free(map->sections);
|
|
|
|
|
g_free(map->nodes);
|
2012-02-12 18:32:55 +02:00
|
|
|
}
|
|
|
|
|
|
2017-09-21 18:50:59 +10:00
|
|
|
static void register_subpage(FlatView *fv, MemoryRegionSection *section)
|
2012-02-13 17:14:32 +02:00
|
|
|
{
|
2017-09-21 18:50:59 +10:00
|
|
|
AddressSpaceDispatch *d = flatview_to_dispatch(fv);
|
2012-02-13 17:14:32 +02:00
|
|
|
subpage_t *subpage;
|
2012-10-23 12:30:10 +02:00
|
|
|
hwaddr base = section->offset_within_address_space
|
2012-02-13 17:14:32 +02:00
|
|
|
& TARGET_PAGE_MASK;
|
2017-05-15 16:50:57 +08:00
|
|
|
MemoryRegionSection *existing = phys_page_find(d, base);
|
2012-02-13 17:14:32 +02:00
|
|
|
MemoryRegionSection subsection = {
|
|
|
|
|
.offset_within_address_space = base,
|
2013-05-27 10:08:27 +02:00
|
|
|
.size = int128_make64(TARGET_PAGE_SIZE),
|
2012-02-13 17:14:32 +02:00
|
|
|
};
|
2012-10-23 12:30:10 +02:00
|
|
|
hwaddr start, end;
|
2012-02-13 17:14:32 +02:00
|
|
|
|
2012-03-08 16:16:34 +02:00
|
|
|
assert(existing->mr->subpage || existing->mr == &io_mem_unassigned);
|
2012-02-13 17:14:32 +02:00
|
|
|
|
2012-03-08 16:16:34 +02:00
|
|
|
if (!(existing->mr->subpage)) {
|
2017-09-21 18:50:58 +10:00
|
|
|
subpage = subpage_init(fv, base);
|
|
|
|
|
subsection.fv = fv;
|
2012-02-13 17:14:32 +02:00
|
|
|
subsection.mr = &subpage->iomem;
|
2012-10-03 16:22:53 +02:00
|
|
|
phys_page_set(d, base >> TARGET_PAGE_BITS, 1,
|
2013-12-01 14:02:23 +02:00
|
|
|
phys_section_add(&d->map, &subsection));
|
2012-02-13 17:14:32 +02:00
|
|
|
} else {
|
2012-03-08 16:16:34 +02:00
|
|
|
subpage = container_of(existing->mr, subpage_t, iomem);
|
2012-02-13 17:14:32 +02:00
|
|
|
}
|
|
|
|
|
start = section->offset_within_address_space & ~TARGET_PAGE_MASK;
|
2013-05-27 10:08:27 +02:00
|
|
|
end = start + int128_get64(section->size) - 1;
|
2013-12-01 14:02:23 +02:00
|
|
|
subpage_register(subpage, start, end,
|
|
|
|
|
phys_section_add(&d->map, section));
|
2012-02-13 17:14:32 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2017-09-21 18:50:59 +10:00
|
|
|
static void register_multipage(FlatView *fv,
|
2013-05-27 10:08:27 +02:00
|
|
|
MemoryRegionSection *section)
|
2003-08-10 21:47:01 +00:00
|
|
|
{
|
2017-09-21 18:50:59 +10:00
|
|
|
AddressSpaceDispatch *d = flatview_to_dispatch(fv);
|
2012-10-23 12:30:10 +02:00
|
|
|
hwaddr start_addr = section->offset_within_address_space;
|
2013-12-01 14:02:23 +02:00
|
|
|
uint16_t section_index = phys_section_add(&d->map, section);
|
2013-05-27 10:08:27 +02:00
|
|
|
uint64_t num_pages = int128_get64(int128_rshift(section->size,
|
|
|
|
|
TARGET_PAGE_BITS));
|
2012-01-02 12:17:03 +02:00
|
|
|
|
2013-05-27 10:47:10 +02:00
|
|
|
assert(num_pages);
|
|
|
|
|
phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index);
|
2003-08-10 21:47:01 +00:00
|
|
|
}
|
|
|
|
|
|
2019-03-11 13:42:52 +08:00
|
|
|
/*
|
|
|
|
|
* The range in *section* may look like this:
|
|
|
|
|
*
|
|
|
|
|
* |s|PPPPPPP|s|
|
|
|
|
|
*
|
|
|
|
|
* where s stands for subpage and P for page.
|
|
|
|
|
*/
|
2017-09-21 18:51:00 +10:00
|
|
|
void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section)
|
2012-02-13 17:14:32 +02:00
|
|
|
{
|
2019-03-11 13:42:52 +08:00
|
|
|
MemoryRegionSection remain = *section;
|
2013-05-27 10:08:27 +02:00
|
|
|
Int128 page_size = int128_make64(TARGET_PAGE_SIZE);
|
2012-02-13 17:14:32 +02:00
|
|
|
|
2019-03-11 13:42:52 +08:00
|
|
|
/* register first subpage */
|
|
|
|
|
if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) {
|
|
|
|
|
uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space)
|
|
|
|
|
- remain.offset_within_address_space;
|
2013-05-27 10:47:10 +02:00
|
|
|
|
2019-03-11 13:42:52 +08:00
|
|
|
MemoryRegionSection now = remain;
|
2013-05-27 10:08:27 +02:00
|
|
|
now.size = int128_min(int128_make64(left), now.size);
|
2017-09-21 18:50:59 +10:00
|
|
|
register_subpage(fv, &now);
|
2019-03-11 13:42:52 +08:00
|
|
|
if (int128_eq(remain.size, now.size)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2013-05-27 10:08:27 +02:00
|
|
|
remain.size = int128_sub(remain.size, now.size);
|
|
|
|
|
remain.offset_within_address_space += int128_get64(now.size);
|
|
|
|
|
remain.offset_within_region += int128_get64(now.size);
|
2019-03-11 13:42:52 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* register whole pages */
|
|
|
|
|
if (int128_ge(remain.size, page_size)) {
|
|
|
|
|
MemoryRegionSection now = remain;
|
|
|
|
|
now.size = int128_and(now.size, int128_neg(page_size));
|
|
|
|
|
register_multipage(fv, &now);
|
|
|
|
|
if (int128_eq(remain.size, now.size)) {
|
|
|
|
|
return;
|
2012-07-25 18:45:04 -04:00
|
|
|
}
|
2019-03-11 13:42:52 +08:00
|
|
|
remain.size = int128_sub(remain.size, now.size);
|
|
|
|
|
remain.offset_within_address_space += int128_get64(now.size);
|
|
|
|
|
remain.offset_within_region += int128_get64(now.size);
|
2012-02-13 17:14:32 +02:00
|
|
|
}
|
2019-03-11 13:42:52 +08:00
|
|
|
|
|
|
|
|
/* register last subpage */
|
|
|
|
|
register_subpage(fv, &remain);
|
2012-02-13 17:14:32 +02:00
|
|
|
}
|
|
|
|
|
|
2010-01-26 19:21:16 +08:00
|
|
|
void qemu_flush_coalesced_mmio_buffer(void)
|
|
|
|
|
{
|
|
|
|
|
if (kvm_enabled())
|
|
|
|
|
kvm_flush_coalesced_mmio_buffer();
|
|
|
|
|
}
|
|
|
|
|
|
2011-08-17 00:01:33 -07:00
|
|
|
void qemu_mutex_lock_ramlist(void)
|
|
|
|
|
{
|
|
|
|
|
qemu_mutex_lock(&ram_list.mutex);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void qemu_mutex_unlock_ramlist(void)
|
|
|
|
|
{
|
|
|
|
|
qemu_mutex_unlock(&ram_list.mutex);
|
|
|
|
|
}
|
|
|
|
|
|
2021-09-08 10:35:43 +01:00
|
|
|
GString *ram_block_format(void)
|
2017-05-12 12:17:41 +08:00
|
|
|
{
|
|
|
|
|
RAMBlock *block;
|
|
|
|
|
char *psize;
|
2021-09-08 10:35:43 +01:00
|
|
|
GString *buf = g_string_new("");
|
2017-05-12 12:17:41 +08:00
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2022-12-05 20:07:12 +08:00
|
|
|
g_string_append_printf(buf, "%24s %8s %18s %18s %18s %18s %3s\n",
|
|
|
|
|
"Block Name", "PSize", "Offset", "Used", "Total",
|
|
|
|
|
"HVA", "RO");
|
|
|
|
|
|
2017-05-12 12:17:41 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
|
|
|
|
psize = size_to_str(block->page_size);
|
2021-09-08 10:35:43 +01:00
|
|
|
g_string_append_printf(buf, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64
|
2022-12-05 20:07:12 +08:00
|
|
|
" 0x%016" PRIx64 " 0x%016" PRIx64 " %3s\n",
|
|
|
|
|
block->idstr, psize,
|
2021-09-08 10:35:43 +01:00
|
|
|
(uint64_t)block->offset,
|
|
|
|
|
(uint64_t)block->used_length,
|
2022-12-05 20:07:12 +08:00
|
|
|
(uint64_t)block->max_length,
|
|
|
|
|
(uint64_t)(uintptr_t)block->host,
|
|
|
|
|
block->mr->readonly ? "ro" : "rw");
|
|
|
|
|
|
2017-05-12 12:17:41 +08:00
|
|
|
g_free(psize);
|
|
|
|
|
}
|
2021-09-08 10:35:43 +01:00
|
|
|
|
|
|
|
|
return buf;
|
2017-05-12 12:17:41 +08:00
|
|
|
}
|
|
|
|
|
|
2019-04-17 13:31:43 +02:00
|
|
|
static int find_min_backend_pagesize(Object *obj, void *opaque)
|
2017-03-02 13:36:11 +11:00
|
|
|
{
|
|
|
|
|
long *hpsize_min = opaque;
|
|
|
|
|
|
|
|
|
|
if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
|
2019-03-26 14:33:33 +11:00
|
|
|
HostMemoryBackend *backend = MEMORY_BACKEND(obj);
|
|
|
|
|
long hpsize = host_memory_backend_pagesize(backend);
|
2018-04-03 15:05:45 +10:00
|
|
|
|
2019-03-26 14:33:33 +11:00
|
|
|
if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) {
|
2018-04-03 14:55:11 +10:00
|
|
|
*hpsize_min = hpsize;
|
2017-03-02 13:36:11 +11:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-17 13:31:43 +02:00
|
|
|
static int find_max_backend_pagesize(Object *obj, void *opaque)
|
|
|
|
|
{
|
|
|
|
|
long *hpsize_max = opaque;
|
|
|
|
|
|
|
|
|
|
if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) {
|
|
|
|
|
HostMemoryBackend *backend = MEMORY_BACKEND(obj);
|
|
|
|
|
long hpsize = host_memory_backend_pagesize(backend);
|
|
|
|
|
|
|
|
|
|
if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) {
|
|
|
|
|
*hpsize_max = hpsize;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* TODO: We assume right now that all mapped host memory backends are
|
|
|
|
|
* used as RAM, however some might be used for different purposes.
|
|
|
|
|
*/
|
|
|
|
|
long qemu_minrampagesize(void)
|
2017-03-02 13:36:11 +11:00
|
|
|
{
|
|
|
|
|
long hpsize = LONG_MAX;
|
2020-02-19 11:09:47 -05:00
|
|
|
Object *memdev_root = object_resolve_path("/objects", NULL);
|
2017-03-02 13:36:11 +11:00
|
|
|
|
2020-02-19 11:09:47 -05:00
|
|
|
object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize);
|
2017-03-02 13:36:11 +11:00
|
|
|
return hpsize;
|
|
|
|
|
}
|
2019-04-17 13:31:43 +02:00
|
|
|
|
|
|
|
|
long qemu_maxrampagesize(void)
|
|
|
|
|
{
|
2020-02-19 11:09:47 -05:00
|
|
|
long pagesize = 0;
|
2019-04-17 13:31:43 +02:00
|
|
|
Object *memdev_root = object_resolve_path("/objects", NULL);
|
|
|
|
|
|
2020-02-19 11:09:47 -05:00
|
|
|
object_child_foreach(memdev_root, find_max_backend_pagesize, &pagesize);
|
2019-04-17 13:31:43 +02:00
|
|
|
return pagesize;
|
|
|
|
|
}
|
2017-03-02 13:36:11 +11:00
|
|
|
|
2025-04-28 15:39:05 +09:00
|
|
|
#if defined(CONFIG_POSIX) && !defined(EMSCRIPTEN)
|
2016-10-27 12:22:58 +08:00
|
|
|
static int64_t get_file_size(int fd)
|
|
|
|
|
{
|
2019-08-30 10:30:56 +01:00
|
|
|
int64_t size;
|
|
|
|
|
#if defined(__linux__)
|
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
|
|
if (fstat(fd, &st) < 0) {
|
|
|
|
|
return -errno;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Special handling for devdax character devices */
|
|
|
|
|
if (S_ISCHR(st.st_mode)) {
|
|
|
|
|
g_autofree char *subsystem_path = NULL;
|
|
|
|
|
g_autofree char *subsystem = NULL;
|
|
|
|
|
|
|
|
|
|
subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem",
|
|
|
|
|
major(st.st_rdev), minor(st.st_rdev));
|
|
|
|
|
subsystem = g_file_read_link(subsystem_path, NULL);
|
|
|
|
|
|
|
|
|
|
if (subsystem && g_str_has_suffix(subsystem, "/dax")) {
|
|
|
|
|
g_autofree char *size_path = NULL;
|
|
|
|
|
g_autofree char *size_str = NULL;
|
|
|
|
|
|
|
|
|
|
size_path = g_strdup_printf("/sys/dev/char/%d:%d/size",
|
|
|
|
|
major(st.st_rdev), minor(st.st_rdev));
|
|
|
|
|
|
|
|
|
|
if (g_file_get_contents(size_path, &size_str, NULL, NULL)) {
|
|
|
|
|
return g_ascii_strtoll(size_str, NULL, 0);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif /* defined(__linux__) */
|
|
|
|
|
|
|
|
|
|
/* st.st_size may be zero for special files yet lseek(2) works */
|
|
|
|
|
size = lseek(fd, 0, SEEK_END);
|
2016-10-27 12:22:58 +08:00
|
|
|
if (size < 0) {
|
|
|
|
|
return -errno;
|
|
|
|
|
}
|
|
|
|
|
return size;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-29 16:50:09 +08:00
|
|
|
static int64_t get_file_align(int fd)
|
|
|
|
|
{
|
|
|
|
|
int64_t align = -1;
|
|
|
|
|
#if defined(__linux__) && defined(CONFIG_LIBDAXCTL)
|
|
|
|
|
struct stat st;
|
|
|
|
|
|
|
|
|
|
if (fstat(fd, &st) < 0) {
|
|
|
|
|
return -errno;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Special handling for devdax character devices */
|
|
|
|
|
if (S_ISCHR(st.st_mode)) {
|
|
|
|
|
g_autofree char *path = NULL;
|
|
|
|
|
g_autofree char *rpath = NULL;
|
|
|
|
|
struct daxctl_ctx *ctx;
|
|
|
|
|
struct daxctl_region *region;
|
|
|
|
|
int rc = 0;
|
|
|
|
|
|
|
|
|
|
path = g_strdup_printf("/sys/dev/char/%d:%d",
|
|
|
|
|
major(st.st_rdev), minor(st.st_rdev));
|
|
|
|
|
rpath = realpath(path, NULL);
|
2021-08-12 16:15:25 +01:00
|
|
|
if (!rpath) {
|
|
|
|
|
return -errno;
|
|
|
|
|
}
|
2020-04-29 16:50:09 +08:00
|
|
|
|
|
|
|
|
rc = daxctl_new(&ctx);
|
|
|
|
|
if (rc) {
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
daxctl_region_foreach(ctx, region) {
|
|
|
|
|
if (strstr(rpath, daxctl_region_get_path(region))) {
|
|
|
|
|
align = daxctl_region_get_align(region);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
daxctl_unref(ctx);
|
|
|
|
|
}
|
|
|
|
|
#endif /* defined(__linux__) && defined(CONFIG_LIBDAXCTL) */
|
|
|
|
|
|
|
|
|
|
return align;
|
|
|
|
|
}
|
|
|
|
|
|
2017-06-02 18:12:22 +04:00
|
|
|
static int file_ram_open(const char *path,
|
|
|
|
|
const char *region_name,
|
2021-01-04 17:13:18 +00:00
|
|
|
bool readonly,
|
softmmu/physmem: Fail creation of new files in file_ram_open() with readonly=true
Currently, if a file does not exist yet, file_ram_open() will create new
empty file and open it writable. However, it even does that when
readonly=true was specified.
Specifying O_RDONLY instead to create a new readonly file would
theoretically work, however, ftruncate() will refuse to resize the new
empty file and we'll get a warning:
ftruncate: Invalid argument
And later eventually more problems when actually mmap'ing that file and
accessing it.
If someone intends to let QEMU open+mmap a file read-only, better
create+resize+fill that file ahead of time outside of QEMU context.
We'll now fail with:
./qemu-system-x86_64 \
-object memory-backend-file,id=ram0,mem-path=tmp,readonly=true,size=1g
qemu-system-x86_64: can't open backing store tmp for guest RAM: No such file or directory
All use cases of readonly files (R/O NVDIMMs, VM templating) work on
existing files, so silently creating new files might just hide user
errors when accidentally specifying a non-existent file.
Note that the only memory-backend-file will end up calling
memory_region_init_ram_from_file() -> qemu_ram_alloc_from_file() ->
file_ram_open().
Move error reporting to the single caller.
Message-ID: <20230906120503.359863-7-david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
2023-09-06 14:04:58 +02:00
|
|
|
bool *created)
|
2010-03-01 20:25:08 -03:00
|
|
|
{
|
|
|
|
|
char *filename;
|
2013-03-04 13:54:25 -05:00
|
|
|
char *sanitized_name;
|
|
|
|
|
char *c;
|
2016-03-17 15:53:13 +01:00
|
|
|
int fd = -1;
|
2010-03-01 20:25:08 -03:00
|
|
|
|
2017-06-02 18:12:22 +04:00
|
|
|
*created = false;
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
for (;;) {
|
2021-01-04 17:13:18 +00:00
|
|
|
fd = open(path, readonly ? O_RDONLY : O_RDWR);
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
if (fd >= 0) {
|
softmmu/physmem: Never return directories from file_ram_open()
open() does not fail on directories when opening them readonly (O_RDONLY).
Currently, we succeed opening such directories and fail later during
mmap(), resulting in a misleading error message.
$ ./qemu-system-x86_64 \
-object memory-backend-file,id=ram0,mem-path=tmp,readonly=true,size=1g
qemu-system-x86_64: unable to map backing store for guest RAM: No such device
To identify directories and handle them accordingly in file_ram_open()
also when readonly=true was specified, detect if we just opened a directory
using fstat() instead. Then, fail file_ram_open() right away, similarly
to how we now fail if the file does not exist and we want to open the
file readonly.
With this change, we get a nicer error message:
qemu-system-x86_64: can't open backing store tmp for guest RAM: Is a directory
Note that the only memory-backend-file will end up calling
memory_region_init_ram_from_file() -> qemu_ram_alloc_from_file() ->
file_ram_open().
Message-ID: <20230906120503.359863-8-david@redhat.com>
Reported-by: Thiner Logoer <logoerthiner1@163.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Tested-by: Mario Casquero <mcasquer@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
2023-09-06 14:04:59 +02:00
|
|
|
/*
|
|
|
|
|
* open(O_RDONLY) won't fail with EISDIR. Check manually if we
|
|
|
|
|
* opened a directory and fail similarly to how we fail ENOENT
|
|
|
|
|
* in readonly mode. Note that mkstemp() would imply O_RDWR.
|
|
|
|
|
*/
|
|
|
|
|
if (readonly) {
|
|
|
|
|
struct stat file_stat;
|
|
|
|
|
|
|
|
|
|
if (fstat(fd, &file_stat)) {
|
|
|
|
|
close(fd);
|
|
|
|
|
if (errno == EINTR) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
return -errno;
|
|
|
|
|
} else if (S_ISDIR(file_stat.st_mode)) {
|
|
|
|
|
close(fd);
|
|
|
|
|
return -EISDIR;
|
|
|
|
|
}
|
|
|
|
|
}
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
/* @path names an existing file, use it */
|
|
|
|
|
break;
|
2015-10-28 12:54:07 +03:00
|
|
|
}
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
if (errno == ENOENT) {
|
softmmu/physmem: Fail creation of new files in file_ram_open() with readonly=true
Currently, if a file does not exist yet, file_ram_open() will create new
empty file and open it writable. However, it even does that when
readonly=true was specified.
Specifying O_RDONLY instead to create a new readonly file would
theoretically work, however, ftruncate() will refuse to resize the new
empty file and we'll get a warning:
ftruncate: Invalid argument
And later eventually more problems when actually mmap'ing that file and
accessing it.
If someone intends to let QEMU open+mmap a file read-only, better
create+resize+fill that file ahead of time outside of QEMU context.
We'll now fail with:
./qemu-system-x86_64 \
-object memory-backend-file,id=ram0,mem-path=tmp,readonly=true,size=1g
qemu-system-x86_64: can't open backing store tmp for guest RAM: No such file or directory
All use cases of readonly files (R/O NVDIMMs, VM templating) work on
existing files, so silently creating new files might just hide user
errors when accidentally specifying a non-existent file.
Note that the only memory-backend-file will end up calling
memory_region_init_ram_from_file() -> qemu_ram_alloc_from_file() ->
file_ram_open().
Move error reporting to the single caller.
Message-ID: <20230906120503.359863-7-david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
2023-09-06 14:04:58 +02:00
|
|
|
if (readonly) {
|
|
|
|
|
/* Refuse to create new, readonly files. */
|
|
|
|
|
return -ENOENT;
|
|
|
|
|
}
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
/* @path names a file that doesn't exist, create it */
|
|
|
|
|
fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644);
|
|
|
|
|
if (fd >= 0) {
|
2017-06-02 18:12:22 +04:00
|
|
|
*created = true;
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
} else if (errno == EISDIR) {
|
|
|
|
|
/* @path names a directory, create a file there */
|
|
|
|
|
/* Make name safe to use with mkstemp by replacing '/' with '_'. */
|
2017-06-02 18:12:22 +04:00
|
|
|
sanitized_name = g_strdup(region_name);
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
for (c = sanitized_name; *c != '\0'; c++) {
|
|
|
|
|
if (*c == '/') {
|
|
|
|
|
*c = '_';
|
|
|
|
|
}
|
|
|
|
|
}
|
2013-03-04 13:54:25 -05:00
|
|
|
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
|
|
|
|
|
sanitized_name);
|
|
|
|
|
g_free(sanitized_name);
|
2015-10-28 12:54:07 +03:00
|
|
|
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
fd = mkstemp(filename);
|
|
|
|
|
if (fd >= 0) {
|
|
|
|
|
unlink(filename);
|
|
|
|
|
g_free(filename);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
g_free(filename);
|
2015-10-28 12:54:07 +03:00
|
|
|
}
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
if (errno != EEXIST && errno != EINTR) {
|
softmmu/physmem: Fail creation of new files in file_ram_open() with readonly=true
Currently, if a file does not exist yet, file_ram_open() will create new
empty file and open it writable. However, it even does that when
readonly=true was specified.
Specifying O_RDONLY instead to create a new readonly file would
theoretically work, however, ftruncate() will refuse to resize the new
empty file and we'll get a warning:
ftruncate: Invalid argument
And later eventually more problems when actually mmap'ing that file and
accessing it.
If someone intends to let QEMU open+mmap a file read-only, better
create+resize+fill that file ahead of time outside of QEMU context.
We'll now fail with:
./qemu-system-x86_64 \
-object memory-backend-file,id=ram0,mem-path=tmp,readonly=true,size=1g
qemu-system-x86_64: can't open backing store tmp for guest RAM: No such file or directory
All use cases of readonly files (R/O NVDIMMs, VM templating) work on
existing files, so silently creating new files might just hide user
errors when accidentally specifying a non-existent file.
Note that the only memory-backend-file will end up calling
memory_region_init_ram_from_file() -> qemu_ram_alloc_from_file() ->
file_ram_open().
Move error reporting to the single caller.
Message-ID: <20230906120503.359863-7-david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
2023-09-06 14:04:58 +02:00
|
|
|
return -errno;
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
}
|
|
|
|
|
/*
|
|
|
|
|
* Try again on EINTR and EEXIST. The latter happens when
|
|
|
|
|
* something else creates the file between our two open().
|
|
|
|
|
*/
|
2015-10-28 12:54:07 +03:00
|
|
|
}
|
2010-03-01 20:25:08 -03:00
|
|
|
|
2017-06-02 18:12:22 +04:00
|
|
|
return fd;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void *file_ram_alloc(RAMBlock *block,
|
|
|
|
|
ram_addr_t memory,
|
|
|
|
|
int fd,
|
|
|
|
|
bool truncate,
|
2021-01-29 11:46:04 -05:00
|
|
|
off_t offset,
|
2017-06-02 18:12:22 +04:00
|
|
|
Error **errp)
|
|
|
|
|
{
|
2021-05-10 13:43:20 +02:00
|
|
|
uint32_t qemu_map_flags;
|
2017-06-02 18:12:22 +04:00
|
|
|
void *area;
|
|
|
|
|
|
2016-09-29 20:09:37 +01:00
|
|
|
block->page_size = qemu_fd_getpagesize(fd);
|
hostmem-file: add "align" option
When mmap(2) the backend files, QEMU uses the host page size
(getpagesize(2)) by default as the alignment of mapping address.
However, some backends may require alignments different than the page
size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux
kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned,
fails with a kernel message like
[617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff)
Because there is no common approach to get such alignment requirement,
we add the 'align' option to 'memory-backend-file', so that users or
management utils, which have enough knowledge about the backend, can
specify a proper alignment via this option.
Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Message-Id: <20171211072806.2812-2-haozhong.zhang@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
[ehabkost: fixed typo, fixed error_setg() format string]
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2017-12-11 15:28:04 +08:00
|
|
|
if (block->mr->align % block->page_size) {
|
|
|
|
|
error_setg(errp, "alignment 0x%" PRIx64
|
|
|
|
|
" must be multiples of page size 0x%zx",
|
|
|
|
|
block->mr->align, block->page_size);
|
|
|
|
|
return NULL;
|
2018-06-07 17:47:05 +02:00
|
|
|
} else if (block->mr->align && !is_power_of_2(block->mr->align)) {
|
|
|
|
|
error_setg(errp, "alignment 0x%" PRIx64
|
|
|
|
|
" must be a power of two", block->mr->align);
|
|
|
|
|
return NULL;
|
2023-04-03 22:14:21 +00:00
|
|
|
} else if (offset % block->page_size) {
|
|
|
|
|
error_setg(errp, "offset 0x%" PRIx64
|
|
|
|
|
" must be multiples of page size 0x%zx",
|
|
|
|
|
offset, block->page_size);
|
|
|
|
|
return NULL;
|
hostmem-file: add "align" option
When mmap(2) the backend files, QEMU uses the host page size
(getpagesize(2)) by default as the alignment of mapping address.
However, some backends may require alignments different than the page
size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux
kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned,
fails with a kernel message like
[617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff)
Because there is no common approach to get such alignment requirement,
we add the 'align' option to 'memory-backend-file', so that users or
management utils, which have enough knowledge about the backend, can
specify a proper alignment via this option.
Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Message-Id: <20171211072806.2812-2-haozhong.zhang@intel.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
[ehabkost: fixed typo, fixed error_setg() format string]
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
2017-12-11 15:28:04 +08:00
|
|
|
}
|
|
|
|
|
block->mr->align = MAX(block->page_size, block->mr->align);
|
exec.c: workaround regression caused by alignment change in d2f39ad
Commit d2f39ad "exec.c: Ensure right alignment also for file backed ram"
added an additional alignment requirement on the size of backend file
besides the previous page size. On x86, the alignment is changed from
4KB in QEMU 2.6 to 2MB in QEMU 2.7.
This change breaks certain usages in QEMU 2.7 on x86, e.g.
-object memory-backend-file,id=mem1,mem-path=/tmp/,size=$SZ
-device pc-dimm,id=dimm1,memdev=mem1
where $SZ is multiple of 4KB but not 2MB (e.g. 1023M). QEMU 2.7
reports the following error message and aborts:
qemu-system-x86_64: -device pc-dimm,memdev=mem1,id=nv1: backend memory size must be multiple of 0x200000
The same regression may also happen in other platforms as indicated by
Igor Mammedov. This change is however necessary for s390 according to
the commit message of d2f39ad, so we workaround the regression by taking
the change only on s390.
Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Reported-by: "Xu, Anthony" <anthony.xu@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-10-24 20:49:37 +08:00
|
|
|
#if defined(__s390x__)
|
|
|
|
|
if (kvm_enabled()) {
|
|
|
|
|
block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
|
2016-09-29 20:09:37 +01:00
|
|
|
if (memory < block->page_size) {
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to "
|
2016-09-29 20:09:37 +01:00
|
|
|
"or larger than page size 0x%zx",
|
|
|
|
|
memory, block->page_size);
|
2017-06-02 18:12:22 +04:00
|
|
|
return NULL;
|
2016-11-02 09:05:51 +08:00
|
|
|
}
|
|
|
|
|
|
2016-09-29 20:09:37 +01:00
|
|
|
memory = ROUND_UP(memory, block->page_size);
|
2010-03-01 20:25:08 -03:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* ftruncate is not supported by hugetlbfs in older
|
|
|
|
|
* hosts, so don't bother bailing out on errors.
|
|
|
|
|
* If anything goes wrong with it under other filesystems,
|
|
|
|
|
* mmap will fail.
|
2016-10-27 12:22:58 +08:00
|
|
|
*
|
|
|
|
|
* Do not truncate the non-empty backend file to avoid corrupting
|
|
|
|
|
* the existing data in the file. Disabling shrinking is not
|
|
|
|
|
* enough. For example, the current vNVDIMM implementation stores
|
|
|
|
|
* the guest NVDIMM labels at the end of the backend file. If the
|
|
|
|
|
* backend file is later extended, QEMU will not be able to find
|
|
|
|
|
* those labels. Therefore, extending the non-empty backend file
|
|
|
|
|
* is disabled as well.
|
2010-03-01 20:25:08 -03:00
|
|
|
*/
|
2023-04-03 22:14:21 +00:00
|
|
|
if (truncate && ftruncate(fd, offset + memory)) {
|
2010-08-18 13:30:13 +09:00
|
|
|
perror("ftruncate");
|
2014-05-14 17:43:20 +08:00
|
|
|
}
|
2010-03-01 20:25:08 -03:00
|
|
|
|
2023-09-06 14:04:54 +02:00
|
|
|
qemu_map_flags = (block->flags & RAM_READONLY) ? QEMU_MAP_READONLY : 0;
|
2021-05-10 13:43:20 +02:00
|
|
|
qemu_map_flags |= (block->flags & RAM_SHARED) ? QEMU_MAP_SHARED : 0;
|
|
|
|
|
qemu_map_flags |= (block->flags & RAM_PMEM) ? QEMU_MAP_SYNC : 0;
|
2021-05-10 13:43:21 +02:00
|
|
|
qemu_map_flags |= (block->flags & RAM_NORESERVE) ? QEMU_MAP_NORESERVE : 0;
|
2021-05-10 13:43:20 +02:00
|
|
|
area = qemu_ram_mmap(fd, memory, block->mr->align, qemu_map_flags, offset);
|
2010-03-01 20:25:08 -03:00
|
|
|
if (area == MAP_FAILED) {
|
2014-05-14 17:43:20 +08:00
|
|
|
error_setg_errno(errp, errno,
|
exec: Fix memory allocation when memory path names new file
Commit 8d31d6b extended file_ram_alloc() to accept file names in
addition to directory names. Even though it passes O_CREAT to open(),
it actually works only for existing files. Reproducer adapted from
the commit's qemu-doc.texi update:
$ qemu-system-x86_64 -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1
qemu-system-x86_64: -object memory-backend-file,size=2M,mem-path=/dev/hugepages/my-shmem-file,id=mb1: failed to get page size of file /dev/hugepages/my-shmem-file: No such file or directory
This is because we first get the page size for @path, then open the
actual file. Unwise even before the flawed commit, because the
directory could change in between, invalidating the page size.
Unlikely to bite in practice.
Rearrange the code to create the file (if necessary) before getting
its page size. Carefully avoid TOCTTOU conditions with a method
suggested by Paolo Bonzini.
While there, replace "hugepages" by "guest RAM" in error messages,
because host memory backends can be used for purposes other than huge
pages, e.g. /dev/shm/ shared memory. Help text of -mem-path agrees.
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Markus Armbruster <armbru@redhat.com>
Message-Id: <1457378754-21649-2-git-send-email-armbru@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2016-03-07 20:25:13 +01:00
|
|
|
"unable to map backing store for guest RAM");
|
2017-06-02 18:12:22 +04:00
|
|
|
return NULL;
|
2010-03-01 20:25:08 -03:00
|
|
|
}
|
2013-10-28 18:51:46 -02:00
|
|
|
|
2010-07-02 11:13:17 -06:00
|
|
|
block->fd = fd;
|
2023-04-03 22:14:21 +00:00
|
|
|
block->fd_offset = offset;
|
2010-03-01 20:25:08 -03:00
|
|
|
return area;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
2018-01-05 17:01:37 +00:00
|
|
|
/* Allocate space within the ram_addr_t space that governs the
|
|
|
|
|
* dirty bitmaps.
|
|
|
|
|
* Called with the ramlist lock held.
|
|
|
|
|
*/
|
2010-06-25 11:08:38 -06:00
|
|
|
static ram_addr_t find_ram_offset(ram_addr_t size)
|
2010-07-02 11:13:17 -06:00
|
|
|
{
|
|
|
|
|
RAMBlock *block, *next_block;
|
2011-10-31 08:54:09 -06:00
|
|
|
ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX;
|
2010-07-02 11:13:17 -06:00
|
|
|
|
2013-03-11 10:20:21 +01:00
|
|
|
assert(size != 0); /* it would hand out same offset multiple times */
|
|
|
|
|
|
2013-09-05 14:41:35 -04:00
|
|
|
if (QLIST_EMPTY_RCU(&ram_list.blocks)) {
|
2010-07-02 11:13:17 -06:00
|
|
|
return 0;
|
2015-01-21 13:45:24 +01:00
|
|
|
}
|
2010-07-02 11:13:17 -06:00
|
|
|
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
2018-01-05 17:01:37 +00:00
|
|
|
ram_addr_t candidate, next = RAM_ADDR_MAX;
|
2010-07-02 11:13:17 -06:00
|
|
|
|
2018-01-05 17:01:38 +00:00
|
|
|
/* Align blocks to start on a 'long' in the bitmap
|
|
|
|
|
* which makes the bitmap sync'ing take the fast path.
|
|
|
|
|
*/
|
2018-01-05 17:01:37 +00:00
|
|
|
candidate = block->offset + block->max_length;
|
2018-01-05 17:01:38 +00:00
|
|
|
candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS);
|
2010-07-02 11:13:17 -06:00
|
|
|
|
2018-01-05 17:01:37 +00:00
|
|
|
/* Search for the closest following block
|
|
|
|
|
* and find the gap.
|
|
|
|
|
*/
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(next_block) {
|
2018-01-05 17:01:37 +00:00
|
|
|
if (next_block->offset >= candidate) {
|
2010-07-02 11:13:17 -06:00
|
|
|
next = MIN(next, next_block->offset);
|
|
|
|
|
}
|
|
|
|
|
}
|
2018-01-05 17:01:37 +00:00
|
|
|
|
|
|
|
|
/* If it fits remember our place and remember the size
|
|
|
|
|
* of gap, but keep going so that we might find a smaller
|
|
|
|
|
* gap to fill so avoiding fragmentation.
|
|
|
|
|
*/
|
|
|
|
|
if (next - candidate >= size && next - candidate < mingap) {
|
|
|
|
|
offset = candidate;
|
|
|
|
|
mingap = next - candidate;
|
2010-07-02 11:13:17 -06:00
|
|
|
}
|
2018-01-05 17:01:37 +00:00
|
|
|
|
|
|
|
|
trace_find_ram_offset_loop(size, candidate, offset, next, mingap);
|
2010-07-02 11:13:17 -06:00
|
|
|
}
|
2011-10-31 08:54:09 -06:00
|
|
|
|
|
|
|
|
if (offset == RAM_ADDR_MAX) {
|
|
|
|
|
fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n",
|
|
|
|
|
(uint64_t)size);
|
|
|
|
|
abort();
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-05 17:01:37 +00:00
|
|
|
trace_find_ram_offset(size, offset);
|
|
|
|
|
|
2010-07-02 11:13:17 -06:00
|
|
|
return offset;
|
|
|
|
|
}
|
|
|
|
|
|
2012-08-02 15:44:16 -04:00
|
|
|
static void qemu_ram_setup_dump(void *addr, ram_addr_t size)
|
|
|
|
|
{
|
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
|
|
/* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */
|
2015-02-04 17:43:54 +02:00
|
|
|
if (!machine_dump_guest_core(current_machine)) {
|
2012-08-02 15:44:16 -04:00
|
|
|
ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP);
|
|
|
|
|
if (ret) {
|
|
|
|
|
perror("qemu_madvise");
|
|
|
|
|
fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, "
|
2024-06-28 13:05:52 +09:00
|
|
|
"but dump-guest-core=off specified\n");
|
2012-08-02 15:44:16 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-05 18:10:32 +00:00
|
|
|
const char *qemu_ram_get_idstr(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->idstr;
|
|
|
|
|
}
|
|
|
|
|
|
2019-02-15 20:45:44 +03:00
|
|
|
void *qemu_ram_get_host_addr(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->host;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ram_addr_t qemu_ram_get_offset(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->offset;
|
|
|
|
|
}
|
|
|
|
|
|
2025-07-02 14:58:44 -07:00
|
|
|
ram_addr_t qemu_ram_get_fd_offset(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->fd_offset;
|
|
|
|
|
}
|
|
|
|
|
|
2019-02-15 20:45:44 +03:00
|
|
|
ram_addr_t qemu_ram_get_used_length(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->used_length;
|
|
|
|
|
}
|
|
|
|
|
|
2021-04-29 13:26:59 +02:00
|
|
|
ram_addr_t qemu_ram_get_max_length(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->max_length;
|
|
|
|
|
}
|
|
|
|
|
|
2017-03-07 18:36:36 +00:00
|
|
|
bool qemu_ram_is_shared(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->flags & RAM_SHARED;
|
|
|
|
|
}
|
|
|
|
|
|
2021-05-10 13:43:21 +02:00
|
|
|
bool qemu_ram_is_noreserve(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->flags & RAM_NORESERVE;
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-12 17:20:58 +00:00
|
|
|
/* Note: Only set at the start of postcopy */
|
|
|
|
|
bool qemu_ram_is_uf_zeroable(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->flags & RAM_UF_ZEROPAGE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void qemu_ram_set_uf_zeroable(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
rb->flags |= RAM_UF_ZEROPAGE;
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-14 08:57:00 +02:00
|
|
|
bool qemu_ram_is_migratable(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->flags & RAM_MIGRATABLE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void qemu_ram_set_migratable(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
rb->flags |= RAM_MIGRATABLE;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void qemu_ram_unset_migratable(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
rb->flags &= ~RAM_MIGRATABLE;
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-07 08:18:36 -07:00
|
|
|
bool qemu_ram_is_named_file(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->flags & RAM_NAMED_FILE;
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-13 14:59:05 -04:00
|
|
|
int qemu_ram_get_fd(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->fd;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-02 10:35:28 -05:00
|
|
|
/* Called with the BQL held. */
|
2016-05-10 10:04:59 +08:00
|
|
|
void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev)
|
2014-04-02 15:13:26 +08:00
|
|
|
{
|
2016-05-10 10:04:59 +08:00
|
|
|
RAMBlock *block;
|
2014-04-02 15:13:26 +08:00
|
|
|
|
2011-12-20 15:59:12 +02:00
|
|
|
assert(new_block);
|
|
|
|
|
assert(!new_block->idstr[0]);
|
2010-07-26 18:10:57 -06:00
|
|
|
|
2012-02-03 12:28:43 -06:00
|
|
|
if (dev) {
|
|
|
|
|
char *id = qdev_get_dev_path(dev);
|
2010-07-26 18:10:57 -06:00
|
|
|
if (id) {
|
|
|
|
|
snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id);
|
2011-08-20 22:09:37 -05:00
|
|
|
g_free(id);
|
2010-07-26 18:10:57 -06:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
pstrcat(new_block->idstr, sizeof(new_block->idstr), name);
|
|
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
2016-05-10 10:04:59 +08:00
|
|
|
if (block != new_block &&
|
|
|
|
|
!strcmp(block->idstr, new_block->idstr)) {
|
2010-07-26 18:10:57 -06:00
|
|
|
fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n",
|
|
|
|
|
new_block->idstr);
|
|
|
|
|
abort();
|
|
|
|
|
}
|
|
|
|
|
}
|
2011-12-20 15:59:12 +02:00
|
|
|
}
|
|
|
|
|
|
2024-01-02 10:35:28 -05:00
|
|
|
/* Called with the BQL held. */
|
2016-05-10 10:04:59 +08:00
|
|
|
void qemu_ram_unset_idstr(RAMBlock *block)
|
2014-04-02 15:13:26 +08:00
|
|
|
{
|
2013-09-05 14:41:35 -04:00
|
|
|
/* FIXME: arch_init.c assumes that this is not called throughout
|
|
|
|
|
* migration. Ignore the problem since hot-unplug during migration
|
|
|
|
|
* does not work anyway.
|
|
|
|
|
*/
|
2014-04-02 15:13:26 +08:00
|
|
|
if (block) {
|
|
|
|
|
memset(block->idstr, 0, sizeof(block->idstr));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-15 11:00:34 -08:00
|
|
|
static char *cpr_name(MemoryRegion *mr)
|
|
|
|
|
{
|
|
|
|
|
const char *mr_name = memory_region_name(mr);
|
|
|
|
|
g_autofree char *id = mr->dev ? qdev_get_dev_path(mr->dev) : NULL;
|
|
|
|
|
|
|
|
|
|
if (id) {
|
|
|
|
|
return g_strdup_printf("%s/%s", id, mr_name);
|
|
|
|
|
} else {
|
|
|
|
|
return g_strdup(mr_name);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-29 20:09:37 +01:00
|
|
|
size_t qemu_ram_pagesize(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
return rb->page_size;
|
|
|
|
|
}
|
|
|
|
|
|
2017-02-24 18:28:34 +00:00
|
|
|
/* Returns the largest size of page in use */
|
|
|
|
|
size_t qemu_ram_pagesize_largest(void)
|
|
|
|
|
{
|
|
|
|
|
RAMBlock *block;
|
|
|
|
|
size_t largest = 0;
|
|
|
|
|
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
2017-02-24 18:28:34 +00:00
|
|
|
largest = MAX(largest, qemu_ram_pagesize(block));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return largest;
|
|
|
|
|
}
|
|
|
|
|
|
2012-09-05 16:50:16 -03:00
|
|
|
static int memory_try_enable_merging(void *addr, size_t len)
|
|
|
|
|
{
|
2015-02-04 17:43:55 +02:00
|
|
|
if (!machine_mem_merge(current_machine)) {
|
2012-09-05 16:50:16 -03:00
|
|
|
/* disabled by the user */
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE);
|
|
|
|
|
}
|
|
|
|
|
|
2021-04-29 13:27:02 +02:00
|
|
|
/*
|
|
|
|
|
* Resizing RAM while migrating can result in the migration being canceled.
|
|
|
|
|
* Care has to be taken if the guest might have already detected the memory.
|
2014-11-12 14:27:41 +02:00
|
|
|
*
|
|
|
|
|
* As memory core doesn't know how is memory accessed, it is up to
|
|
|
|
|
* resize callback to update device state and/or add assertions to detect
|
|
|
|
|
* misuse, if necessary.
|
|
|
|
|
*/
|
2016-05-10 10:04:59 +08:00
|
|
|
int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp)
|
2014-11-12 14:27:41 +02:00
|
|
|
{
|
2021-04-29 13:27:00 +02:00
|
|
|
const ram_addr_t oldsize = block->used_length;
|
2020-04-03 11:18:27 +01:00
|
|
|
const ram_addr_t unaligned_size = newsize;
|
|
|
|
|
|
2014-11-12 14:27:41 +02:00
|
|
|
assert(block);
|
|
|
|
|
|
2024-01-02 12:57:49 +11:00
|
|
|
newsize = TARGET_PAGE_ALIGN(newsize);
|
|
|
|
|
newsize = REAL_HOST_PAGE_ALIGN(newsize);
|
2015-02-17 10:15:30 +01:00
|
|
|
|
2014-11-12 14:27:41 +02:00
|
|
|
if (block->used_length == newsize) {
|
2020-04-03 11:18:27 +01:00
|
|
|
/*
|
|
|
|
|
* We don't have to resize the ram block (which only knows aligned
|
|
|
|
|
* sizes), however, we have to notify if the unaligned size changed.
|
|
|
|
|
*/
|
|
|
|
|
if (unaligned_size != memory_region_size(block->mr)) {
|
|
|
|
|
memory_region_set_size(block->mr, unaligned_size);
|
|
|
|
|
if (block->resized) {
|
|
|
|
|
block->resized(block->idstr, unaligned_size, block->host);
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-11-12 14:27:41 +02:00
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!(block->flags & RAM_RESIZEABLE)) {
|
|
|
|
|
error_setg_errno(errp, EINVAL,
|
2020-10-22 13:13:02 +02:00
|
|
|
"Size mismatch: %s: 0x" RAM_ADDR_FMT
|
|
|
|
|
" != 0x" RAM_ADDR_FMT, block->idstr,
|
2014-11-12 14:27:41 +02:00
|
|
|
newsize, block->used_length);
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (block->max_length < newsize) {
|
|
|
|
|
error_setg_errno(errp, EINVAL,
|
2020-10-22 13:13:02 +02:00
|
|
|
"Size too large: %s: 0x" RAM_ADDR_FMT
|
2014-11-12 14:27:41 +02:00
|
|
|
" > 0x" RAM_ADDR_FMT, block->idstr,
|
|
|
|
|
newsize, block->max_length);
|
|
|
|
|
return -EINVAL;
|
|
|
|
|
}
|
|
|
|
|
|
2021-04-29 13:27:00 +02:00
|
|
|
/* Notify before modifying the ram block and touching the bitmaps. */
|
|
|
|
|
if (block->host) {
|
|
|
|
|
ram_block_notify_resize(block->host, oldsize, newsize);
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 13:55:15 +02:00
|
|
|
physical_memory_clear_dirty_range(block->offset, block->used_length);
|
2014-11-12 14:27:41 +02:00
|
|
|
block->used_length = newsize;
|
2025-09-30 09:08:44 +02:00
|
|
|
physical_memory_set_dirty_range(block->offset, block->used_length,
|
2015-03-23 11:56:01 +01:00
|
|
|
DIRTY_CLIENTS_ALL);
|
2020-04-03 11:18:27 +01:00
|
|
|
memory_region_set_size(block->mr, unaligned_size);
|
2014-11-12 14:27:41 +02:00
|
|
|
if (block->resized) {
|
2020-04-03 11:18:27 +01:00
|
|
|
block->resized(block->idstr, unaligned_size, block->host);
|
2014-11-12 14:27:41 +02:00
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-21 00:08:41 +00:00
|
|
|
/*
|
|
|
|
|
* Trigger sync on the given ram block for range [start, start + length]
|
|
|
|
|
* with the backing store if one is available.
|
|
|
|
|
* Otherwise no-op.
|
|
|
|
|
* @Note: this is supposed to be a synchronous op.
|
|
|
|
|
*/
|
2020-05-08 08:24:56 +02:00
|
|
|
void qemu_ram_msync(RAMBlock *block, ram_addr_t start, ram_addr_t length)
|
2019-11-21 00:08:41 +00:00
|
|
|
{
|
|
|
|
|
/* The requested range should fit in within the block range */
|
|
|
|
|
g_assert((start + length) <= block->used_length);
|
|
|
|
|
|
|
|
|
|
#ifdef CONFIG_LIBPMEM
|
|
|
|
|
/* The lack of support for pmem should not block the sync */
|
2025-09-29 17:12:12 +02:00
|
|
|
if (ram_block_is_pmem(block)) {
|
2019-12-19 15:43:22 +00:00
|
|
|
void *addr = ramblock_ptr(block, start);
|
2019-11-21 00:08:41 +00:00
|
|
|
pmem_persist(addr, length);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
if (block->fd >= 0) {
|
|
|
|
|
/**
|
|
|
|
|
* Case there is no support for PMEM or the memory has not been
|
|
|
|
|
* specified as persistent (or is not one) - use the msync.
|
|
|
|
|
* Less optimal but still achieves the same goal
|
|
|
|
|
*/
|
2019-12-19 15:43:22 +00:00
|
|
|
void *addr = ramblock_ptr(block, start);
|
2019-11-21 00:08:41 +00:00
|
|
|
if (qemu_msync(addr, length, block->fd)) {
|
|
|
|
|
warn_report("%s: failed to sync memory range: start: "
|
|
|
|
|
RAM_ADDR_FMT " length: " RAM_ADDR_FMT,
|
|
|
|
|
__func__, start, length);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-01-25 13:33:20 +00:00
|
|
|
/* Called with ram_list.mutex held */
|
2024-08-28 11:07:43 +02:00
|
|
|
static void dirty_memory_extend(ram_addr_t new_ram_size)
|
2016-01-25 13:33:20 +00:00
|
|
|
{
|
2024-08-28 11:07:43 +02:00
|
|
|
unsigned int old_num_blocks = ram_list.num_dirty_blocks;
|
|
|
|
|
unsigned int new_num_blocks = DIV_ROUND_UP(new_ram_size,
|
|
|
|
|
DIRTY_MEMORY_BLOCK_SIZE);
|
2016-01-25 13:33:20 +00:00
|
|
|
int i;
|
|
|
|
|
|
|
|
|
|
/* Only need to extend if block count increased */
|
|
|
|
|
if (new_num_blocks <= old_num_blocks) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (i = 0; i < DIRTY_MEMORY_NUM; i++) {
|
|
|
|
|
DirtyMemoryBlocks *old_blocks;
|
|
|
|
|
DirtyMemoryBlocks *new_blocks;
|
|
|
|
|
int j;
|
|
|
|
|
|
2020-09-23 11:56:46 +01:00
|
|
|
old_blocks = qatomic_rcu_read(&ram_list.dirty_memory[i]);
|
2016-01-25 13:33:20 +00:00
|
|
|
new_blocks = g_malloc(sizeof(*new_blocks) +
|
|
|
|
|
sizeof(new_blocks->blocks[0]) * new_num_blocks);
|
|
|
|
|
|
|
|
|
|
if (old_num_blocks) {
|
|
|
|
|
memcpy(new_blocks->blocks, old_blocks->blocks,
|
|
|
|
|
old_num_blocks * sizeof(old_blocks->blocks[0]));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (j = old_num_blocks; j < new_num_blocks; j++) {
|
|
|
|
|
new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE);
|
|
|
|
|
}
|
|
|
|
|
|
2020-09-23 11:56:46 +01:00
|
|
|
qatomic_rcu_set(&ram_list.dirty_memory[i], new_blocks);
|
2016-01-25 13:33:20 +00:00
|
|
|
|
|
|
|
|
if (old_blocks) {
|
|
|
|
|
g_free_rcu(old_blocks, rcu);
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-08-28 11:07:43 +02:00
|
|
|
|
|
|
|
|
ram_list.num_dirty_blocks = new_num_blocks;
|
2016-01-25 13:33:20 +00:00
|
|
|
}
|
|
|
|
|
|
2021-04-06 10:01:24 +02:00
|
|
|
static void ram_block_add(RAMBlock *new_block, Error **errp)
|
2011-12-20 15:59:12 +02:00
|
|
|
{
|
2021-05-10 13:43:21 +02:00
|
|
|
const bool noreserve = qemu_ram_is_noreserve(new_block);
|
2021-04-06 10:01:24 +02:00
|
|
|
const bool shared = qemu_ram_is_shared(new_block);
|
2014-05-14 17:43:18 +08:00
|
|
|
RAMBlock *block;
|
2015-01-21 13:45:24 +01:00
|
|
|
RAMBlock *last_block = NULL;
|
2024-03-20 03:39:02 -05:00
|
|
|
bool free_on_error = false;
|
2024-08-28 11:07:43 +02:00
|
|
|
ram_addr_t ram_size;
|
2016-01-14 16:09:39 +01:00
|
|
|
Error *err = NULL;
|
2013-10-08 13:52:02 +02:00
|
|
|
|
2011-08-17 00:01:33 -07:00
|
|
|
qemu_mutex_lock_ramlist();
|
2014-12-15 22:55:32 +02:00
|
|
|
new_block->offset = find_ram_offset(new_block->max_length);
|
2014-05-14 17:43:18 +08:00
|
|
|
|
|
|
|
|
if (!new_block->host) {
|
|
|
|
|
if (xen_enabled()) {
|
2014-12-15 22:55:32 +02:00
|
|
|
xen_ram_alloc(new_block->offset, new_block->max_length,
|
2016-01-14 16:09:39 +01:00
|
|
|
new_block->mr, &err);
|
|
|
|
|
if (err) {
|
|
|
|
|
error_propagate(errp, err);
|
|
|
|
|
qemu_mutex_unlock_ramlist();
|
2016-03-09 18:14:01 +01:00
|
|
|
return;
|
2016-01-14 16:09:39 +01:00
|
|
|
}
|
2014-05-14 17:43:18 +08:00
|
|
|
} else {
|
2021-03-03 14:09:16 +01:00
|
|
|
new_block->host = qemu_anon_ram_alloc(new_block->max_length,
|
|
|
|
|
&new_block->mr->align,
|
2021-05-10 13:43:21 +02:00
|
|
|
shared, noreserve);
|
2013-07-31 15:11:11 +02:00
|
|
|
if (!new_block->host) {
|
2014-09-09 13:27:54 +08:00
|
|
|
error_setg_errno(errp, errno,
|
|
|
|
|
"cannot set up guest memory '%s'",
|
|
|
|
|
memory_region_name(new_block->mr));
|
|
|
|
|
qemu_mutex_unlock_ramlist();
|
2016-03-09 18:14:01 +01:00
|
|
|
return;
|
2013-07-31 15:11:11 +02:00
|
|
|
}
|
2014-12-15 22:55:32 +02:00
|
|
|
memory_try_enable_merging(new_block->host, new_block->max_length);
|
2024-03-20 03:39:02 -05:00
|
|
|
free_on_error = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (new_block->flags & RAM_GUEST_MEMFD) {
|
2024-07-16 14:42:12 +08:00
|
|
|
int ret;
|
|
|
|
|
|
2025-02-17 13:08:12 +01:00
|
|
|
if (!kvm_enabled()) {
|
|
|
|
|
error_setg(errp, "cannot set up private guest memory for %s: KVM required",
|
|
|
|
|
object_get_typename(OBJECT(current_machine->cgs)));
|
|
|
|
|
goto out_free;
|
|
|
|
|
}
|
2024-03-20 03:39:02 -05:00
|
|
|
assert(new_block->guest_memfd < 0);
|
|
|
|
|
|
2025-06-12 16:27:46 +08:00
|
|
|
ret = ram_block_coordinated_discard_require(true);
|
2024-07-16 14:42:12 +08:00
|
|
|
if (ret < 0) {
|
|
|
|
|
error_setg_errno(errp, -ret,
|
2024-03-20 17:45:29 +01:00
|
|
|
"cannot set up private guest memory: discard currently blocked");
|
|
|
|
|
error_append_hint(errp, "Are you using assigned devices?\n");
|
|
|
|
|
goto out_free;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-20 03:39:02 -05:00
|
|
|
new_block->guest_memfd = kvm_create_guest_memfd(new_block->max_length,
|
|
|
|
|
0, errp);
|
|
|
|
|
if (new_block->guest_memfd < 0) {
|
|
|
|
|
qemu_mutex_unlock_ramlist();
|
|
|
|
|
goto out_free;
|
2010-08-18 15:41:49 +09:00
|
|
|
}
|
2025-02-27 06:48:01 -08:00
|
|
|
|
2025-06-12 16:27:46 +08:00
|
|
|
/*
|
|
|
|
|
* The attribute bitmap of the RamBlockAttributes is default to
|
|
|
|
|
* discarded, which mimics the behavior of kvm_set_phys_mem() when it
|
|
|
|
|
* calls kvm_set_memory_attributes_private(). This leads to a brief
|
|
|
|
|
* period of inconsistency between the creation of the RAMBlock and its
|
|
|
|
|
* mapping into the physical address space. However, this is not
|
|
|
|
|
* problematic, as no users rely on the attribute status to perform
|
|
|
|
|
* any actions during this interval.
|
|
|
|
|
*/
|
|
|
|
|
new_block->attributes = ram_block_attributes_create(new_block);
|
|
|
|
|
if (!new_block->attributes) {
|
|
|
|
|
error_setg(errp, "Failed to create ram block attribute");
|
|
|
|
|
close(new_block->guest_memfd);
|
|
|
|
|
ram_block_coordinated_discard_require(false);
|
|
|
|
|
qemu_mutex_unlock_ramlist();
|
|
|
|
|
goto out_free;
|
|
|
|
|
}
|
|
|
|
|
|
2025-03-27 07:52:10 -07:00
|
|
|
/*
|
|
|
|
|
* Add a specific guest_memfd blocker if a generic one would not be
|
|
|
|
|
* added by ram_block_add_cpr_blocker.
|
|
|
|
|
*/
|
|
|
|
|
if (ram_is_cpr_compatible(new_block)) {
|
|
|
|
|
error_setg(&new_block->cpr_blocker,
|
|
|
|
|
"Memory region %s uses guest_memfd, "
|
|
|
|
|
"which is not supported with CPR.",
|
|
|
|
|
memory_region_name(new_block->mr));
|
2025-10-27 07:45:02 +01:00
|
|
|
migrate_add_blocker_modes(&new_block->cpr_blocker,
|
|
|
|
|
BIT(MIG_MODE_CPR_TRANSFER), errp);
|
2025-03-27 07:52:10 -07:00
|
|
|
}
|
2010-03-01 20:25:08 -03:00
|
|
|
}
|
2009-04-11 17:15:54 +00:00
|
|
|
|
2024-08-28 11:07:43 +02:00
|
|
|
ram_size = (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS;
|
|
|
|
|
dirty_memory_extend(ram_size);
|
2015-01-21 13:45:24 +01:00
|
|
|
/* Keep the list sorted from biggest to smallest block. Unlike QTAILQ,
|
|
|
|
|
* QLIST (which has an RCU-friendly variant) does not have insertion at
|
|
|
|
|
* tail, so save the last element in last_block.
|
|
|
|
|
*/
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
2015-01-21 13:45:24 +01:00
|
|
|
last_block = block;
|
2014-12-15 22:55:32 +02:00
|
|
|
if (block->max_length < new_block->max_length) {
|
2012-11-14 16:00:51 +01:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (block) {
|
2013-09-05 14:41:35 -04:00
|
|
|
QLIST_INSERT_BEFORE_RCU(block, new_block, next);
|
2015-01-21 13:45:24 +01:00
|
|
|
} else if (last_block) {
|
2013-09-05 14:41:35 -04:00
|
|
|
QLIST_INSERT_AFTER_RCU(last_block, new_block, next);
|
2015-01-21 13:45:24 +01:00
|
|
|
} else { /* list is empty */
|
2013-09-05 14:41:35 -04:00
|
|
|
QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next);
|
2012-11-14 16:00:51 +01:00
|
|
|
}
|
2012-11-14 15:45:02 +01:00
|
|
|
ram_list.mru_block = NULL;
|
2009-04-11 17:15:54 +00:00
|
|
|
|
2013-09-05 14:41:35 -04:00
|
|
|
/* Write list before version */
|
|
|
|
|
smp_wmb();
|
2011-08-18 11:41:17 -07:00
|
|
|
ram_list.version++;
|
2011-08-17 00:01:33 -07:00
|
|
|
qemu_mutex_unlock_ramlist();
|
2011-08-18 11:41:17 -07:00
|
|
|
|
2025-09-30 09:08:44 +02:00
|
|
|
physical_memory_set_dirty_range(new_block->offset,
|
2015-03-23 11:56:01 +01:00
|
|
|
new_block->used_length,
|
|
|
|
|
DIRTY_CLIENTS_ALL);
|
2009-04-11 17:15:54 +00:00
|
|
|
|
2015-01-21 16:18:35 +01:00
|
|
|
if (new_block->host) {
|
|
|
|
|
qemu_ram_setup_dump(new_block->host, new_block->max_length);
|
|
|
|
|
qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE);
|
2020-02-19 23:11:09 -05:00
|
|
|
/*
|
|
|
|
|
* MADV_DONTFORK is also needed by KVM in absence of synchronous MMU
|
|
|
|
|
* Configure it unless the machine is a qtest server, in which case
|
|
|
|
|
* KVM is not used and it may be forked (eg for fuzzing purposes).
|
|
|
|
|
*/
|
|
|
|
|
if (!qtest_enabled()) {
|
|
|
|
|
qemu_madvise(new_block->host, new_block->max_length,
|
|
|
|
|
QEMU_MADV_DONTFORK);
|
|
|
|
|
}
|
2021-04-29 13:27:00 +02:00
|
|
|
ram_block_notify_add(new_block->host, new_block->used_length,
|
|
|
|
|
new_block->max_length);
|
2014-05-14 17:43:18 +08:00
|
|
|
}
|
2024-03-20 03:39:02 -05:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
out_free:
|
|
|
|
|
if (free_on_error) {
|
|
|
|
|
qemu_anon_ram_free(new_block->host, new_block->max_length);
|
|
|
|
|
new_block->host = NULL;
|
|
|
|
|
}
|
2009-04-11 17:15:54 +00:00
|
|
|
}
|
2007-02-08 23:08:38 +00:00
|
|
|
|
2025-04-28 15:39:05 +09:00
|
|
|
#if defined(CONFIG_POSIX) && !defined(EMSCRIPTEN)
|
2025-01-15 11:00:29 -08:00
|
|
|
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, ram_addr_t max_size,
|
|
|
|
|
qemu_ram_resize_cb resized, MemoryRegion *mr,
|
2021-01-29 11:46:04 -05:00
|
|
|
uint32_t ram_flags, int fd, off_t offset,
|
2025-01-15 11:00:29 -08:00
|
|
|
bool grow,
|
2023-09-06 14:04:54 +02:00
|
|
|
Error **errp)
|
2014-05-14 17:43:18 +08:00
|
|
|
{
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
ERRP_GUARD();
|
2014-05-14 17:43:18 +08:00
|
|
|
RAMBlock *new_block;
|
2014-09-09 13:27:54 +08:00
|
|
|
Error *local_err = NULL;
|
2025-01-15 11:00:31 -08:00
|
|
|
int64_t file_size, file_align, share_flags;
|
|
|
|
|
|
|
|
|
|
share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED);
|
|
|
|
|
assert(share_flags != (RAM_SHARED | RAM_PRIVATE));
|
|
|
|
|
ram_flags &= ~RAM_PRIVATE;
|
2014-05-14 17:43:18 +08:00
|
|
|
|
2018-07-18 15:48:00 +08:00
|
|
|
/* Just support these ram flags by now. */
|
2021-07-19 19:21:04 +08:00
|
|
|
assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
|
2023-09-06 14:04:54 +02:00
|
|
|
RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY |
|
2025-01-15 11:00:29 -08:00
|
|
|
RAM_READONLY_FD | RAM_GUEST_MEMFD |
|
|
|
|
|
RAM_RESIZEABLE)) == 0);
|
|
|
|
|
assert(max_size >= size);
|
2018-07-18 15:48:00 +08:00
|
|
|
|
2014-05-14 17:43:18 +08:00
|
|
|
if (xen_enabled()) {
|
2014-05-14 17:43:20 +08:00
|
|
|
error_setg(errp, "-mem-path not supported with Xen");
|
2016-03-01 14:18:18 +08:00
|
|
|
return NULL;
|
2014-05-14 17:43:18 +08:00
|
|
|
}
|
|
|
|
|
|
2017-06-02 18:12:21 +04:00
|
|
|
if (kvm_enabled() && !kvm_has_sync_mmu()) {
|
|
|
|
|
error_setg(errp,
|
|
|
|
|
"host lacks kvm mmu notifiers, -mem-path unsupported");
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2024-01-02 12:57:49 +11:00
|
|
|
size = TARGET_PAGE_ALIGN(size);
|
|
|
|
|
size = REAL_HOST_PAGE_ALIGN(size);
|
2025-01-15 11:00:29 -08:00
|
|
|
max_size = TARGET_PAGE_ALIGN(max_size);
|
|
|
|
|
max_size = REAL_HOST_PAGE_ALIGN(max_size);
|
2024-01-02 12:57:49 +11:00
|
|
|
|
2017-06-02 18:12:22 +04:00
|
|
|
file_size = get_file_size(fd);
|
2025-01-15 11:00:29 -08:00
|
|
|
if (file_size && file_size < offset + max_size && !grow) {
|
physmem: fix qemu_ram_alloc_from_fd size calculation
qemu_ram_alloc_from_fd allocates space if file_size == 0. If non-zero,
it uses the existing space and verifies it is large enough, but the
verification was broken when the offset parameter was introduced. As
a result, a file smaller than offset passes the verification and causes
errors later. Fix that, and update the error message to include offset.
Peter provides this concise reproducer:
$ touch ramfile
$ truncate -s 64M ramfile
$ ./qemu-system-x86_64 -object memory-backend-file,mem-path=./ramfile,offset=128M,size=128M,id=mem1,prealloc=on
qemu-system-x86_64: qemu_prealloc_mem: preallocating memory failed: Bad address
With the fix, the error message is:
qemu-system-x86_64: mem1 backing store size 0x4000000 is too small for 'size' option 0x8000000 plus 'offset' option 0x8000000
Cc: qemu-stable@nongnu.org
Fixes: 4b870dc4d0c0 ("hostmem-file: add offset option")
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-3-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:28 -08:00
|
|
|
error_setg(errp, "%s backing store size 0x%" PRIx64
|
|
|
|
|
" is too small for 'size' option 0x" RAM_ADDR_FMT
|
|
|
|
|
" plus 'offset' option 0x%" PRIx64,
|
2025-01-15 11:00:29 -08:00
|
|
|
memory_region_name(mr), file_size, max_size,
|
physmem: fix qemu_ram_alloc_from_fd size calculation
qemu_ram_alloc_from_fd allocates space if file_size == 0. If non-zero,
it uses the existing space and verifies it is large enough, but the
verification was broken when the offset parameter was introduced. As
a result, a file smaller than offset passes the verification and causes
errors later. Fix that, and update the error message to include offset.
Peter provides this concise reproducer:
$ touch ramfile
$ truncate -s 64M ramfile
$ ./qemu-system-x86_64 -object memory-backend-file,mem-path=./ramfile,offset=128M,size=128M,id=mem1,prealloc=on
qemu-system-x86_64: qemu_prealloc_mem: preallocating memory failed: Bad address
With the fix, the error message is:
qemu-system-x86_64: mem1 backing store size 0x4000000 is too small for 'size' option 0x8000000 plus 'offset' option 0x8000000
Cc: qemu-stable@nongnu.org
Fixes: 4b870dc4d0c0 ("hostmem-file: add offset option")
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-3-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:28 -08:00
|
|
|
(uint64_t)offset);
|
2017-06-02 18:12:22 +04:00
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-29 16:50:09 +08:00
|
|
|
file_align = get_file_align(fd);
|
2021-08-12 16:06:24 +01:00
|
|
|
if (file_align > 0 && file_align > mr->align) {
|
2020-04-29 16:50:09 +08:00
|
|
|
error_setg(errp, "backing store align 0x%" PRIx64
|
2020-04-29 16:50:10 +08:00
|
|
|
" is larger than 'align' option 0x%" PRIx64,
|
2020-04-29 16:50:09 +08:00
|
|
|
file_align, mr->align);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2014-05-14 17:43:18 +08:00
|
|
|
new_block = g_malloc0(sizeof(*new_block));
|
|
|
|
|
new_block->mr = mr;
|
2014-12-15 22:55:32 +02:00
|
|
|
new_block->used_length = size;
|
2025-01-15 11:00:29 -08:00
|
|
|
new_block->max_length = max_size;
|
|
|
|
|
new_block->resized = resized;
|
2018-07-18 15:47:58 +08:00
|
|
|
new_block->flags = ram_flags;
|
2024-03-20 03:39:02 -05:00
|
|
|
new_block->guest_memfd = -1;
|
2025-01-15 11:00:29 -08:00
|
|
|
new_block->host = file_ram_alloc(new_block, max_size, fd,
|
|
|
|
|
file_size < offset + max_size,
|
|
|
|
|
offset, errp);
|
2014-05-14 17:43:20 +08:00
|
|
|
if (!new_block->host) {
|
|
|
|
|
g_free(new_block);
|
2016-03-01 14:18:18 +08:00
|
|
|
return NULL;
|
2014-05-14 17:43:20 +08:00
|
|
|
}
|
|
|
|
|
|
2021-04-06 10:01:24 +02:00
|
|
|
ram_block_add(new_block, &local_err);
|
2014-09-09 13:27:54 +08:00
|
|
|
if (local_err) {
|
|
|
|
|
g_free(new_block);
|
|
|
|
|
error_propagate(errp, local_err);
|
2016-03-01 14:18:18 +08:00
|
|
|
return NULL;
|
2014-09-09 13:27:54 +08:00
|
|
|
}
|
2016-03-01 14:18:18 +08:00
|
|
|
return new_block;
|
2017-06-02 18:12:23 +04:00
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
|
2018-07-18 15:47:58 +08:00
|
|
|
uint32_t ram_flags, const char *mem_path,
|
2023-09-06 14:04:54 +02:00
|
|
|
off_t offset, Error **errp)
|
2017-06-02 18:12:23 +04:00
|
|
|
{
|
|
|
|
|
int fd;
|
|
|
|
|
bool created;
|
|
|
|
|
RAMBlock *block;
|
|
|
|
|
|
2023-09-06 14:04:54 +02:00
|
|
|
fd = file_ram_open(mem_path, memory_region_name(mr),
|
softmmu/physmem: Fail creation of new files in file_ram_open() with readonly=true
Currently, if a file does not exist yet, file_ram_open() will create new
empty file and open it writable. However, it even does that when
readonly=true was specified.
Specifying O_RDONLY instead to create a new readonly file would
theoretically work, however, ftruncate() will refuse to resize the new
empty file and we'll get a warning:
ftruncate: Invalid argument
And later eventually more problems when actually mmap'ing that file and
accessing it.
If someone intends to let QEMU open+mmap a file read-only, better
create+resize+fill that file ahead of time outside of QEMU context.
We'll now fail with:
./qemu-system-x86_64 \
-object memory-backend-file,id=ram0,mem-path=tmp,readonly=true,size=1g
qemu-system-x86_64: can't open backing store tmp for guest RAM: No such file or directory
All use cases of readonly files (R/O NVDIMMs, VM templating) work on
existing files, so silently creating new files might just hide user
errors when accidentally specifying a non-existent file.
Note that the only memory-backend-file will end up calling
memory_region_init_ram_from_file() -> qemu_ram_alloc_from_file() ->
file_ram_open().
Move error reporting to the single caller.
Message-ID: <20230906120503.359863-7-david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
2023-09-06 14:04:58 +02:00
|
|
|
!!(ram_flags & RAM_READONLY_FD), &created);
|
2017-06-02 18:12:23 +04:00
|
|
|
if (fd < 0) {
|
softmmu/physmem: Fail creation of new files in file_ram_open() with readonly=true
Currently, if a file does not exist yet, file_ram_open() will create new
empty file and open it writable. However, it even does that when
readonly=true was specified.
Specifying O_RDONLY instead to create a new readonly file would
theoretically work, however, ftruncate() will refuse to resize the new
empty file and we'll get a warning:
ftruncate: Invalid argument
And later eventually more problems when actually mmap'ing that file and
accessing it.
If someone intends to let QEMU open+mmap a file read-only, better
create+resize+fill that file ahead of time outside of QEMU context.
We'll now fail with:
./qemu-system-x86_64 \
-object memory-backend-file,id=ram0,mem-path=tmp,readonly=true,size=1g
qemu-system-x86_64: can't open backing store tmp for guest RAM: No such file or directory
All use cases of readonly files (R/O NVDIMMs, VM templating) work on
existing files, so silently creating new files might just hide user
errors when accidentally specifying a non-existent file.
Note that the only memory-backend-file will end up calling
memory_region_init_ram_from_file() -> qemu_ram_alloc_from_file() ->
file_ram_open().
Move error reporting to the single caller.
Message-ID: <20230906120503.359863-7-david@redhat.com>
Acked-by: Peter Xu <peterx@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
2023-09-06 14:04:58 +02:00
|
|
|
error_setg_errno(errp, -fd, "can't open backing store %s for guest RAM",
|
|
|
|
|
mem_path);
|
softmmu/physmem: Hint that "readonly=on,rom=off" exists when opening file R/W for private mapping fails
It's easy to miss that memory-backend-file with "share=off" (default)
will always try opening the file R/W as default, and fail if we don't
have write permissions to the file.
In that case, the user has to explicit specify "readonly=on,rom=off" to
get usable RAM, for example, for VM templating.
Let's hint that '-object memory-backend-file,readonly=on,rom=off,...'
exists to consume R/O files in a private mapping to create writable RAM,
but only if we have permissions to open the file read-only.
Message-ID: <20230906120503.359863-11-david@redhat.com>
Suggested-by: ThinerLogoer <logoerthiner1@163.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
2023-09-06 14:05:02 +02:00
|
|
|
if (!(ram_flags & RAM_READONLY_FD) && !(ram_flags & RAM_SHARED) &&
|
|
|
|
|
fd == -EACCES) {
|
|
|
|
|
/*
|
|
|
|
|
* If we can open the file R/O (note: will never create a new file)
|
|
|
|
|
* and we are dealing with a private mapping, there are still ways
|
|
|
|
|
* to consume such files and get RAM instead of ROM.
|
|
|
|
|
*/
|
|
|
|
|
fd = file_ram_open(mem_path, memory_region_name(mr), true,
|
|
|
|
|
&created);
|
|
|
|
|
if (fd < 0) {
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
assert(!created);
|
|
|
|
|
close(fd);
|
|
|
|
|
error_append_hint(errp, "Consider opening the backing store"
|
|
|
|
|
" read-only but still creating writable RAM using"
|
|
|
|
|
" '-object memory-backend-file,readonly=on,rom=off...'"
|
|
|
|
|
" (see \"VM templating\" documentation)\n");
|
|
|
|
|
}
|
2017-06-02 18:12:23 +04:00
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-15 11:00:29 -08:00
|
|
|
block = qemu_ram_alloc_from_fd(size, size, NULL, mr, ram_flags, fd, offset,
|
|
|
|
|
false, errp);
|
2017-06-02 18:12:23 +04:00
|
|
|
if (!block) {
|
|
|
|
|
if (created) {
|
|
|
|
|
unlink(mem_path);
|
|
|
|
|
}
|
|
|
|
|
close(fd);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return block;
|
2014-05-14 17:43:18 +08:00
|
|
|
}
|
2014-05-14 17:43:19 +08:00
|
|
|
#endif
|
2014-05-14 17:43:18 +08:00
|
|
|
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
#ifdef CONFIG_POSIX
|
|
|
|
|
/*
|
|
|
|
|
* Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor, so it can be
|
|
|
|
|
* shared with another process if CPR is being used. Use memfd if available
|
|
|
|
|
* because it has no size limits, else use POSIX shm.
|
|
|
|
|
*/
|
2025-01-15 11:00:34 -08:00
|
|
|
static int qemu_ram_get_shared_fd(const char *name, bool *reused, Error **errp)
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
{
|
2025-01-15 11:00:34 -08:00
|
|
|
int fd = cpr_find_fd(name, 0);
|
|
|
|
|
|
|
|
|
|
if (fd >= 0) {
|
|
|
|
|
*reused = true;
|
|
|
|
|
return fd;
|
|
|
|
|
}
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
|
|
|
|
|
if (qemu_memfd_check(0)) {
|
|
|
|
|
fd = qemu_memfd_create(name, 0, 0, 0, 0, errp);
|
|
|
|
|
} else {
|
|
|
|
|
fd = qemu_shm_alloc(0, errp);
|
|
|
|
|
}
|
2025-01-15 11:00:34 -08:00
|
|
|
|
|
|
|
|
if (fd >= 0) {
|
|
|
|
|
cpr_save_fd(name, 0, fd);
|
|
|
|
|
}
|
|
|
|
|
*reused = false;
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
return fd;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
2014-11-12 14:27:41 +02:00
|
|
|
static
|
2016-03-01 14:18:18 +08:00
|
|
|
RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
|
2025-01-15 11:00:29 -08:00
|
|
|
qemu_ram_resize_cb resized,
|
2021-05-10 13:43:19 +02:00
|
|
|
void *host, uint32_t ram_flags,
|
2016-03-01 14:18:18 +08:00
|
|
|
MemoryRegion *mr, Error **errp)
|
2014-05-14 17:43:18 +08:00
|
|
|
{
|
|
|
|
|
RAMBlock *new_block;
|
2014-09-09 13:27:54 +08:00
|
|
|
Error *local_err = NULL;
|
2025-01-15 11:00:31 -08:00
|
|
|
int align, share_flags;
|
|
|
|
|
|
|
|
|
|
share_flags = ram_flags & (RAM_PRIVATE | RAM_SHARED);
|
|
|
|
|
assert(share_flags != (RAM_SHARED | RAM_PRIVATE));
|
|
|
|
|
ram_flags &= ~RAM_PRIVATE;
|
2014-05-14 17:43:18 +08:00
|
|
|
|
2021-05-10 13:43:21 +02:00
|
|
|
assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
|
2024-03-20 03:39:02 -05:00
|
|
|
RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0);
|
2021-05-10 13:43:19 +02:00
|
|
|
assert(!host ^ (ram_flags & RAM_PREALLOC));
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
assert(max_size >= size);
|
|
|
|
|
|
2025-04-28 15:39:05 +09:00
|
|
|
/* ignore RAM_SHARED for Windows and emscripten*/
|
|
|
|
|
#if defined(CONFIG_POSIX) && !defined(EMSCRIPTEN)
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
if (!host) {
|
2025-01-15 11:00:32 -08:00
|
|
|
if (!share_flags && current_machine->aux_ram_share) {
|
|
|
|
|
ram_flags |= RAM_SHARED;
|
|
|
|
|
}
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
if (ram_flags & RAM_SHARED) {
|
2025-01-15 11:00:34 -08:00
|
|
|
bool reused;
|
|
|
|
|
g_autofree char *name = cpr_name(mr);
|
|
|
|
|
int fd = qemu_ram_get_shared_fd(name, &reused, errp);
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
|
|
|
|
|
if (fd < 0) {
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Use same alignment as qemu_anon_ram_alloc */
|
|
|
|
|
mr->align = QEMU_VMALLOC_ALIGN;
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* This can fail if the shm mount size is too small, or alloc from
|
|
|
|
|
* fd is not supported, but previous QEMU versions that called
|
|
|
|
|
* qemu_anon_ram_alloc for anonymous shared memory could have
|
|
|
|
|
* succeeded. Quietly fail and fall back.
|
2025-01-15 11:00:34 -08:00
|
|
|
*
|
|
|
|
|
* After cpr-transfer, new QEMU could create a memory region
|
|
|
|
|
* with a larger max size than old, so pass reused to grow the
|
|
|
|
|
* region if necessary. The extra space will be usable after a
|
|
|
|
|
* guest reset.
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
*/
|
|
|
|
|
new_block = qemu_ram_alloc_from_fd(size, max_size, resized, mr,
|
2025-01-15 11:00:34 -08:00
|
|
|
ram_flags, fd, 0, reused, NULL);
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
if (new_block) {
|
|
|
|
|
trace_qemu_ram_alloc_shared(name, new_block->used_length,
|
|
|
|
|
new_block->max_length, fd,
|
|
|
|
|
new_block->host);
|
|
|
|
|
return new_block;
|
|
|
|
|
}
|
|
|
|
|
|
2025-01-15 11:00:34 -08:00
|
|
|
cpr_delete_fd(name, 0);
|
physmem: fd-based shared memory
Create MAP_SHARED RAMBlocks by mmap'ing a file descriptor rather than using
MAP_ANON, so the memory can be accessed in another process by passing and
mmap'ing the fd. This will allow CPR to support memory-backend-ram and
memory-backend-shm objects, provided the user creates them with share=on.
Use memfd_create if available because it has no constraints. If not, use
POSIX shm_open. However, allocation on the opened fd may fail if the shm
mount size is too small, even if the system has free memory, so for backwards
compatibility fall back to qemu_anon_ram_alloc/MAP_ANON on failure.
For backwards compatibility on Windows, always use MAP_ANON. share=on has
no purpose there, but the syntax is accepted, and must continue to work.
Lastly, quietly fall back to MAP_ANON if the system does not support
qemu_ram_alloc_from_fd.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Link: https://lore.kernel.org/r/1736967650-129648-5-git-send-email-steven.sistare@oracle.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-01-15 11:00:30 -08:00
|
|
|
close(fd);
|
|
|
|
|
/* fall back to anon allocation */
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif
|
2021-05-10 13:43:19 +02:00
|
|
|
|
2024-01-02 12:57:49 +11:00
|
|
|
align = qemu_real_host_page_size();
|
|
|
|
|
align = MAX(align, TARGET_PAGE_SIZE);
|
|
|
|
|
size = ROUND_UP(size, align);
|
|
|
|
|
max_size = ROUND_UP(max_size, align);
|
|
|
|
|
|
2014-05-14 17:43:18 +08:00
|
|
|
new_block = g_malloc0(sizeof(*new_block));
|
|
|
|
|
new_block->mr = mr;
|
2014-11-12 14:27:41 +02:00
|
|
|
new_block->resized = resized;
|
2014-12-15 22:55:32 +02:00
|
|
|
new_block->used_length = size;
|
|
|
|
|
new_block->max_length = max_size;
|
2014-05-14 17:43:18 +08:00
|
|
|
new_block->fd = -1;
|
2024-03-20 03:39:02 -05:00
|
|
|
new_block->guest_memfd = -1;
|
2022-03-23 19:57:22 +04:00
|
|
|
new_block->page_size = qemu_real_host_page_size();
|
2014-05-14 17:43:18 +08:00
|
|
|
new_block->host = host;
|
2021-05-10 13:43:19 +02:00
|
|
|
new_block->flags = ram_flags;
|
2021-04-06 10:01:24 +02:00
|
|
|
ram_block_add(new_block, &local_err);
|
2014-09-09 13:27:54 +08:00
|
|
|
if (local_err) {
|
|
|
|
|
g_free(new_block);
|
|
|
|
|
error_propagate(errp, local_err);
|
2016-03-01 14:18:18 +08:00
|
|
|
return NULL;
|
2014-09-09 13:27:54 +08:00
|
|
|
}
|
2016-03-01 14:18:18 +08:00
|
|
|
return new_block;
|
2014-05-14 17:43:18 +08:00
|
|
|
}
|
|
|
|
|
|
2016-03-01 14:18:18 +08:00
|
|
|
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
|
2014-11-12 14:27:41 +02:00
|
|
|
MemoryRegion *mr, Error **errp)
|
|
|
|
|
{
|
2021-05-10 13:43:19 +02:00
|
|
|
return qemu_ram_alloc_internal(size, size, NULL, host, RAM_PREALLOC, mr,
|
|
|
|
|
errp);
|
2014-11-12 14:27:41 +02:00
|
|
|
}
|
|
|
|
|
|
2021-05-10 13:43:19 +02:00
|
|
|
RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags,
|
2017-12-13 16:37:37 +02:00
|
|
|
MemoryRegion *mr, Error **errp)
|
2010-08-18 15:41:49 +09:00
|
|
|
{
|
2025-01-15 11:00:31 -08:00
|
|
|
assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE | RAM_GUEST_MEMFD |
|
|
|
|
|
RAM_PRIVATE)) == 0);
|
2021-05-10 13:43:19 +02:00
|
|
|
return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp);
|
2014-11-12 14:27:41 +02:00
|
|
|
}
|
|
|
|
|
|
2016-03-01 14:18:18 +08:00
|
|
|
RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz,
|
2025-01-15 11:00:29 -08:00
|
|
|
qemu_ram_resize_cb resized,
|
|
|
|
|
MemoryRegion *mr, Error **errp)
|
2014-11-12 14:27:41 +02:00
|
|
|
{
|
2021-05-10 13:43:19 +02:00
|
|
|
return qemu_ram_alloc_internal(size, maxsz, resized, NULL,
|
|
|
|
|
RAM_RESIZEABLE, mr, errp);
|
2010-08-18 15:41:49 +09:00
|
|
|
}
|
|
|
|
|
|
2013-09-09 17:58:40 +02:00
|
|
|
static void reclaim_ramblock(RAMBlock *block)
|
|
|
|
|
{
|
|
|
|
|
if (block->flags & RAM_PREALLOC) {
|
|
|
|
|
;
|
|
|
|
|
} else if (xen_enabled()) {
|
|
|
|
|
xen_invalidate_map_cache_entry(block->host);
|
2025-04-28 15:39:05 +09:00
|
|
|
#if !defined(_WIN32) && !defined(EMSCRIPTEN)
|
2013-09-09 17:58:40 +02:00
|
|
|
} else if (block->fd >= 0) {
|
2019-01-30 21:36:05 -02:00
|
|
|
qemu_ram_munmap(block->fd, block->host, block->max_length);
|
2013-09-09 17:58:40 +02:00
|
|
|
close(block->fd);
|
|
|
|
|
#endif
|
|
|
|
|
} else {
|
|
|
|
|
qemu_anon_ram_free(block->host, block->max_length);
|
|
|
|
|
}
|
2024-03-20 03:39:02 -05:00
|
|
|
|
|
|
|
|
if (block->guest_memfd >= 0) {
|
2025-06-12 16:27:46 +08:00
|
|
|
ram_block_attributes_destroy(block->attributes);
|
2024-03-20 03:39:02 -05:00
|
|
|
close(block->guest_memfd);
|
2025-06-12 16:27:46 +08:00
|
|
|
ram_block_coordinated_discard_require(false);
|
2024-03-20 03:39:02 -05:00
|
|
|
}
|
|
|
|
|
|
2013-09-09 17:58:40 +02:00
|
|
|
g_free(block);
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-01 14:18:22 +08:00
|
|
|
void qemu_ram_free(RAMBlock *block)
|
2007-02-08 23:08:38 +00:00
|
|
|
{
|
2025-01-15 11:00:34 -08:00
|
|
|
g_autofree char *name = NULL;
|
|
|
|
|
|
2016-03-29 13:20:51 +02:00
|
|
|
if (!block) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2016-12-21 00:31:36 +08:00
|
|
|
if (block->host) {
|
2021-04-29 13:27:00 +02:00
|
|
|
ram_block_notify_remove(block->host, block->used_length,
|
|
|
|
|
block->max_length);
|
2016-12-21 00:31:36 +08:00
|
|
|
}
|
|
|
|
|
|
2011-08-17 00:01:33 -07:00
|
|
|
qemu_mutex_lock_ramlist();
|
2025-01-15 11:00:34 -08:00
|
|
|
name = cpr_name(block->mr);
|
|
|
|
|
cpr_delete_fd(name, 0);
|
2016-03-01 14:18:22 +08:00
|
|
|
QLIST_REMOVE_RCU(block, next);
|
|
|
|
|
ram_list.mru_block = NULL;
|
|
|
|
|
/* Write list before version */
|
|
|
|
|
smp_wmb();
|
|
|
|
|
ram_list.version++;
|
|
|
|
|
call_rcu(block, reclaim_ramblock, rcu);
|
2011-08-17 00:01:33 -07:00
|
|
|
qemu_mutex_unlock_ramlist();
|
2007-02-08 23:08:38 +00:00
|
|
|
}
|
|
|
|
|
|
2011-03-02 08:56:19 +01:00
|
|
|
#ifndef _WIN32
|
2025-02-11 21:27:06 +00:00
|
|
|
/* Simply remap the given VM memory location from start to start+length */
|
|
|
|
|
static int qemu_ram_remap_mmap(RAMBlock *block, uint64_t start, size_t length)
|
|
|
|
|
{
|
|
|
|
|
int flags, prot;
|
|
|
|
|
void *area;
|
|
|
|
|
void *host_startaddr = block->host + start;
|
|
|
|
|
|
|
|
|
|
assert(block->fd < 0);
|
|
|
|
|
flags = MAP_FIXED | MAP_ANONYMOUS;
|
|
|
|
|
flags |= block->flags & RAM_SHARED ? MAP_SHARED : MAP_PRIVATE;
|
|
|
|
|
flags |= block->flags & RAM_NORESERVE ? MAP_NORESERVE : 0;
|
|
|
|
|
prot = PROT_READ;
|
|
|
|
|
prot |= block->flags & RAM_READONLY ? 0 : PROT_WRITE;
|
|
|
|
|
area = mmap(host_startaddr, length, prot, flags, -1, 0);
|
|
|
|
|
return area != host_startaddr ? -errno : 0;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-11 21:27:05 +00:00
|
|
|
/*
|
|
|
|
|
* qemu_ram_remap - remap a single RAM page
|
|
|
|
|
*
|
|
|
|
|
* @addr: address in ram_addr_t address space.
|
|
|
|
|
*
|
|
|
|
|
* This function will try remapping a single page of guest RAM identified by
|
|
|
|
|
* @addr, essentially discarding memory to recover from previously poisoned
|
|
|
|
|
* memory (MCE). The page size depends on the RAMBlock (i.e., hugetlb). @addr
|
|
|
|
|
* does not have to point at the start of the page.
|
|
|
|
|
*
|
|
|
|
|
* This function is only to be used during system resets; it will kill the
|
|
|
|
|
* VM if remapping failed.
|
|
|
|
|
*/
|
|
|
|
|
void qemu_ram_remap(ram_addr_t addr)
|
2011-03-02 08:56:19 +01:00
|
|
|
{
|
|
|
|
|
RAMBlock *block;
|
2025-02-11 21:27:05 +00:00
|
|
|
uint64_t offset;
|
2025-02-11 21:27:06 +00:00
|
|
|
void *vaddr;
|
2025-02-11 21:27:05 +00:00
|
|
|
size_t page_size;
|
2011-03-02 08:56:19 +01:00
|
|
|
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
2011-03-02 08:56:19 +01:00
|
|
|
offset = addr - block->offset;
|
2014-12-15 22:55:32 +02:00
|
|
|
if (offset < block->max_length) {
|
2025-02-11 21:27:05 +00:00
|
|
|
/* Respect the pagesize of our RAMBlock */
|
|
|
|
|
page_size = qemu_ram_pagesize(block);
|
|
|
|
|
offset = QEMU_ALIGN_DOWN(offset, page_size);
|
|
|
|
|
|
2014-11-12 11:44:41 +02:00
|
|
|
vaddr = ramblock_ptr(block, offset);
|
2014-05-14 17:43:22 +08:00
|
|
|
if (block->flags & RAM_PREALLOC) {
|
2011-03-02 08:56:19 +01:00
|
|
|
;
|
2013-07-31 15:11:05 +02:00
|
|
|
} else if (xen_enabled()) {
|
|
|
|
|
abort();
|
2011-03-02 08:56:19 +01:00
|
|
|
} else {
|
2025-02-11 21:27:06 +00:00
|
|
|
if (ram_block_discard_range(block, offset, page_size) != 0) {
|
|
|
|
|
/*
|
|
|
|
|
* Fall back to using mmap() only for anonymous mapping,
|
|
|
|
|
* as if a backing file is associated we may not be able
|
|
|
|
|
* to recover the memory in all cases.
|
|
|
|
|
* So don't take the risk of using only mmap and fail now.
|
|
|
|
|
*/
|
|
|
|
|
if (block->fd >= 0) {
|
|
|
|
|
error_report("Could not remap RAM %s:%" PRIx64 "+%"
|
|
|
|
|
PRIx64 " +%zx", block->idstr, offset,
|
|
|
|
|
block->fd_offset, page_size);
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
if (qemu_ram_remap_mmap(block, offset, page_size) != 0) {
|
|
|
|
|
error_report("Could not remap RAM %s:%" PRIx64 " +%zx",
|
|
|
|
|
block->idstr, offset, page_size);
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
2011-03-02 08:56:19 +01:00
|
|
|
}
|
2025-02-11 21:27:05 +00:00
|
|
|
memory_try_enable_merging(vaddr, page_size);
|
|
|
|
|
qemu_ram_setup_dump(vaddr, page_size);
|
2011-03-02 08:56:19 +01:00
|
|
|
}
|
2025-02-11 21:27:05 +00:00
|
|
|
|
|
|
|
|
break;
|
2011-03-02 08:56:19 +01:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
#endif /* !_WIN32 */
|
|
|
|
|
|
2023-10-05 11:17:13 -07:00
|
|
|
/*
|
|
|
|
|
* Return a host pointer to guest's ram.
|
2024-04-30 18:49:35 +02:00
|
|
|
* For Xen, foreign mappings get created if they don't already exist.
|
2013-09-05 14:41:35 -04:00
|
|
|
*
|
2024-04-30 18:49:35 +02:00
|
|
|
* @block: block for the RAM to lookup (optional and may be NULL).
|
|
|
|
|
* @addr: address within the memory region.
|
|
|
|
|
* @size: pointer to requested size (optional and may be NULL).
|
|
|
|
|
* size may get modified and return a value smaller than
|
|
|
|
|
* what was requested.
|
|
|
|
|
* @lock: wether to lock the mapping in xen-mapcache until invalidated.
|
|
|
|
|
* @is_write: hint wether to map RW or RO in the xen-mapcache.
|
|
|
|
|
* (optional and may always be set to true).
|
2013-09-05 14:41:35 -04:00
|
|
|
*
|
2015-12-16 10:31:26 +01:00
|
|
|
* Called within RCU critical section.
|
2013-09-05 14:41:35 -04:00
|
|
|
*/
|
2024-02-15 11:15:06 +02:00
|
|
|
static void *qemu_ram_ptr_length(RAMBlock *block, ram_addr_t addr,
|
2024-04-30 18:49:35 +02:00
|
|
|
hwaddr *size, bool lock,
|
|
|
|
|
bool is_write)
|
2011-05-19 18:35:45 +01:00
|
|
|
{
|
2023-10-05 11:17:13 -07:00
|
|
|
hwaddr len = 0;
|
|
|
|
|
|
|
|
|
|
if (size && *size == 0) {
|
2011-06-27 18:26:06 +01:00
|
|
|
return NULL;
|
|
|
|
|
}
|
2015-12-16 10:31:26 +01:00
|
|
|
|
2016-02-20 10:35:20 +08:00
|
|
|
if (block == NULL) {
|
|
|
|
|
block = qemu_get_ram_block(addr);
|
2016-02-22 11:02:12 +01:00
|
|
|
addr -= block->offset;
|
2016-02-20 10:35:20 +08:00
|
|
|
}
|
2023-10-05 11:17:13 -07:00
|
|
|
if (size) {
|
|
|
|
|
*size = MIN(*size, block->max_length - addr);
|
|
|
|
|
len = *size;
|
|
|
|
|
}
|
2015-12-16 10:31:26 +01:00
|
|
|
|
|
|
|
|
if (xen_enabled() && block->host == NULL) {
|
|
|
|
|
/* We need to check if the requested address is in the RAM
|
|
|
|
|
* because we don't want to map the entire memory in QEMU.
|
|
|
|
|
* In that case just map the requested area.
|
|
|
|
|
*/
|
2024-05-03 03:44:46 +02:00
|
|
|
if (xen_mr_is_memory(block->mr)) {
|
2024-05-03 03:44:45 +02:00
|
|
|
return xen_map_cache(block->mr, block->offset + addr,
|
2024-04-30 10:14:01 +02:00
|
|
|
len, block->offset,
|
|
|
|
|
lock, lock, is_write);
|
2011-05-19 18:35:45 +01:00
|
|
|
}
|
|
|
|
|
|
2024-04-30 18:49:35 +02:00
|
|
|
block->host = xen_map_cache(block->mr, block->offset,
|
2024-04-30 10:14:01 +02:00
|
|
|
block->max_length,
|
|
|
|
|
block->offset,
|
|
|
|
|
1, lock, is_write);
|
2011-05-19 18:35:45 +01:00
|
|
|
}
|
2015-12-16 10:31:26 +01:00
|
|
|
|
2016-02-22 11:02:12 +01:00
|
|
|
return ramblock_ptr(block, addr);
|
2011-05-19 18:35:45 +01:00
|
|
|
}
|
|
|
|
|
|
2023-10-05 11:17:13 -07:00
|
|
|
/*
|
|
|
|
|
* Return a host pointer to ram allocated with qemu_ram_alloc.
|
|
|
|
|
* This should not be used for general purpose DMA. Use address_space_map
|
|
|
|
|
* or address_space_rw instead. For local memory (e.g. video ram) that the
|
|
|
|
|
* device owns, use memory_region_get_ram_ptr.
|
|
|
|
|
*
|
|
|
|
|
* Called within RCU critical section.
|
|
|
|
|
*/
|
|
|
|
|
void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr)
|
|
|
|
|
{
|
2024-04-30 18:49:35 +02:00
|
|
|
return qemu_ram_ptr_length(ram_block, addr, NULL, false, true);
|
2023-10-05 11:17:13 -07:00
|
|
|
}
|
|
|
|
|
|
2018-03-12 17:20:57 +00:00
|
|
|
/* Return the offset of a hostpointer within a ramblock */
|
|
|
|
|
ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host)
|
|
|
|
|
{
|
|
|
|
|
ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host;
|
|
|
|
|
assert((uintptr_t)host >= (uintptr_t)rb->host);
|
|
|
|
|
assert(res < rb->max_length);
|
|
|
|
|
|
|
|
|
|
return res;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-05 18:10:32 +00:00
|
|
|
RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
|
|
|
|
|
ram_addr_t *offset)
|
2009-04-11 14:47:08 +00:00
|
|
|
{
|
2009-04-11 17:15:54 +00:00
|
|
|
RAMBlock *block;
|
|
|
|
|
uint8_t *host = ptr;
|
|
|
|
|
|
2011-06-21 22:59:09 +02:00
|
|
|
if (xen_enabled()) {
|
2016-05-26 10:07:50 +02:00
|
|
|
ram_addr_t ram_addr;
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2016-05-26 10:07:50 +02:00
|
|
|
ram_addr = xen_ram_addr_from_mapcache(ptr);
|
2024-07-02 00:44:20 +02:00
|
|
|
if (ram_addr == RAM_ADDR_INVALID) {
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2016-05-26 10:07:50 +02:00
|
|
|
block = qemu_get_ram_block(ram_addr);
|
2015-11-05 18:10:32 +00:00
|
|
|
if (block) {
|
2016-06-09 16:56:17 +01:00
|
|
|
*offset = ram_addr - block->offset;
|
2015-11-05 18:10:32 +00:00
|
|
|
}
|
|
|
|
|
return block;
|
2011-05-19 18:35:46 +01:00
|
|
|
}
|
|
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2020-09-23 11:56:46 +01:00
|
|
|
block = qatomic_rcu_read(&ram_list.mru_block);
|
2014-12-15 22:55:32 +02:00
|
|
|
if (block && block->host && host - block->host < block->max_length) {
|
2013-05-06 14:28:39 +02:00
|
|
|
goto found;
|
|
|
|
|
}
|
|
|
|
|
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
2010-08-31 16:41:25 +01:00
|
|
|
/* This case append when the block is not mapped. */
|
|
|
|
|
if (block->host == NULL) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2014-12-15 22:55:32 +02:00
|
|
|
if (host - block->host < block->max_length) {
|
2013-05-06 14:28:39 +02:00
|
|
|
goto found;
|
2010-06-11 11:11:42 -06:00
|
|
|
}
|
2009-04-11 17:15:54 +00:00
|
|
|
}
|
2010-08-31 16:41:25 +01:00
|
|
|
|
2013-05-06 14:36:15 +02:00
|
|
|
return NULL;
|
2013-05-06 14:28:39 +02:00
|
|
|
|
|
|
|
|
found:
|
2015-11-05 18:10:32 +00:00
|
|
|
*offset = (host - block->host);
|
|
|
|
|
if (round_offset) {
|
|
|
|
|
*offset &= TARGET_PAGE_MASK;
|
|
|
|
|
}
|
|
|
|
|
return block;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-05 18:10:33 +00:00
|
|
|
/*
|
|
|
|
|
* Finds the named RAMBlock
|
|
|
|
|
*
|
|
|
|
|
* name: The name of RAMBlock to find
|
|
|
|
|
*
|
|
|
|
|
* Returns: RAMBlock (or NULL if not found)
|
|
|
|
|
*/
|
|
|
|
|
RAMBlock *qemu_ram_block_by_name(const char *name)
|
|
|
|
|
{
|
|
|
|
|
RAMBlock *block;
|
|
|
|
|
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
2015-11-05 18:10:33 +00:00
|
|
|
if (!strcmp(name, block->idstr)) {
|
|
|
|
|
return block;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-04 11:06:28 +02:00
|
|
|
/*
|
|
|
|
|
* Some of the system routines need to translate from a host pointer
|
|
|
|
|
* (typically a TLB entry) back to a ram offset.
|
|
|
|
|
*/
|
2016-03-25 12:55:08 +01:00
|
|
|
ram_addr_t qemu_ram_addr_from_host(void *ptr)
|
2015-11-05 18:10:32 +00:00
|
|
|
{
|
|
|
|
|
RAMBlock *block;
|
2016-05-26 10:07:50 +02:00
|
|
|
ram_addr_t offset;
|
2015-11-05 18:10:32 +00:00
|
|
|
|
2016-05-26 10:07:50 +02:00
|
|
|
block = qemu_ram_block_from_host(ptr, false, &offset);
|
2015-11-05 18:10:32 +00:00
|
|
|
if (!block) {
|
2016-03-25 12:55:08 +01:00
|
|
|
return RAM_ADDR_INVALID;
|
2015-11-05 18:10:32 +00:00
|
|
|
}
|
|
|
|
|
|
2016-03-25 12:55:08 +01:00
|
|
|
return block->offset + offset;
|
2010-10-11 15:31:19 -03:00
|
|
|
}
|
2010-06-11 11:11:42 -06:00
|
|
|
|
2022-08-10 12:04:15 -07:00
|
|
|
ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
|
|
|
|
|
{
|
|
|
|
|
ram_addr_t ram_addr;
|
|
|
|
|
|
|
|
|
|
ram_addr = qemu_ram_addr_from_host(ptr);
|
|
|
|
|
if (ram_addr == RAM_ADDR_INVALID) {
|
|
|
|
|
error_report("Bad ram pointer %p", ptr);
|
|
|
|
|
abort();
|
|
|
|
|
}
|
|
|
|
|
return ram_addr;
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-05 00:19:49 +01:00
|
|
|
static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
|
2020-02-19 19:52:44 +01:00
|
|
|
MemTxAttrs attrs, void *buf, hwaddr len);
|
2017-09-21 18:50:58 +10:00
|
|
|
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
|
2020-02-19 19:52:44 +01:00
|
|
|
const void *buf, hwaddr len);
|
2019-01-17 20:49:01 +08:00
|
|
|
static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
|
2018-05-31 14:50:52 +01:00
|
|
|
bool is_write, MemTxAttrs attrs);
|
2017-09-21 18:50:58 +10:00
|
|
|
|
2015-04-26 16:49:24 +01:00
|
|
|
static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data,
|
|
|
|
|
unsigned len, MemTxAttrs attrs)
|
2007-05-26 17:36:03 +00:00
|
|
|
{
|
2013-05-26 21:55:37 +02:00
|
|
|
subpage_t *subpage = opaque;
|
2014-12-22 13:11:39 +01:00
|
|
|
uint8_t buf[8];
|
2015-04-26 16:49:24 +01:00
|
|
|
MemTxResult res;
|
2013-05-24 16:10:39 +02:00
|
|
|
|
2025-12-19 14:14:36 +01:00
|
|
|
trace_subpage_read(subpage, len, addr);
|
2017-09-21 18:50:58 +10:00
|
|
|
res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len);
|
2015-04-26 16:49:24 +01:00
|
|
|
if (res) {
|
|
|
|
|
return res;
|
2015-04-26 16:49:24 +01:00
|
|
|
}
|
2018-06-15 14:57:14 +01:00
|
|
|
*data = ldn_p(buf, len);
|
|
|
|
|
return MEMTX_OK;
|
2007-05-26 17:36:03 +00:00
|
|
|
}
|
|
|
|
|
|
2015-04-26 16:49:24 +01:00
|
|
|
static MemTxResult subpage_write(void *opaque, hwaddr addr,
|
|
|
|
|
uint64_t value, unsigned len, MemTxAttrs attrs)
|
2007-05-26 17:36:03 +00:00
|
|
|
{
|
2013-05-26 21:55:37 +02:00
|
|
|
subpage_t *subpage = opaque;
|
2014-12-22 13:11:39 +01:00
|
|
|
uint8_t buf[8];
|
2013-05-26 21:55:37 +02:00
|
|
|
|
2025-12-19 14:14:36 +01:00
|
|
|
trace_subpage_write(subpage, len, addr, value);
|
2018-06-15 14:57:14 +01:00
|
|
|
stn_p(buf, len, value);
|
2017-09-21 18:50:58 +10:00
|
|
|
return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len);
|
2007-05-26 17:36:03 +00:00
|
|
|
}
|
|
|
|
|
|
2013-05-24 14:02:39 +02:00
|
|
|
static bool subpage_accepts(void *opaque, hwaddr addr,
|
2018-05-31 14:50:52 +01:00
|
|
|
unsigned len, bool is_write,
|
|
|
|
|
MemTxAttrs attrs)
|
2013-05-24 14:02:39 +02:00
|
|
|
{
|
2013-05-26 21:55:37 +02:00
|
|
|
subpage_t *subpage = opaque;
|
2025-12-19 14:14:36 +01:00
|
|
|
|
|
|
|
|
trace_subpage_accepts(subpage, is_write ? 'w' : 'r', len, addr);
|
2013-05-24 14:02:39 +02:00
|
|
|
|
2017-09-21 18:50:58 +10:00
|
|
|
return flatview_access_valid(subpage->fv, addr + subpage->base,
|
2018-05-31 14:50:52 +01:00
|
|
|
len, is_write, attrs);
|
2013-05-24 14:02:39 +02:00
|
|
|
}
|
|
|
|
|
|
2012-01-02 12:32:48 +02:00
|
|
|
static const MemoryRegionOps subpage_ops = {
|
2015-04-26 16:49:24 +01:00
|
|
|
.read_with_attrs = subpage_read,
|
|
|
|
|
.write_with_attrs = subpage_write,
|
2014-12-22 13:11:39 +01:00
|
|
|
.impl.min_access_size = 1,
|
|
|
|
|
.impl.max_access_size = 8,
|
|
|
|
|
.valid.min_access_size = 1,
|
|
|
|
|
.valid.max_access_size = 8,
|
2013-05-24 14:02:39 +02:00
|
|
|
.valid.accepts = subpage_accepts,
|
2012-01-02 12:32:48 +02:00
|
|
|
.endianness = DEVICE_NATIVE_ENDIAN,
|
2007-05-26 17:36:03 +00:00
|
|
|
};
|
|
|
|
|
|
2019-03-21 16:25:53 +08:00
|
|
|
static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end,
|
|
|
|
|
uint16_t section)
|
2007-05-26 17:36:03 +00:00
|
|
|
{
|
|
|
|
|
int idx, eidx;
|
|
|
|
|
|
|
|
|
|
if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE)
|
|
|
|
|
return -1;
|
|
|
|
|
idx = SUBPAGE_IDX(start);
|
|
|
|
|
eidx = SUBPAGE_IDX(end);
|
2025-12-19 14:14:36 +01:00
|
|
|
trace_subpage_register(mmio, start, end, idx, eidx, section);
|
2007-05-26 17:36:03 +00:00
|
|
|
for (; idx <= eidx; idx++) {
|
2012-02-12 18:32:55 +02:00
|
|
|
mmio->sub_section[idx] = section;
|
2007-05-26 17:36:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
2017-09-21 18:50:58 +10:00
|
|
|
static subpage_t *subpage_init(FlatView *fv, hwaddr base)
|
2007-05-26 17:36:03 +00:00
|
|
|
{
|
2009-10-01 16:12:16 -05:00
|
|
|
subpage_t *mmio;
|
2007-05-26 17:36:03 +00:00
|
|
|
|
2019-03-21 16:25:53 +08:00
|
|
|
/* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */
|
2016-10-24 16:26:49 +01:00
|
|
|
mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t));
|
2017-09-21 18:50:58 +10:00
|
|
|
mmio->fv = fv;
|
2009-02-05 22:06:18 +00:00
|
|
|
mmio->base = base;
|
2013-06-06 05:41:28 -04:00
|
|
|
memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio,
|
2014-06-05 23:15:52 -07:00
|
|
|
NULL, TARGET_PAGE_SIZE);
|
2012-01-02 13:20:11 +02:00
|
|
|
mmio->iomem.subpage = true;
|
2025-12-19 14:14:36 +01:00
|
|
|
trace_subpage_init(mmio, base, TARGET_PAGE_SIZE);
|
2007-05-26 17:36:03 +00:00
|
|
|
|
|
|
|
|
return mmio;
|
|
|
|
|
}
|
|
|
|
|
|
2017-09-21 18:50:58 +10:00
|
|
|
static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr)
|
2012-02-12 18:32:55 +02:00
|
|
|
{
|
2017-09-21 18:50:58 +10:00
|
|
|
assert(fv);
|
2012-02-12 18:32:55 +02:00
|
|
|
MemoryRegionSection section = {
|
2017-09-21 18:50:58 +10:00
|
|
|
.fv = fv,
|
2012-02-12 18:32:55 +02:00
|
|
|
.mr = mr,
|
|
|
|
|
.offset_within_address_space = 0,
|
|
|
|
|
.offset_within_region = 0,
|
2013-05-27 10:08:27 +02:00
|
|
|
.size = int128_2_64(),
|
2012-02-12 18:32:55 +02:00
|
|
|
};
|
|
|
|
|
|
2013-12-01 14:02:23 +02:00
|
|
|
return phys_section_add(map, §ion);
|
2012-02-12 18:32:55 +02:00
|
|
|
}
|
|
|
|
|
|
2009-06-14 11:38:52 +03:00
|
|
|
static void io_mem_init(void)
|
|
|
|
|
{
|
2013-06-06 05:41:28 -04:00
|
|
|
memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL,
|
2014-06-13 10:48:06 +02:00
|
|
|
NULL, UINT64_MAX);
|
system/physmem: mark io_mem_unassigned lockless
When the Bus Master bit is disabled in a PCI device's Command Register,
the device's DMA address space becomes unassigned memory (i.e. the
io_mem_unassigned MemoryRegion).
This can lead to deadlocks with IOThreads since io_mem_unassigned
accesses attempt to acquire the Big QEMU Lock (BQL). For example,
virtio-pci devices deadlock in virtio_write_config() ->
virtio_pci_stop_ioeventfd() when waiting for the IOThread while holding
the BQL. The IOThread is unable to acquire the BQL but the vcpu thread
won't release the BQL while waiting for the IOThread.
io_mem_unassigned is trivially thread-safe since it has no state, it
simply rejects all load/store accesses. Therefore it is safe to enable
lockless I/O on io_mem_unassigned to eliminate this deadlock.
Here is the backtrace described above:
Thread 9 (Thread 0x7fccfcdff6c0 (LWP 247832) "CPU 4/KVM"):
#0 0x00007fcd11529d46 in ppoll () from target:/lib64/libc.so.6
#1 0x000056468a1a9bad in ppoll (__fds=<optimized out>, __nfds=<optimized out>, __timeout=0x0, __ss=0x0) at /usr/include/bits/poll2.h:88
#2 0x000056468a18f9d9 in fdmon_poll_wait (ctx=0x5646c6a1dc30, ready_list=0x7fccfcdfb310, timeout=-1) at ../util/fdmon-poll.c:79
#3 0x000056468a18f14f in aio_poll (ctx=<optimized out>, blocking=blocking@entry=true) at ../util/aio-posix.c:730
#4 0x000056468a1ad842 in aio_wait_bh_oneshot (ctx=<optimized out>, cb=cb@entry=0x564689faa420 <virtio_blk_ioeventfd_stop_vq_bh>, opaque=<optimized out>) at ../util/aio-wait.c:85
#5 0x0000564689faaa89 in virtio_blk_stop_ioeventfd (vdev=0x5646c8fd7e90) at ../hw/block/virtio-blk.c:1644
#6 0x0000564689d77880 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x5646c8fd7e08) at ../hw/virtio/virtio-bus.c:264
#7 0x0000564689d780db in virtio_bus_stop_ioeventfd (bus=bus@entry=0x5646c8fd7e08) at ../hw/virtio/virtio-bus.c:256
#8 0x0000564689d7d98a in virtio_pci_stop_ioeventfd (proxy=0x5646c8fcf8e0) at ../hw/virtio/virtio-pci.c:413
#9 virtio_write_config (pci_dev=0x5646c8fcf8e0, address=4, val=<optimized out>, len=<optimized out>) at ../hw/virtio/virtio-pci.c:803
#10 0x0000564689dcb45a in memory_region_write_accessor (mr=mr@entry=0x5646c6dc2d30, addr=3145732, value=value@entry=0x7fccfcdfb528, size=size@entry=2, shift=<optimized out>, mask=mask@entry=65535, attrs=...) at ../system/memory.c:491
#11 0x0000564689dcaeb0 in access_with_adjusted_size (addr=addr@entry=3145732, value=value@entry=0x7fccfcdfb528, size=size@entry=2, access_size_min=<optimized out>, access_size_max=<optimized out>, access_fn=0x564689dcb3f0 <memory_region_write_accessor>, mr=0x5646c6dc2d30, attrs=...) at ../system/memory.c:567
#12 0x0000564689dcb156 in memory_region_dispatch_write (mr=mr@entry=0x5646c6dc2d30, addr=addr@entry=3145732, data=<optimized out>, op=<optimized out>, attrs=attrs@entry=...) at ../system/memory.c:1554
#13 0x0000564689dd389a in flatview_write_continue_step (attrs=..., attrs@entry=..., buf=buf@entry=0x7fcd05b87028 "", mr_addr=3145732, l=l@entry=0x7fccfcdfb5f0, mr=0x5646c6dc2d30, len=2) at ../system/physmem.c:3266
#14 0x0000564689dd3adb in flatview_write_continue (fv=0x7fcadc0d8930, addr=3761242116, attrs=..., ptr=0xe0300004, len=2, mr_addr=<optimized out>, l=<optimized out>, mr=<optimized out>) at ../system/physmem.c:3296
#15 flatview_write (fv=0x7fcadc0d8930, addr=addr@entry=3761242116, attrs=attrs@entry=..., buf=buf@entry=0x7fcd05b87028, len=len@entry=2) at ../system/physmem.c:3327
#16 0x0000564689dd7191 in address_space_write (as=0x56468b433600 <address_space_memory>, addr=3761242116, attrs=..., buf=0x7fcd05b87028, len=2) at ../system/physmem.c:3447
#17 address_space_rw (as=0x56468b433600 <address_space_memory>, addr=3761242116, attrs=attrs@entry=..., buf=buf@entry=0x7fcd05b87028, len=2, is_write=<optimized out>) at ../system/physmem.c:3457
#18 0x0000564689ff1ef6 in kvm_cpu_exec (cpu=cpu@entry=0x5646c6dab810) at ../accel/kvm/kvm-all.c:3248
#19 0x0000564689ff32f5 in kvm_vcpu_thread_fn (arg=arg@entry=0x5646c6dab810) at ../accel/kvm/kvm-accel-ops.c:53
#20 0x000056468a19225c in qemu_thread_start (args=0x5646c6db6190) at ../util/qemu-thread-posix.c:393
#21 0x00007fcd114c5b68 in start_thread () from target:/lib64/libc.so.6
#22 0x00007fcd115364e4 in clone () from target:/lib64/libc.so.6
Thread 3 (Thread 0x7fcd0503a6c0 (LWP 247825) "IO iothread1"):
#0 0x00007fcd114c2d30 in __lll_lock_wait () from target:/lib64/libc.so.6
#1 0x00007fcd114c8fe2 in pthread_mutex_lock@@GLIBC_2.2.5 () from target:/lib64/libc.so.6
#2 0x000056468a192538 in qemu_mutex_lock_impl (mutex=0x56468b432e60 <bql>, file=0x56468a1e26a5 "../system/physmem.c", line=3198) at ../util/qemu-thread-posix.c:94
#3 0x0000564689dc12e2 in bql_lock_impl (file=file@entry=0x56468a1e26a5 "../system/physmem.c", line=line@entry=3198) at ../system/cpus.c:566
#4 0x0000564689ddc151 in prepare_mmio_access (mr=0x56468b433800 <io_mem_unassigned>) at ../system/physmem.c:3198
#5 address_space_lduw_internal_cached_slow (cache=<optimized out>, addr=2, attrs=..., result=0x0, endian=DEVICE_LITTLE_ENDIAN) at ../system/memory_ldst.c.inc:211
#6 address_space_lduw_le_cached_slow (cache=<optimized out>, addr=addr@entry=2, attrs=attrs@entry=..., result=result@entry=0x0) at ../system/memory_ldst.c.inc:253
#7 0x0000564689fd692c in address_space_lduw_le_cached (result=0x0, cache=<optimized out>, addr=2, attrs=...) at /var/tmp/qemu/include/exec/memory_ldst_cached.h.inc:35
#8 lduw_le_phys_cached (cache=<optimized out>, addr=2) at /var/tmp/qemu/include/exec/memory_ldst_phys.h.inc:66
#9 virtio_lduw_phys_cached (vdev=<optimized out>, cache=<optimized out>, pa=2) at /var/tmp/qemu/include/hw/virtio/virtio-access.h:166
#10 vring_avail_idx (vq=0x5646c8fe2470) at ../hw/virtio/virtio.c:396
#11 virtio_queue_split_set_notification (vq=0x5646c8fe2470, enable=0) at ../hw/virtio/virtio.c:534
#12 virtio_queue_set_notification (vq=0x5646c8fe2470, enable=0) at ../hw/virtio/virtio.c:595
#13 0x000056468a18e7a8 in poll_set_started (ctx=ctx@entry=0x5646c6c74e30, ready_list=ready_list@entry=0x7fcd050366a0, started=started@entry=true) at ../util/aio-posix.c:247
#14 0x000056468a18f2bb in poll_set_started (ctx=0x5646c6c74e30, ready_list=0x7fcd050366a0, started=true) at ../util/aio-posix.c:226
#15 try_poll_mode (ctx=0x5646c6c74e30, ready_list=0x7fcd050366a0, timeout=<synthetic pointer>) at ../util/aio-posix.c:612
#16 aio_poll (ctx=0x5646c6c74e30, blocking=blocking@entry=true) at ../util/aio-posix.c:689
#17 0x000056468a032c26 in iothread_run (opaque=opaque@entry=0x5646c69f3380) at ../iothread.c:63
#18 0x000056468a19225c in qemu_thread_start (args=0x5646c6c75410) at ../util/qemu-thread-posix.c:393
#19 0x00007fcd114c5b68 in start_thread () from target:/lib64/libc.so.6
#20 0x00007fcd115364e4 in clone () from target:/lib64/libc.so.6
Buglink: https://issues.redhat.com/browse/RHEL-71933
Reported-by: Peixiu Hou <phou@redhat.com>
Cc: Kevin Wolf <kwolf@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org>
Link: https://lore.kernel.org/r/20251029185224.420261-1-stefanha@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-10-29 14:52:24 -04:00
|
|
|
|
|
|
|
|
/* Trivially thread-safe since memory accesses are rejected */
|
|
|
|
|
memory_region_enable_lockless_io(&io_mem_unassigned);
|
2009-06-14 11:38:52 +03:00
|
|
|
}
|
|
|
|
|
|
2017-09-21 18:51:00 +10:00
|
|
|
AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv)
|
2013-05-29 12:13:54 +02:00
|
|
|
{
|
2013-12-01 14:02:23 +02:00
|
|
|
AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1);
|
|
|
|
|
uint16_t n;
|
|
|
|
|
|
2017-09-21 18:50:58 +10:00
|
|
|
n = dummy_section(&d->map, fv, &io_mem_unassigned);
|
2013-12-01 14:02:23 +02:00
|
|
|
assert(n == PHYS_SECTION_UNASSIGNED);
|
2013-05-29 12:13:54 +02:00
|
|
|
|
2013-11-11 14:42:43 +02:00
|
|
|
d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 };
|
2017-09-21 18:50:56 +10:00
|
|
|
|
|
|
|
|
return d;
|
2013-05-29 12:13:54 +02:00
|
|
|
}
|
|
|
|
|
|
2017-09-21 18:50:56 +10:00
|
|
|
void address_space_dispatch_free(AddressSpaceDispatch *d)
|
2015-01-21 12:09:14 +01:00
|
|
|
{
|
|
|
|
|
phys_sections_free(&d->map);
|
|
|
|
|
g_free(d);
|
|
|
|
|
}
|
|
|
|
|
|
2018-02-06 18:37:39 +01:00
|
|
|
static void do_nothing(CPUState *cpu, run_on_cpu_data d)
|
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void tcg_log_global_after_sync(MemoryListener *listener)
|
|
|
|
|
{
|
|
|
|
|
CPUAddressSpace *cpuas;
|
|
|
|
|
|
|
|
|
|
/* Wait for the CPU to end the current TB. This avoids the following
|
|
|
|
|
* incorrect race:
|
|
|
|
|
*
|
|
|
|
|
* vCPU migration
|
|
|
|
|
* ---------------------- -------------------------
|
|
|
|
|
* TLB check -> slow path
|
|
|
|
|
* notdirty_mem_write
|
|
|
|
|
* write to RAM
|
|
|
|
|
* mark dirty
|
|
|
|
|
* clear dirty flag
|
|
|
|
|
* TLB check -> fast path
|
|
|
|
|
* read memory
|
|
|
|
|
* write to RAM
|
|
|
|
|
*
|
|
|
|
|
* by pushing the migration thread's memory read after the vCPU thread has
|
|
|
|
|
* written the memory.
|
|
|
|
|
*/
|
2019-09-17 12:54:06 +03:00
|
|
|
if (replay_mode == REPLAY_MODE_NONE) {
|
|
|
|
|
/*
|
|
|
|
|
* VGA can make calls to this function while updating the screen.
|
|
|
|
|
* In record/replay mode this causes a deadlock, because
|
|
|
|
|
* run_on_cpu waits for rr mutex. Therefore no races are possible
|
|
|
|
|
* in this case and no need for making run_on_cpu when
|
2021-10-15 11:29:44 +02:00
|
|
|
* record/replay is enabled.
|
2019-09-17 12:54:06 +03:00
|
|
|
*/
|
|
|
|
|
cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
|
|
|
|
|
run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL);
|
|
|
|
|
}
|
2018-02-06 18:37:39 +01:00
|
|
|
}
|
|
|
|
|
|
2023-08-25 16:13:17 -07:00
|
|
|
static void tcg_commit_cpu(CPUState *cpu, run_on_cpu_data data)
|
|
|
|
|
{
|
|
|
|
|
tlb_flush(cpu);
|
|
|
|
|
}
|
|
|
|
|
|
2012-10-02 18:54:45 +02:00
|
|
|
static void tcg_commit(MemoryListener *listener)
|
2012-02-08 21:36:02 +02:00
|
|
|
{
|
2015-10-01 15:29:50 +01:00
|
|
|
CPUAddressSpace *cpuas;
|
2023-08-25 16:13:17 -07:00
|
|
|
CPUState *cpu;
|
2012-02-12 21:23:17 +02:00
|
|
|
|
2018-06-22 13:45:31 -04:00
|
|
|
assert(tcg_enabled());
|
2012-02-12 21:23:17 +02:00
|
|
|
/* since each CPU stores ram addresses in its TLB cache, we must
|
|
|
|
|
reset the modified entries */
|
2015-10-01 15:29:50 +01:00
|
|
|
cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener);
|
2023-08-25 16:13:17 -07:00
|
|
|
cpu = cpuas->cpu;
|
|
|
|
|
|
|
|
|
|
/*
|
system/physmem: fix use-after-free with dispatch
A use-after-free bug was reported when booting a Linux kernel during the
pci setup phase. It's quite hard to reproduce (needs smp, and favored by
having several pci devices with BAR and specific Linux config, which
is Debian default one in this case).
After investigation (see the associated bug ticket), it appears that,
under specific conditions, we might access a cached AddressSpaceDispatch
that was reclaimed by RCU thread meanwhile.
In the Linux boot scenario, during the pci phase, memory region are
destroyed/recreated, resulting in exposition of the bug.
The core of the issue is that we cache the dispatch associated to
current cpu in cpu->cpu_ases[asidx].memory_dispatch. It is updated with
tcg_commit, which runs asynchronously on a given cpu.
At some point, we leave the rcu critial section, and the RCU thread
starts reclaiming it, but tcg_commit is not yet invoked, resulting in
the use-after-free.
It's not the first problem around this area, and commit 0d58c660689 [1]
("softmmu: Use async_run_on_cpu in tcg_commit") already tried to
address it. It did a good job, but it seems that we found a specific
situation where it's not enough.
This patch takes a simple approach: remove the cached value creating the
issue, and make sure we always get the current mapping for address
space, using address_space_to_dispatch(cpu->cpu_ases[asidx].as).
It's equivalent to qatomic_rcu_read(&as->current_map)->dispatch;
This is not really costly, we just need two dereferences,
including one atomic (rcu) read, which is negligible considering we are
already on mmu slow path anyway.
Note that tcg_commit is still needed, as it's taking care of flushing
TLB, removing previously mapped entries.
Another solution would be to cache directly values under the dispatch
(dispatch themselves are not ref counted), keep an active reference on
associated memory section, and release it when appropriate (tricky).
Given the time already spent debugging this area now and previously, I
strongly prefer eliminating the root of the issue, instead of adding
more complexity for a hypothetical performance gain. RCU is precisely
used to ensure good performance when reading data, so caching is not as
beneficial as it might seem IMHO.
[1] https://gitlab.com/qemu-project/qemu/-/commit/0d58c660689f6da1e3feff8a997014003d928b3b
Cc: qemu-stable@nongnu.org
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/3040
Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Michael Tokarev <mjt@tls.msk.ru>
Tested-by: Michael Tokarev <mjt@tls.msk.ru>
Message-ID: <20250724161142.2803091-1-pierrick.bouvier@linaro.org>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
2025-07-24 09:11:42 -07:00
|
|
|
* Queueing the work function will kick the cpu back to
|
2023-08-25 16:13:17 -07:00
|
|
|
* the main loop, which will end the RCU critical section and reclaim
|
|
|
|
|
* the memory data structures.
|
|
|
|
|
*
|
|
|
|
|
* That said, the listener is also called during realize, before
|
|
|
|
|
* all of the tcg machinery for run-on is initialized: thus halt_cond.
|
2015-10-01 15:29:50 +01:00
|
|
|
*/
|
2023-08-25 16:13:17 -07:00
|
|
|
if (cpu->halt_cond) {
|
|
|
|
|
async_run_on_cpu(cpu, tcg_commit_cpu, RUN_ON_CPU_HOST_PTR(cpuas));
|
|
|
|
|
} else {
|
|
|
|
|
tcg_commit_cpu(cpu, RUN_ON_CPU_HOST_PTR(cpuas));
|
|
|
|
|
}
|
2012-02-08 21:36:02 +02:00
|
|
|
}
|
|
|
|
|
|
2011-07-26 14:26:14 +03:00
|
|
|
static void memory_map_init(void)
|
|
|
|
|
{
|
2011-08-20 22:09:37 -05:00
|
|
|
system_memory = g_malloc(sizeof(*system_memory));
|
2013-11-07 17:14:36 +01:00
|
|
|
|
2013-11-07 17:14:37 +01:00
|
|
|
memory_region_init(system_memory, NULL, "system", UINT64_MAX);
|
2013-04-29 16:25:51 +00:00
|
|
|
address_space_init(&address_space_memory, system_memory, "memory");
|
2011-08-08 16:09:03 +03:00
|
|
|
|
2011-08-20 22:09:37 -05:00
|
|
|
system_io = g_malloc(sizeof(*system_io));
|
2013-09-02 18:43:30 +02:00
|
|
|
memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io",
|
|
|
|
|
65536);
|
2013-04-29 16:25:51 +00:00
|
|
|
address_space_init(&address_space_io, system_io, "I/O");
|
2011-07-26 14:26:14 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
MemoryRegion *get_system_memory(void)
|
|
|
|
|
{
|
|
|
|
|
return system_memory;
|
|
|
|
|
}
|
|
|
|
|
|
2011-08-08 16:09:03 +03:00
|
|
|
MemoryRegion *get_system_io(void)
|
|
|
|
|
{
|
|
|
|
|
return system_io;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-23 11:45:53 +01:00
|
|
|
static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr,
|
2012-10-23 12:30:10 +02:00
|
|
|
hwaddr length)
|
2012-10-03 13:49:05 +00:00
|
|
|
{
|
2015-03-25 15:21:39 +01:00
|
|
|
uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr);
|
2024-07-23 18:05:13 +01:00
|
|
|
ram_addr_t ramaddr = memory_region_get_ram_addr(mr);
|
|
|
|
|
|
|
|
|
|
/* We know we're only called for RAM MemoryRegions */
|
|
|
|
|
assert(ramaddr != RAM_ADDR_INVALID);
|
|
|
|
|
addr += ramaddr;
|
2016-02-22 11:02:12 +01:00
|
|
|
|
2015-03-25 15:21:39 +01:00
|
|
|
/* No early return if dirty_log_mask is or becomes 0, because
|
2025-09-30 09:08:44 +02:00
|
|
|
* physical_memory_set_dirty_range will still call
|
2015-03-25 15:21:39 +01:00
|
|
|
* xen_modified_memory.
|
|
|
|
|
*/
|
|
|
|
|
if (dirty_log_mask) {
|
|
|
|
|
dirty_log_mask =
|
2025-09-30 09:08:44 +02:00
|
|
|
physical_memory_range_includes_clean(addr, length, dirty_log_mask);
|
2015-03-25 15:21:39 +01:00
|
|
|
}
|
|
|
|
|
if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) {
|
2017-07-03 17:50:40 +02:00
|
|
|
assert(tcg_enabled());
|
2025-04-23 13:06:12 -07:00
|
|
|
tb_invalidate_phys_range(NULL, addr, addr + length - 1);
|
2015-03-25 15:21:39 +01:00
|
|
|
dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE);
|
2012-10-03 13:49:05 +00:00
|
|
|
}
|
2025-09-30 09:08:44 +02:00
|
|
|
physical_memory_set_dirty_range(addr, length, dirty_log_mask);
|
2012-10-03 13:49:05 +00:00
|
|
|
}
|
|
|
|
|
|
2019-01-29 11:46:04 +00:00
|
|
|
void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size)
|
|
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* In principle this function would work on other memory region types too,
|
|
|
|
|
* but the ROM device use case is the only one where this operation is
|
|
|
|
|
* necessary. Other memory regions should use the
|
|
|
|
|
* address_space_read/write() APIs.
|
|
|
|
|
*/
|
|
|
|
|
assert(memory_region_is_romd(mr));
|
|
|
|
|
|
|
|
|
|
invalidate_and_set_dirty(mr, addr, size);
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-13 16:26:32 -04:00
|
|
|
int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr)
|
2013-05-24 11:59:43 +02:00
|
|
|
{
|
2013-07-17 13:17:41 +02:00
|
|
|
unsigned access_size_max = mr->ops->valid.max_access_size;
|
2013-07-08 14:55:59 -07:00
|
|
|
|
|
|
|
|
/* Regions are assumed to support 1-4 byte accesses unless
|
|
|
|
|
otherwise specified. */
|
|
|
|
|
if (access_size_max == 0) {
|
|
|
|
|
access_size_max = 4;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Bound the maximum access by the alignment of the address. */
|
|
|
|
|
if (!mr->ops->impl.unaligned) {
|
|
|
|
|
unsigned align_size_max = addr & -addr;
|
|
|
|
|
if (align_size_max != 0 && align_size_max < access_size_max) {
|
|
|
|
|
access_size_max = align_size_max;
|
|
|
|
|
}
|
2013-05-24 11:59:43 +02:00
|
|
|
}
|
2013-07-08 14:55:59 -07:00
|
|
|
|
|
|
|
|
/* Don't attempt accesses larger than the maximum. */
|
|
|
|
|
if (l > access_size_max) {
|
|
|
|
|
l = access_size_max;
|
2013-05-24 11:59:43 +02:00
|
|
|
}
|
2015-07-24 13:33:10 +01:00
|
|
|
l = pow2floor(l);
|
2013-07-08 14:55:59 -07:00
|
|
|
|
|
|
|
|
return l;
|
2013-05-24 11:59:43 +02:00
|
|
|
}
|
|
|
|
|
|
2022-06-13 16:26:32 -04:00
|
|
|
bool prepare_mmio_access(MemoryRegion *mr)
|
2015-06-18 18:47:21 +02:00
|
|
|
{
|
2015-06-18 18:47:22 +02:00
|
|
|
bool release_lock = false;
|
|
|
|
|
|
2025-08-14 18:05:53 +02:00
|
|
|
if (!bql_locked() && !mr->lockless_io) {
|
2024-01-02 10:35:25 -05:00
|
|
|
bql_lock();
|
2015-06-18 18:47:22 +02:00
|
|
|
release_lock = true;
|
|
|
|
|
}
|
2015-06-18 18:47:21 +02:00
|
|
|
if (mr->flush_coalesced_mmio) {
|
|
|
|
|
qemu_flush_coalesced_mmio_buffer();
|
|
|
|
|
}
|
2015-06-18 18:47:22 +02:00
|
|
|
|
|
|
|
|
return release_lock;
|
2015-06-18 18:47:21 +02:00
|
|
|
}
|
|
|
|
|
|
2021-12-15 19:24:21 +01:00
|
|
|
/**
|
|
|
|
|
* flatview_access_allowed
|
|
|
|
|
* @mr: #MemoryRegion to be accessed
|
|
|
|
|
* @attrs: memory transaction attributes
|
|
|
|
|
* @addr: address within that memory region
|
|
|
|
|
* @len: the number of bytes to access
|
|
|
|
|
*
|
|
|
|
|
* Check if a memory transaction is allowed.
|
|
|
|
|
*
|
|
|
|
|
* Returns: true if transaction is allowed, false if denied.
|
|
|
|
|
*/
|
|
|
|
|
static bool flatview_access_allowed(MemoryRegion *mr, MemTxAttrs attrs,
|
|
|
|
|
hwaddr addr, hwaddr len)
|
|
|
|
|
{
|
|
|
|
|
if (likely(!attrs.memory)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
if (memory_region_is_ram(mr)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2024-11-02 13:17:35 +01:00
|
|
|
qemu_log_mask(LOG_INVALID_MEM,
|
2021-12-15 19:24:21 +01:00
|
|
|
"Invalid access to non-RAM device at "
|
|
|
|
|
"addr 0x%" HWADDR_PRIX ", size %" HWADDR_PRIu ", "
|
|
|
|
|
"region '%s'\n", addr, len, memory_region_name(mr));
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-07 15:37:09 +00:00
|
|
|
static MemTxResult flatview_write_continue_step(MemTxAttrs attrs,
|
|
|
|
|
const uint8_t *buf,
|
|
|
|
|
hwaddr len, hwaddr mr_addr,
|
|
|
|
|
hwaddr *l, MemoryRegion *mr)
|
|
|
|
|
{
|
|
|
|
|
if (!flatview_access_allowed(mr, attrs, mr_addr, *l)) {
|
|
|
|
|
return MEMTX_ACCESS_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-10 09:46:46 +01:00
|
|
|
if (!memory_access_is_direct(mr, true, attrs)) {
|
2024-03-07 15:37:09 +00:00
|
|
|
uint64_t val;
|
|
|
|
|
MemTxResult result;
|
|
|
|
|
bool release_lock = prepare_mmio_access(mr);
|
|
|
|
|
|
|
|
|
|
*l = memory_access_size(mr, *l, mr_addr);
|
|
|
|
|
/*
|
|
|
|
|
* XXX: could force current_cpu to NULL to avoid
|
|
|
|
|
* potential bugs
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Assure Coverity (and ourselves) that we are not going to OVERRUN
|
|
|
|
|
* the buffer by following ldn_he_p().
|
|
|
|
|
*/
|
|
|
|
|
#ifdef QEMU_STATIC_ANALYSIS
|
|
|
|
|
assert((*l == 1 && len >= 1) ||
|
|
|
|
|
(*l == 2 && len >= 2) ||
|
|
|
|
|
(*l == 4 && len >= 4) ||
|
|
|
|
|
(*l == 8 && len >= 8));
|
|
|
|
|
#endif
|
|
|
|
|
val = ldn_he_p(buf, *l);
|
|
|
|
|
result = memory_region_dispatch_write(mr, mr_addr, val,
|
|
|
|
|
size_memop(*l), attrs);
|
|
|
|
|
if (release_lock) {
|
|
|
|
|
bql_unlock();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
} else {
|
|
|
|
|
/* RAM case */
|
|
|
|
|
uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
|
2024-04-30 18:49:35 +02:00
|
|
|
false, true);
|
2024-03-07 15:37:09 +00:00
|
|
|
|
|
|
|
|
memmove(ram_ptr, buf, *l);
|
|
|
|
|
invalidate_and_set_dirty(mr, mr_addr, *l);
|
|
|
|
|
|
|
|
|
|
return MEMTX_OK;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-12-09 10:18:57 +01:00
|
|
|
/* Called within RCU critical section. */
|
2017-09-21 18:50:58 +10:00
|
|
|
static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr,
|
|
|
|
|
MemTxAttrs attrs,
|
2020-02-19 19:52:44 +01:00
|
|
|
const void *ptr,
|
2024-03-07 15:37:07 +00:00
|
|
|
hwaddr len, hwaddr mr_addr,
|
2017-09-21 18:50:58 +10:00
|
|
|
hwaddr l, MemoryRegion *mr)
|
2004-01-24 15:23:36 +00:00
|
|
|
{
|
2015-04-26 16:49:23 +01:00
|
|
|
MemTxResult result = MEMTX_OK;
|
2020-02-19 19:52:44 +01:00
|
|
|
const uint8_t *buf = ptr;
|
2007-09-17 08:09:54 +00:00
|
|
|
|
2015-12-09 10:18:57 +01:00
|
|
|
for (;;) {
|
2024-03-07 15:37:09 +00:00
|
|
|
result |= flatview_write_continue_step(attrs, buf, len, mr_addr, &l,
|
|
|
|
|
mr);
|
2015-06-18 18:47:22 +02:00
|
|
|
|
2004-01-24 15:23:36 +00:00
|
|
|
len -= l;
|
|
|
|
|
buf += l;
|
|
|
|
|
addr += l;
|
2015-12-09 10:18:57 +01:00
|
|
|
|
|
|
|
|
if (!len) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
l = len;
|
2024-03-07 15:37:07 +00:00
|
|
|
mr = flatview_translate(fv, addr, &mr_addr, &l, true, attrs);
|
2004-01-24 15:23:36 +00:00
|
|
|
}
|
2013-05-21 09:56:55 +02:00
|
|
|
|
2015-04-26 16:49:23 +01:00
|
|
|
return result;
|
2004-01-24 15:23:36 +00:00
|
|
|
}
|
2005-01-28 22:37:22 +00:00
|
|
|
|
2018-03-05 09:23:56 +01:00
|
|
|
/* Called from RCU critical section. */
|
2017-09-21 18:50:58 +10:00
|
|
|
static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs,
|
2020-02-19 19:52:44 +01:00
|
|
|
const void *buf, hwaddr len)
|
2012-10-03 16:22:53 +02:00
|
|
|
{
|
2015-12-09 10:06:31 +01:00
|
|
|
hwaddr l;
|
2024-03-07 15:37:07 +00:00
|
|
|
hwaddr mr_addr;
|
2015-12-09 10:06:31 +01:00
|
|
|
MemoryRegion *mr;
|
|
|
|
|
|
2018-03-05 09:23:56 +01:00
|
|
|
l = len;
|
2024-03-07 15:37:07 +00:00
|
|
|
mr = flatview_translate(fv, addr, &mr_addr, &l, true, attrs);
|
2025-09-03 10:29:32 -04:00
|
|
|
if (!flatview_access_allowed(mr, attrs, mr_addr, l)) {
|
2021-12-15 19:24:21 +01:00
|
|
|
return MEMTX_ACCESS_ERROR;
|
|
|
|
|
}
|
2021-12-15 19:24:20 +01:00
|
|
|
return flatview_write_continue(fv, addr, attrs, buf, len,
|
2024-03-07 15:37:07 +00:00
|
|
|
mr_addr, l, mr);
|
2015-12-09 10:18:57 +01:00
|
|
|
}
|
|
|
|
|
|
2024-03-07 15:37:09 +00:00
|
|
|
static MemTxResult flatview_read_continue_step(MemTxAttrs attrs, uint8_t *buf,
|
|
|
|
|
hwaddr len, hwaddr mr_addr,
|
|
|
|
|
hwaddr *l,
|
|
|
|
|
MemoryRegion *mr)
|
|
|
|
|
{
|
|
|
|
|
if (!flatview_access_allowed(mr, attrs, mr_addr, *l)) {
|
|
|
|
|
return MEMTX_ACCESS_ERROR;
|
|
|
|
|
}
|
|
|
|
|
|
2025-02-10 09:46:46 +01:00
|
|
|
if (!memory_access_is_direct(mr, false, attrs)) {
|
2024-03-07 15:37:09 +00:00
|
|
|
/* I/O case */
|
|
|
|
|
uint64_t val;
|
|
|
|
|
MemTxResult result;
|
|
|
|
|
bool release_lock = prepare_mmio_access(mr);
|
|
|
|
|
|
|
|
|
|
*l = memory_access_size(mr, *l, mr_addr);
|
|
|
|
|
result = memory_region_dispatch_read(mr, mr_addr, &val, size_memop(*l),
|
|
|
|
|
attrs);
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Assure Coverity (and ourselves) that we are not going to OVERRUN
|
|
|
|
|
* the buffer by following stn_he_p().
|
|
|
|
|
*/
|
|
|
|
|
#ifdef QEMU_STATIC_ANALYSIS
|
|
|
|
|
assert((*l == 1 && len >= 1) ||
|
|
|
|
|
(*l == 2 && len >= 2) ||
|
|
|
|
|
(*l == 4 && len >= 4) ||
|
|
|
|
|
(*l == 8 && len >= 8));
|
|
|
|
|
#endif
|
|
|
|
|
stn_he_p(buf, *l, val);
|
|
|
|
|
|
|
|
|
|
if (release_lock) {
|
|
|
|
|
bql_unlock();
|
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
} else {
|
|
|
|
|
/* RAM case */
|
|
|
|
|
uint8_t *ram_ptr = qemu_ram_ptr_length(mr->ram_block, mr_addr, l,
|
2024-04-30 18:49:35 +02:00
|
|
|
false, false);
|
2024-03-07 15:37:09 +00:00
|
|
|
|
|
|
|
|
memcpy(buf, ram_ptr, *l);
|
|
|
|
|
|
|
|
|
|
return MEMTX_OK;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-12-09 10:18:57 +01:00
|
|
|
/* Called within RCU critical section. */
|
2017-09-21 18:50:58 +10:00
|
|
|
MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr,
|
2020-02-19 19:52:44 +01:00
|
|
|
MemTxAttrs attrs, void *ptr,
|
2024-03-07 15:37:07 +00:00
|
|
|
hwaddr len, hwaddr mr_addr, hwaddr l,
|
2017-09-21 18:50:58 +10:00
|
|
|
MemoryRegion *mr)
|
2015-12-09 10:18:57 +01:00
|
|
|
{
|
|
|
|
|
MemTxResult result = MEMTX_OK;
|
2020-02-19 19:52:44 +01:00
|
|
|
uint8_t *buf = ptr;
|
2015-12-09 10:06:31 +01:00
|
|
|
|
2021-03-15 10:05:12 -04:00
|
|
|
fuzz_dma_read_cb(addr, len, mr);
|
2015-12-09 10:18:57 +01:00
|
|
|
for (;;) {
|
2024-03-07 15:37:09 +00:00
|
|
|
result |= flatview_read_continue_step(attrs, buf, len, mr_addr, &l, mr);
|
2015-12-09 10:06:31 +01:00
|
|
|
|
|
|
|
|
len -= l;
|
|
|
|
|
buf += l;
|
|
|
|
|
addr += l;
|
2015-12-09 10:18:57 +01:00
|
|
|
|
|
|
|
|
if (!len) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
l = len;
|
2024-03-07 15:37:07 +00:00
|
|
|
mr = flatview_translate(fv, addr, &mr_addr, &l, false, attrs);
|
2015-12-09 10:18:57 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-05 00:19:49 +01:00
|
|
|
/* Called from RCU critical section. */
|
|
|
|
|
static MemTxResult flatview_read(FlatView *fv, hwaddr addr,
|
2020-02-19 19:52:44 +01:00
|
|
|
MemTxAttrs attrs, void *buf, hwaddr len)
|
2015-12-09 10:18:57 +01:00
|
|
|
{
|
|
|
|
|
hwaddr l;
|
2024-03-07 15:37:07 +00:00
|
|
|
hwaddr mr_addr;
|
2015-12-09 10:18:57 +01:00
|
|
|
MemoryRegion *mr;
|
2015-12-09 10:06:31 +01:00
|
|
|
|
2018-03-05 00:19:49 +01:00
|
|
|
l = len;
|
2024-03-07 15:37:07 +00:00
|
|
|
mr = flatview_translate(fv, addr, &mr_addr, &l, false, attrs);
|
2025-09-03 10:29:32 -04:00
|
|
|
if (!flatview_access_allowed(mr, attrs, mr_addr, l)) {
|
2021-12-15 19:24:21 +01:00
|
|
|
return MEMTX_ACCESS_ERROR;
|
|
|
|
|
}
|
2018-03-05 00:19:49 +01:00
|
|
|
return flatview_read_continue(fv, addr, attrs, buf, len,
|
2024-03-07 15:37:07 +00:00
|
|
|
mr_addr, l, mr);
|
2012-10-03 16:22:53 +02:00
|
|
|
}
|
|
|
|
|
|
2018-03-05 00:19:49 +01:00
|
|
|
MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr,
|
2020-02-19 19:54:35 +01:00
|
|
|
MemTxAttrs attrs, void *buf, hwaddr len)
|
2018-03-05 00:19:49 +01:00
|
|
|
{
|
|
|
|
|
MemTxResult result = MEMTX_OK;
|
|
|
|
|
FlatView *fv;
|
|
|
|
|
|
|
|
|
|
if (len > 0) {
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2018-03-05 00:19:49 +01:00
|
|
|
fv = address_space_to_flatview(as);
|
|
|
|
|
result = flatview_read(fv, addr, attrs, buf, len);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-05 09:23:56 +01:00
|
|
|
MemTxResult address_space_write(AddressSpace *as, hwaddr addr,
|
|
|
|
|
MemTxAttrs attrs,
|
2020-02-19 19:54:35 +01:00
|
|
|
const void *buf, hwaddr len)
|
2018-03-05 09:23:56 +01:00
|
|
|
{
|
|
|
|
|
MemTxResult result = MEMTX_OK;
|
|
|
|
|
FlatView *fv;
|
|
|
|
|
|
|
|
|
|
if (len > 0) {
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2018-03-05 09:23:56 +01:00
|
|
|
fv = address_space_to_flatview(as);
|
|
|
|
|
result = flatview_write(fv, addr, attrs, buf, len);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-05 09:29:04 +01:00
|
|
|
MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs,
|
2020-02-19 19:54:35 +01:00
|
|
|
void *buf, hwaddr len, bool is_write)
|
2018-03-05 09:29:04 +01:00
|
|
|
{
|
|
|
|
|
if (is_write) {
|
|
|
|
|
return address_space_write(as, addr, attrs, buf, len);
|
|
|
|
|
} else {
|
|
|
|
|
return address_space_read_full(as, addr, attrs, buf, len);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-01-15 21:37:23 +01:00
|
|
|
MemTxResult address_space_set(AddressSpace *as, hwaddr addr,
|
|
|
|
|
uint8_t c, hwaddr len, MemTxAttrs attrs)
|
|
|
|
|
{
|
|
|
|
|
#define FILLBUF_SIZE 512
|
|
|
|
|
uint8_t fillbuf[FILLBUF_SIZE];
|
|
|
|
|
int l;
|
|
|
|
|
MemTxResult error = MEMTX_OK;
|
|
|
|
|
|
|
|
|
|
memset(fillbuf, c, FILLBUF_SIZE);
|
|
|
|
|
while (len > 0) {
|
|
|
|
|
l = len < FILLBUF_SIZE ? len : FILLBUF_SIZE;
|
|
|
|
|
error |= address_space_write(as, addr, attrs, fillbuf, l);
|
|
|
|
|
len -= l;
|
|
|
|
|
addr += l;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return error;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 15:57:57 +02:00
|
|
|
void cpu_physical_memory_read(hwaddr addr, void *buf, hwaddr len)
|
|
|
|
|
{
|
2025-09-30 10:14:35 +02:00
|
|
|
address_space_read(&address_space_memory, addr,
|
|
|
|
|
MEMTXATTRS_UNSPECIFIED, buf, len);
|
2025-09-29 15:57:57 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void cpu_physical_memory_write(hwaddr addr, const void *buf, hwaddr len)
|
|
|
|
|
{
|
2025-09-30 10:14:35 +02:00
|
|
|
address_space_write(&address_space_memory, addr,
|
|
|
|
|
MEMTXATTRS_UNSPECIFIED, buf, len);
|
2025-09-29 15:57:57 +02:00
|
|
|
}
|
|
|
|
|
|
2025-09-22 12:29:40 -07:00
|
|
|
/* used for ROM loading : can write in RAM and ROM */
|
|
|
|
|
MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr,
|
|
|
|
|
MemTxAttrs attrs,
|
|
|
|
|
const void *buf, hwaddr len)
|
2006-04-23 17:14:48 +00:00
|
|
|
{
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2006-04-23 17:14:48 +00:00
|
|
|
while (len > 0) {
|
2025-09-22 12:29:40 -07:00
|
|
|
hwaddr addr1, l = len;
|
|
|
|
|
MemoryRegion *mr = address_space_translate(as, addr, &addr1, &l,
|
|
|
|
|
true, attrs);
|
2007-09-17 08:09:54 +00:00
|
|
|
|
2025-02-10 09:46:45 +01:00
|
|
|
if (!memory_region_supports_direct_access(mr)) {
|
exec: skip MMIO regions correctly in cpu_physical_memory_write_rom_internal
Loading the BIOS in the mac99 machine is interesting, because there is a
PROM in the middle of the BIOS region (from 16K to 32K). Before memory
region accesses were clamped, when QEMU was asked to load a BIOS from
0xfff00000 to 0xffffffff it would put even those 16K from the BIOS file
into the region. This is weird because those 16K were not actually
visible between 0xfff04000 and 0xfff07fff. However, it worked.
After clamping was added, this also worked. In this case, the
cpu_physical_memory_write_rom_internal function split the write in
three parts: the first 16K were copied, the PROM area (second 16K) were
ignored, then the rest was copied.
Problems then started with commit 965eb2f (exec: do not clamp accesses
to MMIO regions, 2015-06-17). Clamping accesses is not done for MMIO
regions because they can overlap wildly, and MMIO registers can be
expected to perform full-width accesses based only on their address
(with no respect for adjacent registers that could decode to completely
different MemoryRegions). However, this lack of clamping also applied
to the PROM area! cpu_physical_memory_write_rom_internal thus failed
to copy the third range above, i.e. only copied the first 16K of the BIOS.
In effect, address_space_translate is expecting _something else_ to do
the clamping for MMIO regions if the incoming length is large. This
"something else" is memory_access_size in the case of address_space_rw,
so use the same logic in cpu_physical_memory_write_rom_internal.
Reported-by: Alexander Graf <agraf@redhat.com>
Reviewed-by: Laurent Vivier <lvivier@redhat.com>
Tested-by: Laurent Vivier <lvivier@redhat.com>
Fixes: 965eb2f
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2015-07-04 00:24:51 +02:00
|
|
|
l = memory_access_size(mr, l, addr1);
|
2006-04-23 17:14:48 +00:00
|
|
|
} else {
|
|
|
|
|
/* ROM/RAM case */
|
2025-09-22 12:29:40 -07:00
|
|
|
void *ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
|
|
|
|
|
memcpy(ram_ptr, buf, l);
|
|
|
|
|
invalidate_and_set_dirty(mr, addr1, l);
|
2006-04-23 17:14:48 +00:00
|
|
|
}
|
|
|
|
|
len -= l;
|
|
|
|
|
addr += l;
|
2025-09-22 12:29:40 -07:00
|
|
|
buf += l;
|
2006-04-23 17:14:48 +00:00
|
|
|
}
|
2018-12-14 13:30:48 +00:00
|
|
|
return MEMTX_OK;
|
2006-04-23 17:14:48 +00:00
|
|
|
}
|
|
|
|
|
|
2025-09-29 15:40:33 +02:00
|
|
|
void address_space_flush_icache_range(AddressSpace *as, hwaddr addr, hwaddr len)
|
2013-12-11 14:17:44 +01:00
|
|
|
{
|
|
|
|
|
/*
|
|
|
|
|
* This function should do the same thing as an icache flush that was
|
|
|
|
|
* triggered from within the guest. For TCG we are always cache coherent,
|
|
|
|
|
* so there is no need to flush anything. For KVM / Xen we need to flush
|
|
|
|
|
* the host's instruction cache at least.
|
|
|
|
|
*/
|
|
|
|
|
if (tcg_enabled()) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-22 12:29:40 -07:00
|
|
|
RCU_READ_LOCK_GUARD();
|
|
|
|
|
while (len > 0) {
|
|
|
|
|
hwaddr addr1, l = len;
|
2025-09-29 15:40:33 +02:00
|
|
|
MemoryRegion *mr = address_space_translate(as, addr, &addr1, &l, true,
|
2025-09-22 12:29:40 -07:00
|
|
|
MEMTXATTRS_UNSPECIFIED);
|
|
|
|
|
|
|
|
|
|
if (!memory_region_supports_direct_access(mr)) {
|
|
|
|
|
l = memory_access_size(mr, l, addr1);
|
|
|
|
|
} else {
|
|
|
|
|
/* ROM/RAM case */
|
|
|
|
|
void *ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1);
|
|
|
|
|
flush_idcache_range((uintptr_t)ram_ptr, (uintptr_t)ram_ptr, l);
|
|
|
|
|
}
|
|
|
|
|
len -= l;
|
|
|
|
|
addr += l;
|
|
|
|
|
}
|
2013-12-11 14:17:44 +01:00
|
|
|
}
|
|
|
|
|
|
2024-08-19 06:54:54 -07:00
|
|
|
/*
|
|
|
|
|
* A magic value stored in the first 8 bytes of the bounce buffer struct. Used
|
|
|
|
|
* to detect illegal pointers passed to address_space_unmap.
|
|
|
|
|
*/
|
|
|
|
|
#define BOUNCE_BUFFER_MAGIC 0xb4017ceb4ffe12ed
|
|
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
uint64_t magic;
|
|
|
|
|
MemoryRegion *mr;
|
|
|
|
|
hwaddr addr;
|
|
|
|
|
size_t len;
|
|
|
|
|
uint8_t buffer[];
|
|
|
|
|
} BounceBuffer;
|
|
|
|
|
|
2023-09-07 06:04:23 -07:00
|
|
|
static void
|
|
|
|
|
address_space_unregister_map_client_do(AddressSpaceMapClient *client)
|
2015-03-16 17:03:37 +08:00
|
|
|
{
|
|
|
|
|
QLIST_REMOVE(client, link);
|
|
|
|
|
g_free(client);
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-07 06:04:23 -07:00
|
|
|
static void address_space_notify_map_clients_locked(AddressSpace *as)
|
2015-03-16 17:03:36 +08:00
|
|
|
{
|
2023-09-07 06:04:23 -07:00
|
|
|
AddressSpaceMapClient *client;
|
2015-03-16 17:03:36 +08:00
|
|
|
|
2023-09-07 06:04:23 -07:00
|
|
|
while (!QLIST_EMPTY(&as->map_client_list)) {
|
|
|
|
|
client = QLIST_FIRST(&as->map_client_list);
|
2015-03-16 17:03:37 +08:00
|
|
|
qemu_bh_schedule(client->bh);
|
2023-09-07 06:04:23 -07:00
|
|
|
address_space_unregister_map_client_do(client);
|
2015-03-16 17:03:36 +08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-07 06:04:23 -07:00
|
|
|
void address_space_register_map_client(AddressSpace *as, QEMUBH *bh)
|
2009-01-22 16:59:16 +00:00
|
|
|
{
|
2023-09-07 06:04:23 -07:00
|
|
|
AddressSpaceMapClient *client = g_malloc(sizeof(*client));
|
2009-01-22 16:59:16 +00:00
|
|
|
|
2023-09-07 06:04:23 -07:00
|
|
|
QEMU_LOCK_GUARD(&as->map_client_list_lock);
|
2015-03-16 17:03:37 +08:00
|
|
|
client->bh = bh;
|
2023-09-07 06:04:23 -07:00
|
|
|
QLIST_INSERT_HEAD(&as->map_client_list, client, link);
|
2024-08-19 06:54:54 -07:00
|
|
|
/* Write map_client_list before reading bounce_buffer_size. */
|
2023-03-03 14:36:32 +01:00
|
|
|
smp_mb();
|
2024-08-19 06:54:54 -07:00
|
|
|
if (qatomic_read(&as->bounce_buffer_size) < as->max_bounce_buffer_size) {
|
2023-09-07 06:04:23 -07:00
|
|
|
address_space_notify_map_clients_locked(as);
|
2015-03-16 17:03:36 +08:00
|
|
|
}
|
2009-01-22 16:59:16 +00:00
|
|
|
}
|
|
|
|
|
|
2015-03-16 17:03:35 +08:00
|
|
|
void cpu_exec_init_all(void)
|
2009-01-22 16:59:16 +00:00
|
|
|
{
|
2015-03-16 17:03:35 +08:00
|
|
|
qemu_mutex_init(&ram_list.mutex);
|
2016-10-24 16:26:49 +01:00
|
|
|
/* The data structures we set up here depend on knowing the page size,
|
|
|
|
|
* so no more changes can be made after this point.
|
|
|
|
|
* In an ideal world, nothing we did before we had finished the
|
|
|
|
|
* machine setup would care about the target page size, and we could
|
|
|
|
|
* do this much later, rather than requiring board models to state
|
|
|
|
|
* up front what their requirements are.
|
|
|
|
|
*/
|
|
|
|
|
finalize_target_page_bits();
|
2015-03-16 17:03:35 +08:00
|
|
|
io_mem_init();
|
2015-11-02 09:23:52 +01:00
|
|
|
memory_map_init();
|
2009-01-22 16:59:16 +00:00
|
|
|
}
|
|
|
|
|
|
2023-09-07 06:04:23 -07:00
|
|
|
void address_space_unregister_map_client(AddressSpace *as, QEMUBH *bh)
|
2009-01-22 16:59:16 +00:00
|
|
|
{
|
2023-09-07 06:04:23 -07:00
|
|
|
AddressSpaceMapClient *client;
|
2009-01-22 16:59:16 +00:00
|
|
|
|
2023-09-07 06:04:23 -07:00
|
|
|
QEMU_LOCK_GUARD(&as->map_client_list_lock);
|
|
|
|
|
QLIST_FOREACH(client, &as->map_client_list, link) {
|
2015-03-16 17:03:37 +08:00
|
|
|
if (client->bh == bh) {
|
2023-09-07 06:04:23 -07:00
|
|
|
address_space_unregister_map_client_do(client);
|
2015-03-16 17:03:37 +08:00
|
|
|
break;
|
|
|
|
|
}
|
2009-01-22 16:59:16 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-07 06:04:23 -07:00
|
|
|
static void address_space_notify_map_clients(AddressSpace *as)
|
2009-01-22 16:59:16 +00:00
|
|
|
{
|
2023-09-07 06:04:23 -07:00
|
|
|
QEMU_LOCK_GUARD(&as->map_client_list_lock);
|
2023-09-07 06:04:23 -07:00
|
|
|
address_space_notify_map_clients_locked(as);
|
2009-01-22 16:59:16 +00:00
|
|
|
}
|
|
|
|
|
|
2019-01-17 20:49:01 +08:00
|
|
|
static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len,
|
2018-05-31 14:50:52 +01:00
|
|
|
bool is_write, MemTxAttrs attrs)
|
2013-04-11 15:40:59 +02:00
|
|
|
{
|
2013-05-29 12:42:00 +02:00
|
|
|
MemoryRegion *mr;
|
2013-04-11 15:40:59 +02:00
|
|
|
hwaddr l, xlat;
|
|
|
|
|
|
|
|
|
|
while (len > 0) {
|
|
|
|
|
l = len;
|
2018-05-31 14:50:52 +01:00
|
|
|
mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
|
2025-02-10 09:46:46 +01:00
|
|
|
if (!memory_access_is_direct(mr, is_write, attrs)) {
|
2013-05-29 12:42:00 +02:00
|
|
|
l = memory_access_size(mr, l, addr);
|
2018-05-31 14:50:52 +01:00
|
|
|
if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) {
|
2013-04-11 15:40:59 +02:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
len -= l;
|
|
|
|
|
addr += l;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2017-09-21 18:50:58 +10:00
|
|
|
bool address_space_access_valid(AddressSpace *as, hwaddr addr,
|
2019-01-17 20:49:01 +08:00
|
|
|
hwaddr len, bool is_write,
|
2018-05-31 14:50:52 +01:00
|
|
|
MemTxAttrs attrs)
|
2017-09-21 18:50:58 +10:00
|
|
|
{
|
2018-03-05 00:23:26 +01:00
|
|
|
FlatView *fv;
|
|
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2018-03-05 00:23:26 +01:00
|
|
|
fv = address_space_to_flatview(as);
|
2021-12-15 19:24:20 +01:00
|
|
|
return flatview_access_valid(fv, addr, len, is_write, attrs);
|
2017-09-21 18:50:58 +10:00
|
|
|
}
|
|
|
|
|
|
2025-09-29 14:36:19 +02:00
|
|
|
bool address_space_is_io(AddressSpace *as, hwaddr addr)
|
|
|
|
|
{
|
|
|
|
|
MemoryRegion *mr;
|
|
|
|
|
|
|
|
|
|
RCU_READ_LOCK_GUARD();
|
|
|
|
|
mr = address_space_translate(as, addr, &addr, NULL, false,
|
|
|
|
|
MEMTXATTRS_UNSPECIFIED);
|
|
|
|
|
|
|
|
|
|
return !(memory_region_is_ram(mr) || memory_region_is_romd(mr));
|
|
|
|
|
}
|
|
|
|
|
|
2016-11-22 12:04:31 +01:00
|
|
|
static hwaddr
|
2017-09-21 18:50:58 +10:00
|
|
|
flatview_extend_translation(FlatView *fv, hwaddr addr,
|
2018-05-31 14:50:52 +01:00
|
|
|
hwaddr target_len,
|
|
|
|
|
MemoryRegion *mr, hwaddr base, hwaddr len,
|
|
|
|
|
bool is_write, MemTxAttrs attrs)
|
2016-11-22 12:04:31 +01:00
|
|
|
{
|
|
|
|
|
hwaddr done = 0;
|
|
|
|
|
hwaddr xlat;
|
|
|
|
|
MemoryRegion *this_mr;
|
|
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
|
target_len -= len;
|
|
|
|
|
addr += len;
|
|
|
|
|
done += len;
|
|
|
|
|
if (target_len == 0) {
|
|
|
|
|
return done;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
len = target_len;
|
2017-09-21 18:50:58 +10:00
|
|
|
this_mr = flatview_translate(fv, addr, &xlat,
|
2018-05-31 14:50:52 +01:00
|
|
|
&len, is_write, attrs);
|
2016-11-22 12:04:31 +01:00
|
|
|
if (this_mr != mr || xlat != base + done) {
|
|
|
|
|
return done;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2009-01-22 16:59:11 +00:00
|
|
|
/* Map a physical memory region into a host virtual address.
|
|
|
|
|
* May map a subset of the requested range, given by and returned in *plen.
|
|
|
|
|
* May return NULL if resources needed to perform the mapping are exhausted.
|
|
|
|
|
* Use only for reads OR writes - not for read-modify-write operations.
|
2023-09-07 06:04:23 -07:00
|
|
|
* Use address_space_register_map_client() to know when retrying the map
|
|
|
|
|
* operation is likely to succeed.
|
2009-01-22 16:59:11 +00:00
|
|
|
*/
|
2012-10-03 16:22:53 +02:00
|
|
|
void *address_space_map(AddressSpace *as,
|
2012-10-23 12:30:10 +02:00
|
|
|
hwaddr addr,
|
|
|
|
|
hwaddr *plen,
|
2018-05-31 14:50:52 +01:00
|
|
|
bool is_write,
|
|
|
|
|
MemTxAttrs attrs)
|
2009-01-22 16:59:11 +00:00
|
|
|
{
|
2012-10-23 12:30:10 +02:00
|
|
|
hwaddr len = *plen;
|
2016-11-22 12:04:31 +01:00
|
|
|
hwaddr l, xlat;
|
|
|
|
|
MemoryRegion *mr;
|
2018-03-05 00:23:26 +01:00
|
|
|
FlatView *fv;
|
2009-01-22 16:59:11 +00:00
|
|
|
|
2024-07-05 09:40:10 +01:00
|
|
|
trace_address_space_map(as, addr, len, is_write, *(uint32_t *) &attrs);
|
|
|
|
|
|
2013-06-28 17:29:27 +02:00
|
|
|
if (len == 0) {
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
2011-05-19 18:35:45 +01:00
|
|
|
|
2013-06-28 17:29:27 +02:00
|
|
|
l = len;
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2018-03-05 00:23:26 +01:00
|
|
|
fv = address_space_to_flatview(as);
|
2018-05-31 14:50:52 +01:00
|
|
|
mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs);
|
2015-03-18 14:21:43 +01:00
|
|
|
|
2025-02-10 09:46:46 +01:00
|
|
|
if (!memory_access_is_direct(mr, is_write, attrs)) {
|
2024-08-19 06:54:54 -07:00
|
|
|
size_t used = qatomic_read(&as->bounce_buffer_size);
|
|
|
|
|
for (;;) {
|
|
|
|
|
hwaddr alloc = MIN(as->max_bounce_buffer_size - used, l);
|
|
|
|
|
size_t new_size = used + alloc;
|
|
|
|
|
size_t actual =
|
|
|
|
|
qatomic_cmpxchg(&as->bounce_buffer_size, used, new_size);
|
|
|
|
|
if (actual == used) {
|
|
|
|
|
l = alloc;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
used = actual;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (l == 0) {
|
2020-05-26 16:47:43 +05:30
|
|
|
*plen = 0;
|
2013-06-28 17:29:27 +02:00
|
|
|
return NULL;
|
2009-01-22 16:59:11 +00:00
|
|
|
}
|
2013-06-28 17:33:29 +02:00
|
|
|
|
2024-08-19 06:54:54 -07:00
|
|
|
BounceBuffer *bounce = g_malloc0(l + sizeof(BounceBuffer));
|
|
|
|
|
bounce->magic = BOUNCE_BUFFER_MAGIC;
|
2013-06-28 17:33:29 +02:00
|
|
|
memory_region_ref(mr);
|
2024-08-19 06:54:54 -07:00
|
|
|
bounce->mr = mr;
|
|
|
|
|
bounce->addr = addr;
|
|
|
|
|
bounce->len = l;
|
|
|
|
|
|
2013-06-28 17:29:27 +02:00
|
|
|
if (!is_write) {
|
2024-09-12 15:04:04 +08:00
|
|
|
flatview_read(fv, addr, attrs,
|
2024-08-19 06:54:54 -07:00
|
|
|
bounce->buffer, l);
|
2011-06-27 18:26:06 +01:00
|
|
|
}
|
2009-01-22 16:59:11 +00:00
|
|
|
|
2013-06-28 17:29:27 +02:00
|
|
|
*plen = l;
|
2024-08-19 06:54:54 -07:00
|
|
|
return bounce->buffer;
|
2013-06-28 17:29:27 +02:00
|
|
|
}
|
|
|
|
|
|
2013-06-28 17:33:29 +02:00
|
|
|
memory_region_ref(mr);
|
2017-09-21 18:50:58 +10:00
|
|
|
*plen = flatview_extend_translation(fv, addr, len, mr, xlat,
|
2018-05-31 14:50:52 +01:00
|
|
|
l, is_write, attrs);
|
2021-01-20 01:02:55 -05:00
|
|
|
fuzz_dma_read_cb(addr, *plen, mr);
|
2024-04-30 18:49:35 +02:00
|
|
|
return qemu_ram_ptr_length(mr->ram_block, xlat, plen, true, is_write);
|
2009-01-22 16:59:11 +00:00
|
|
|
}
|
|
|
|
|
|
2012-10-03 16:22:53 +02:00
|
|
|
/* Unmaps a memory region previously mapped by address_space_map().
|
2020-02-19 20:12:01 +01:00
|
|
|
* Will also mark the memory as dirty if is_write is true. access_len gives
|
2009-01-22 16:59:11 +00:00
|
|
|
* the amount of memory that was actually read or written by the caller.
|
|
|
|
|
*/
|
2012-10-23 12:30:10 +02:00
|
|
|
void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len,
|
2020-02-19 20:12:01 +01:00
|
|
|
bool is_write, hwaddr access_len)
|
2009-01-22 16:59:11 +00:00
|
|
|
{
|
2024-08-19 06:54:54 -07:00
|
|
|
MemoryRegion *mr;
|
|
|
|
|
ram_addr_t addr1;
|
2013-06-28 17:33:29 +02:00
|
|
|
|
2024-08-19 06:54:54 -07:00
|
|
|
mr = memory_region_from_host(buffer, &addr1);
|
|
|
|
|
if (mr != NULL) {
|
2009-01-22 16:59:11 +00:00
|
|
|
if (is_write) {
|
2015-03-23 11:45:53 +01:00
|
|
|
invalidate_and_set_dirty(mr, addr1, access_len);
|
2009-01-22 16:59:11 +00:00
|
|
|
}
|
2011-06-21 22:59:09 +02:00
|
|
|
if (xen_enabled()) {
|
2011-06-21 22:59:08 +02:00
|
|
|
xen_invalidate_map_cache_entry(buffer);
|
2010-09-16 13:57:49 +01:00
|
|
|
}
|
2013-06-28 17:33:29 +02:00
|
|
|
memory_region_unref(mr);
|
2009-01-22 16:59:11 +00:00
|
|
|
return;
|
|
|
|
|
}
|
2024-08-19 06:54:54 -07:00
|
|
|
|
|
|
|
|
|
|
|
|
|
BounceBuffer *bounce = container_of(buffer, BounceBuffer, buffer);
|
|
|
|
|
assert(bounce->magic == BOUNCE_BUFFER_MAGIC);
|
|
|
|
|
|
2009-01-22 16:59:11 +00:00
|
|
|
if (is_write) {
|
2024-08-19 06:54:54 -07:00
|
|
|
address_space_write(as, bounce->addr, MEMTXATTRS_UNSPECIFIED,
|
|
|
|
|
bounce->buffer, access_len);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
qatomic_sub(&as->bounce_buffer_size, bounce->len);
|
|
|
|
|
bounce->magic = ~BOUNCE_BUFFER_MAGIC;
|
|
|
|
|
memory_region_unref(bounce->mr);
|
|
|
|
|
g_free(bounce);
|
|
|
|
|
/* Write bounce_buffer_size before reading map_client_list. */
|
|
|
|
|
smp_mb();
|
2023-09-07 06:04:23 -07:00
|
|
|
address_space_notify_map_clients(as);
|
2009-01-22 16:59:11 +00:00
|
|
|
}
|
2006-04-23 17:14:48 +00:00
|
|
|
|
2012-10-23 12:30:10 +02:00
|
|
|
void *cpu_physical_memory_map(hwaddr addr,
|
|
|
|
|
hwaddr *plen,
|
2020-02-19 20:32:30 +01:00
|
|
|
bool is_write)
|
2012-10-03 16:22:53 +02:00
|
|
|
{
|
2018-05-31 14:50:52 +01:00
|
|
|
return address_space_map(&address_space_memory, addr, plen, is_write,
|
|
|
|
|
MEMTXATTRS_UNSPECIFIED);
|
2012-10-03 16:22:53 +02:00
|
|
|
}
|
|
|
|
|
|
2012-10-23 12:30:10 +02:00
|
|
|
void cpu_physical_memory_unmap(void *buffer, hwaddr len,
|
2020-02-19 20:32:30 +01:00
|
|
|
bool is_write, hwaddr access_len)
|
2012-10-03 16:22:53 +02:00
|
|
|
{
|
|
|
|
|
return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len);
|
|
|
|
|
}
|
|
|
|
|
|
2016-11-22 11:34:02 +01:00
|
|
|
#define ARG1_DECL AddressSpace *as
|
|
|
|
|
#define ARG1 as
|
|
|
|
|
#define SUFFIX
|
|
|
|
|
#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__)
|
|
|
|
|
#define RCU_READ_LOCK(...) rcu_read_lock()
|
|
|
|
|
#define RCU_READ_UNLOCK(...) rcu_read_unlock()
|
2020-02-04 12:41:01 +01:00
|
|
|
#include "memory_ldst.c.inc"
|
2011-07-06 09:09:23 +02:00
|
|
|
|
2016-11-22 12:04:52 +01:00
|
|
|
int64_t address_space_cache_init(MemoryRegionCache *cache,
|
|
|
|
|
AddressSpace *as,
|
|
|
|
|
hwaddr addr,
|
|
|
|
|
hwaddr len,
|
|
|
|
|
bool is_write)
|
|
|
|
|
{
|
2018-03-18 18:26:36 +01:00
|
|
|
AddressSpaceDispatch *d;
|
|
|
|
|
hwaddr l;
|
|
|
|
|
MemoryRegion *mr;
|
memory: clamp cached translation in case it points to an MMIO region
In using the address_space_translate_internal API, address_space_cache_init
forgot one piece of advice that can be found in the code for
address_space_translate_internal:
/* MMIO registers can be expected to perform full-width accesses based only
* on their address, without considering adjacent registers that could
* decode to completely different MemoryRegions. When such registers
* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
* regions overlap wildly. For this reason we cannot clamp the accesses
* here.
*
* If the length is small (as is the case for address_space_ldl/stl),
* everything works fine. If the incoming length is large, however,
* the caller really has to do the clamping through memory_access_size.
*/
address_space_cache_init is exactly one such case where "the incoming length
is large", therefore we need to clamp the resulting length---not to
memory_access_size though, since we are not doing an access yet, but to
the size of the resulting section. This ensures that subsequent accesses
to the cached MemoryRegionSection will be in range.
With this patch, the enclosed testcase notices that the used ring does
not fit into the MSI-X table and prints a "qemu-system-x86_64: Cannot map used"
error.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-12-01 09:29:56 -05:00
|
|
|
Int128 diff;
|
2018-03-18 18:26:36 +01:00
|
|
|
|
|
|
|
|
assert(len > 0);
|
|
|
|
|
|
|
|
|
|
l = len;
|
|
|
|
|
cache->fv = address_space_get_flatview(as);
|
|
|
|
|
d = flatview_to_dispatch(cache->fv);
|
|
|
|
|
cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true);
|
|
|
|
|
|
memory: clamp cached translation in case it points to an MMIO region
In using the address_space_translate_internal API, address_space_cache_init
forgot one piece of advice that can be found in the code for
address_space_translate_internal:
/* MMIO registers can be expected to perform full-width accesses based only
* on their address, without considering adjacent registers that could
* decode to completely different MemoryRegions. When such registers
* exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO
* regions overlap wildly. For this reason we cannot clamp the accesses
* here.
*
* If the length is small (as is the case for address_space_ldl/stl),
* everything works fine. If the incoming length is large, however,
* the caller really has to do the clamping through memory_access_size.
*/
address_space_cache_init is exactly one such case where "the incoming length
is large", therefore we need to clamp the resulting length---not to
memory_access_size though, since we are not doing an access yet, but to
the size of the resulting section. This ensures that subsequent accesses
to the cached MemoryRegionSection will be in range.
With this patch, the enclosed testcase notices that the used ring does
not fit into the MSI-X table and prints a "qemu-system-x86_64: Cannot map used"
error.
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2020-12-01 09:29:56 -05:00
|
|
|
/*
|
|
|
|
|
* cache->xlat is now relative to cache->mrs.mr, not to the section itself.
|
|
|
|
|
* Take that into account to compute how many bytes are there between
|
|
|
|
|
* cache->xlat and the end of the section.
|
|
|
|
|
*/
|
|
|
|
|
diff = int128_sub(cache->mrs.size,
|
|
|
|
|
int128_make64(cache->xlat - cache->mrs.offset_within_region));
|
|
|
|
|
l = int128_get64(int128_min(diff, int128_make64(l)));
|
|
|
|
|
|
2018-03-18 18:26:36 +01:00
|
|
|
mr = cache->mrs.mr;
|
|
|
|
|
memory_region_ref(mr);
|
2025-02-10 09:46:46 +01:00
|
|
|
if (memory_access_is_direct(mr, is_write, MEMTXATTRS_UNSPECIFIED)) {
|
2018-05-31 14:50:52 +01:00
|
|
|
/* We don't care about the memory attributes here as we're only
|
|
|
|
|
* doing this if we found actual RAM, which behaves the same
|
|
|
|
|
* regardless of attributes; so UNSPECIFIED is fine.
|
|
|
|
|
*/
|
2018-03-18 18:26:36 +01:00
|
|
|
l = flatview_extend_translation(cache->fv, addr, len, mr,
|
2018-05-31 14:50:52 +01:00
|
|
|
cache->xlat, l, is_write,
|
|
|
|
|
MEMTXATTRS_UNSPECIFIED);
|
2024-04-30 18:49:35 +02:00
|
|
|
cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true,
|
|
|
|
|
is_write);
|
2018-03-18 18:26:36 +01:00
|
|
|
} else {
|
|
|
|
|
cache->ptr = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cache->len = l;
|
|
|
|
|
cache->is_write = is_write;
|
|
|
|
|
return l;
|
2016-11-22 12:04:52 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void address_space_cache_invalidate(MemoryRegionCache *cache,
|
|
|
|
|
hwaddr addr,
|
|
|
|
|
hwaddr access_len)
|
|
|
|
|
{
|
2018-03-18 18:26:36 +01:00
|
|
|
assert(cache->is_write);
|
|
|
|
|
if (likely(cache->ptr)) {
|
|
|
|
|
invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len);
|
|
|
|
|
}
|
2016-11-22 12:04:52 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void address_space_cache_destroy(MemoryRegionCache *cache)
|
|
|
|
|
{
|
2018-03-18 18:26:36 +01:00
|
|
|
if (!cache->mrs.mr) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (xen_enabled()) {
|
|
|
|
|
xen_invalidate_map_cache_entry(cache->ptr);
|
|
|
|
|
}
|
|
|
|
|
memory_region_unref(cache->mrs.mr);
|
|
|
|
|
flatview_unref(cache->fv);
|
|
|
|
|
cache->mrs.mr = NULL;
|
|
|
|
|
cache->fv = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Called from RCU critical section. This function has the same
|
|
|
|
|
* semantics as address_space_translate, but it only works on a
|
|
|
|
|
* predefined range of a MemoryRegion that was mapped with
|
|
|
|
|
* address_space_cache_init.
|
|
|
|
|
*/
|
|
|
|
|
static inline MemoryRegion *address_space_translate_cached(
|
|
|
|
|
MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat,
|
2018-05-31 14:50:52 +01:00
|
|
|
hwaddr *plen, bool is_write, MemTxAttrs attrs)
|
2018-03-18 18:26:36 +01:00
|
|
|
{
|
|
|
|
|
MemoryRegionSection section;
|
|
|
|
|
MemoryRegion *mr;
|
|
|
|
|
IOMMUMemoryRegion *iommu_mr;
|
|
|
|
|
AddressSpace *target_as;
|
|
|
|
|
|
|
|
|
|
assert(!cache->ptr);
|
|
|
|
|
*xlat = addr + cache->xlat;
|
|
|
|
|
|
|
|
|
|
mr = cache->mrs.mr;
|
|
|
|
|
iommu_mr = memory_region_get_iommu(mr);
|
|
|
|
|
if (!iommu_mr) {
|
|
|
|
|
/* MMIO region. */
|
|
|
|
|
return mr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
section = address_space_translate_iommu(iommu_mr, xlat, plen,
|
|
|
|
|
NULL, is_write, true,
|
2018-05-31 14:50:53 +01:00
|
|
|
&target_as, attrs);
|
2018-03-18 18:26:36 +01:00
|
|
|
return section.mr;
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-07 15:37:10 +00:00
|
|
|
/* Called within RCU critical section. */
|
|
|
|
|
static MemTxResult address_space_write_continue_cached(MemTxAttrs attrs,
|
|
|
|
|
const void *ptr,
|
|
|
|
|
hwaddr len,
|
|
|
|
|
hwaddr mr_addr,
|
|
|
|
|
hwaddr l,
|
|
|
|
|
MemoryRegion *mr)
|
|
|
|
|
{
|
|
|
|
|
MemTxResult result = MEMTX_OK;
|
|
|
|
|
const uint8_t *buf = ptr;
|
|
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
|
result |= flatview_write_continue_step(attrs, buf, len, mr_addr, &l,
|
|
|
|
|
mr);
|
|
|
|
|
|
|
|
|
|
len -= l;
|
|
|
|
|
buf += l;
|
|
|
|
|
mr_addr += l;
|
|
|
|
|
|
|
|
|
|
if (!len) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
l = len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Called within RCU critical section. */
|
|
|
|
|
static MemTxResult address_space_read_continue_cached(MemTxAttrs attrs,
|
|
|
|
|
void *ptr, hwaddr len,
|
|
|
|
|
hwaddr mr_addr, hwaddr l,
|
|
|
|
|
MemoryRegion *mr)
|
|
|
|
|
{
|
|
|
|
|
MemTxResult result = MEMTX_OK;
|
|
|
|
|
uint8_t *buf = ptr;
|
|
|
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
|
result |= flatview_read_continue_step(attrs, buf, len, mr_addr, &l, mr);
|
|
|
|
|
len -= l;
|
|
|
|
|
buf += l;
|
|
|
|
|
mr_addr += l;
|
|
|
|
|
|
|
|
|
|
if (!len) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
l = len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2018-03-18 18:26:36 +01:00
|
|
|
/* Called from RCU critical section. address_space_read_cached uses this
|
|
|
|
|
* out of line function when the target is an MMIO or IOMMU region.
|
|
|
|
|
*/
|
2020-05-18 17:53:02 +02:00
|
|
|
MemTxResult
|
2018-03-18 18:26:36 +01:00
|
|
|
address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr,
|
2019-01-17 20:49:01 +08:00
|
|
|
void *buf, hwaddr len)
|
2018-03-18 18:26:36 +01:00
|
|
|
{
|
2024-03-07 15:37:07 +00:00
|
|
|
hwaddr mr_addr, l;
|
2018-03-18 18:26:36 +01:00
|
|
|
MemoryRegion *mr;
|
|
|
|
|
|
|
|
|
|
l = len;
|
2024-03-07 15:37:07 +00:00
|
|
|
mr = address_space_translate_cached(cache, addr, &mr_addr, &l, false,
|
2018-05-31 14:50:52 +01:00
|
|
|
MEMTXATTRS_UNSPECIFIED);
|
2024-03-07 15:37:10 +00:00
|
|
|
return address_space_read_continue_cached(MEMTXATTRS_UNSPECIFIED,
|
|
|
|
|
buf, len, mr_addr, l, mr);
|
2018-03-18 18:26:36 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Called from RCU critical section. address_space_write_cached uses this
|
|
|
|
|
* out of line function when the target is an MMIO or IOMMU region.
|
|
|
|
|
*/
|
2020-05-18 17:53:02 +02:00
|
|
|
MemTxResult
|
2018-03-18 18:26:36 +01:00
|
|
|
address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr,
|
2019-01-17 20:49:01 +08:00
|
|
|
const void *buf, hwaddr len)
|
2018-03-18 18:26:36 +01:00
|
|
|
{
|
2024-03-07 15:37:07 +00:00
|
|
|
hwaddr mr_addr, l;
|
2018-03-18 18:26:36 +01:00
|
|
|
MemoryRegion *mr;
|
|
|
|
|
|
|
|
|
|
l = len;
|
2024-03-07 15:37:07 +00:00
|
|
|
mr = address_space_translate_cached(cache, addr, &mr_addr, &l, true,
|
2018-05-31 14:50:52 +01:00
|
|
|
MEMTXATTRS_UNSPECIFIED);
|
2024-03-07 15:37:10 +00:00
|
|
|
return address_space_write_continue_cached(MEMTXATTRS_UNSPECIFIED,
|
|
|
|
|
buf, len, mr_addr, l, mr);
|
2016-11-22 12:04:52 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define ARG1_DECL MemoryRegionCache *cache
|
|
|
|
|
#define ARG1 cache
|
2018-03-18 18:26:36 +01:00
|
|
|
#define SUFFIX _cached_slow
|
|
|
|
|
#define TRANSLATE(...) address_space_translate_cached(cache, __VA_ARGS__)
|
|
|
|
|
#define RCU_READ_LOCK() ((void)0)
|
|
|
|
|
#define RCU_READ_UNLOCK() ((void)0)
|
2020-02-04 12:41:01 +01:00
|
|
|
#include "memory_ldst.c.inc"
|
2016-11-22 12:04:52 +01:00
|
|
|
|
2009-03-28 17:51:36 +00:00
|
|
|
/* virtual memory access for debug (includes writing to ROM) */
|
2022-02-03 02:13:28 +01:00
|
|
|
int cpu_memory_rw_debug(CPUState *cpu, vaddr addr,
|
|
|
|
|
void *ptr, size_t len, bool is_write)
|
2004-01-24 15:23:36 +00:00
|
|
|
{
|
2012-10-23 12:30:10 +02:00
|
|
|
hwaddr phys_addr;
|
2022-02-03 02:13:28 +01:00
|
|
|
vaddr l, page;
|
2020-02-19 20:02:11 +01:00
|
|
|
uint8_t *buf = ptr;
|
2004-01-24 15:23:36 +00:00
|
|
|
|
2017-03-07 15:19:08 +01:00
|
|
|
cpu_synchronize_state(cpu);
|
2004-01-24 15:23:36 +00:00
|
|
|
while (len > 0) {
|
2016-01-21 14:15:06 +00:00
|
|
|
int asidx;
|
|
|
|
|
MemTxAttrs attrs;
|
2020-05-18 17:53:03 +02:00
|
|
|
MemTxResult res;
|
2016-01-21 14:15:06 +00:00
|
|
|
|
2004-01-24 15:23:36 +00:00
|
|
|
page = addr & TARGET_PAGE_MASK;
|
2016-01-21 14:15:06 +00:00
|
|
|
phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs);
|
|
|
|
|
asidx = cpu_asidx_from_attrs(cpu, attrs);
|
2004-01-24 15:23:36 +00:00
|
|
|
/* if no physical page mapped, return an error */
|
|
|
|
|
if (phys_addr == -1)
|
|
|
|
|
return -1;
|
|
|
|
|
l = (page + TARGET_PAGE_SIZE) - addr;
|
|
|
|
|
if (l > len)
|
|
|
|
|
l = len;
|
2009-03-28 17:51:36 +00:00
|
|
|
phys_addr += (addr & ~TARGET_PAGE_MASK);
|
2025-02-10 09:46:48 +01:00
|
|
|
res = address_space_rw(cpu->cpu_ases[asidx].as, phys_addr, attrs, buf,
|
|
|
|
|
l, is_write);
|
2020-05-18 17:53:03 +02:00
|
|
|
if (res != MEMTX_OK) {
|
|
|
|
|
return -1;
|
2013-12-13 16:31:02 +10:00
|
|
|
}
|
2004-01-24 15:23:36 +00:00
|
|
|
len -= l;
|
|
|
|
|
buf += l;
|
|
|
|
|
addr += l;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
2015-11-05 18:10:29 +00:00
|
|
|
|
2015-05-21 13:24:13 +01:00
|
|
|
int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque)
|
2013-06-25 21:35:34 -04:00
|
|
|
{
|
|
|
|
|
RAMBlock *block;
|
2015-05-21 13:24:13 +01:00
|
|
|
int ret = 0;
|
2013-06-25 21:35:34 -04:00
|
|
|
|
2019-10-07 15:36:41 +01:00
|
|
|
RCU_READ_LOCK_GUARD();
|
2017-05-12 12:17:39 +08:00
|
|
|
RAMBLOCK_FOREACH(block) {
|
2019-02-15 20:45:44 +03:00
|
|
|
ret = func(block, opaque);
|
2015-05-21 13:24:13 +01:00
|
|
|
if (ret) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2013-06-25 21:35:34 -04:00
|
|
|
}
|
2015-05-21 13:24:13 +01:00
|
|
|
return ret;
|
2013-06-25 21:35:34 -04:00
|
|
|
}
|
2017-02-24 18:28:32 +00:00
|
|
|
|
|
|
|
|
/*
|
2025-09-29 16:18:18 +02:00
|
|
|
* Unmap pages of memory from offset to offset+length such that
|
2017-02-24 18:28:32 +00:00
|
|
|
* they a) read as 0, b) Trigger whatever fault mechanism
|
|
|
|
|
* the OS provides for postcopy.
|
|
|
|
|
* The pages must be unmapped by the end of the function.
|
|
|
|
|
* Returns: 0 on success, none-0 on failure
|
|
|
|
|
*
|
|
|
|
|
*/
|
2025-09-29 16:18:18 +02:00
|
|
|
int ram_block_discard_range(RAMBlock *rb, uint64_t offset, size_t length)
|
2017-02-24 18:28:32 +00:00
|
|
|
{
|
|
|
|
|
int ret = -1;
|
|
|
|
|
|
2025-09-29 16:18:18 +02:00
|
|
|
uint8_t *host_startaddr = rb->host + offset;
|
2017-02-24 18:28:32 +00:00
|
|
|
|
2020-01-03 11:39:58 +04:00
|
|
|
if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) {
|
2024-01-24 21:33:28 -05:00
|
|
|
error_report("%s: Unaligned start address: %p",
|
|
|
|
|
__func__, host_startaddr);
|
2017-02-24 18:28:32 +00:00
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 16:18:18 +02:00
|
|
|
if ((offset + length) <= rb->max_length) {
|
2018-03-12 17:20:56 +00:00
|
|
|
bool need_madvise, need_fallocate;
|
2020-01-03 11:39:58 +04:00
|
|
|
if (!QEMU_IS_ALIGNED(length, rb->page_size)) {
|
2024-01-24 21:33:28 -05:00
|
|
|
error_report("%s: Unaligned length: %zx", __func__, length);
|
2017-02-24 18:28:32 +00:00
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
errno = ENOTSUP; /* If we are missing MADVISE etc */
|
|
|
|
|
|
2018-03-12 17:20:56 +00:00
|
|
|
/* The logic here is messy;
|
|
|
|
|
* madvise DONTNEED fails for hugepages
|
|
|
|
|
* fallocate works on hugepages and shmem
|
softmmu/physmem: Fix ram_block_discard_range() to handle shared anonymous memory
We can create shared anonymous memory via
"-object memory-backend-ram,share=on,..."
which is, for example, required by PVRDMA for mremap() to work.
Shared anonymous memory is weird, though. Instead of MADV_DONTNEED, we
have to use MADV_REMOVE: MADV_DONTNEED will only remove / zap all
relevant page table entries of the current process, the backend storage
will not get removed, resulting in no reduced memory consumption and
a repopulation of previous content on next access.
Shared anonymous memory is internally really just shmem, but without a
fd exposed. As we cannot use fallocate() without the fd to discard the
backing storage, MADV_REMOVE gets the same job done without a fd as
documented in "man 2 madvise". Removing backing storage implicitly
invalidates all page table entries with relevant mappings - an additional
MADV_DONTNEED is not required.
Fixes: 06329ccecfa0 ("mem: add share parameter to memory-backend-ram")
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210406080126.24010-3-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-06 10:01:25 +02:00
|
|
|
* shared anonymous memory requires madvise REMOVE
|
2018-03-12 17:20:56 +00:00
|
|
|
*/
|
2024-01-02 12:57:48 +11:00
|
|
|
need_madvise = (rb->page_size == qemu_real_host_page_size());
|
2018-03-12 17:20:56 +00:00
|
|
|
need_fallocate = rb->fd != -1;
|
|
|
|
|
if (need_fallocate) {
|
|
|
|
|
/* For a file, this causes the area of the file to be zero'd
|
|
|
|
|
* if read, and for hugetlbfs also causes it to be unmapped
|
|
|
|
|
* so a userfault will trigger.
|
2017-02-24 18:28:33 +00:00
|
|
|
*/
|
|
|
|
|
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
|
2023-09-06 14:04:57 +02:00
|
|
|
/*
|
|
|
|
|
* fallocate() will fail with readonly files. Let's print a
|
|
|
|
|
* proper error message.
|
|
|
|
|
*/
|
|
|
|
|
if (rb->flags & RAM_READONLY_FD) {
|
2024-01-24 21:33:28 -05:00
|
|
|
error_report("%s: Discarding RAM with readonly files is not"
|
|
|
|
|
" supported", __func__);
|
2023-09-06 14:04:57 +02:00
|
|
|
goto err;
|
|
|
|
|
|
|
|
|
|
}
|
2023-07-06 09:56:06 +02:00
|
|
|
/*
|
|
|
|
|
* We'll discard data from the actual file, even though we only
|
|
|
|
|
* have a MAP_PRIVATE mapping, possibly messing with other
|
|
|
|
|
* MAP_PRIVATE/MAP_SHARED mappings. There is no easy way to
|
|
|
|
|
* change that behavior whithout violating the promised
|
|
|
|
|
* semantics of ram_block_discard_range().
|
|
|
|
|
*
|
|
|
|
|
* Only warn, because it works as long as nobody else uses that
|
|
|
|
|
* file.
|
|
|
|
|
*/
|
|
|
|
|
if (!qemu_ram_is_shared(rb)) {
|
2024-01-24 21:33:28 -05:00
|
|
|
warn_report_once("%s: Discarding RAM"
|
2023-07-06 09:56:06 +02:00
|
|
|
" in private file mappings is possibly"
|
|
|
|
|
" dangerous, because it will modify the"
|
|
|
|
|
" underlying file and will affect other"
|
2024-01-24 21:33:28 -05:00
|
|
|
" users of the file", __func__);
|
2023-07-06 09:56:06 +02:00
|
|
|
}
|
|
|
|
|
|
2017-02-24 18:28:33 +00:00
|
|
|
ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
|
2025-09-29 16:18:18 +02:00
|
|
|
offset + rb->fd_offset, length);
|
2018-03-12 17:20:56 +00:00
|
|
|
if (ret) {
|
|
|
|
|
ret = -errno;
|
2025-01-22 19:40:53 +00:00
|
|
|
error_report("%s: Failed to fallocate %s:%" PRIx64 "+%" PRIx64
|
2025-09-29 16:18:18 +02:00
|
|
|
" +%zx (%d)", __func__, rb->idstr, offset,
|
2025-01-22 19:40:53 +00:00
|
|
|
rb->fd_offset, length, ret);
|
2018-03-12 17:20:56 +00:00
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
ret = -ENOSYS;
|
2024-01-24 21:33:28 -05:00
|
|
|
error_report("%s: fallocate not available/file"
|
2025-01-22 19:40:53 +00:00
|
|
|
"%s:%" PRIx64 "+%" PRIx64 " +%zx (%d)", __func__,
|
2025-09-29 16:18:18 +02:00
|
|
|
rb->idstr, offset, rb->fd_offset, length, ret);
|
2018-03-12 17:20:56 +00:00
|
|
|
goto err;
|
2017-02-24 18:28:33 +00:00
|
|
|
#endif
|
|
|
|
|
}
|
2018-03-12 17:20:56 +00:00
|
|
|
if (need_madvise) {
|
|
|
|
|
/* For normal RAM this causes it to be unmapped,
|
|
|
|
|
* for shared memory it causes the local mapping to disappear
|
|
|
|
|
* and to fall back on the file contents (which we just
|
|
|
|
|
* fallocate'd away).
|
|
|
|
|
*/
|
|
|
|
|
#if defined(CONFIG_MADVISE)
|
softmmu/physmem: Fix ram_block_discard_range() to handle shared anonymous memory
We can create shared anonymous memory via
"-object memory-backend-ram,share=on,..."
which is, for example, required by PVRDMA for mremap() to work.
Shared anonymous memory is weird, though. Instead of MADV_DONTNEED, we
have to use MADV_REMOVE: MADV_DONTNEED will only remove / zap all
relevant page table entries of the current process, the backend storage
will not get removed, resulting in no reduced memory consumption and
a repopulation of previous content on next access.
Shared anonymous memory is internally really just shmem, but without a
fd exposed. As we cannot use fallocate() without the fd to discard the
backing storage, MADV_REMOVE gets the same job done without a fd as
documented in "man 2 madvise". Removing backing storage implicitly
invalidates all page table entries with relevant mappings - an additional
MADV_DONTNEED is not required.
Fixes: 06329ccecfa0 ("mem: add share parameter to memory-backend-ram")
Reviewed-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20210406080126.24010-3-david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2021-04-06 10:01:25 +02:00
|
|
|
if (qemu_ram_is_shared(rb) && rb->fd < 0) {
|
|
|
|
|
ret = madvise(host_startaddr, length, QEMU_MADV_REMOVE);
|
|
|
|
|
} else {
|
|
|
|
|
ret = madvise(host_startaddr, length, QEMU_MADV_DONTNEED);
|
|
|
|
|
}
|
2018-03-12 17:20:56 +00:00
|
|
|
if (ret) {
|
|
|
|
|
ret = -errno;
|
2024-01-24 21:33:28 -05:00
|
|
|
error_report("%s: Failed to discard range "
|
2018-03-12 17:20:56 +00:00
|
|
|
"%s:%" PRIx64 " +%zx (%d)",
|
2025-09-29 16:18:18 +02:00
|
|
|
__func__, rb->idstr, offset, length, ret);
|
2018-03-12 17:20:56 +00:00
|
|
|
goto err;
|
|
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
ret = -ENOSYS;
|
2024-01-24 21:33:28 -05:00
|
|
|
error_report("%s: MADVISE not available %s:%" PRIx64 " +%zx (%d)",
|
2025-09-29 16:18:18 +02:00
|
|
|
__func__, rb->idstr, offset, length, ret);
|
2018-03-12 17:20:56 +00:00
|
|
|
goto err;
|
|
|
|
|
#endif
|
2017-02-24 18:28:32 +00:00
|
|
|
}
|
2018-03-12 17:20:56 +00:00
|
|
|
trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
|
|
|
|
|
need_madvise, need_fallocate, ret);
|
2017-02-24 18:28:32 +00:00
|
|
|
} else {
|
2024-01-24 21:33:28 -05:00
|
|
|
error_report("%s: Overrun block '%s' (%" PRIu64 "/%zx/" RAM_ADDR_FMT")",
|
2025-09-29 16:18:18 +02:00
|
|
|
__func__, rb->idstr, offset, length, rb->max_length);
|
2017-02-24 18:28:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
err:
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 16:18:18 +02:00
|
|
|
int ram_block_discard_guest_memfd_range(RAMBlock *rb, uint64_t offset,
|
2024-03-20 03:39:07 -05:00
|
|
|
size_t length)
|
|
|
|
|
{
|
|
|
|
|
int ret = -1;
|
|
|
|
|
|
|
|
|
|
#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
|
2025-01-22 19:40:53 +00:00
|
|
|
/* ignore fd_offset with guest_memfd */
|
2024-03-20 03:39:07 -05:00
|
|
|
ret = fallocate(rb->guest_memfd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
|
2025-09-29 16:18:18 +02:00
|
|
|
offset, length);
|
2024-03-20 03:39:07 -05:00
|
|
|
|
|
|
|
|
if (ret) {
|
|
|
|
|
ret = -errno;
|
|
|
|
|
error_report("%s: Failed to fallocate %s:%" PRIx64 " +%zx (%d)",
|
2025-09-29 16:18:18 +02:00
|
|
|
__func__, rb->idstr, offset, length, ret);
|
2024-03-20 03:39:07 -05:00
|
|
|
}
|
|
|
|
|
#else
|
|
|
|
|
ret = -ENOSYS;
|
|
|
|
|
error_report("%s: fallocate not available %s:%" PRIx64 " +%zx (%d)",
|
2025-09-29 16:18:18 +02:00
|
|
|
__func__, rb->idstr, offset, length, ret);
|
2024-03-20 03:39:07 -05:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 17:12:12 +02:00
|
|
|
bool ram_block_is_pmem(RAMBlock *rb)
|
2018-07-18 15:48:00 +08:00
|
|
|
{
|
|
|
|
|
return rb->flags & RAM_PMEM;
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-17 21:17:56 +02:00
|
|
|
static void mtree_print_phys_entries(int start, int end, int skip, int ptr)
|
2017-09-21 18:51:06 +10:00
|
|
|
{
|
|
|
|
|
if (start == end - 1) {
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf("\t%3d ", start);
|
2017-09-21 18:51:06 +10:00
|
|
|
} else {
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf("\t%3d..%-3d ", start, end - 1);
|
2017-09-21 18:51:06 +10:00
|
|
|
}
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf(" skip=%d ", skip);
|
2017-09-21 18:51:06 +10:00
|
|
|
if (ptr == PHYS_MAP_NODE_NIL) {
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf(" ptr=NIL");
|
2017-09-21 18:51:06 +10:00
|
|
|
} else if (!skip) {
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf(" ptr=#%d", ptr);
|
2017-09-21 18:51:06 +10:00
|
|
|
} else {
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf(" ptr=[%d]", ptr);
|
2017-09-21 18:51:06 +10:00
|
|
|
}
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf("\n");
|
2017-09-21 18:51:06 +10:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \
|
|
|
|
|
int128_sub((size), int128_one())) : 0)
|
|
|
|
|
|
2019-04-17 21:17:56 +02:00
|
|
|
void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root)
|
2017-09-21 18:51:06 +10:00
|
|
|
{
|
|
|
|
|
int i;
|
|
|
|
|
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf(" Dispatch\n");
|
|
|
|
|
qemu_printf(" Physical sections\n");
|
2017-09-21 18:51:06 +10:00
|
|
|
|
|
|
|
|
for (i = 0; i < d->map.sections_nb; ++i) {
|
|
|
|
|
MemoryRegionSection *s = d->map.sections + i;
|
|
|
|
|
const char *names[] = { " [unassigned]", " [not dirty]",
|
|
|
|
|
" [ROM]", " [watch]" };
|
|
|
|
|
|
2023-01-10 22:29:47 +01:00
|
|
|
qemu_printf(" #%d @" HWADDR_FMT_plx ".." HWADDR_FMT_plx
|
2019-04-17 21:17:56 +02:00
|
|
|
" %s%s%s%s%s",
|
2017-09-21 18:51:06 +10:00
|
|
|
i,
|
|
|
|
|
s->offset_within_address_space,
|
2022-06-22 17:59:12 +08:00
|
|
|
s->offset_within_address_space + MR_SIZE(s->size),
|
2017-09-21 18:51:06 +10:00
|
|
|
s->mr->name ? s->mr->name : "(noname)",
|
|
|
|
|
i < ARRAY_SIZE(names) ? names[i] : "",
|
|
|
|
|
s->mr == root ? " [ROOT]" : "",
|
|
|
|
|
s == d->mru_section ? " [MRU]" : "",
|
|
|
|
|
s->mr->is_iommu ? " [iommu]" : "");
|
|
|
|
|
|
|
|
|
|
if (s->mr->alias) {
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf(" alias=%s", s->mr->alias->name ?
|
2017-09-21 18:51:06 +10:00
|
|
|
s->mr->alias->name : "noname");
|
|
|
|
|
}
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf("\n");
|
2017-09-21 18:51:06 +10:00
|
|
|
}
|
|
|
|
|
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf(" Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n",
|
2017-09-21 18:51:06 +10:00
|
|
|
P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip);
|
|
|
|
|
for (i = 0; i < d->map.nodes_nb; ++i) {
|
|
|
|
|
int j, jprev;
|
|
|
|
|
PhysPageEntry prev;
|
|
|
|
|
Node *n = d->map.nodes + i;
|
|
|
|
|
|
2019-04-17 21:17:56 +02:00
|
|
|
qemu_printf(" [%d]\n", i);
|
2017-09-21 18:51:06 +10:00
|
|
|
|
|
|
|
|
for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) {
|
|
|
|
|
PhysPageEntry *pe = *n + j;
|
|
|
|
|
|
|
|
|
|
if (pe->ptr == prev.ptr && pe->skip == prev.skip) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2019-04-17 21:17:56 +02:00
|
|
|
mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
|
2017-09-21 18:51:06 +10:00
|
|
|
|
|
|
|
|
jprev = j;
|
|
|
|
|
prev = *pe;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (jprev != ARRAY_SIZE(*n)) {
|
2019-04-17 21:17:56 +02:00
|
|
|
mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr);
|
2017-09-21 18:51:06 +10:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-04-13 11:55:29 +02:00
|
|
|
/* Require any discards to work. */
|
2021-04-13 11:55:28 +02:00
|
|
|
static unsigned int ram_block_discard_required_cnt;
|
2021-04-13 11:55:29 +02:00
|
|
|
/* Require only coordinated discards to work. */
|
|
|
|
|
static unsigned int ram_block_coordinated_discard_required_cnt;
|
|
|
|
|
/* Disable any discards. */
|
2021-04-13 11:55:28 +02:00
|
|
|
static unsigned int ram_block_discard_disabled_cnt;
|
2021-04-13 11:55:29 +02:00
|
|
|
/* Disable only uncoordinated discards. */
|
|
|
|
|
static unsigned int ram_block_uncoordinated_discard_disabled_cnt;
|
2021-04-13 11:55:28 +02:00
|
|
|
static QemuMutex ram_block_discard_disable_mutex;
|
|
|
|
|
|
|
|
|
|
static void ram_block_discard_disable_mutex_lock(void)
|
|
|
|
|
{
|
|
|
|
|
static gsize initialized;
|
|
|
|
|
|
|
|
|
|
if (g_once_init_enter(&initialized)) {
|
|
|
|
|
qemu_mutex_init(&ram_block_discard_disable_mutex);
|
|
|
|
|
g_once_init_leave(&initialized, 1);
|
|
|
|
|
}
|
|
|
|
|
qemu_mutex_lock(&ram_block_discard_disable_mutex);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ram_block_discard_disable_mutex_unlock(void)
|
|
|
|
|
{
|
|
|
|
|
qemu_mutex_unlock(&ram_block_discard_disable_mutex);
|
|
|
|
|
}
|
2020-06-26 09:22:29 +02:00
|
|
|
|
|
|
|
|
int ram_block_discard_disable(bool state)
|
|
|
|
|
{
|
2021-04-13 11:55:28 +02:00
|
|
|
int ret = 0;
|
2020-06-26 09:22:29 +02:00
|
|
|
|
2021-04-13 11:55:28 +02:00
|
|
|
ram_block_discard_disable_mutex_lock();
|
2020-06-26 09:22:29 +02:00
|
|
|
if (!state) {
|
2021-04-13 11:55:28 +02:00
|
|
|
ram_block_discard_disabled_cnt--;
|
2021-04-13 11:55:29 +02:00
|
|
|
} else if (ram_block_discard_required_cnt ||
|
|
|
|
|
ram_block_coordinated_discard_required_cnt) {
|
|
|
|
|
ret = -EBUSY;
|
2021-04-13 11:55:28 +02:00
|
|
|
} else {
|
2021-04-13 11:55:29 +02:00
|
|
|
ram_block_discard_disabled_cnt++;
|
|
|
|
|
}
|
|
|
|
|
ram_block_discard_disable_mutex_unlock();
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ram_block_uncoordinated_discard_disable(bool state)
|
|
|
|
|
{
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
ram_block_discard_disable_mutex_lock();
|
|
|
|
|
if (!state) {
|
|
|
|
|
ram_block_uncoordinated_discard_disabled_cnt--;
|
|
|
|
|
} else if (ram_block_discard_required_cnt) {
|
2021-04-13 11:55:28 +02:00
|
|
|
ret = -EBUSY;
|
2021-04-13 11:55:29 +02:00
|
|
|
} else {
|
|
|
|
|
ram_block_uncoordinated_discard_disabled_cnt++;
|
2020-06-26 09:22:29 +02:00
|
|
|
}
|
2021-04-13 11:55:28 +02:00
|
|
|
ram_block_discard_disable_mutex_unlock();
|
|
|
|
|
return ret;
|
2020-06-26 09:22:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ram_block_discard_require(bool state)
|
|
|
|
|
{
|
2021-04-13 11:55:28 +02:00
|
|
|
int ret = 0;
|
2020-06-26 09:22:29 +02:00
|
|
|
|
2021-04-13 11:55:28 +02:00
|
|
|
ram_block_discard_disable_mutex_lock();
|
2020-06-26 09:22:29 +02:00
|
|
|
if (!state) {
|
2021-04-13 11:55:28 +02:00
|
|
|
ram_block_discard_required_cnt--;
|
2021-04-13 11:55:29 +02:00
|
|
|
} else if (ram_block_discard_disabled_cnt ||
|
|
|
|
|
ram_block_uncoordinated_discard_disabled_cnt) {
|
|
|
|
|
ret = -EBUSY;
|
2021-04-13 11:55:28 +02:00
|
|
|
} else {
|
2021-04-13 11:55:29 +02:00
|
|
|
ram_block_discard_required_cnt++;
|
|
|
|
|
}
|
|
|
|
|
ram_block_discard_disable_mutex_unlock();
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int ram_block_coordinated_discard_require(bool state)
|
|
|
|
|
{
|
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
|
|
ram_block_discard_disable_mutex_lock();
|
|
|
|
|
if (!state) {
|
|
|
|
|
ram_block_coordinated_discard_required_cnt--;
|
|
|
|
|
} else if (ram_block_discard_disabled_cnt) {
|
2021-04-13 11:55:28 +02:00
|
|
|
ret = -EBUSY;
|
2021-04-13 11:55:29 +02:00
|
|
|
} else {
|
|
|
|
|
ram_block_coordinated_discard_required_cnt++;
|
2020-06-26 09:22:29 +02:00
|
|
|
}
|
2021-04-13 11:55:28 +02:00
|
|
|
ram_block_discard_disable_mutex_unlock();
|
|
|
|
|
return ret;
|
2020-06-26 09:22:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool ram_block_discard_is_disabled(void)
|
|
|
|
|
{
|
2021-04-13 11:55:29 +02:00
|
|
|
return qatomic_read(&ram_block_discard_disabled_cnt) ||
|
|
|
|
|
qatomic_read(&ram_block_uncoordinated_discard_disabled_cnt);
|
2020-06-26 09:22:29 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool ram_block_discard_is_required(void)
|
|
|
|
|
{
|
2021-04-13 11:55:29 +02:00
|
|
|
return qatomic_read(&ram_block_discard_required_cnt) ||
|
|
|
|
|
qatomic_read(&ram_block_coordinated_discard_required_cnt);
|
2020-06-26 09:22:29 +02:00
|
|
|
}
|
2025-02-27 06:48:01 -08:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Return true if ram is compatible with CPR. Do not exclude rom,
|
|
|
|
|
* because the rom file could change in new QEMU.
|
|
|
|
|
*/
|
|
|
|
|
static bool ram_is_cpr_compatible(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
MemoryRegion *mr = rb->mr;
|
|
|
|
|
|
|
|
|
|
if (!mr || !memory_region_is_ram(mr)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Ram device is remapped in new QEMU */
|
|
|
|
|
if (memory_region_is_ram_device(mr)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* A file descriptor is passed to new QEMU and remapped, or its backing
|
|
|
|
|
* file is reopened and mapped. It must be shared to avoid COW.
|
|
|
|
|
*/
|
|
|
|
|
if (rb->fd >= 0 && qemu_ram_is_shared(rb)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Add a blocker for each volatile ram block. This function should only be
|
|
|
|
|
* called after we know that the block is migratable. Non-migratable blocks
|
|
|
|
|
* are either re-created in new QEMU, or are handled specially, or are covered
|
|
|
|
|
* by a device-level CPR blocker.
|
|
|
|
|
*/
|
|
|
|
|
void ram_block_add_cpr_blocker(RAMBlock *rb, Error **errp)
|
|
|
|
|
{
|
|
|
|
|
assert(qemu_ram_is_migratable(rb));
|
|
|
|
|
|
|
|
|
|
if (ram_is_cpr_compatible(rb)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
error_setg(&rb->cpr_blocker,
|
|
|
|
|
"Memory region %s is not compatible with CPR. share=on is "
|
|
|
|
|
"required for memory-backend objects, and aux-ram-share=on is "
|
|
|
|
|
"required.", memory_region_name(rb->mr));
|
2025-10-27 07:45:02 +01:00
|
|
|
migrate_add_blocker_modes(&rb->cpr_blocker, BIT(MIG_MODE_CPR_TRANSFER),
|
|
|
|
|
errp);
|
2025-02-27 06:48:01 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void ram_block_del_cpr_blocker(RAMBlock *rb)
|
|
|
|
|
{
|
|
|
|
|
migrate_del_blocker(&rb->cpr_blocker);
|
|
|
|
|
}
|