2021-05-17 17:16:58 +02:00
|
|
|
# See docs/devel/tracing.rst for syntax documentation.
|
2016-06-16 09:39:51 +01:00
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# savevm.c
|
2016-06-16 09:39:51 +01:00
|
|
|
qemu_loadvm_state_section(unsigned int section_type) "%d"
|
|
|
|
|
qemu_loadvm_state_section_command(int ret) "%d"
|
|
|
|
|
qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
|
|
|
|
|
qemu_loadvm_state_post_main(int ret) "%d"
|
|
|
|
|
qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
|
|
|
|
|
qemu_savevm_send_packaged(void) ""
|
2023-06-21 14:11:55 +03:00
|
|
|
loadvm_state_switchover_ack_needed(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
|
2017-06-28 11:52:26 +02:00
|
|
|
loadvm_state_setup(void) ""
|
|
|
|
|
loadvm_state_cleanup(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
loadvm_handle_cmd_packaged(unsigned int length) "%u"
|
|
|
|
|
loadvm_handle_cmd_packaged_main(int ret) "%d"
|
|
|
|
|
loadvm_handle_cmd_packaged_received(int ret) "%d"
|
2018-05-02 18:47:27 +08:00
|
|
|
loadvm_handle_recv_bitmap(char *s) "%s"
|
2016-06-16 09:39:51 +01:00
|
|
|
loadvm_postcopy_handle_advise(void) ""
|
2022-03-01 16:39:02 +08:00
|
|
|
loadvm_postcopy_handle_listen(const char *str) "%s"
|
2016-06-16 09:39:51 +01:00
|
|
|
loadvm_postcopy_handle_run(void) ""
|
2018-05-02 18:47:29 +08:00
|
|
|
loadvm_postcopy_handle_resume(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
loadvm_postcopy_ram_handle_discard(void) ""
|
|
|
|
|
loadvm_postcopy_ram_handle_discard_end(void) ""
|
|
|
|
|
loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud"
|
2022-03-01 16:39:01 +08:00
|
|
|
loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d"
|
trace-events: fix code style: print 0x before hex numbers
The only exception are groups of numers separated by symbols
'.', ' ', ':', '/', like 'ab.09.7d'.
This patch is made by the following:
> find . -name trace-events | xargs python script.py
where script.py is the following python script:
=========================
#!/usr/bin/env python
import sys
import re
import fileinput
rhex = '%[-+ *.0-9]*(?:[hljztL]|ll|hh)?(?:x|X|"\s*PRI[xX][^"]*"?)'
rgroup = re.compile('((?:' + rhex + '[.:/ ])+' + rhex + ')')
rbad = re.compile('(?<!0x)' + rhex)
files = sys.argv[1:]
for fname in files:
for line in fileinput.input(fname, inplace=True):
arr = re.split(rgroup, line)
for i in range(0, len(arr), 2):
arr[i] = re.sub(rbad, '0x\g<0>', arr[i])
sys.stdout.write(''.join(arr))
=========================
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Message-id: 20170731160135.12101-5-vsementsov@virtuozzo.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-07-31 19:01:35 +03:00
|
|
|
loadvm_process_command_ping(uint32_t val) "0x%x"
|
2023-06-21 14:11:55 +03:00
|
|
|
loadvm_approve_switchover(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
|
2016-06-16 09:39:51 +01:00
|
|
|
postcopy_ram_listen_thread_exit(void) ""
|
|
|
|
|
postcopy_ram_listen_thread_start(void) ""
|
|
|
|
|
qemu_savevm_send_postcopy_advise(void) ""
|
|
|
|
|
qemu_savevm_send_postcopy_ram_discard(const char *id, uint16_t len) "%s: %ud"
|
|
|
|
|
savevm_command_send(uint16_t command, uint16_t len) "com=0x%x len=%d"
|
|
|
|
|
savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u"
|
|
|
|
|
savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d"
|
|
|
|
|
savevm_section_skip(const char *id, unsigned int section_id) "%s, section_id %u"
|
|
|
|
|
savevm_send_open_return_path(void) ""
|
trace-events: fix code style: print 0x before hex numbers
The only exception are groups of numers separated by symbols
'.', ' ', ':', '/', like 'ab.09.7d'.
This patch is made by the following:
> find . -name trace-events | xargs python script.py
where script.py is the following python script:
=========================
#!/usr/bin/env python
import sys
import re
import fileinput
rhex = '%[-+ *.0-9]*(?:[hljztL]|ll|hh)?(?:x|X|"\s*PRI[xX][^"]*"?)'
rgroup = re.compile('((?:' + rhex + '[.:/ ])+' + rhex + ')')
rbad = re.compile('(?<!0x)' + rhex)
files = sys.argv[1:]
for fname in files:
for line in fileinput.input(fname, inplace=True):
arr = re.split(rgroup, line)
for i in range(0, len(arr), 2):
arr[i] = re.sub(rbad, '0x\g<0>', arr[i])
sys.stdout.write(''.join(arr))
=========================
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Message-id: 20170731160135.12101-5-vsementsov@virtuozzo.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-07-31 19:01:35 +03:00
|
|
|
savevm_send_ping(uint32_t val) "0x%x"
|
2016-06-16 09:39:51 +01:00
|
|
|
savevm_send_postcopy_listen(void) ""
|
|
|
|
|
savevm_send_postcopy_run(void) ""
|
2018-05-02 18:47:29 +08:00
|
|
|
savevm_send_postcopy_resume(void) ""
|
2018-09-03 12:38:47 +08:00
|
|
|
savevm_send_colo_enable(void) ""
|
2018-05-02 18:47:27 +08:00
|
|
|
savevm_send_recv_bitmap(char *name) "%s"
|
2025-03-04 23:03:32 +01:00
|
|
|
savevm_send_switchover_start(void) ""
|
2017-06-28 11:52:24 +02:00
|
|
|
savevm_state_setup(void) ""
|
2018-05-02 18:47:31 +08:00
|
|
|
savevm_state_resume_prepare(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
savevm_state_header(void) ""
|
|
|
|
|
savevm_state_iterate(void) ""
|
|
|
|
|
savevm_state_cleanup(void) ""
|
|
|
|
|
vmstate_save(const char *idstr, const char *vmsd_name) "%s, %s"
|
|
|
|
|
vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s"
|
migration: Add per vmstate downtime tracepoints
We have a bunch of savevm_section* tracepoints, they're good to analyze
migration stream, but not always suitable if someone would like to analyze
the migration downtime. Two major problems:
- savevm_section* tracepoints are dumping all sections, we only care
about the sections that contribute to the downtime
- They don't have an identifier to show the type of sections, so no way
to filter downtime information either easily.
We can add type into the tracepoints, but instead of doing so, this patch
kept them untouched, instead of adding a bunch of downtime specific
tracepoints, so one can enable "vmstate_downtime*" tracepoints and get a
full picture of how the downtime is distributed across iterative and
non-iterative vmstate save/load.
Note that here both save() and load() need to be traced, because both of
them may contribute to the downtime. The contribution is not a simple "add
them together", though: consider when the src is doing a save() of device1
while the dest can be load()ing for device2, so they can happen
concurrently.
Tracking both sides make sense because device load() and save() can be
imbalanced, one device can save() super fast, but load() super slow, vice
versa. We can't figure that out without tracing both.
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20231030163346.765724-4-peterx@redhat.com>
2023-10-30 12:33:44 -04:00
|
|
|
vmstate_downtime_save(const char *type, const char *idstr, uint32_t instance_id, int64_t downtime) "type=%s idstr=%s instance_id=%d downtime=%"PRIi64
|
|
|
|
|
vmstate_downtime_load(const char *type, const char *idstr, uint32_t instance_id, int64_t downtime) "type=%s idstr=%s instance_id=%d downtime=%"PRIi64
|
2023-10-30 12:33:46 -04:00
|
|
|
vmstate_downtime_checkpoint(const char *checkpoint) "%s"
|
2019-03-14 19:09:29 +01:00
|
|
|
postcopy_pause_incoming(void) ""
|
|
|
|
|
postcopy_pause_incoming_continued(void) ""
|
2020-10-21 17:27:19 -04:00
|
|
|
postcopy_page_req_sync(void *host_addr) "sync page req %p"
|
2016-06-16 09:39:51 +01:00
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# vmstate.c
|
2016-06-16 09:39:51 +01:00
|
|
|
vmstate_load_field_error(const char *field, int ret) "field \"%s\" load failed, ret = %d"
|
|
|
|
|
vmstate_load_state(const char *name, int version_id) "%s v%d"
|
|
|
|
|
vmstate_load_state_end(const char *name, const char *reason, int val) "%s %s/%d"
|
2024-05-16 12:40:20 +04:00
|
|
|
vmstate_load_state_field(const char *name, const char *field, bool exists) "%s:%s exists=%d"
|
2016-06-16 09:39:51 +01:00
|
|
|
vmstate_n_elems(const char *name, int n_elems) "%s: %d"
|
|
|
|
|
vmstate_subsection_load(const char *parent) "%s"
|
|
|
|
|
vmstate_subsection_load_bad(const char *parent, const char *sub, const char *sub2) "%s: %s/%s"
|
|
|
|
|
vmstate_subsection_load_good(const char *parent) "%s"
|
2019-03-14 19:09:29 +01:00
|
|
|
vmstate_save_state_pre_save_res(const char *name, int res) "%s/%d"
|
|
|
|
|
vmstate_save_state_loop(const char *name, const char *field, int n_elems) "%s/%s[%d]"
|
|
|
|
|
vmstate_save_state_top(const char *idstr) "%s"
|
|
|
|
|
vmstate_subsection_save_loop(const char *name, const char *sub) "%s/%s"
|
|
|
|
|
vmstate_subsection_save_top(const char *idstr) "%s"
|
2023-09-06 16:47:22 -04:00
|
|
|
vmstate_field_exists(const char *vmsd, const char *name, int field_version, int version, int result) "%s:%s field_version %d version %d result %d"
|
2019-03-14 19:09:29 +01:00
|
|
|
|
|
|
|
|
# vmstate-types.c
|
2017-01-19 11:00:51 -08:00
|
|
|
get_qtailq(const char *name, int version_id) "%s v%d"
|
|
|
|
|
get_qtailq_end(const char *name, const char *reason, int val) "%s %s/%d"
|
|
|
|
|
put_qtailq(const char *name, int version_id) "%s v%d"
|
|
|
|
|
put_qtailq_end(const char *name, const char *reason) "%s %s"
|
2016-06-16 09:39:51 +01:00
|
|
|
|
2019-10-11 14:17:24 +02:00
|
|
|
get_gtree(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, uint32_t nnodes) "%s(%s/%s) nnodes=%d"
|
|
|
|
|
get_gtree_end(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, int ret) "%s(%s/%s) %d"
|
|
|
|
|
put_gtree(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, uint32_t nnodes) "%s(%s/%s) nnodes=%d"
|
|
|
|
|
put_gtree_end(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, int ret) "%s(%s/%s) %d"
|
|
|
|
|
|
2020-01-13 14:48:23 +01:00
|
|
|
get_qlist(const char *field_name, const char *vmsd_name, int version_id) "%s(%s v%d)"
|
|
|
|
|
get_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)"
|
|
|
|
|
put_qlist(const char *field_name, const char *vmsd_name, int version_id) "%s(%s v%d)"
|
|
|
|
|
put_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)"
|
|
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# qemu-file.c
|
2016-06-16 09:39:51 +01:00
|
|
|
qemu_file_fclose(void) ""
|
2025-01-15 11:00:39 -08:00
|
|
|
qemu_file_put_fd(const char *name, int fd, int ret) "ioc %s, fd %d -> status %d"
|
|
|
|
|
qemu_file_get_fd(const char *name, int fd) "ioc %s -> fd %d"
|
2016-06-16 09:39:51 +01:00
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# ram.c
|
2022-08-02 08:19:49 +02:00
|
|
|
get_queued_page(const char *block_name, uint64_t tmp_offset, unsigned long page_abs) "%s/0x%" PRIx64 " page_abs=0x%lx"
|
|
|
|
|
get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, unsigned long page_abs) "%s/0x%" PRIx64 " page_abs=0x%lx"
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_bitmap_sync_start(void) ""
|
|
|
|
|
migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64
|
migration: Split log_clear() into smaller chunks
Currently we are doing log_clear() right after log_sync() which mostly
keeps the old behavior when log_clear() was still part of log_sync().
This patch tries to further optimize the migration log_clear() code
path to split huge log_clear()s into smaller chunks.
We do this by spliting the whole guest memory region into memory
chunks, whose size is decided by MigrationState.clear_bitmap_shift (an
example will be given below). With that, we don't do the dirty bitmap
clear operation on the remote node (e.g., KVM) when we fetch the dirty
bitmap, instead we explicitly clear the dirty bitmap for the memory
chunk for each of the first time we send a page in that chunk.
Here comes an example.
Assuming the guest has 64G memory, then before this patch the KVM
ioctl KVM_CLEAR_DIRTY_LOG will be a single one covering 64G memory.
If after the patch, let's assume when the clear bitmap shift is 18,
then the memory chunk size on x86_64 will be 1UL<<18 * 4K = 1GB. Then
instead of sending a big 64G ioctl, we'll send 64 small ioctls, each
of the ioctl will cover 1G of the guest memory. For each of the 64
small ioctls, we'll only send if any of the page in that small chunk
was going to be sent right away.
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Message-Id: <20190603065056.25211-12-peterx@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2019-06-03 14:50:56 +08:00
|
|
|
migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx"
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_throttle(void) ""
|
2023-06-08 00:12:40 +08:00
|
|
|
migration_dirty_limit_guest(int64_t dirtyrate) "guest dirty page rate limit %" PRIi64 " MB/s"
|
2017-02-24 18:28:32 +00:00
|
|
|
ram_discard_range(const char *rbname, uint64_t start, size_t len) "%s: start: %" PRIx64 " %zx"
|
trace-events: fix code style: print 0x before hex numbers
The only exception are groups of numers separated by symbols
'.', ' ', ':', '/', like 'ab.09.7d'.
This patch is made by the following:
> find . -name trace-events | xargs python script.py
where script.py is the following python script:
=========================
#!/usr/bin/env python
import sys
import re
import fileinput
rhex = '%[-+ *.0-9]*(?:[hljztL]|ll|hh)?(?:x|X|"\s*PRI[xX][^"]*"?)'
rgroup = re.compile('((?:' + rhex + '[.:/ ])+' + rhex + ')')
rbad = re.compile('(?<!0x)' + rhex)
files = sys.argv[1:]
for fname in files:
for line in fileinput.input(fname, inplace=True):
arr = re.split(rgroup, line)
for i in range(0, len(arr), 2):
arr[i] = re.sub(rbad, '0x\g<0>', arr[i])
sys.stdout.write(''.join(arr))
=========================
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Message-id: 20170731160135.12101-5-vsementsov@virtuozzo.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-07-31 19:01:35 +03:00
|
|
|
ram_load_loop(const char *rbname, uint64_t addr, int flags, void *host) "%s: addr: 0x%" PRIx64 " flags: 0x%x host: %p"
|
2022-07-07 14:55:02 -04:00
|
|
|
ram_load_postcopy_loop(int channel, uint64_t addr, int flags) "chan=%d addr=0x%" PRIx64 " flags=0x%x"
|
2016-06-16 09:39:51 +01:00
|
|
|
ram_postcopy_send_discard_bitmap(void) ""
|
trace-events: fix code style: print 0x before hex numbers
The only exception are groups of numers separated by symbols
'.', ' ', ':', '/', like 'ab.09.7d'.
This patch is made by the following:
> find . -name trace-events | xargs python script.py
where script.py is the following python script:
=========================
#!/usr/bin/env python
import sys
import re
import fileinput
rhex = '%[-+ *.0-9]*(?:[hljztL]|ll|hh)?(?:x|X|"\s*PRI[xX][^"]*"?)'
rgroup = re.compile('((?:' + rhex + '[.:/ ])+' + rhex + ')')
rbad = re.compile('(?<!0x)' + rhex)
files = sys.argv[1:]
for fname in files:
for line in fileinput.input(fname, inplace=True):
arr = re.split(rgroup, line)
for i in range(0, len(arr), 2):
arr[i] = re.sub(rbad, '0x\g<0>', arr[i])
sys.stdout.write(''.join(arr))
=========================
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Message-id: 20170731160135.12101-5-vsementsov@virtuozzo.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-07-31 19:01:35 +03:00
|
|
|
ram_save_page(const char *rbname, uint64_t offset, void *host) "%s: offset: 0x%" PRIx64 " host: %p"
|
|
|
|
|
ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: 0x%zx len: 0x%zx"
|
2025-06-13 10:07:59 -04:00
|
|
|
ram_save_complete(uint64_t dirty_pages, int done) "dirty=%" PRIu64 ", done=%d"
|
migration: synchronize dirty bitmap for resume
This patch implements the first part of core RAM resume logic for
postcopy. ram_resume_prepare() is provided for the work.
When the migration is interrupted by network failure, the dirty bitmap
on the source side will be meaningless, because even the dirty bit is
cleared, it is still possible that the sent page was lost along the way
to destination. Here instead of continue the migration with the old
dirty bitmap on source, we ask the destination side to send back its
received bitmap, then invert it to be our initial dirty bitmap.
The source side send thread will issue the MIG_CMD_RECV_BITMAP requests,
once per ramblock, to ask for the received bitmap. On destination side,
MIG_RP_MSG_RECV_BITMAP will be issued, along with the requested bitmap.
Data will be received on the return-path thread of source, and the main
migration thread will be notified when all the ramblock bitmaps are
synchronized.
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20180502104740.12123-17-peterx@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-05-02 18:47:32 +08:00
|
|
|
ram_dirty_bitmap_request(char *str) "%s"
|
2018-05-02 18:47:28 +08:00
|
|
|
ram_dirty_bitmap_reload_begin(char *str) "%s"
|
|
|
|
|
ram_dirty_bitmap_reload_complete(char *str) "%s"
|
migration: synchronize dirty bitmap for resume
This patch implements the first part of core RAM resume logic for
postcopy. ram_resume_prepare() is provided for the work.
When the migration is interrupted by network failure, the dirty bitmap
on the source side will be meaningless, because even the dirty bit is
cleared, it is still possible that the sent page was lost along the way
to destination. Here instead of continue the migration with the old
dirty bitmap on source, we ask the destination side to send back its
received bitmap, then invert it to be our initial dirty bitmap.
The source side send thread will issue the MIG_CMD_RECV_BITMAP requests,
once per ramblock, to ask for the received bitmap. On destination side,
MIG_RP_MSG_RECV_BITMAP will be issued, along with the requested bitmap.
Data will be received on the return-path thread of source, and the main
migration thread will be notified when all the ramblock bitmaps are
synchronized.
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20180502104740.12123-17-peterx@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
2018-05-02 18:47:32 +08:00
|
|
|
ram_dirty_bitmap_sync_start(void) ""
|
|
|
|
|
ram_dirty_bitmap_sync_wait(void) ""
|
|
|
|
|
ram_dirty_bitmap_sync_complete(void) ""
|
2018-05-02 18:47:33 +08:00
|
|
|
ram_state_resume_prepare(uint64_t v) "%" PRId64
|
2018-09-03 12:38:50 +08:00
|
|
|
colo_flush_ram_cache_begin(uint64_t dirty_pages) "dirty_pages %" PRIu64
|
|
|
|
|
colo_flush_ram_cache_end(void) ""
|
2019-03-14 19:09:29 +01:00
|
|
|
save_xbzrle_page_skipping(void) ""
|
|
|
|
|
save_xbzrle_page_overflow(void) ""
|
|
|
|
|
ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
|
2024-10-29 15:58:14 +01:00
|
|
|
ram_load_start(void) ""
|
2019-03-14 19:09:29 +01:00
|
|
|
ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
|
2021-01-29 13:14:05 +03:00
|
|
|
ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
|
|
|
|
|
ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
|
migration: Postcopy preemption enablement
This patch enables postcopy-preempt feature.
It contains two major changes to the migration logic:
(1) Postcopy requests are now sent via a different socket from precopy
background migration stream, so as to be isolated from very high page
request delays.
(2) For huge page enabled hosts: when there's postcopy requests, they can now
intercept a partial sending of huge host pages on src QEMU.
After this patch, we'll live migrate a VM with two channels for postcopy: (1)
PRECOPY channel, which is the default channel that transfers background pages;
and (2) POSTCOPY channel, which only transfers requested pages.
There's no strict rule of which channel to use, e.g., if a requested page is
already being transferred on precopy channel, then we will keep using the same
precopy channel to transfer the page even if it's explicitly requested. In 99%
of the cases we'll prioritize the channels so we send requested page via the
postcopy channel as long as possible.
On the source QEMU, when we found a postcopy request, we'll interrupt the
PRECOPY channel sending process and quickly switch to the POSTCOPY channel.
After we serviced all the high priority postcopy pages, we'll switch back to
PRECOPY channel so that we'll continue to send the interrupted huge page again.
There's no new thread introduced on src QEMU.
On the destination QEMU, one new thread is introduced to receive page data from
the postcopy specific socket (done in the preparation patch).
This patch has a side effect: after sending postcopy pages, previously we'll
assume the guest will access follow up pages so we'll keep sending from there.
Now it's changed. Instead of going on with a postcopy requested page, we'll go
back and continue sending the precopy huge page (which can be intercepted by a
postcopy request so the huge page can be sent partially before).
Whether that's a problem is debatable, because "assuming the guest will
continue to access the next page" may not really suite when huge pages are
used, especially if the huge page is large (e.g. 1GB pages). So that locality
hint is much meaningless if huge pages are used.
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20220707185504.27203-1-peterx@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2022-07-07 14:55:04 -04:00
|
|
|
postcopy_preempt_triggered(char *str, unsigned long page) "during sending ramblock %s offset 0x%lx"
|
|
|
|
|
postcopy_preempt_restored(char *str, unsigned long page) "ramblock %s offset 0x%lx"
|
|
|
|
|
postcopy_preempt_hit(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
|
|
|
|
|
postcopy_preempt_send_host_page(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
|
|
|
|
|
postcopy_preempt_switch_channel(int channel) "%d"
|
|
|
|
|
postcopy_preempt_reset_channel(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
|
2020-08-06 16:13:34 +02:00
|
|
|
# multifd.c
|
2021-12-15 14:20:48 +01:00
|
|
|
multifd_new_send_channel_async(uint8_t id) "channel %u"
|
2023-10-12 10:43:43 -03:00
|
|
|
multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p"
|
2024-08-27 14:45:57 -03:00
|
|
|
multifd_recv_unfill(uint8_t id, uint64_t packet_num, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " flags 0x%x next packet size %u"
|
2021-12-15 14:20:48 +01:00
|
|
|
multifd_recv_new_channel(uint8_t id) "channel %u"
|
2020-08-06 16:13:34 +02:00
|
|
|
multifd_recv_sync_main(long packet_num) "packet num %ld"
|
2021-12-15 14:20:48 +01:00
|
|
|
multifd_recv_sync_main_signal(uint8_t id) "channel %u"
|
2024-02-29 12:29:55 -03:00
|
|
|
multifd_recv_sync_main_wait(uint8_t id) "iter %u"
|
2020-08-06 16:13:34 +02:00
|
|
|
multifd_recv_terminate_threads(bool error) "error %d"
|
2024-08-27 14:45:56 -03:00
|
|
|
multifd_recv_thread_end(uint8_t id, uint64_t packets) "channel %u packets %" PRIu64
|
2021-12-15 14:20:48 +01:00
|
|
|
multifd_recv_thread_start(uint8_t id) "%u"
|
2024-08-27 14:45:57 -03:00
|
|
|
multifd_send_fill(uint8_t id, uint64_t packet_num, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " flags 0x%x next packet size %u"
|
|
|
|
|
multifd_send_ram_fill(uint8_t id, uint32_t normal, uint32_t zero) "channel %u normal pages %u zero pages %u"
|
2021-12-15 14:20:48 +01:00
|
|
|
multifd_send_error(uint8_t id) "channel %u"
|
2020-08-06 16:13:34 +02:00
|
|
|
multifd_send_sync_main(long packet_num) "packet num %ld"
|
2021-12-15 14:20:48 +01:00
|
|
|
multifd_send_sync_main_signal(uint8_t id) "channel %u"
|
|
|
|
|
multifd_send_sync_main_wait(uint8_t id) "channel %u"
|
2024-02-02 18:28:49 +08:00
|
|
|
multifd_send_terminate_threads(void) ""
|
2024-08-27 14:45:56 -03:00
|
|
|
multifd_send_thread_end(uint8_t id, uint64_t packets) "channel %u packets %" PRIu64
|
2021-12-15 14:20:48 +01:00
|
|
|
multifd_send_thread_start(uint8_t id) "%u"
|
2026-01-23 11:16:34 -03:00
|
|
|
multifd_tls_outgoing_handshake_start(void *ioc, void *tioc) "ioc=%p tioc=%p"
|
2020-09-15 11:04:02 +08:00
|
|
|
multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s"
|
|
|
|
|
multifd_tls_outgoing_handshake_complete(void *ioc) "ioc=%p"
|
2026-01-23 11:16:34 -03:00
|
|
|
multifd_set_outgoing_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
|
2020-08-06 16:13:34 +02:00
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# migration.c
|
2017-08-30 16:32:01 +08:00
|
|
|
migrate_set_state(const char *new_state) "new state %s"
|
2025-02-13 14:59:21 -03:00
|
|
|
migration_cleanup(void) ""
|
2024-04-30 11:56:42 +03:00
|
|
|
migrate_error(const char *error_desc) "error=%s"
|
2025-02-13 14:59:20 -03:00
|
|
|
migration_cancel(void) ""
|
trace-events: fix code style: print 0x before hex numbers
The only exception are groups of numers separated by symbols
'.', ' ', ':', '/', like 'ab.09.7d'.
This patch is made by the following:
> find . -name trace-events | xargs python script.py
where script.py is the following python script:
=========================
#!/usr/bin/env python
import sys
import re
import fileinput
rhex = '%[-+ *.0-9]*(?:[hljztL]|ll|hh)?(?:x|X|"\s*PRI[xX][^"]*"?)'
rgroup = re.compile('((?:' + rhex + '[.:/ ])+' + rhex + ')')
rbad = re.compile('(?<!0x)' + rhex)
files = sys.argv[1:]
for fname in files:
for line in fileinput.input(fname, inplace=True):
arr = re.split(rgroup, line)
for i in range(0, len(arr), 2):
arr[i] = re.sub(rbad, '0x\g<0>', arr[i])
sys.stdout.write(''.join(arr))
=========================
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Message-id: 20170731160135.12101-5-vsementsov@virtuozzo.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-07-31 19:01:35 +03:00
|
|
|
migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx"
|
2023-02-08 14:41:06 +01:00
|
|
|
migrate_pending_exact(uint64_t size, uint64_t pre, uint64_t post) "exact pending size %" PRIu64 " (pre = %" PRIu64 " post=%" PRIu64 ")"
|
|
|
|
|
migrate_pending_estimate(uint64_t size, uint64_t pre, uint64_t post) "estimate pending size %" PRIu64 " (pre = %" PRIu64 " post=%" PRIu64 ")"
|
2016-06-16 09:39:51 +01:00
|
|
|
migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
|
2018-05-02 18:47:28 +08:00
|
|
|
migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi64
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_completion_file_err(void) ""
|
2021-04-15 14:33:51 +01:00
|
|
|
migration_completion_vm_stop(int ret) "ret %d"
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_completion_postcopy_end(void) ""
|
|
|
|
|
migration_completion_postcopy_end_after_complete(void) ""
|
2019-12-05 10:29:18 +00:00
|
|
|
migration_rate_limit_pre(int ms) "%d ms"
|
|
|
|
|
migration_rate_limit_post(int urgent) "urgent: %d"
|
2017-05-31 18:35:34 +08:00
|
|
|
migration_return_path_end_before(void) ""
|
2023-10-17 16:26:29 -04:00
|
|
|
migration_return_path_end_after(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_thread_after_loop(void) ""
|
|
|
|
|
migration_thread_file_err(void) ""
|
|
|
|
|
migration_thread_setup_complete(void) ""
|
|
|
|
|
open_return_path_on_source(void) ""
|
|
|
|
|
open_return_path_on_source_continue(void) ""
|
|
|
|
|
postcopy_start(void) ""
|
2018-05-02 18:47:21 +08:00
|
|
|
postcopy_pause_return_path(void) ""
|
|
|
|
|
postcopy_pause_return_path_continued(void) ""
|
2018-05-02 18:47:19 +08:00
|
|
|
postcopy_pause_continued(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
postcopy_start_set_run(void) ""
|
2020-10-21 17:27:18 -04:00
|
|
|
postcopy_page_req_add(void *addr, int count) "new page req %p total %d"
|
2016-06-16 09:39:51 +01:00
|
|
|
source_return_path_thread_bad_end(void) ""
|
|
|
|
|
source_return_path_thread_end(void) ""
|
|
|
|
|
source_return_path_thread_entry(void) ""
|
|
|
|
|
source_return_path_thread_loop_top(void) ""
|
trace-events: fix code style: print 0x before hex numbers
The only exception are groups of numers separated by symbols
'.', ' ', ':', '/', like 'ab.09.7d'.
This patch is made by the following:
> find . -name trace-events | xargs python script.py
where script.py is the following python script:
=========================
#!/usr/bin/env python
import sys
import re
import fileinput
rhex = '%[-+ *.0-9]*(?:[hljztL]|ll|hh)?(?:x|X|"\s*PRI[xX][^"]*"?)'
rgroup = re.compile('((?:' + rhex + '[.:/ ])+' + rhex + ')')
rbad = re.compile('(?<!0x)' + rhex)
files = sys.argv[1:]
for fname in files:
for line in fileinput.input(fname, inplace=True):
arr = re.split(rgroup, line)
for i in range(0, len(arr), 2):
arr[i] = re.sub(rbad, '0x\g<0>', arr[i])
sys.stdout.write(''.join(arr))
=========================
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Message-id: 20170731160135.12101-5-vsementsov@virtuozzo.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-07-31 19:01:35 +03:00
|
|
|
source_return_path_thread_pong(uint32_t val) "0x%x"
|
|
|
|
|
source_return_path_thread_shut(uint32_t val) "0x%x"
|
2018-05-02 18:47:30 +08:00
|
|
|
source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32
|
2023-06-21 14:11:55 +03:00
|
|
|
source_return_path_thread_switchover_acked(void) ""
|
migration: Introduce POSTCOPY_DEVICE state
Currently, when postcopy starts, the source VM starts switchover and
sends a package containing the state of all non-postcopiable devices.
When the destination loads this package, the switchover is complete and
the destination VM starts. However, if the device state load fails or
the destination side crashes, the source side is already in
POSTCOPY_ACTIVE state and cannot be recovered, even when it has the most
up-to-date machine state as the destination has not yet started.
This patch introduces a new POSTCOPY_DEVICE state which is active while
the destination machine is loading the device state, is not yet running,
and the source side can be resumed in case of a migration failure.
Return-path is required for this state to function, otherwise it will be
skipped in favor of POSTCOPY_ACTIVE.
To transition from POSTCOPY_DEVICE to POSTCOPY_ACTIVE, the source
side uses a PONG message that is a response to a PING message processed
just before the POSTCOPY_RUN command that starts the destination VM.
Thus, this feature is effective even if the destination side does not
yet support this new state.
Signed-off-by: Juraj Marcin <jmarcin@redhat.com>
Link: https://lore.kernel.org/r/20251103183301.3840862-9-jmarcin@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-11-03 19:32:57 +01:00
|
|
|
source_return_path_thread_postcopy_package_loaded(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_thread_low_pending(uint64_t pending) "%" PRIu64
|
migration: Allow user to specify available switchover bandwidth
Migration bandwidth is a very important value to live migration. It's
because it's one of the major factors that we'll make decision on when to
switchover to destination in a precopy process.
This value is currently estimated by QEMU during the whole live migration
process by monitoring how fast we were sending the data. This can be the
most accurate bandwidth if in the ideal world, where we're always feeding
unlimited data to the migration channel, and then it'll be limited to the
bandwidth that is available.
However in reality it may be very different, e.g., over a 10Gbps network we
can see query-migrate showing migration bandwidth of only a few tens of
MB/s just because there are plenty of other things the migration thread
might be doing. For example, the migration thread can be busy scanning
zero pages, or it can be fetching dirty bitmap from other external dirty
sources (like vhost or KVM). It means we may not be pushing data as much
as possible to migration channel, so the bandwidth estimated from "how many
data we sent in the channel" can be dramatically inaccurate sometimes.
With that, the decision to switchover will be affected, by assuming that we
may not be able to switchover at all with such a low bandwidth, but in
reality we can.
The migration may not even converge at all with the downtime specified,
with that wrong estimation of bandwidth, keeping iterations forever with a
low estimation of bandwidth.
The issue is QEMU itself may not be able to avoid those uncertainties on
measuing the real "available migration bandwidth". At least not something
I can think of so far.
One way to fix this is when the user is fully aware of the available
bandwidth, then we can allow the user to help providing an accurate value.
For example, if the user has a dedicated channel of 10Gbps for migration
for this specific VM, the user can specify this bandwidth so QEMU can
always do the calculation based on this fact, trusting the user as long as
specified. It may not be the exact bandwidth when switching over (in which
case qemu will push migration data as fast as possible), but much better
than QEMU trying to wildly guess, especially when very wrong.
A new parameter "avail-switchover-bandwidth" is introduced just for this.
So when the user specified this parameter, instead of trusting the
estimated value from QEMU itself (based on the QEMUFile send speed), it
trusts the user more by using this value to decide when to switchover,
assuming that we'll have such bandwidth available then.
Note that specifying this value will not throttle the bandwidth for
switchover yet, so QEMU will always use the full bandwidth possible for
sending switchover data, assuming that should always be the most important
way to use the network at that time.
This can resolve issues like "unconvergence migration" which is caused by
hilarious low "migration bandwidth" detected for whatever reason.
Reported-by: Zhiyi Guo <zhguo@redhat.com>
Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Juan Quintela <quintela@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Juan Quintela <quintela@redhat.com>
Message-ID: <20231010221922.40638-1-peterx@redhat.com>
2023-10-10 18:19:22 -04:00
|
|
|
migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64
|
2025-11-03 19:32:54 +01:00
|
|
|
process_incoming_migration_co_end(int ret) "ret=%d"
|
2016-06-16 09:39:51 +01:00
|
|
|
process_incoming_migration_co_postcopy_end_main(void) ""
|
migration: Postcopy preemption enablement
This patch enables postcopy-preempt feature.
It contains two major changes to the migration logic:
(1) Postcopy requests are now sent via a different socket from precopy
background migration stream, so as to be isolated from very high page
request delays.
(2) For huge page enabled hosts: when there's postcopy requests, they can now
intercept a partial sending of huge host pages on src QEMU.
After this patch, we'll live migrate a VM with two channels for postcopy: (1)
PRECOPY channel, which is the default channel that transfers background pages;
and (2) POSTCOPY channel, which only transfers requested pages.
There's no strict rule of which channel to use, e.g., if a requested page is
already being transferred on precopy channel, then we will keep using the same
precopy channel to transfer the page even if it's explicitly requested. In 99%
of the cases we'll prioritize the channels so we send requested page via the
postcopy channel as long as possible.
On the source QEMU, when we found a postcopy request, we'll interrupt the
PRECOPY channel sending process and quickly switch to the POSTCOPY channel.
After we serviced all the high priority postcopy pages, we'll switch back to
PRECOPY channel so that we'll continue to send the interrupted huge page again.
There's no new thread introduced on src QEMU.
On the destination QEMU, one new thread is introduced to receive page data from
the postcopy specific socket (done in the preparation patch).
This patch has a side effect: after sending postcopy pages, previously we'll
assume the guest will access follow up pages so we'll keep sending from there.
Now it's changed. Instead of going on with a postcopy requested page, we'll go
back and continue sending the precopy huge page (which can be intercepted by a
postcopy request so the huge page can be sent partially before).
Whether that's a problem is debatable, because "assuming the guest will
continue to access the next page" may not really suite when huge pages are
used, especially if the huge page is large (e.g. 1GB pages). So that locality
hint is much meaningless if huge pages are used.
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Message-Id: <20220707185504.27203-1-peterx@redhat.com>
Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2022-07-07 14:55:04 -04:00
|
|
|
postcopy_preempt_enabled(bool value) "%d"
|
2025-01-14 18:07:41 -05:00
|
|
|
migration_precopy_complete(void) ""
|
2019-03-14 19:09:29 +01:00
|
|
|
|
2023-05-15 21:57:00 +02:00
|
|
|
# migration-stats
|
2023-05-15 21:57:03 +02:00
|
|
|
migration_transferred_bytes(uint64_t qemu_file, uint64_t multifd, uint64_t rdma) "qemu_file %" PRIu64 " multifd %" PRIu64 " RDMA %" PRIu64
|
2023-05-15 21:57:00 +02:00
|
|
|
|
2019-03-14 19:09:29 +01:00
|
|
|
# channel.c
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
|
2026-01-23 11:16:42 -03:00
|
|
|
migration_set_outgoing_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
|
2019-03-14 19:09:29 +01:00
|
|
|
|
|
|
|
|
# global_state.c
|
|
|
|
|
migrate_state_too_big(void) ""
|
|
|
|
|
migrate_global_state_post_load(const char *state) "loaded state: %s"
|
|
|
|
|
migrate_global_state_pre_save(const char *state) "saved state: %s"
|
2016-06-16 09:39:51 +01:00
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# rdma.c
|
2016-06-16 09:39:51 +01:00
|
|
|
qemu_rdma_accept_incoming_migration(void) ""
|
|
|
|
|
qemu_rdma_accept_incoming_migration_accepted(void) ""
|
|
|
|
|
qemu_rdma_accept_pin_state(bool pin) "%d"
|
|
|
|
|
qemu_rdma_accept_pin_verbsc(void *verbs) "Verbs context after listen: %p"
|
2023-09-28 15:19:31 +02:00
|
|
|
qemu_rdma_block_for_wrid_miss(uint64_t wcomp, uint64_t req) "A Wanted wrid %" PRIu64 " but got %" PRIu64
|
2016-06-16 09:39:51 +01:00
|
|
|
qemu_rdma_cleanup_disconnect(void) ""
|
|
|
|
|
qemu_rdma_close(void) ""
|
|
|
|
|
qemu_rdma_connect_pin_all_requested(void) ""
|
|
|
|
|
qemu_rdma_connect_pin_all_outcome(bool pin) "%d"
|
|
|
|
|
qemu_rdma_dest_init_trying(const char *host, const char *ip) "%s => %s"
|
2023-09-28 15:20:19 +02:00
|
|
|
qemu_rdma_dump_id_failed(const char *who) "%s RDMA Device opened, but can't query port information"
|
|
|
|
|
qemu_rdma_dump_id(const char *who, const char *name, const char *dev_name, const char *dev_path, const char *ibdev_path, int transport, const char *transport_name) "%s RDMA Device opened: kernel name %s uverbs device name %s, infiniband_verbs class device path %s, infiniband class device path %s, transport: (%d) %s"
|
2016-06-16 09:39:51 +01:00
|
|
|
qemu_rdma_dump_gid(const char *who, const char *src, const char *dst) "%s Source GID: %s, Dest GID: %s"
|
|
|
|
|
qemu_rdma_exchange_get_response_start(const char *desc) "CONTROL: %s receiving..."
|
|
|
|
|
qemu_rdma_exchange_get_response_none(const char *desc, int type) "Surprise: got %s (%d)"
|
|
|
|
|
qemu_rdma_exchange_send_issue_callback(void) ""
|
|
|
|
|
qemu_rdma_exchange_send_waiting(const char *desc) "Waiting for response %s"
|
|
|
|
|
qemu_rdma_exchange_send_received(const char *desc) "Response %s received."
|
|
|
|
|
qemu_rdma_fill(size_t control_len, size_t size) "RDMA %zd of %zd bytes already in buffer"
|
|
|
|
|
qemu_rdma_init_ram_blocks(int blocks) "Allocated %d local ram block structures"
|
2023-09-28 15:19:31 +02:00
|
|
|
qemu_rdma_poll_recv(uint64_t comp, int64_t id, int sent) "completion %" PRIu64 " received (%" PRId64 ") left %d"
|
|
|
|
|
qemu_rdma_poll_write(uint64_t comp, int left, uint64_t block, uint64_t chunk, void *local, void *remote) "completions %" PRIu64 " left %d, block %" PRIu64 ", chunk: %" PRIu64 " %p %p"
|
|
|
|
|
qemu_rdma_poll_other(uint64_t comp, int left) "other completion %" PRIu64 " received left %d"
|
2016-06-16 09:39:51 +01:00
|
|
|
qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
|
|
|
|
|
qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
|
2021-09-10 15:02:54 +08:00
|
|
|
qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
|
2021-09-10 15:02:55 +08:00
|
|
|
qemu_rdma_advise_mr(const char *name, uint32_t len, uint64_t addr, const char *res) "Try to advise block %s prefetch at %" PRIu32 "@0x%" PRIx64 ": %s"
|
2016-06-16 09:39:51 +01:00
|
|
|
qemu_rdma_resolve_host_trying(const char *host, const char *ip) "Trying %s => %s"
|
|
|
|
|
qemu_rdma_signal_unregister_append(uint64_t chunk, int pos) "Appending unregister chunk %" PRIu64 " at position %d"
|
|
|
|
|
qemu_rdma_signal_unregister_already(uint64_t chunk) "Unregister chunk %" PRIu64 " already in queue"
|
|
|
|
|
qemu_rdma_unregister_waiting_inflight(uint64_t chunk) "Cannot unregister inflight chunk: %" PRIu64
|
|
|
|
|
qemu_rdma_unregister_waiting_proc(uint64_t chunk, int pos) "Processing unregister for chunk: %" PRIu64 " at position %d"
|
|
|
|
|
qemu_rdma_unregister_waiting_send(uint64_t chunk) "Sending unregister for chunk: %" PRIu64
|
|
|
|
|
qemu_rdma_unregister_waiting_complete(uint64_t chunk) "Unregister for chunk: %" PRIu64 " complete."
|
|
|
|
|
qemu_rdma_write_flush(int sent) "sent total: %d"
|
|
|
|
|
qemu_rdma_write_one_block(int count, int block, uint64_t chunk, uint64_t current, uint64_t len, int nb_sent, int nb_chunks) "(%d) Not clobbering: block: %d chunk %" PRIu64 " current %" PRIu64 " len %" PRIu64 " %d %d"
|
trace-events: fix code style: print 0x before hex numbers
The only exception are groups of numers separated by symbols
'.', ' ', ':', '/', like 'ab.09.7d'.
This patch is made by the following:
> find . -name trace-events | xargs python script.py
where script.py is the following python script:
=========================
#!/usr/bin/env python
import sys
import re
import fileinput
rhex = '%[-+ *.0-9]*(?:[hljztL]|ll|hh)?(?:x|X|"\s*PRI[xX][^"]*"?)'
rgroup = re.compile('((?:' + rhex + '[.:/ ])+' + rhex + ')')
rbad = re.compile('(?<!0x)' + rhex)
files = sys.argv[1:]
for fname in files:
for line in fileinput.input(fname, inplace=True):
arr = re.split(rgroup, line)
for i in range(0, len(arr), 2):
arr[i] = re.sub(rbad, '0x\g<0>', arr[i])
sys.stdout.write(''.join(arr))
=========================
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Message-id: 20170731160135.12101-5-vsementsov@virtuozzo.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-07-31 19:01:35 +03:00
|
|
|
qemu_rdma_write_one_post(uint64_t chunk, long addr, long remote, uint32_t len) "Posting chunk: %" PRIu64 ", addr: 0x%lx remote: 0x%lx, bytes %" PRIu32
|
2016-06-16 09:39:51 +01:00
|
|
|
qemu_rdma_write_one_queue_full(void) ""
|
trace-events: fix code style: print 0x before hex numbers
The only exception are groups of numers separated by symbols
'.', ' ', ':', '/', like 'ab.09.7d'.
This patch is made by the following:
> find . -name trace-events | xargs python script.py
where script.py is the following python script:
=========================
#!/usr/bin/env python
import sys
import re
import fileinput
rhex = '%[-+ *.0-9]*(?:[hljztL]|ll|hh)?(?:x|X|"\s*PRI[xX][^"]*"?)'
rgroup = re.compile('((?:' + rhex + '[.:/ ])+' + rhex + ')')
rbad = re.compile('(?<!0x)' + rhex)
files = sys.argv[1:]
for fname in files:
for line in fileinput.input(fname, inplace=True):
arr = re.split(rgroup, line)
for i in range(0, len(arr), 2):
arr[i] = re.sub(rbad, '0x\g<0>', arr[i])
sys.stdout.write(''.join(arr))
=========================
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Message-id: 20170731160135.12101-5-vsementsov@virtuozzo.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-07-31 19:01:35 +03:00
|
|
|
qemu_rdma_write_one_recvregres(int mykey, int theirkey, uint64_t chunk) "Received registration result: my key: 0x%x their key 0x%x, chunk %" PRIu64
|
2016-06-16 09:39:51 +01:00
|
|
|
qemu_rdma_write_one_sendreg(uint64_t chunk, int len, int index, int64_t offset) "Sending registration request chunk %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64
|
|
|
|
|
qemu_rdma_write_one_top(uint64_t chunks, uint64_t size) "Writing %" PRIu64 " chunks, (%" PRIu64 " MB)"
|
|
|
|
|
qemu_rdma_write_one_zero(uint64_t chunk, int len, int index, int64_t offset) "Entire chunk is zero, sending compress: %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64
|
|
|
|
|
rdma_add_block(const char *block_name, int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: '%s':%d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
|
|
|
|
|
rdma_block_notification_handle(const char *name, int index) "%s at %d"
|
|
|
|
|
rdma_delete_block(void *block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %p, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
|
2023-10-11 22:35:23 +02:00
|
|
|
rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
|
|
|
|
|
rdma_registration_handle_finished(void) ""
|
|
|
|
|
rdma_registration_handle_ram_blocks(void) ""
|
|
|
|
|
rdma_registration_handle_ram_blocks_loop(const char *name, uint64_t offset, uint64_t length, void *local_host_addr, unsigned int src_index) "%s: @0x%" PRIx64 "/%" PRIu64 " host:@%p src_index: %u"
|
|
|
|
|
rdma_registration_handle_register(int requests) "%d requests"
|
|
|
|
|
rdma_registration_handle_register_loop(int req, int index, uint64_t addr, uint64_t chunks) "Registration request (%d): index %d, current_addr %" PRIu64 " chunks: %" PRIu64
|
|
|
|
|
rdma_registration_handle_register_rkey(int rkey) "0x%x"
|
|
|
|
|
rdma_registration_handle_unregister(int requests) "%d requests"
|
|
|
|
|
rdma_registration_handle_unregister_loop(int count, int index, uint64_t chunk) "Unregistration request (%d): index %d, chunk %" PRIu64
|
|
|
|
|
rdma_registration_handle_unregister_success(uint64_t chunk) "%" PRIu64
|
|
|
|
|
rdma_registration_handle_wait(void) ""
|
|
|
|
|
rdma_registration_start(uint64_t flags) "%" PRIu64
|
|
|
|
|
rdma_registration_stop(uint64_t flags) "%" PRIu64
|
|
|
|
|
rdma_registration_stop_ram(void) ""
|
2026-01-23 11:16:47 -03:00
|
|
|
rdma_connect_incoming(void) ""
|
|
|
|
|
rdma_connect_incoming_after_dest_init(void) ""
|
|
|
|
|
rdma_connect_incoming_after_rdma_listen(void) ""
|
|
|
|
|
rdma_connect_outgoing_after_rdma_connect(void) ""
|
|
|
|
|
rdma_connect_outgoing_after_rdma_source_init(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# postcopy-ram.c
|
2016-06-16 09:39:51 +01:00
|
|
|
postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands"
|
|
|
|
|
postcopy_discard_send_range(const char *ramblock, unsigned long start, unsigned long length) "%s:%lx/%lx"
|
trace-events: fix code style: print 0x before hex numbers
The only exception are groups of numers separated by symbols
'.', ' ', ':', '/', like 'ab.09.7d'.
This patch is made by the following:
> find . -name trace-events | xargs python script.py
where script.py is the following python script:
=========================
#!/usr/bin/env python
import sys
import re
import fileinput
rhex = '%[-+ *.0-9]*(?:[hljztL]|ll|hh)?(?:x|X|"\s*PRI[xX][^"]*"?)'
rgroup = re.compile('((?:' + rhex + '[.:/ ])+' + rhex + ')')
rbad = re.compile('(?<!0x)' + rhex)
files = sys.argv[1:]
for fname in files:
for line in fileinput.input(fname, inplace=True):
arr = re.split(rgroup, line)
for i in range(0, len(arr), 2):
arr[i] = re.sub(rbad, '0x\g<0>', arr[i])
sys.stdout.write(''.join(arr))
=========================
Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Acked-by: Cornelia Huck <cohuck@redhat.com>
Message-id: 20170731160135.12101-5-vsementsov@virtuozzo.com
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
2017-07-31 19:01:35 +03:00
|
|
|
postcopy_cleanup_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=0x%zx length=0x%zx"
|
|
|
|
|
postcopy_init_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=0x%zx length=0x%zx"
|
|
|
|
|
postcopy_nhp_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=0x%zx length=0x%zx"
|
2016-06-16 09:39:51 +01:00
|
|
|
postcopy_place_page(void *host_addr) "host=%p"
|
|
|
|
|
postcopy_place_page_zero(void *host_addr) "host=%p"
|
|
|
|
|
postcopy_ram_enable_notify(void) ""
|
2019-03-14 19:09:29 +01:00
|
|
|
postcopy_pause_fault_thread(void) ""
|
|
|
|
|
postcopy_pause_fault_thread_continued(void) ""
|
2022-07-07 14:55:06 -04:00
|
|
|
postcopy_pause_fast_load(void) ""
|
|
|
|
|
postcopy_pause_fast_load_continued(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
postcopy_ram_fault_thread_entry(void) ""
|
|
|
|
|
postcopy_ram_fault_thread_exit(void) ""
|
2018-03-12 17:21:04 +00:00
|
|
|
postcopy_ram_fault_thread_fds_core(int baseufd, int quitfd) "ufd: %d quitfd: %d"
|
|
|
|
|
postcopy_ram_fault_thread_fds_extra(size_t index, const char *name, int fd) "%zd/%s: %d"
|
2016-06-16 09:39:51 +01:00
|
|
|
postcopy_ram_fault_thread_quit(void) ""
|
2018-03-22 21:17:24 +03:00
|
|
|
postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, uint32_t pid) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx pid=%u"
|
2016-06-16 09:39:51 +01:00
|
|
|
postcopy_ram_incoming_cleanup_closeuf(void) ""
|
|
|
|
|
postcopy_ram_incoming_cleanup_entry(void) ""
|
|
|
|
|
postcopy_ram_incoming_cleanup_exit(void) ""
|
|
|
|
|
postcopy_ram_incoming_cleanup_join(void) ""
|
2018-03-22 21:17:27 +03:00
|
|
|
postcopy_ram_incoming_cleanup_blocktime(uint64_t total) "total blocktime %" PRIu64
|
2018-03-12 17:21:12 +00:00
|
|
|
postcopy_request_shared_page(const char *sharer, const char *rb, uint64_t rb_offset) "for %s in %s offset 0x%"PRIx64
|
2018-03-12 17:21:17 +00:00
|
|
|
postcopy_request_shared_page_present(const char *sharer, const char *rb, uint64_t rb_offset) "%s already %s offset 0x%"PRIx64
|
2018-03-12 17:21:14 +00:00
|
|
|
postcopy_wake_shared(uint64_t client_addr, const char *rb) "at 0x%"PRIx64" in %s"
|
2020-10-21 17:27:18 -04:00
|
|
|
postcopy_page_req_del(void *addr, int count) "resolved page req %p total %d"
|
2022-07-07 14:55:18 -04:00
|
|
|
postcopy_preempt_tls_handshake(void) ""
|
2022-07-07 14:55:02 -04:00
|
|
|
postcopy_preempt_new_channel(void) ""
|
|
|
|
|
postcopy_preempt_thread_entry(void) ""
|
|
|
|
|
postcopy_preempt_thread_exit(void) ""
|
2025-06-13 10:12:13 -04:00
|
|
|
postcopy_blocktime_tid_cpu_map(int cpu, uint32_t tid) "cpu: %d, tid: %u"
|
migration/postcopy: Optimize blocktime fault tracking with hashtable
Currently, the postcopy blocktime feature maintains vCPU fault information
using an array (vcpu_addr[]). It has two issues.
Issue 1: Performance Concern
============================
The old algorithm was almost OK and fast on inserts, except that the lookup
is slow and won't scale if there are a lot of vCPUs: when a page is copied
during postcopy, mark_postcopy_blocktime_end() will walk the whole array
trying to find which vCPUs are blocked by the address. So it needs
constant O(N) walk for each page resolution.
Alexey (the author of postcopy blocktime) mentioned the perf issue and how
to optimize it in a piece of comment in the page resolution path. The
comment was (interestingly..) not complete, but it's relatively clear what
he wanted to say about this perf issue.
Issue 2: Wrong Accounting on re-entrancies
==========================================
People might think that each vCPU should only and always get one fault at a
time, so that when the blocktime layer captured one fault on one vCPU, we
should never see another fault message on this vCPU.
It's almost correct, except in some extreme rare cases.
Case 1: it's possible the fault thread processes the userfaultfd messages
too fast so it can see >1 messages on one vCPU before the previous one was
resolved.
Case 2: it's theoretically also possible one vCPU can get even more than
one message on the same fault address if a fault is retried by the
kernel (e.g., handle_userfault() got interrupted before page resolution).
As this info might be important, instead of using commit message, I put
more details into the code as comment, when introducing an array
maintaining concurrent faults on one vCPU. Please refer to the comments
for details on both cases, especially case 1 which can be tricky.
Case 1 sounds rare, but it can be easily reproduced locally for me when we
run blocktime together with the migration-test on the vanilla postcopy.
New Design
==========
This patch should do almost what Alexey mentioned, but slightly
differently: instead of having an array to maintain vCPU fault addresses,
for each of the fault message we push a message into a hash, indexed by the
fault address.
With the hash, it can replace the old two structs: both the vcpu_addr[]
array, and also the array to store the start time of the fault. However
due to above we need one more counter array to account concurrent faults on
the same vCPU - that should even be needed in the old code, it's just that
the old code was buggy and it will blindly overwrite an existing
entry.. now we'll start to really track everything.
The hash structure might be more efficient than tree to maintain such
addr->(cpu, fault_time) information, so that the insert() and lookup()
paths should ideally both be ~O(1). After all, we do not need to sort.
Here we need to do one remove() though after the lookup(). It could be
slow but only if many vCPUs faulted exactly on the same address (so when
the list of cpu entries is long), which should be unlikely. Even with that,
it's still a worst case O(N) (consider 400 vCPUs faulted on the same
address and how likely is it..) rather than a constant O(N) complexity.
When at it, touch up the tracepoints to make them slightly more useful.
One tracepoint is added when walking all the fault entries.
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20250613141217.474825-13-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-06-13 10:12:15 -04:00
|
|
|
postcopy_blocktime_begin(uint64_t addr, uint64_t time, int cpu, bool exists) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", cpu: %d, exist: %d"
|
migration/postcopy: blocktime allows track / report non-vCPU faults
When used to report page fault latencies, the blocktime feature can be
almost useless when KVM async page fault is enabled, because in most cases
such remote fault will kickoff async page faults, then it's not trackable
from blocktime layer.
After all these recent rewrites to blocktime layer, it's finally so easy to
also support tracking non-vCPU faults. It'll be even faster if we could
always index fault records with TIDs, unfortunately we need to maintain the
blocktime API which report things in vCPU indexes.
Of course this can work not only for kworkers, but also any guest accesses
that may reach a missing page, for example, very likely when in the QEMU
main thread too (and all other threads whenever applicable).
In this case, we don't care about "how long the threads are blocked", but
we only care about "how long the fault will be resolved".
Cc: Markus Armbruster <armbru@redhat.com>
Cc: Dr. David Alan Gilbert <dave@treblig.org>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Tested-by: Mario Casquero <mcasquer@redhat.com>
Link: https://lore.kernel.org/r/20250613141217.474825-14-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-06-13 10:12:16 -04:00
|
|
|
postcopy_blocktime_end(uint64_t addr, uint64_t time, int affected_cpu, int affected_non_cpus) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", affected_cpus: %d, affected_non_cpus: %d"
|
migration/postcopy: Optimize blocktime fault tracking with hashtable
Currently, the postcopy blocktime feature maintains vCPU fault information
using an array (vcpu_addr[]). It has two issues.
Issue 1: Performance Concern
============================
The old algorithm was almost OK and fast on inserts, except that the lookup
is slow and won't scale if there are a lot of vCPUs: when a page is copied
during postcopy, mark_postcopy_blocktime_end() will walk the whole array
trying to find which vCPUs are blocked by the address. So it needs
constant O(N) walk for each page resolution.
Alexey (the author of postcopy blocktime) mentioned the perf issue and how
to optimize it in a piece of comment in the page resolution path. The
comment was (interestingly..) not complete, but it's relatively clear what
he wanted to say about this perf issue.
Issue 2: Wrong Accounting on re-entrancies
==========================================
People might think that each vCPU should only and always get one fault at a
time, so that when the blocktime layer captured one fault on one vCPU, we
should never see another fault message on this vCPU.
It's almost correct, except in some extreme rare cases.
Case 1: it's possible the fault thread processes the userfaultfd messages
too fast so it can see >1 messages on one vCPU before the previous one was
resolved.
Case 2: it's theoretically also possible one vCPU can get even more than
one message on the same fault address if a fault is retried by the
kernel (e.g., handle_userfault() got interrupted before page resolution).
As this info might be important, instead of using commit message, I put
more details into the code as comment, when introducing an array
maintaining concurrent faults on one vCPU. Please refer to the comments
for details on both cases, especially case 1 which can be tricky.
Case 1 sounds rare, but it can be easily reproduced locally for me when we
run blocktime together with the migration-test on the vanilla postcopy.
New Design
==========
This patch should do almost what Alexey mentioned, but slightly
differently: instead of having an array to maintain vCPU fault addresses,
for each of the fault message we push a message into a hash, indexed by the
fault address.
With the hash, it can replace the old two structs: both the vcpu_addr[]
array, and also the array to store the start time of the fault. However
due to above we need one more counter array to account concurrent faults on
the same vCPU - that should even be needed in the old code, it's just that
the old code was buggy and it will blindly overwrite an existing
entry.. now we'll start to really track everything.
The hash structure might be more efficient than tree to maintain such
addr->(cpu, fault_time) information, so that the insert() and lookup()
paths should ideally both be ~O(1). After all, we do not need to sort.
Here we need to do one remove() though after the lookup(). It could be
slow but only if many vCPUs faulted exactly on the same address (so when
the list of cpu entries is long), which should be unlikely. Even with that,
it's still a worst case O(N) (consider 400 vCPUs faulted on the same
address and how likely is it..) rather than a constant O(N) complexity.
When at it, touch up the tracepoints to make them slightly more useful.
One tracepoint is added when walking all the fault entries.
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20250613141217.474825-13-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-06-13 10:12:15 -04:00
|
|
|
postcopy_blocktime_end_one(int cpu, uint8_t left_faults) "cpu: %d, left_faults: %" PRIu8
|
2016-06-16 09:39:51 +01:00
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# exec.c
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_exec_outgoing(const char *cmd) "cmd=%s"
|
|
|
|
|
migration_exec_incoming(const char *cmd) "cmd=%s"
|
|
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# fd.c
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_fd_outgoing(int fd) "fd=%d"
|
|
|
|
|
migration_fd_incoming(int fd) "fd=%d"
|
|
|
|
|
|
2023-09-08 07:22:10 -07:00
|
|
|
# file.c
|
|
|
|
|
migration_file_outgoing(const char *filename) "filename=%s"
|
|
|
|
|
migration_file_incoming(const char *filename) "filename=%s"
|
|
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# socket.c
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_socket_incoming_accepted(void) ""
|
2026-01-23 11:16:34 -03:00
|
|
|
migration_socket_outgoing_connected(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_socket_outgoing_error(const char *err) "error=%s"
|
|
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# tls.c
|
2026-01-23 11:16:34 -03:00
|
|
|
migration_tls_outgoing_handshake_start(void) ""
|
2016-06-16 09:39:51 +01:00
|
|
|
migration_tls_outgoing_handshake_error(const char *err) "err=%s"
|
|
|
|
|
migration_tls_outgoing_handshake_complete(void) ""
|
|
|
|
|
migration_tls_incoming_handshake_start(void) ""
|
|
|
|
|
migration_tls_incoming_handshake_error(const char *err) "err=%s"
|
|
|
|
|
migration_tls_incoming_handshake_complete(void) ""
|
2016-10-27 14:42:54 +08:00
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# colo.c
|
2016-10-27 14:42:54 +08:00
|
|
|
colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
|
2016-10-27 14:42:57 +08:00
|
|
|
colo_send_message(const char *msg) "Send '%s' message"
|
|
|
|
|
colo_receive_message(const char *msg) "Receive '%s' message"
|
2019-03-14 19:09:29 +01:00
|
|
|
|
|
|
|
|
# colo-failover.c
|
2016-10-27 14:43:04 +08:00
|
|
|
colo_failover_set_state(const char *new_state) "new state %s"
|
2018-03-13 15:34:01 -04:00
|
|
|
|
2025-01-15 11:00:33 -08:00
|
|
|
# cpr.c
|
|
|
|
|
cpr_save_fd(const char *name, int id, int fd) "%s, id %d, fd %d"
|
|
|
|
|
cpr_delete_fd(const char *name, int id) "%s, id %d"
|
|
|
|
|
cpr_find_fd(const char *name, int id, int fd) "%s, id %d returns %d"
|
|
|
|
|
cpr_state_save(const char *mode) "%s mode"
|
|
|
|
|
cpr_state_load(const char *mode) "%s mode"
|
2025-01-15 11:00:41 -08:00
|
|
|
cpr_transfer_input(const char *path) "%s"
|
|
|
|
|
cpr_transfer_output(const char *path) "%s"
|
migration: cpr-exec mode
Add the cpr-exec migration mode. Usage:
qemu-system-$arch -machine aux-ram-share=on ...
migrate_set_parameter mode cpr-exec
migrate_set_parameter cpr-exec-command \
<arg1> <arg2> ... -incoming <uri-1> \
migrate -d <uri-1>
The migrate command stops the VM, saves state to uri-1,
directly exec's a new version of QEMU on the same host,
replacing the original process while retaining its PID, and
loads state from uri-1. Guest RAM is preserved in place,
albeit with new virtual addresses.
The new QEMU process is started by exec'ing the command
specified by the @cpr-exec-command parameter. The first word of
the command is the binary, and the remaining words are its
arguments. The command may be a direct invocation of new QEMU,
or may be a non-QEMU command that exec's the new QEMU binary.
This mode creates a second migration channel that is not visible
to the user. At the start of migration, old QEMU saves CPR state
to the second channel, and at the end of migration, it tells the
main loop to call cpr_exec. New QEMU loads CPR state early, before
objects are created.
Because old QEMU terminates when new QEMU starts, one cannot
stream data between the two, so uri-1 must be a type,
such as a file, that accepts all data before old QEMU exits.
Otherwise, old QEMU may quietly block writing to the channel.
Memory-backend objects must have the share=on attribute, but
memory-backend-epc is not supported. The VM must be started with
the '-machine aux-ram-share=on' option, which allows anonymous
memory to be transferred in place to the new process. The memfds
are kept open across exec by clearing the close-on-exec flag, their
values are saved in CPR state, and they are mmap'd in new QEMU.
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Acked-by: Markus Armbruster <armbru@redhat.com>
Link: https://lore.kernel.org/r/1759332851-370353-7-git-send-email-steven.sistare@oracle.com
Signed-off-by: Peter Xu <peterx@redhat.com>
2025-10-01 08:33:58 -07:00
|
|
|
cpr_exec(void) ""
|
2025-01-15 11:00:33 -08:00
|
|
|
|
2019-03-14 19:09:26 +01:00
|
|
|
# block-dirty-bitmap.c
|
2018-03-13 15:34:01 -04:00
|
|
|
send_bitmap_header_enter(void) ""
|
|
|
|
|
send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "flags: 0x%x, start_sector: %" PRIu64 ", nr_sectors: %" PRIu32 ", data_size: %" PRIu64
|
|
|
|
|
dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d"
|
|
|
|
|
dirty_bitmap_save_complete_enter(void) ""
|
|
|
|
|
dirty_bitmap_save_complete_finish(void) ""
|
2022-10-03 02:00:03 +02:00
|
|
|
dirty_bitmap_state_pending(uint64_t pending) "pending %" PRIu64
|
2018-03-13 15:34:01 -04:00
|
|
|
dirty_bitmap_load_complete(void) ""
|
|
|
|
|
dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32
|
|
|
|
|
dirty_bitmap_load_bits_zeroes(void) ""
|
|
|
|
|
dirty_bitmap_load_header(uint32_t flags) "flags 0x%x"
|
|
|
|
|
dirty_bitmap_load_enter(void) ""
|
|
|
|
|
dirty_bitmap_load_success(void) ""
|
2020-09-16 14:22:07 +08:00
|
|
|
|
|
|
|
|
# dirtyrate.c
|
|
|
|
|
dirtyrate_set_state(const char *new_state) "new state %s"
|
|
|
|
|
query_dirty_rate_info(const char *new_state) "current state %s"
|
2023-04-27 15:42:57 +03:00
|
|
|
get_ramblock_vfn_hash(const char *idstr, uint64_t vfn, uint32_t hash) "ramblock name: %s, vfn: %"PRIu64 ", hash: %" PRIu32
|
|
|
|
|
calc_page_dirty_rate(const char *idstr, uint32_t new_hash, uint32_t old_hash) "ramblock name: %s, new hash: %" PRIu32 ", old hash: %" PRIu32
|
2020-09-16 14:22:07 +08:00
|
|
|
skip_sample_ramblock(const char *idstr, uint64_t ramblock_size) "ramblock name: %s, ramblock size: %" PRIu64
|
|
|
|
|
find_page_matched(const char *idstr) "ramblock %s addr or size changed"
|
2021-06-29 16:01:23 +00:00
|
|
|
dirtyrate_calculate(int64_t dirtyrate) "dirty rate: %" PRIi64 " MB/s"
|
|
|
|
|
dirtyrate_do_calculate_vcpu(int idx, uint64_t rate) "vcpu[%d]: %"PRIu64 " MB/s"
|
2020-10-20 15:32:56 +08:00
|
|
|
|
|
|
|
|
# block.c
|
|
|
|
|
migration_block_init_shared(const char *blk_device_name) "Start migration for %s with shared base image"
|
|
|
|
|
migration_block_init_full(const char *blk_device_name) "Start full migration for %s"
|
|
|
|
|
migration_block_save_device_dirty(int64_t sector) "Error reading sector %" PRId64
|
|
|
|
|
migration_block_flush_blks(const char *action, int submitted, int read_done, int transferred) "%s submitted %d read_done %d transferred %d"
|
|
|
|
|
migration_block_save(const char *mig_stage, int submitted, int transferred) "Enter save live %s submitted %d transferred %d"
|
|
|
|
|
migration_block_save_complete(void) "Block migration completed"
|
2022-10-03 02:00:03 +02:00
|
|
|
migration_block_state_pending(uint64_t pending) "Enter save live pending %" PRIu64
|
2023-02-15 16:35:17 +01:00
|
|
|
migration_block_progression(unsigned percent) "Completed %u%%"
|
2020-10-20 15:32:56 +08:00
|
|
|
|
|
|
|
|
# page_cache.c
|
|
|
|
|
migration_pagecache_init(int64_t max_num_items) "Setting cache buckets to %" PRId64
|
|
|
|
|
migration_pagecache_insert(void) "Error allocating page"
|
2024-10-17 14:42:52 +08:00
|
|
|
|
|
|
|
|
# cpu-throttle.c
|
|
|
|
|
cpu_throttle_set(int new_throttle_pct) "set guest CPU throttled by %d%%"
|
2024-10-17 14:42:54 +08:00
|
|
|
cpu_throttle_dirty_sync(void) ""
|
migration/block: Rewrite disk activation
This patch proposes a flag to maintain disk activation status globally. It
mostly rewrites disk activation mgmt for QEMU, including COLO and QMP
command xen_save_devices_state.
Backgrounds
===========
We have two problems on disk activations, one resolved, one not.
Problem 1: disk activation recover (for switchover interruptions)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
When migration is either cancelled or failed during switchover, especially
when after the disks are inactivated, QEMU needs to remember re-activate
the disks again before vm starts.
It used to be done separately in two paths: one in qmp_migrate_cancel(),
the other one in the failure path of migration_completion().
It used to be fixed in different commits, all over the places in QEMU. So
these are the relevant changes I saw, I'm not sure if it's complete list:
- In 2016, commit fe904ea824 ("migration: regain control of images when
migration fails to complete")
- In 2017, commit 1d2acc3162 ("migration: re-active images while migration
been canceled after inactive them")
- In 2023, commit 6dab4c93ec ("migration: Attempt disk reactivation in
more failure scenarios")
Now since we have a slightly better picture maybe we can unify the
reactivation in a single path.
One side benefit of doing so is, we can move the disk operation outside QMP
command "migrate_cancel". It's possible that in the future we may want to
make "migrate_cancel" be OOB-compatible, while that requires the command
doesn't need BQL in the first place. This will already do that and make
migrate_cancel command lightweight.
Problem 2: disk invalidation on top of invalidated disks
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This is an unresolved bug for current QEMU. Link in "Resolves:" at the
end. It turns out besides the src switchover phase (problem 1 above), QEMU
also needs to remember block activation on destination.
Consider two continuous migration in a row, where the VM was always paused.
In that scenario, the disks are not activated even until migration
completed in the 1st round. When the 2nd round starts, if QEMU doesn't
know the status of the disks, it needs to try inactivate the disk again.
Here the issue is the block layer API bdrv_inactivate_all() will crash a
QEMU if invoked on already inactive disks for the 2nd migration. For
detail, see the bug link at the end.
Implementation
==============
This patch proposes to maintain disk activation with a global flag, so we
know:
- If we used to inactivate disks for migration, but migration got
cancelled, or failed, QEMU will know it should reactivate the disks.
- On incoming side, if the disks are never activated but then another
migration is triggered, QEMU should be able to tell that inactivate is
not needed for the 2nd migration.
We used to have disk_inactive, but it only solves the 1st issue, not the
2nd. Also, it's done in completely separate paths so it's extremely hard
to follow either how the flag changes, or the duration that the flag is
valid, and when we will reactivate the disks.
Convert the existing disk_inactive flag into that global flag (also invert
its naming), and maintain the disk activation status for the whole
lifecycle of qemu. That includes the incoming QEMU.
Put both of the error cases of source migration (failure, cancelled)
together into migration_iteration_finish(), which will be invoked for
either of the scenario. So from that part QEMU should behave the same as
before. However with such global maintenance on disk activation status, we
not only cleanup quite a few temporary paths that we try to maintain the
disk activation status (e.g. in postcopy code), meanwhile it fixes the
crash for problem 2 in one shot.
For freshly started QEMU, the flag is initialized to TRUE showing that the
QEMU owns the disks by default.
For incoming migrated QEMU, the flag will be initialized to FALSE once and
for all showing that the dest QEMU doesn't own the disks until switchover.
That is guaranteed by the "once" variable.
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2395
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Message-Id: <20241206230838.1111496-7-peterx@redhat.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
2024-12-06 18:08:38 -05:00
|
|
|
|
|
|
|
|
# block-active.c
|
|
|
|
|
migration_block_activation(const char *name) "%s"
|