Files
qemu-qemu/migration/trace-events

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

404 lines
25 KiB
Plaintext
Raw Normal View History

# See docs/devel/tracing.rst for syntax documentation.
# savevm.c
qemu_loadvm_state_section(unsigned int section_type) "%d"
qemu_loadvm_state_section_command(int ret) "%d"
qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
qemu_loadvm_state_post_main(int ret) "%d"
qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
qemu_savevm_send_packaged(void) ""
loadvm_state_switchover_ack_needed(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
loadvm_state_setup(void) ""
loadvm_state_cleanup(void) ""
loadvm_handle_cmd_packaged(unsigned int length) "%u"
loadvm_handle_cmd_packaged_main(int ret) "%d"
loadvm_handle_cmd_packaged_received(int ret) "%d"
loadvm_handle_recv_bitmap(char *s) "%s"
loadvm_postcopy_handle_advise(void) ""
loadvm_postcopy_handle_listen(const char *str) "%s"
loadvm_postcopy_handle_run(void) ""
loadvm_postcopy_handle_resume(void) ""
loadvm_postcopy_ram_handle_discard(void) ""
loadvm_postcopy_ram_handle_discard_end(void) ""
loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud"
loadvm_process_command(const char *s, uint16_t len) "com=%s len=%d"
loadvm_process_command_ping(uint32_t val) "0x%x"
loadvm_approve_switchover(unsigned int switchover_ack_pending_num) "Switchover ack pending num=%u"
postcopy_ram_listen_thread_exit(void) ""
postcopy_ram_listen_thread_start(void) ""
qemu_savevm_send_postcopy_advise(void) ""
qemu_savevm_send_postcopy_ram_discard(const char *id, uint16_t len) "%s: %ud"
savevm_command_send(uint16_t command, uint16_t len) "com=0x%x len=%d"
savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u"
savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d"
savevm_section_skip(const char *id, unsigned int section_id) "%s, section_id %u"
savevm_send_open_return_path(void) ""
savevm_send_ping(uint32_t val) "0x%x"
savevm_send_postcopy_listen(void) ""
savevm_send_postcopy_run(void) ""
savevm_send_postcopy_resume(void) ""
savevm_send_colo_enable(void) ""
savevm_send_recv_bitmap(char *name) "%s"
savevm_send_switchover_start(void) ""
savevm_state_setup(void) ""
savevm_state_resume_prepare(void) ""
savevm_state_header(void) ""
savevm_state_iterate(void) ""
savevm_state_cleanup(void) ""
vmstate_save(const char *idstr, const char *vmsd_name) "%s, %s"
vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s"
2023-10-30 12:33:44 -04:00
vmstate_downtime_save(const char *type, const char *idstr, uint32_t instance_id, int64_t downtime) "type=%s idstr=%s instance_id=%d downtime=%"PRIi64
vmstate_downtime_load(const char *type, const char *idstr, uint32_t instance_id, int64_t downtime) "type=%s idstr=%s instance_id=%d downtime=%"PRIi64
migration: Add tracepoints for downtime checkpoints This patch is inspired by Joao Martin's patch here: https://lore.kernel.org/r/20230926161841.98464-1-joao.m.martins@oracle.com Add tracepoints for major downtime checkpoints on both src and dst. They share the same tracepoint with a string showing its stage. Besides the checkpoints in the previous patch, this patch also added destination checkpoints. On src, we have these checkpoints added: - src-downtime-start: right before vm stops on src - src-vm-stopped: after vm is fully stopped - src-iterable-saved: after all iterables saved (END sections) - src-non-iterable-saved: after all non-iterable saved (FULL sections) - src-downtime-stop: migration fully completed On dst, we have these checkpoints added: - dst-precopy-loadvm-completes: after loadvm all done for precopy - dst-precopy-bh-*: record BH steps to resume VM for precopy - dst-postcopy-bh-*: record BH steps to resume VM for postcopy On dst side, we don't have a good way to trace total time consumed by iterable or non-iterable for now. We can mark it by 1st time receiving a FULL / END section, but rather than that let's just rely on the other tracepoints added for vmstates to back up the information. With this patch, one can enable "vmstate_downtime*" tracepoints and it'll enable all tracepoints for downtime measurements necessary. Drop loadvm_postcopy_handle_run_bh() tracepoint alongside, because they service the same purpose, which was only for postcopy. We then have unified prefix for all downtime relevant tracepoints. Co-developed-by: Joao Martins <joao.m.martins@oracle.com> Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Juan Quintela <quintela@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com> Message-ID: <20231030163346.765724-6-peterx@redhat.com>
2023-10-30 12:33:46 -04:00
vmstate_downtime_checkpoint(const char *checkpoint) "%s"
postcopy_pause_incoming(void) ""
postcopy_pause_incoming_continued(void) ""
postcopy_page_req_sync(void *host_addr) "sync page req %p"
# vmstate.c
vmstate_load_field_error(const char *field, int ret) "field \"%s\" load failed, ret = %d"
vmstate_load_state(const char *name, int version_id) "%s v%d"
vmstate_load_state_end(const char *name, const char *reason, int val) "%s %s/%d"
vmstate_load_state_field(const char *name, const char *field, bool exists) "%s:%s exists=%d"
vmstate_n_elems(const char *name, int n_elems) "%s: %d"
vmstate_subsection_load(const char *parent) "%s"
vmstate_subsection_load_bad(const char *parent, const char *sub, const char *sub2) "%s: %s/%s"
vmstate_subsection_load_good(const char *parent) "%s"
vmstate_save_state_pre_save_res(const char *name, int res) "%s/%d"
vmstate_save_state_loop(const char *name, const char *field, int n_elems) "%s/%s[%d]"
vmstate_save_state_top(const char *idstr) "%s"
vmstate_subsection_save_loop(const char *name, const char *sub) "%s/%s"
vmstate_subsection_save_top(const char *idstr) "%s"
vmstate_field_exists(const char *vmsd, const char *name, int field_version, int version, int result) "%s:%s field_version %d version %d result %d"
# vmstate-types.c
get_qtailq(const char *name, int version_id) "%s v%d"
get_qtailq_end(const char *name, const char *reason, int val) "%s %s/%d"
put_qtailq(const char *name, int version_id) "%s v%d"
put_qtailq_end(const char *name, const char *reason) "%s %s"
get_gtree(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, uint32_t nnodes) "%s(%s/%s) nnodes=%d"
get_gtree_end(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, int ret) "%s(%s/%s) %d"
put_gtree(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, uint32_t nnodes) "%s(%s/%s) nnodes=%d"
put_gtree_end(const char *field_name, const char *key_vmsd_name, const char *val_vmsd_name, int ret) "%s(%s/%s) %d"
get_qlist(const char *field_name, const char *vmsd_name, int version_id) "%s(%s v%d)"
get_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)"
put_qlist(const char *field_name, const char *vmsd_name, int version_id) "%s(%s v%d)"
put_qlist_end(const char *field_name, const char *vmsd_name) "%s(%s)"
# qemu-file.c
qemu_file_fclose(void) ""
qemu_file_put_fd(const char *name, int fd, int ret) "ioc %s, fd %d -> status %d"
qemu_file_get_fd(const char *name, int fd) "ioc %s -> fd %d"
# ram.c
get_queued_page(const char *block_name, uint64_t tmp_offset, unsigned long page_abs) "%s/0x%" PRIx64 " page_abs=0x%lx"
get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, unsigned long page_abs) "%s/0x%" PRIx64 " page_abs=0x%lx"
migration_bitmap_sync_start(void) ""
migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64
migration: Split log_clear() into smaller chunks Currently we are doing log_clear() right after log_sync() which mostly keeps the old behavior when log_clear() was still part of log_sync(). This patch tries to further optimize the migration log_clear() code path to split huge log_clear()s into smaller chunks. We do this by spliting the whole guest memory region into memory chunks, whose size is decided by MigrationState.clear_bitmap_shift (an example will be given below). With that, we don't do the dirty bitmap clear operation on the remote node (e.g., KVM) when we fetch the dirty bitmap, instead we explicitly clear the dirty bitmap for the memory chunk for each of the first time we send a page in that chunk. Here comes an example. Assuming the guest has 64G memory, then before this patch the KVM ioctl KVM_CLEAR_DIRTY_LOG will be a single one covering 64G memory. If after the patch, let's assume when the clear bitmap shift is 18, then the memory chunk size on x86_64 will be 1UL<<18 * 4K = 1GB. Then instead of sending a big 64G ioctl, we'll send 64 small ioctls, each of the ioctl will cover 1G of the guest memory. For each of the 64 small ioctls, we'll only send if any of the page in that small chunk was going to be sent right away. Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Juan Quintela <quintela@redhat.com> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Message-Id: <20190603065056.25211-12-peterx@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
2019-06-03 14:50:56 +08:00
migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx"
migration_throttle(void) ""
migration_dirty_limit_guest(int64_t dirtyrate) "guest dirty page rate limit %" PRIi64 " MB/s"
ram_discard_range(const char *rbname, uint64_t start, size_t len) "%s: start: %" PRIx64 " %zx"
ram_load_loop(const char *rbname, uint64_t addr, int flags, void *host) "%s: addr: 0x%" PRIx64 " flags: 0x%x host: %p"
ram_load_postcopy_loop(int channel, uint64_t addr, int flags) "chan=%d addr=0x%" PRIx64 " flags=0x%x"
ram_postcopy_send_discard_bitmap(void) ""
ram_save_page(const char *rbname, uint64_t offset, void *host) "%s: offset: 0x%" PRIx64 " host: %p"
ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: 0x%zx len: 0x%zx"
ram_save_complete(uint64_t dirty_pages, int done) "dirty=%" PRIu64 ", done=%d"
ram_dirty_bitmap_request(char *str) "%s"
ram_dirty_bitmap_reload_begin(char *str) "%s"
ram_dirty_bitmap_reload_complete(char *str) "%s"
ram_dirty_bitmap_sync_start(void) ""
ram_dirty_bitmap_sync_wait(void) ""
ram_dirty_bitmap_sync_complete(void) ""
ram_state_resume_prepare(uint64_t v) "%" PRId64
colo_flush_ram_cache_begin(uint64_t dirty_pages) "dirty_pages %" PRIu64
colo_flush_ram_cache_end(void) ""
save_xbzrle_page_skipping(void) ""
save_xbzrle_page_overflow(void) ""
ram_save_iterate_big_wait(uint64_t milliconds, int iterations) "big wait: %" PRIu64 " milliseconds, %d iterations"
ram_load_start(void) ""
ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRIu64
ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
migration: Postcopy preemption enablement This patch enables postcopy-preempt feature. It contains two major changes to the migration logic: (1) Postcopy requests are now sent via a different socket from precopy background migration stream, so as to be isolated from very high page request delays. (2) For huge page enabled hosts: when there's postcopy requests, they can now intercept a partial sending of huge host pages on src QEMU. After this patch, we'll live migrate a VM with two channels for postcopy: (1) PRECOPY channel, which is the default channel that transfers background pages; and (2) POSTCOPY channel, which only transfers requested pages. There's no strict rule of which channel to use, e.g., if a requested page is already being transferred on precopy channel, then we will keep using the same precopy channel to transfer the page even if it's explicitly requested. In 99% of the cases we'll prioritize the channels so we send requested page via the postcopy channel as long as possible. On the source QEMU, when we found a postcopy request, we'll interrupt the PRECOPY channel sending process and quickly switch to the POSTCOPY channel. After we serviced all the high priority postcopy pages, we'll switch back to PRECOPY channel so that we'll continue to send the interrupted huge page again. There's no new thread introduced on src QEMU. On the destination QEMU, one new thread is introduced to receive page data from the postcopy specific socket (done in the preparation patch). This patch has a side effect: after sending postcopy pages, previously we'll assume the guest will access follow up pages so we'll keep sending from there. Now it's changed. Instead of going on with a postcopy requested page, we'll go back and continue sending the precopy huge page (which can be intercepted by a postcopy request so the huge page can be sent partially before). Whether that's a problem is debatable, because "assuming the guest will continue to access the next page" may not really suite when huge pages are used, especially if the huge page is large (e.g. 1GB pages). So that locality hint is much meaningless if huge pages are used. Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20220707185504.27203-1-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2022-07-07 14:55:04 -04:00
postcopy_preempt_triggered(char *str, unsigned long page) "during sending ramblock %s offset 0x%lx"
postcopy_preempt_restored(char *str, unsigned long page) "ramblock %s offset 0x%lx"
postcopy_preempt_hit(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
postcopy_preempt_send_host_page(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
postcopy_preempt_switch_channel(int channel) "%d"
postcopy_preempt_reset_channel(void) ""
# multifd.c
multifd_new_send_channel_async(uint8_t id) "channel %u"
multifd_new_send_channel_async_error(uint8_t id, void *err) "channel=%u err=%p"
multifd_recv_unfill(uint8_t id, uint64_t packet_num, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " flags 0x%x next packet size %u"
multifd_recv_new_channel(uint8_t id) "channel %u"
multifd_recv_sync_main(long packet_num) "packet num %ld"
multifd_recv_sync_main_signal(uint8_t id) "channel %u"
multifd_recv_sync_main_wait(uint8_t id) "iter %u"
multifd_recv_terminate_threads(bool error) "error %d"
multifd_recv_thread_end(uint8_t id, uint64_t packets) "channel %u packets %" PRIu64
multifd_recv_thread_start(uint8_t id) "%u"
multifd_send_fill(uint8_t id, uint64_t packet_num, uint32_t flags, uint32_t next_packet_size) "channel %u packet_num %" PRIu64 " flags 0x%x next packet size %u"
multifd_send_ram_fill(uint8_t id, uint32_t normal, uint32_t zero) "channel %u normal pages %u zero pages %u"
multifd_send_error(uint8_t id) "channel %u"
multifd_send_sync_main(long packet_num) "packet num %ld"
multifd_send_sync_main_signal(uint8_t id) "channel %u"
multifd_send_sync_main_wait(uint8_t id) "channel %u"
multifd_send_terminate_threads(void) ""
multifd_send_thread_end(uint8_t id, uint64_t packets) "channel %u packets %" PRIu64
multifd_send_thread_start(uint8_t id) "%u"
multifd_tls_outgoing_handshake_start(void *ioc, void *tioc) "ioc=%p tioc=%p"
multifd_tls_outgoing_handshake_error(void *ioc, const char *err) "ioc=%p err=%s"
multifd_tls_outgoing_handshake_complete(void *ioc) "ioc=%p"
multifd_set_outgoing_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
# migration.c
migrate_set_state(const char *new_state) "new state %s"
migration_cleanup(void) ""
migrate_error(const char *error_desc) "error=%s"
migration_cancel(void) ""
migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx"
migrate_pending_exact(uint64_t size, uint64_t pre, uint64_t post) "exact pending size %" PRIu64 " (pre = %" PRIu64 " post=%" PRIu64 ")"
migrate_pending_estimate(uint64_t size, uint64_t pre, uint64_t post) "estimate pending size %" PRIu64 " (pre = %" PRIu64 " post=%" PRIu64 ")"
migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 0x%"PRIi64
migration_completion_file_err(void) ""
migration_completion_vm_stop(int ret) "ret %d"
migration_completion_postcopy_end(void) ""
migration_completion_postcopy_end_after_complete(void) ""
migration_rate_limit_pre(int ms) "%d ms"
migration_rate_limit_post(int urgent) "urgent: %d"
migration_return_path_end_before(void) ""
migration_return_path_end_after(void) ""
migration_thread_after_loop(void) ""
migration_thread_file_err(void) ""
migration_thread_setup_complete(void) ""
open_return_path_on_source(void) ""
open_return_path_on_source_continue(void) ""
postcopy_start(void) ""
postcopy_pause_return_path(void) ""
postcopy_pause_return_path_continued(void) ""
postcopy_pause_continued(void) ""
postcopy_start_set_run(void) ""
migration: Maintain postcopy faulted addresses Maintain a list of faulted addresses on the destination host for which we're waiting on. This is implemented using a GTree rather than a real list to make sure even there're plenty of vCPUs/threads that are faulting, the lookup will still be fast with O(log(N)) (because we'll do that after placing each page). It should bring a slight overhead, but ideally that shouldn't be a big problem simply because in most cases the requested page list will be short. Actually we did similar things for postcopy blocktime measurements. This patch didn't use that simply because: (1) blocktime measurement is towards vcpu threads only, but here we need to record all faulted addresses, including main thread and external thread (like, DPDK via vhost-user). (2) blocktime measurement will require UFFD_FEATURE_THREAD_ID, but here we don't want to add that extra dependency on the kernel version since not necessary. E.g., we don't need to know which thread faulted on which page, we also don't care about multiple threads faulting on the same page. But we only care about what addresses are faulted so waiting for a page copying from src. (3) blocktime measurement is not enabled by default. However we need this by default especially for postcopy recover. Another thing to mention is that this patch introduced a new mutex to serialize the receivedmap and the page_requested tree, however that serialization does not cover other procedures like UFFDIO_COPY. Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20201021212721.440373-4-peterx@redhat.com> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2020-10-21 17:27:18 -04:00
postcopy_page_req_add(void *addr, int count) "new page req %p total %d"
source_return_path_thread_bad_end(void) ""
source_return_path_thread_end(void) ""
source_return_path_thread_entry(void) ""
source_return_path_thread_loop_top(void) ""
source_return_path_thread_pong(uint32_t val) "0x%x"
source_return_path_thread_shut(uint32_t val) "0x%x"
source_return_path_thread_resume_ack(uint32_t v) "%"PRIu32
source_return_path_thread_switchover_acked(void) ""
source_return_path_thread_postcopy_package_loaded(void) ""
migration_thread_low_pending(uint64_t pending) "%" PRIu64
migration: Allow user to specify available switchover bandwidth Migration bandwidth is a very important value to live migration. It's because it's one of the major factors that we'll make decision on when to switchover to destination in a precopy process. This value is currently estimated by QEMU during the whole live migration process by monitoring how fast we were sending the data. This can be the most accurate bandwidth if in the ideal world, where we're always feeding unlimited data to the migration channel, and then it'll be limited to the bandwidth that is available. However in reality it may be very different, e.g., over a 10Gbps network we can see query-migrate showing migration bandwidth of only a few tens of MB/s just because there are plenty of other things the migration thread might be doing. For example, the migration thread can be busy scanning zero pages, or it can be fetching dirty bitmap from other external dirty sources (like vhost or KVM). It means we may not be pushing data as much as possible to migration channel, so the bandwidth estimated from "how many data we sent in the channel" can be dramatically inaccurate sometimes. With that, the decision to switchover will be affected, by assuming that we may not be able to switchover at all with such a low bandwidth, but in reality we can. The migration may not even converge at all with the downtime specified, with that wrong estimation of bandwidth, keeping iterations forever with a low estimation of bandwidth. The issue is QEMU itself may not be able to avoid those uncertainties on measuing the real "available migration bandwidth". At least not something I can think of so far. One way to fix this is when the user is fully aware of the available bandwidth, then we can allow the user to help providing an accurate value. For example, if the user has a dedicated channel of 10Gbps for migration for this specific VM, the user can specify this bandwidth so QEMU can always do the calculation based on this fact, trusting the user as long as specified. It may not be the exact bandwidth when switching over (in which case qemu will push migration data as fast as possible), but much better than QEMU trying to wildly guess, especially when very wrong. A new parameter "avail-switchover-bandwidth" is introduced just for this. So when the user specified this parameter, instead of trusting the estimated value from QEMU itself (based on the QEMUFile send speed), it trusts the user more by using this value to decide when to switchover, assuming that we'll have such bandwidth available then. Note that specifying this value will not throttle the bandwidth for switchover yet, so QEMU will always use the full bandwidth possible for sending switchover data, assuming that should always be the most important way to use the network at that time. This can resolve issues like "unconvergence migration" which is caused by hilarious low "migration bandwidth" detected for whatever reason. Reported-by: Zhiyi Guo <zhguo@redhat.com> Reviewed-by: Joao Martins <joao.m.martins@oracle.com> Reviewed-by: Juan Quintela <quintela@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com> Message-ID: <20231010221922.40638-1-peterx@redhat.com>
2023-10-10 18:19:22 -04:00
migrate_transferred(uint64_t transferred, uint64_t time_spent, uint64_t bandwidth, uint64_t avail_bw, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " switchover_bw %" PRIu64 " max_size %" PRId64
process_incoming_migration_co_end(int ret) "ret=%d"
process_incoming_migration_co_postcopy_end_main(void) ""
migration: Postcopy preemption enablement This patch enables postcopy-preempt feature. It contains two major changes to the migration logic: (1) Postcopy requests are now sent via a different socket from precopy background migration stream, so as to be isolated from very high page request delays. (2) For huge page enabled hosts: when there's postcopy requests, they can now intercept a partial sending of huge host pages on src QEMU. After this patch, we'll live migrate a VM with two channels for postcopy: (1) PRECOPY channel, which is the default channel that transfers background pages; and (2) POSTCOPY channel, which only transfers requested pages. There's no strict rule of which channel to use, e.g., if a requested page is already being transferred on precopy channel, then we will keep using the same precopy channel to transfer the page even if it's explicitly requested. In 99% of the cases we'll prioritize the channels so we send requested page via the postcopy channel as long as possible. On the source QEMU, when we found a postcopy request, we'll interrupt the PRECOPY channel sending process and quickly switch to the POSTCOPY channel. After we serviced all the high priority postcopy pages, we'll switch back to PRECOPY channel so that we'll continue to send the interrupted huge page again. There's no new thread introduced on src QEMU. On the destination QEMU, one new thread is introduced to receive page data from the postcopy specific socket (done in the preparation patch). This patch has a side effect: after sending postcopy pages, previously we'll assume the guest will access follow up pages so we'll keep sending from there. Now it's changed. Instead of going on with a postcopy requested page, we'll go back and continue sending the precopy huge page (which can be intercepted by a postcopy request so the huge page can be sent partially before). Whether that's a problem is debatable, because "assuming the guest will continue to access the next page" may not really suite when huge pages are used, especially if the huge page is large (e.g. 1GB pages). So that locality hint is much meaningless if huge pages are used. Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20220707185504.27203-1-peterx@redhat.com> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2022-07-07 14:55:04 -04:00
postcopy_preempt_enabled(bool value) "%d"
migration_precopy_complete(void) ""
# migration-stats
migration_transferred_bytes(uint64_t qemu_file, uint64_t multifd, uint64_t rdma) "qemu_file %" PRIu64 " multifd %" PRIu64 " RDMA %" PRIu64
# channel.c
migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
migration_set_outgoing_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
# global_state.c
migrate_state_too_big(void) ""
migrate_global_state_post_load(const char *state) "loaded state: %s"
migrate_global_state_pre_save(const char *state) "saved state: %s"
# rdma.c
qemu_rdma_accept_incoming_migration(void) ""
qemu_rdma_accept_incoming_migration_accepted(void) ""
qemu_rdma_accept_pin_state(bool pin) "%d"
qemu_rdma_accept_pin_verbsc(void *verbs) "Verbs context after listen: %p"
qemu_rdma_block_for_wrid_miss(uint64_t wcomp, uint64_t req) "A Wanted wrid %" PRIu64 " but got %" PRIu64
qemu_rdma_cleanup_disconnect(void) ""
qemu_rdma_close(void) ""
qemu_rdma_connect_pin_all_requested(void) ""
qemu_rdma_connect_pin_all_outcome(bool pin) "%d"
qemu_rdma_dest_init_trying(const char *host, const char *ip) "%s => %s"
qemu_rdma_dump_id_failed(const char *who) "%s RDMA Device opened, but can't query port information"
qemu_rdma_dump_id(const char *who, const char *name, const char *dev_name, const char *dev_path, const char *ibdev_path, int transport, const char *transport_name) "%s RDMA Device opened: kernel name %s uverbs device name %s, infiniband_verbs class device path %s, infiniband class device path %s, transport: (%d) %s"
qemu_rdma_dump_gid(const char *who, const char *src, const char *dst) "%s Source GID: %s, Dest GID: %s"
qemu_rdma_exchange_get_response_start(const char *desc) "CONTROL: %s receiving..."
qemu_rdma_exchange_get_response_none(const char *desc, int type) "Surprise: got %s (%d)"
qemu_rdma_exchange_send_issue_callback(void) ""
qemu_rdma_exchange_send_waiting(const char *desc) "Waiting for response %s"
qemu_rdma_exchange_send_received(const char *desc) "Response %s received."
qemu_rdma_fill(size_t control_len, size_t size) "RDMA %zd of %zd bytes already in buffer"
qemu_rdma_init_ram_blocks(int blocks) "Allocated %d local ram block structures"
qemu_rdma_poll_recv(uint64_t comp, int64_t id, int sent) "completion %" PRIu64 " received (%" PRId64 ") left %d"
qemu_rdma_poll_write(uint64_t comp, int left, uint64_t block, uint64_t chunk, void *local, void *remote) "completions %" PRIu64 " left %d, block %" PRIu64 ", chunk: %" PRIu64 " %p %p"
qemu_rdma_poll_other(uint64_t comp, int left) "other completion %" PRIu64 " received left %d"
qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.."
qemu_rdma_register_and_get_keys(uint64_t len, void *start) "Registering %" PRIu64 " bytes @ %p"
qemu_rdma_register_odp_mr(const char *name) "Try to register On-Demand Paging memory region: %s"
qemu_rdma_advise_mr(const char *name, uint32_t len, uint64_t addr, const char *res) "Try to advise block %s prefetch at %" PRIu32 "@0x%" PRIx64 ": %s"
qemu_rdma_resolve_host_trying(const char *host, const char *ip) "Trying %s => %s"
qemu_rdma_signal_unregister_append(uint64_t chunk, int pos) "Appending unregister chunk %" PRIu64 " at position %d"
qemu_rdma_signal_unregister_already(uint64_t chunk) "Unregister chunk %" PRIu64 " already in queue"
qemu_rdma_unregister_waiting_inflight(uint64_t chunk) "Cannot unregister inflight chunk: %" PRIu64
qemu_rdma_unregister_waiting_proc(uint64_t chunk, int pos) "Processing unregister for chunk: %" PRIu64 " at position %d"
qemu_rdma_unregister_waiting_send(uint64_t chunk) "Sending unregister for chunk: %" PRIu64
qemu_rdma_unregister_waiting_complete(uint64_t chunk) "Unregister for chunk: %" PRIu64 " complete."
qemu_rdma_write_flush(int sent) "sent total: %d"
qemu_rdma_write_one_block(int count, int block, uint64_t chunk, uint64_t current, uint64_t len, int nb_sent, int nb_chunks) "(%d) Not clobbering: block: %d chunk %" PRIu64 " current %" PRIu64 " len %" PRIu64 " %d %d"
qemu_rdma_write_one_post(uint64_t chunk, long addr, long remote, uint32_t len) "Posting chunk: %" PRIu64 ", addr: 0x%lx remote: 0x%lx, bytes %" PRIu32
qemu_rdma_write_one_queue_full(void) ""
qemu_rdma_write_one_recvregres(int mykey, int theirkey, uint64_t chunk) "Received registration result: my key: 0x%x their key 0x%x, chunk %" PRIu64
qemu_rdma_write_one_sendreg(uint64_t chunk, int len, int index, int64_t offset) "Sending registration request chunk %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64
qemu_rdma_write_one_top(uint64_t chunks, uint64_t size) "Writing %" PRIu64 " chunks, (%" PRIu64 " MB)"
qemu_rdma_write_one_zero(uint64_t chunk, int len, int index, int64_t offset) "Entire chunk is zero, sending compress: %" PRIu64 " for %d bytes, index: %d, offset: %" PRId64
rdma_add_block(const char *block_name, int block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Added Block: '%s':%d, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
rdma_block_notification_handle(const char *name, int index) "%s at %d"
rdma_delete_block(void *block, uint64_t addr, uint64_t offset, uint64_t len, uint64_t end, uint64_t bits, int chunks) "Deleted Block: %p, addr: %" PRIu64 ", offset: %" PRIu64 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d"
rdma_registration_handle_compress(int64_t length, int index, int64_t offset) "Zapping zero chunk: %" PRId64 " bytes, index %d, offset %" PRId64
rdma_registration_handle_finished(void) ""
rdma_registration_handle_ram_blocks(void) ""
rdma_registration_handle_ram_blocks_loop(const char *name, uint64_t offset, uint64_t length, void *local_host_addr, unsigned int src_index) "%s: @0x%" PRIx64 "/%" PRIu64 " host:@%p src_index: %u"
rdma_registration_handle_register(int requests) "%d requests"
rdma_registration_handle_register_loop(int req, int index, uint64_t addr, uint64_t chunks) "Registration request (%d): index %d, current_addr %" PRIu64 " chunks: %" PRIu64
rdma_registration_handle_register_rkey(int rkey) "0x%x"
rdma_registration_handle_unregister(int requests) "%d requests"
rdma_registration_handle_unregister_loop(int count, int index, uint64_t chunk) "Unregistration request (%d): index %d, chunk %" PRIu64
rdma_registration_handle_unregister_success(uint64_t chunk) "%" PRIu64
rdma_registration_handle_wait(void) ""
rdma_registration_start(uint64_t flags) "%" PRIu64
rdma_registration_stop(uint64_t flags) "%" PRIu64
rdma_registration_stop_ram(void) ""
rdma_connect_incoming(void) ""
rdma_connect_incoming_after_dest_init(void) ""
rdma_connect_incoming_after_rdma_listen(void) ""
rdma_connect_outgoing_after_rdma_connect(void) ""
rdma_connect_outgoing_after_rdma_source_init(void) ""
# postcopy-ram.c
postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands"
postcopy_discard_send_range(const char *ramblock, unsigned long start, unsigned long length) "%s:%lx/%lx"
postcopy_cleanup_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=0x%zx length=0x%zx"
postcopy_init_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=0x%zx length=0x%zx"
postcopy_nhp_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=0x%zx length=0x%zx"
postcopy_place_page(void *host_addr) "host=%p"
postcopy_place_page_zero(void *host_addr) "host=%p"
postcopy_ram_enable_notify(void) ""
postcopy_pause_fault_thread(void) ""
postcopy_pause_fault_thread_continued(void) ""
postcopy_pause_fast_load(void) ""
postcopy_pause_fast_load_continued(void) ""
postcopy_ram_fault_thread_entry(void) ""
postcopy_ram_fault_thread_exit(void) ""
postcopy_ram_fault_thread_fds_core(int baseufd, int quitfd) "ufd: %d quitfd: %d"
postcopy_ram_fault_thread_fds_extra(size_t index, const char *name, int fd) "%zd/%s: %d"
postcopy_ram_fault_thread_quit(void) ""
postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset, uint32_t pid) "Request for HVA=0x%" PRIx64 " rb=%s offset=0x%zx pid=%u"
postcopy_ram_incoming_cleanup_closeuf(void) ""
postcopy_ram_incoming_cleanup_entry(void) ""
postcopy_ram_incoming_cleanup_exit(void) ""
postcopy_ram_incoming_cleanup_join(void) ""
postcopy_ram_incoming_cleanup_blocktime(uint64_t total) "total blocktime %" PRIu64
postcopy_request_shared_page(const char *sharer, const char *rb, uint64_t rb_offset) "for %s in %s offset 0x%"PRIx64
postcopy_request_shared_page_present(const char *sharer, const char *rb, uint64_t rb_offset) "%s already %s offset 0x%"PRIx64
postcopy_wake_shared(uint64_t client_addr, const char *rb) "at 0x%"PRIx64" in %s"
migration: Maintain postcopy faulted addresses Maintain a list of faulted addresses on the destination host for which we're waiting on. This is implemented using a GTree rather than a real list to make sure even there're plenty of vCPUs/threads that are faulting, the lookup will still be fast with O(log(N)) (because we'll do that after placing each page). It should bring a slight overhead, but ideally that shouldn't be a big problem simply because in most cases the requested page list will be short. Actually we did similar things for postcopy blocktime measurements. This patch didn't use that simply because: (1) blocktime measurement is towards vcpu threads only, but here we need to record all faulted addresses, including main thread and external thread (like, DPDK via vhost-user). (2) blocktime measurement will require UFFD_FEATURE_THREAD_ID, but here we don't want to add that extra dependency on the kernel version since not necessary. E.g., we don't need to know which thread faulted on which page, we also don't care about multiple threads faulting on the same page. But we only care about what addresses are faulted so waiting for a page copying from src. (3) blocktime measurement is not enabled by default. However we need this by default especially for postcopy recover. Another thing to mention is that this patch introduced a new mutex to serialize the receivedmap and the page_requested tree, however that serialization does not cover other procedures like UFFDIO_COPY. Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20201021212721.440373-4-peterx@redhat.com> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
2020-10-21 17:27:18 -04:00
postcopy_page_req_del(void *addr, int count) "resolved page req %p total %d"
postcopy_preempt_tls_handshake(void) ""
postcopy_preempt_new_channel(void) ""
postcopy_preempt_thread_entry(void) ""
postcopy_preempt_thread_exit(void) ""
postcopy_blocktime_tid_cpu_map(int cpu, uint32_t tid) "cpu: %d, tid: %u"
migration/postcopy: Optimize blocktime fault tracking with hashtable Currently, the postcopy blocktime feature maintains vCPU fault information using an array (vcpu_addr[]). It has two issues. Issue 1: Performance Concern ============================ The old algorithm was almost OK and fast on inserts, except that the lookup is slow and won't scale if there are a lot of vCPUs: when a page is copied during postcopy, mark_postcopy_blocktime_end() will walk the whole array trying to find which vCPUs are blocked by the address. So it needs constant O(N) walk for each page resolution. Alexey (the author of postcopy blocktime) mentioned the perf issue and how to optimize it in a piece of comment in the page resolution path. The comment was (interestingly..) not complete, but it's relatively clear what he wanted to say about this perf issue. Issue 2: Wrong Accounting on re-entrancies ========================================== People might think that each vCPU should only and always get one fault at a time, so that when the blocktime layer captured one fault on one vCPU, we should never see another fault message on this vCPU. It's almost correct, except in some extreme rare cases. Case 1: it's possible the fault thread processes the userfaultfd messages too fast so it can see >1 messages on one vCPU before the previous one was resolved. Case 2: it's theoretically also possible one vCPU can get even more than one message on the same fault address if a fault is retried by the kernel (e.g., handle_userfault() got interrupted before page resolution). As this info might be important, instead of using commit message, I put more details into the code as comment, when introducing an array maintaining concurrent faults on one vCPU. Please refer to the comments for details on both cases, especially case 1 which can be tricky. Case 1 sounds rare, but it can be easily reproduced locally for me when we run blocktime together with the migration-test on the vanilla postcopy. New Design ========== This patch should do almost what Alexey mentioned, but slightly differently: instead of having an array to maintain vCPU fault addresses, for each of the fault message we push a message into a hash, indexed by the fault address. With the hash, it can replace the old two structs: both the vcpu_addr[] array, and also the array to store the start time of the fault. However due to above we need one more counter array to account concurrent faults on the same vCPU - that should even be needed in the old code, it's just that the old code was buggy and it will blindly overwrite an existing entry.. now we'll start to really track everything. The hash structure might be more efficient than tree to maintain such addr->(cpu, fault_time) information, so that the insert() and lookup() paths should ideally both be ~O(1). After all, we do not need to sort. Here we need to do one remove() though after the lookup(). It could be slow but only if many vCPUs faulted exactly on the same address (so when the list of cpu entries is long), which should be unlikely. Even with that, it's still a worst case O(N) (consider 400 vCPUs faulted on the same address and how likely is it..) rather than a constant O(N) complexity. When at it, touch up the tracepoints to make them slightly more useful. One tracepoint is added when walking all the fault entries. Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20250613141217.474825-13-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-06-13 10:12:15 -04:00
postcopy_blocktime_begin(uint64_t addr, uint64_t time, int cpu, bool exists) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", cpu: %d, exist: %d"
postcopy_blocktime_end(uint64_t addr, uint64_t time, int affected_cpu, int affected_non_cpus) "addr: 0x%" PRIx64 ", time: %" PRIu64 ", affected_cpus: %d, affected_non_cpus: %d"
migration/postcopy: Optimize blocktime fault tracking with hashtable Currently, the postcopy blocktime feature maintains vCPU fault information using an array (vcpu_addr[]). It has two issues. Issue 1: Performance Concern ============================ The old algorithm was almost OK and fast on inserts, except that the lookup is slow and won't scale if there are a lot of vCPUs: when a page is copied during postcopy, mark_postcopy_blocktime_end() will walk the whole array trying to find which vCPUs are blocked by the address. So it needs constant O(N) walk for each page resolution. Alexey (the author of postcopy blocktime) mentioned the perf issue and how to optimize it in a piece of comment in the page resolution path. The comment was (interestingly..) not complete, but it's relatively clear what he wanted to say about this perf issue. Issue 2: Wrong Accounting on re-entrancies ========================================== People might think that each vCPU should only and always get one fault at a time, so that when the blocktime layer captured one fault on one vCPU, we should never see another fault message on this vCPU. It's almost correct, except in some extreme rare cases. Case 1: it's possible the fault thread processes the userfaultfd messages too fast so it can see >1 messages on one vCPU before the previous one was resolved. Case 2: it's theoretically also possible one vCPU can get even more than one message on the same fault address if a fault is retried by the kernel (e.g., handle_userfault() got interrupted before page resolution). As this info might be important, instead of using commit message, I put more details into the code as comment, when introducing an array maintaining concurrent faults on one vCPU. Please refer to the comments for details on both cases, especially case 1 which can be tricky. Case 1 sounds rare, but it can be easily reproduced locally for me when we run blocktime together with the migration-test on the vanilla postcopy. New Design ========== This patch should do almost what Alexey mentioned, but slightly differently: instead of having an array to maintain vCPU fault addresses, for each of the fault message we push a message into a hash, indexed by the fault address. With the hash, it can replace the old two structs: both the vcpu_addr[] array, and also the array to store the start time of the fault. However due to above we need one more counter array to account concurrent faults on the same vCPU - that should even be needed in the old code, it's just that the old code was buggy and it will blindly overwrite an existing entry.. now we'll start to really track everything. The hash structure might be more efficient than tree to maintain such addr->(cpu, fault_time) information, so that the insert() and lookup() paths should ideally both be ~O(1). After all, we do not need to sort. Here we need to do one remove() though after the lookup(). It could be slow but only if many vCPUs faulted exactly on the same address (so when the list of cpu entries is long), which should be unlikely. Even with that, it's still a worst case O(N) (consider 400 vCPUs faulted on the same address and how likely is it..) rather than a constant O(N) complexity. When at it, touch up the tracepoints to make them slightly more useful. One tracepoint is added when walking all the fault entries. Reviewed-by: Fabiano Rosas <farosas@suse.de> Link: https://lore.kernel.org/r/20250613141217.474825-13-peterx@redhat.com Signed-off-by: Peter Xu <peterx@redhat.com> Signed-off-by: Fabiano Rosas <farosas@suse.de>
2025-06-13 10:12:15 -04:00
postcopy_blocktime_end_one(int cpu, uint8_t left_faults) "cpu: %d, left_faults: %" PRIu8
# exec.c
migration_exec_outgoing(const char *cmd) "cmd=%s"
migration_exec_incoming(const char *cmd) "cmd=%s"
# fd.c
migration_fd_outgoing(int fd) "fd=%d"
migration_fd_incoming(int fd) "fd=%d"
# file.c
migration_file_outgoing(const char *filename) "filename=%s"
migration_file_incoming(const char *filename) "filename=%s"
# socket.c
migration_socket_incoming_accepted(void) ""
migration_socket_outgoing_connected(void) ""
migration_socket_outgoing_error(const char *err) "error=%s"
# tls.c
migration_tls_outgoing_handshake_start(void) ""
migration_tls_outgoing_handshake_error(const char *err) "err=%s"
migration_tls_outgoing_handshake_complete(void) ""
migration_tls_incoming_handshake_start(void) ""
migration_tls_incoming_handshake_error(const char *err) "err=%s"
migration_tls_incoming_handshake_complete(void) ""
# colo.c
colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
COLO: Introduce checkpointing protocol We need communications protocol of user-defined to control the checkpointing process. The new checkpointing request is started by Primary VM, and the interactive process like below: Checkpoint synchronizing points: Primary Secondary initial work 'checkpoint-ready' <-------------------- @ 'checkpoint-request' @ --------------------> Suspend (Only in hybrid mode) 'checkpoint-reply' <-------------------- @ Suspend&Save state 'vmstate-send' @ --------------------> Send state Receive state 'vmstate-received' <-------------------- @ Release packets Load state 'vmstate-load' <-------------------- @ Resume Resume (Only in hybrid mode) Start Comparing (Only in hybrid mode) NOTE: 1) '@' who sends the message 2) Every sync-point is synchronized by two sides with only one handshake(single direction) for low-latency. If more strict synchronization is required, a opposite direction sync-point should be added. 3) Since sync-points are single direction, the remote side may go forward a lot when this side just receives the sync-point. 4) For now, we only support 'periodic' checkpoint, for which the Secondary VM is not running, later we will support 'hybrid' mode. Signed-off-by: zhanghailiang <zhang.zhanghailiang@huawei.com> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com> Signed-off-by: Gonglei <arei.gonglei@huawei.com> Cc: Eric Blake <eblake@redhat.com> Cc: Markus Armbruster <armbru@redhat.com> Cc: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Reviewed-by: Amit Shah <amit.shah@redhat.com> Signed-off-by: Amit Shah <amit@amitshah.net>
2016-10-27 14:42:57 +08:00
colo_send_message(const char *msg) "Send '%s' message"
colo_receive_message(const char *msg) "Receive '%s' message"
# colo-failover.c
colo_failover_set_state(const char *new_state) "new state %s"
# cpr.c
cpr_save_fd(const char *name, int id, int fd) "%s, id %d, fd %d"
cpr_delete_fd(const char *name, int id) "%s, id %d"
cpr_find_fd(const char *name, int id, int fd) "%s, id %d returns %d"
cpr_state_save(const char *mode) "%s mode"
cpr_state_load(const char *mode) "%s mode"
cpr_transfer_input(const char *path) "%s"
cpr_transfer_output(const char *path) "%s"
migration: cpr-exec mode Add the cpr-exec migration mode. Usage: qemu-system-$arch -machine aux-ram-share=on ... migrate_set_parameter mode cpr-exec migrate_set_parameter cpr-exec-command \ <arg1> <arg2> ... -incoming <uri-1> \ migrate -d <uri-1> The migrate command stops the VM, saves state to uri-1, directly exec's a new version of QEMU on the same host, replacing the original process while retaining its PID, and loads state from uri-1. Guest RAM is preserved in place, albeit with new virtual addresses. The new QEMU process is started by exec'ing the command specified by the @cpr-exec-command parameter. The first word of the command is the binary, and the remaining words are its arguments. The command may be a direct invocation of new QEMU, or may be a non-QEMU command that exec's the new QEMU binary. This mode creates a second migration channel that is not visible to the user. At the start of migration, old QEMU saves CPR state to the second channel, and at the end of migration, it tells the main loop to call cpr_exec. New QEMU loads CPR state early, before objects are created. Because old QEMU terminates when new QEMU starts, one cannot stream data between the two, so uri-1 must be a type, such as a file, that accepts all data before old QEMU exits. Otherwise, old QEMU may quietly block writing to the channel. Memory-backend objects must have the share=on attribute, but memory-backend-epc is not supported. The VM must be started with the '-machine aux-ram-share=on' option, which allows anonymous memory to be transferred in place to the new process. The memfds are kept open across exec by clearing the close-on-exec flag, their values are saved in CPR state, and they are mmap'd in new QEMU. Signed-off-by: Steve Sistare <steven.sistare@oracle.com> Acked-by: Markus Armbruster <armbru@redhat.com> Link: https://lore.kernel.org/r/1759332851-370353-7-git-send-email-steven.sistare@oracle.com Signed-off-by: Peter Xu <peterx@redhat.com>
2025-10-01 08:33:58 -07:00
cpr_exec(void) ""
# block-dirty-bitmap.c
send_bitmap_header_enter(void) ""
send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "flags: 0x%x, start_sector: %" PRIu64 ", nr_sectors: %" PRIu32 ", data_size: %" PRIu64
dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d"
dirty_bitmap_save_complete_enter(void) ""
dirty_bitmap_save_complete_finish(void) ""
dirty_bitmap_state_pending(uint64_t pending) "pending %" PRIu64
dirty_bitmap_load_complete(void) ""
dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32
dirty_bitmap_load_bits_zeroes(void) ""
dirty_bitmap_load_header(uint32_t flags) "flags 0x%x"
dirty_bitmap_load_enter(void) ""
dirty_bitmap_load_success(void) ""
# dirtyrate.c
dirtyrate_set_state(const char *new_state) "new state %s"
query_dirty_rate_info(const char *new_state) "current state %s"
get_ramblock_vfn_hash(const char *idstr, uint64_t vfn, uint32_t hash) "ramblock name: %s, vfn: %"PRIu64 ", hash: %" PRIu32
calc_page_dirty_rate(const char *idstr, uint32_t new_hash, uint32_t old_hash) "ramblock name: %s, new hash: %" PRIu32 ", old hash: %" PRIu32
skip_sample_ramblock(const char *idstr, uint64_t ramblock_size) "ramblock name: %s, ramblock size: %" PRIu64
find_page_matched(const char *idstr) "ramblock %s addr or size changed"
dirtyrate_calculate(int64_t dirtyrate) "dirty rate: %" PRIi64 " MB/s"
dirtyrate_do_calculate_vcpu(int idx, uint64_t rate) "vcpu[%d]: %"PRIu64 " MB/s"
# block.c
migration_block_init_shared(const char *blk_device_name) "Start migration for %s with shared base image"
migration_block_init_full(const char *blk_device_name) "Start full migration for %s"
migration_block_save_device_dirty(int64_t sector) "Error reading sector %" PRId64
migration_block_flush_blks(const char *action, int submitted, int read_done, int transferred) "%s submitted %d read_done %d transferred %d"
migration_block_save(const char *mig_stage, int submitted, int transferred) "Enter save live %s submitted %d transferred %d"
migration_block_save_complete(void) "Block migration completed"
migration_block_state_pending(uint64_t pending) "Enter save live pending %" PRIu64
migration_block_progression(unsigned percent) "Completed %u%%"
# page_cache.c
migration_pagecache_init(int64_t max_num_items) "Setting cache buckets to %" PRId64
migration_pagecache_insert(void) "Error allocating page"
# cpu-throttle.c
cpu_throttle_set(int new_throttle_pct) "set guest CPU throttled by %d%%"
cpu_throttle_dirty_sync(void) ""
migration/block: Rewrite disk activation This patch proposes a flag to maintain disk activation status globally. It mostly rewrites disk activation mgmt for QEMU, including COLO and QMP command xen_save_devices_state. Backgrounds =========== We have two problems on disk activations, one resolved, one not. Problem 1: disk activation recover (for switchover interruptions) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ When migration is either cancelled or failed during switchover, especially when after the disks are inactivated, QEMU needs to remember re-activate the disks again before vm starts. It used to be done separately in two paths: one in qmp_migrate_cancel(), the other one in the failure path of migration_completion(). It used to be fixed in different commits, all over the places in QEMU. So these are the relevant changes I saw, I'm not sure if it's complete list: - In 2016, commit fe904ea824 ("migration: regain control of images when migration fails to complete") - In 2017, commit 1d2acc3162 ("migration: re-active images while migration been canceled after inactive them") - In 2023, commit 6dab4c93ec ("migration: Attempt disk reactivation in more failure scenarios") Now since we have a slightly better picture maybe we can unify the reactivation in a single path. One side benefit of doing so is, we can move the disk operation outside QMP command "migrate_cancel". It's possible that in the future we may want to make "migrate_cancel" be OOB-compatible, while that requires the command doesn't need BQL in the first place. This will already do that and make migrate_cancel command lightweight. Problem 2: disk invalidation on top of invalidated disks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This is an unresolved bug for current QEMU. Link in "Resolves:" at the end. It turns out besides the src switchover phase (problem 1 above), QEMU also needs to remember block activation on destination. Consider two continuous migration in a row, where the VM was always paused. In that scenario, the disks are not activated even until migration completed in the 1st round. When the 2nd round starts, if QEMU doesn't know the status of the disks, it needs to try inactivate the disk again. Here the issue is the block layer API bdrv_inactivate_all() will crash a QEMU if invoked on already inactive disks for the 2nd migration. For detail, see the bug link at the end. Implementation ============== This patch proposes to maintain disk activation with a global flag, so we know: - If we used to inactivate disks for migration, but migration got cancelled, or failed, QEMU will know it should reactivate the disks. - On incoming side, if the disks are never activated but then another migration is triggered, QEMU should be able to tell that inactivate is not needed for the 2nd migration. We used to have disk_inactive, but it only solves the 1st issue, not the 2nd. Also, it's done in completely separate paths so it's extremely hard to follow either how the flag changes, or the duration that the flag is valid, and when we will reactivate the disks. Convert the existing disk_inactive flag into that global flag (also invert its naming), and maintain the disk activation status for the whole lifecycle of qemu. That includes the incoming QEMU. Put both of the error cases of source migration (failure, cancelled) together into migration_iteration_finish(), which will be invoked for either of the scenario. So from that part QEMU should behave the same as before. However with such global maintenance on disk activation status, we not only cleanup quite a few temporary paths that we try to maintain the disk activation status (e.g. in postcopy code), meanwhile it fixes the crash for problem 2 in one shot. For freshly started QEMU, the flag is initialized to TRUE showing that the QEMU owns the disks by default. For incoming migrated QEMU, the flag will be initialized to FALSE once and for all showing that the dest QEMU doesn't own the disks until switchover. That is guaranteed by the "once" variable. Resolves: https://gitlab.com/qemu-project/qemu/-/issues/2395 Signed-off-by: Peter Xu <peterx@redhat.com> Reviewed-by: Fabiano Rosas <farosas@suse.de> Message-Id: <20241206230838.1111496-7-peterx@redhat.com> Signed-off-by: Fabiano Rosas <farosas@suse.de>
2024-12-06 18:08:38 -05:00
# block-active.c
migration_block_activation(const char *name) "%s"