migration: introduce MIGRATION_STATUS_FAILING

When migration connection is broken, the QEMU and libvirtd(8)
process on the source side receive TCP connection reset
notification. QEMU sets the migration status to FAILED and
proceeds to migration_cleanup(). Meanwhile, Libvirtd(8) sends
a QMP command to migrate_set_capabilities().

The migration_cleanup() and qmp_migrate_set_capabilities()
calls race with each other. When the latter is invoked first,
since the migration is not running (FAILED), migration
capabilities are reset to false, so during migration_cleanup()
the QEMU process crashes with assertion failure.

Introduce a new migration status FAILING and use it as an
interim status when an error occurs. Once migration_cleanup()
is done, it sets the migration status to FAILED. This helps
to avoid the above race condition and ensuing failure.

Interim status FAILING is set wherever the execution moves
towards migration_cleanup():
  - postcopy_start()
  - migration_thread()
  - migration_cleanup()
  - multifd_send_setup()
  - bg_migration_thread()
  - migration_completion()
  - migration_detect_error()
  - bg_migration_completion()
  - multifd_send_error_propagate()
  - migration_connect_error_propagate()

The migration status finally moves to FAILED and reports an
appropriate error to the user.

Interim status FAILING is _NOT_ set in the following routines
because they do not follow the migration_cleanup() path to the
FAILED state:
  - cpr_exec_cb()
  - qemu_savevm_state()
  - postcopy_listen_thread()
  - process_incoming_migration_co()
  - multifd_recv_terminate_threads()
  - migration_channel_process_incoming()

Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Prasad Pandit <pjp@fedoraproject.org>
Link: https://lore.kernel.org/qemu-devel/20260224102547.226087-1-ppandit@redhat.com
Signed-off-by: Fabiano Rosas <farosas@suse.de>
This commit is contained in:
Prasad Pandit
2026-02-24 15:55:47 +05:30
committed by Fabiano Rosas
parent 1ae4271ab8
commit 93c8a97f99
5 changed files with 32 additions and 19 deletions

View File

@@ -1016,6 +1016,7 @@ bool migration_is_running(void)
case MIGRATION_STATUS_DEVICE: case MIGRATION_STATUS_DEVICE:
case MIGRATION_STATUS_WAIT_UNPLUG: case MIGRATION_STATUS_WAIT_UNPLUG:
case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_CANCELLING:
case MIGRATION_STATUS_FAILING:
case MIGRATION_STATUS_COLO: case MIGRATION_STATUS_COLO:
return true; return true;
default: default:
@@ -1158,6 +1159,7 @@ static void fill_source_migration_info(MigrationInfo *info)
case MIGRATION_STATUS_POSTCOPY_PAUSED: case MIGRATION_STATUS_POSTCOPY_PAUSED:
case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
case MIGRATION_STATUS_POSTCOPY_RECOVER: case MIGRATION_STATUS_POSTCOPY_RECOVER:
case MIGRATION_STATUS_FAILING:
/* TODO add some postcopy stats */ /* TODO add some postcopy stats */
populate_time_info(info, s); populate_time_info(info, s);
populate_ram_info(info, s); populate_ram_info(info, s);
@@ -1210,6 +1212,7 @@ static void fill_destination_migration_info(MigrationInfo *info)
case MIGRATION_STATUS_POSTCOPY_PAUSED: case MIGRATION_STATUS_POSTCOPY_PAUSED:
case MIGRATION_STATUS_POSTCOPY_RECOVER: case MIGRATION_STATUS_POSTCOPY_RECOVER:
case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_FAILED:
case MIGRATION_STATUS_FAILING:
case MIGRATION_STATUS_COLO: case MIGRATION_STATUS_COLO:
info->has_status = true; info->has_status = true;
break; break;
@@ -1330,6 +1333,9 @@ static void migration_cleanup(MigrationState *s)
if (s->state == MIGRATION_STATUS_CANCELLING) { if (s->state == MIGRATION_STATUS_CANCELLING) {
migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING,
MIGRATION_STATUS_CANCELLED); MIGRATION_STATUS_CANCELLED);
} else if (s->state == MIGRATION_STATUS_FAILING) {
migrate_set_state(&s->state, MIGRATION_STATUS_FAILING,
MIGRATION_STATUS_FAILED);
} }
/* /*
@@ -1387,7 +1393,7 @@ void migration_connect_error_propagate(MigrationState *s, Error *error)
switch (current) { switch (current) {
case MIGRATION_STATUS_SETUP: case MIGRATION_STATUS_SETUP:
next = MIGRATION_STATUS_FAILED; next = MIGRATION_STATUS_FAILING;
break; break;
case MIGRATION_STATUS_POSTCOPY_PAUSED: case MIGRATION_STATUS_POSTCOPY_PAUSED:
@@ -1401,9 +1407,10 @@ void migration_connect_error_propagate(MigrationState *s, Error *error)
break; break;
case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_CANCELLING:
case MIGRATION_STATUS_FAILING:
/* /*
* Don't move out of CANCELLING, the only valid transition is to * Keep the current state, next transition is to be done
* CANCELLED, at migration_cleanup(). * in migration_cleanup().
*/ */
break; break;
@@ -1553,6 +1560,7 @@ bool migration_has_failed(MigrationState *s)
{ {
return (s->state == MIGRATION_STATUS_CANCELLING || return (s->state == MIGRATION_STATUS_CANCELLING ||
s->state == MIGRATION_STATUS_CANCELLED || s->state == MIGRATION_STATUS_CANCELLED ||
s->state == MIGRATION_STATUS_FAILING ||
s->state == MIGRATION_STATUS_FAILED); s->state == MIGRATION_STATUS_FAILED);
} }
@@ -2479,7 +2487,7 @@ static int postcopy_start(MigrationState *ms, Error **errp)
if (postcopy_preempt_establish_channel(ms)) { if (postcopy_preempt_establish_channel(ms)) {
if (ms->state != MIGRATION_STATUS_CANCELLING) { if (ms->state != MIGRATION_STATUS_CANCELLING) {
migrate_set_state(&ms->state, ms->state, migrate_set_state(&ms->state, ms->state,
MIGRATION_STATUS_FAILED); MIGRATION_STATUS_FAILING);
} }
error_setg(errp, "%s: Failed to establish preempt channel", error_setg(errp, "%s: Failed to establish preempt channel",
__func__); __func__);
@@ -2642,7 +2650,7 @@ fail_closefb:
qemu_fclose(fb); qemu_fclose(fb);
fail: fail:
if (ms->state != MIGRATION_STATUS_CANCELLING) { if (ms->state != MIGRATION_STATUS_CANCELLING) {
migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED); migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILING);
} }
bql_unlock(); bql_unlock();
return -1; return -1;
@@ -2833,7 +2841,7 @@ fail:
} }
if (s->state != MIGRATION_STATUS_CANCELLING) { if (s->state != MIGRATION_STATUS_CANCELLING) {
migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILED); migrate_set_state(&s->state, s->state, MIGRATION_STATUS_FAILING);
} }
} }
@@ -2870,7 +2878,7 @@ static void bg_migration_completion(MigrationState *s)
fail: fail:
migrate_set_state(&s->state, current_active_state, migrate_set_state(&s->state, current_active_state,
MIGRATION_STATUS_FAILED); MIGRATION_STATUS_FAILING);
} }
typedef enum MigThrError { typedef enum MigThrError {
@@ -3071,7 +3079,7 @@ static MigThrError migration_detect_error(MigrationState *s)
* For precopy (or postcopy with error outside IO, or before dest * For precopy (or postcopy with error outside IO, or before dest
* starts), we fail with no time. * starts), we fail with no time.
*/ */
migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILED); migrate_set_state(&s->state, state, MIGRATION_STATUS_FAILING);
trace_migration_thread_file_err(); trace_migration_thread_file_err();
/* Time to stop the migration, now. */ /* Time to stop the migration, now. */
@@ -3302,7 +3310,7 @@ static void migration_iteration_finish(MigrationState *s)
migrate_start_colo_process(s); migrate_start_colo_process(s);
s->vm_old_state = RUN_STATE_RUNNING; s->vm_old_state = RUN_STATE_RUNNING;
/* Fallthrough */ /* Fallthrough */
case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_FAILING:
case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_CANCELLED:
case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_CANCELLING:
if (!migration_block_activate(&local_err)) { if (!migration_block_activate(&local_err)) {
@@ -3368,7 +3376,7 @@ static void bg_migration_iteration_finish(MigrationState *s)
switch (s->state) { switch (s->state) {
case MIGRATION_STATUS_COMPLETED: case MIGRATION_STATUS_COMPLETED:
case MIGRATION_STATUS_ACTIVE: case MIGRATION_STATUS_ACTIVE:
case MIGRATION_STATUS_FAILED: case MIGRATION_STATUS_FAILING:
case MIGRATION_STATUS_CANCELLED: case MIGRATION_STATUS_CANCELLED:
case MIGRATION_STATUS_CANCELLING: case MIGRATION_STATUS_CANCELLING:
break; break;
@@ -3553,7 +3561,7 @@ static void *migration_thread(void *opaque)
if (ret) { if (ret) {
migrate_error_propagate(s, local_err); migrate_error_propagate(s, local_err);
migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_FAILED); MIGRATION_STATUS_FAILING);
goto out; goto out;
} }
@@ -3745,7 +3753,7 @@ fail:
/* local_err is guaranteed to be set when reaching here */ /* local_err is guaranteed to be set when reaching here */
migrate_error_propagate(s, local_err); migrate_error_propagate(s, local_err);
migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE, migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_FAILED); MIGRATION_STATUS_FAILING);
done: done:
bg_migration_iteration_finish(s); bg_migration_iteration_finish(s);

View File

@@ -431,7 +431,7 @@ static void multifd_send_error_propagate(Error *err)
s->state == MIGRATION_STATUS_DEVICE || s->state == MIGRATION_STATUS_DEVICE ||
s->state == MIGRATION_STATUS_ACTIVE) { s->state == MIGRATION_STATUS_ACTIVE) {
migrate_set_state(&s->state, s->state, migrate_set_state(&s->state, s->state,
MIGRATION_STATUS_FAILED); MIGRATION_STATUS_FAILING);
} }
} }
} }
@@ -986,7 +986,7 @@ bool multifd_send_setup(void)
err: err:
migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
MIGRATION_STATUS_FAILED); MIGRATION_STATUS_FAILING);
return false; return false;
} }

View File

@@ -158,7 +158,10 @@
# #
# @completed: migration is finished. # @completed: migration is finished.
# #
# @failed: some error occurred during migration process. # @failing: error occurred during migration, clean-up underway.
# (since 11.0)
#
# @failed: error occurred during migration, clean-up done.
# #
# @colo: VM is in the process of fault tolerance, VM can not get into # @colo: VM is in the process of fault tolerance, VM can not get into
# this state unless colo capability is enabled for migration. # this state unless colo capability is enabled for migration.
@@ -181,8 +184,8 @@
'data': [ 'none', 'setup', 'cancelling', 'cancelled', 'data': [ 'none', 'setup', 'cancelling', 'cancelled',
'active', 'postcopy-device', 'postcopy-active', 'active', 'postcopy-device', 'postcopy-active',
'postcopy-paused', 'postcopy-recover-setup', 'postcopy-paused', 'postcopy-recover-setup',
'postcopy-recover', 'completed', 'failed', 'colo', 'postcopy-recover', 'completed', 'failing', 'failed',
'pre-switchover', 'device', 'wait-unplug' ] } 'colo', 'pre-switchover', 'device', 'wait-unplug' ] }
## ##
# @VfioStats: # @VfioStats:

View File

@@ -241,7 +241,8 @@ void wait_for_migration_fail(QTestState *from, bool allow_active)
do { do {
status = migrate_query_status(from); status = migrate_query_status(from);
bool result = !strcmp(status, "setup") || !strcmp(status, "failed") || bool result = !strcmp(status, "setup") || !strcmp(status, "failed") ||
(allow_active && !strcmp(status, "active")); (allow_active && !strcmp(status, "active")) ||
!strcmp(status, "failing");
if (!result) { if (!result) {
fprintf(stderr, "%s: unexpected status status=%s allow_active=%d\n", fprintf(stderr, "%s: unexpected status status=%s allow_active=%d\n",
__func__, status, allow_active); __func__, status, allow_active);

View File

@@ -1247,7 +1247,7 @@ void migration_test_add_precopy(MigrationTestEnv *env)
} }
/* ensure new status don't go unnoticed */ /* ensure new status don't go unnoticed */
assert(MIGRATION_STATUS__MAX == 16); assert(MIGRATION_STATUS__MAX == 17);
for (int i = MIGRATION_STATUS_NONE; i < MIGRATION_STATUS__MAX; i++) { for (int i = MIGRATION_STATUS_NONE; i < MIGRATION_STATUS__MAX; i++) {
switch (i) { switch (i) {
@@ -1259,6 +1259,7 @@ void migration_test_add_precopy(MigrationTestEnv *env)
case MIGRATION_STATUS_POSTCOPY_PAUSED: case MIGRATION_STATUS_POSTCOPY_PAUSED:
case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP: case MIGRATION_STATUS_POSTCOPY_RECOVER_SETUP:
case MIGRATION_STATUS_POSTCOPY_RECOVER: case MIGRATION_STATUS_POSTCOPY_RECOVER:
case MIGRATION_STATUS_FAILING:
continue; continue;
default: default:
migration_test_add_suffix("/migration/cancel/src/after/", migration_test_add_suffix("/migration/cancel/src/after/",