2020-01-20 14:18:47 +00:00
|
|
|
/*
|
|
|
|
|
* Linux io_uring support.
|
|
|
|
|
*
|
|
|
|
|
* Copyright (C) 2009 IBM, Corp.
|
|
|
|
|
* Copyright (C) 2009 Red Hat, Inc.
|
|
|
|
|
* Copyright (C) 2019 Aarushi Mehta
|
|
|
|
|
*
|
|
|
|
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
|
|
|
|
* See the COPYING file in the top-level directory.
|
|
|
|
|
*/
|
|
|
|
|
#include "qemu/osdep.h"
|
|
|
|
|
#include <liburing.h>
|
2025-12-03 12:33:13 +01:00
|
|
|
#include "qemu/aio.h"
|
2020-01-20 14:18:47 +00:00
|
|
|
#include "block/block.h"
|
|
|
|
|
#include "block/raw-aio.h"
|
|
|
|
|
#include "qemu/coroutine.h"
|
2024-12-03 15:20:13 +01:00
|
|
|
#include "system/block-backend.h"
|
2020-01-20 14:18:52 +00:00
|
|
|
#include "trace.h"
|
2020-01-20 14:18:47 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
typedef struct {
|
2020-01-20 14:18:47 +00:00
|
|
|
Coroutine *co;
|
|
|
|
|
QEMUIOVector *qiov;
|
2025-11-03 21:29:32 -05:00
|
|
|
uint64_t offset;
|
|
|
|
|
ssize_t ret;
|
|
|
|
|
int type;
|
|
|
|
|
int fd;
|
|
|
|
|
BdrvRequestFlags flags;
|
2020-01-20 14:18:47 +00:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Buffered reads may require resubmission, see
|
|
|
|
|
* luring_resubmit_short_read().
|
|
|
|
|
*/
|
|
|
|
|
int total_read;
|
|
|
|
|
QEMUIOVector resubmit_qiov;
|
|
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
CqeHandler cqe_handler;
|
|
|
|
|
} LuringRequest;
|
2020-01-20 14:18:47 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
static void luring_prep_sqe(struct io_uring_sqe *sqe, void *opaque)
|
2020-01-20 14:18:47 +00:00
|
|
|
{
|
2025-11-03 21:29:32 -05:00
|
|
|
LuringRequest *req = opaque;
|
|
|
|
|
QEMUIOVector *qiov = req->qiov;
|
|
|
|
|
uint64_t offset = req->offset;
|
|
|
|
|
int fd = req->fd;
|
|
|
|
|
BdrvRequestFlags flags = req->flags;
|
|
|
|
|
|
|
|
|
|
switch (req->type) {
|
|
|
|
|
case QEMU_AIO_WRITE:
|
|
|
|
|
{
|
|
|
|
|
int luring_flags = (flags & BDRV_REQ_FUA) ? RWF_DSYNC : 0;
|
2025-11-03 21:29:33 -05:00
|
|
|
if (luring_flags != 0 || qiov->niov > 1) {
|
|
|
|
|
#ifdef HAVE_IO_URING_PREP_WRITEV2
|
|
|
|
|
io_uring_prep_writev2(sqe, fd, qiov->iov,
|
|
|
|
|
qiov->niov, offset, luring_flags);
|
2025-11-03 21:29:32 -05:00
|
|
|
#else
|
2025-11-03 21:29:33 -05:00
|
|
|
/*
|
|
|
|
|
* FUA should only be enabled with HAVE_IO_URING_PREP_WRITEV2, see
|
|
|
|
|
* luring_has_fua().
|
|
|
|
|
*/
|
|
|
|
|
assert(luring_flags == 0);
|
|
|
|
|
|
|
|
|
|
io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
|
2025-11-03 21:29:32 -05:00
|
|
|
#endif
|
2025-11-03 21:29:33 -05:00
|
|
|
} else {
|
|
|
|
|
/* The man page says non-vectored is faster than vectored */
|
|
|
|
|
struct iovec *iov = qiov->iov;
|
|
|
|
|
io_uring_prep_write(sqe, fd, iov->iov_base, iov->iov_len, offset);
|
|
|
|
|
}
|
2025-11-03 21:29:32 -05:00
|
|
|
break;
|
2025-11-03 21:29:33 -05:00
|
|
|
}
|
2025-11-03 21:29:32 -05:00
|
|
|
case QEMU_AIO_ZONE_APPEND:
|
|
|
|
|
io_uring_prep_writev(sqe, fd, qiov->iov, qiov->niov, offset);
|
|
|
|
|
break;
|
|
|
|
|
case QEMU_AIO_READ:
|
|
|
|
|
{
|
|
|
|
|
if (req->resubmit_qiov.iov != NULL) {
|
|
|
|
|
qiov = &req->resubmit_qiov;
|
|
|
|
|
}
|
2025-11-03 21:29:33 -05:00
|
|
|
if (qiov->niov > 1) {
|
|
|
|
|
io_uring_prep_readv(sqe, fd, qiov->iov, qiov->niov,
|
|
|
|
|
offset + req->total_read);
|
|
|
|
|
} else {
|
|
|
|
|
/* The man page says non-vectored is faster than vectored */
|
|
|
|
|
struct iovec *iov = qiov->iov;
|
|
|
|
|
io_uring_prep_read(sqe, fd, iov->iov_base, iov->iov_len,
|
|
|
|
|
offset + req->total_read);
|
|
|
|
|
}
|
2025-11-03 21:29:32 -05:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
case QEMU_AIO_FLUSH:
|
|
|
|
|
io_uring_prep_fsync(sqe, fd, IORING_FSYNC_DATASYNC);
|
|
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
fprintf(stderr, "%s: invalid AIO request type, aborting 0x%x.\n",
|
|
|
|
|
__func__, req->type);
|
|
|
|
|
abort();
|
|
|
|
|
}
|
2020-01-20 14:18:47 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* luring_resubmit_short_read:
|
|
|
|
|
*
|
2022-07-06 09:03:41 +01:00
|
|
|
* Short reads are rare but may occur. The remaining read request needs to be
|
|
|
|
|
* resubmitted.
|
2020-01-20 14:18:47 +00:00
|
|
|
*/
|
2025-11-03 21:29:32 -05:00
|
|
|
static void luring_resubmit_short_read(LuringRequest *req, int nread)
|
2020-01-20 14:18:47 +00:00
|
|
|
{
|
|
|
|
|
QEMUIOVector *resubmit_qiov;
|
|
|
|
|
size_t remaining;
|
|
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
trace_luring_resubmit_short_read(req, nread);
|
2020-01-20 14:18:52 +00:00
|
|
|
|
2020-01-20 14:18:47 +00:00
|
|
|
/* Update read position */
|
2025-11-03 21:29:32 -05:00
|
|
|
req->total_read += nread;
|
|
|
|
|
remaining = req->qiov->size - req->total_read;
|
2020-01-20 14:18:47 +00:00
|
|
|
|
|
|
|
|
/* Shorten qiov */
|
2025-11-03 21:29:32 -05:00
|
|
|
resubmit_qiov = &req->resubmit_qiov;
|
2020-01-20 14:18:47 +00:00
|
|
|
if (resubmit_qiov->iov == NULL) {
|
2025-11-03 21:29:32 -05:00
|
|
|
qemu_iovec_init(resubmit_qiov, req->qiov->niov);
|
2020-01-20 14:18:47 +00:00
|
|
|
} else {
|
|
|
|
|
qemu_iovec_reset(resubmit_qiov);
|
|
|
|
|
}
|
2025-11-03 21:29:32 -05:00
|
|
|
qemu_iovec_concat(resubmit_qiov, req->qiov, req->total_read, remaining);
|
2020-01-20 14:18:47 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
|
2020-01-20 14:18:47 +00:00
|
|
|
}
|
|
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
static void luring_cqe_handler(CqeHandler *cqe_handler)
|
2020-01-20 14:18:47 +00:00
|
|
|
{
|
2025-11-03 21:29:32 -05:00
|
|
|
LuringRequest *req = container_of(cqe_handler, LuringRequest, cqe_handler);
|
|
|
|
|
int ret = cqe_handler->cqe.res;
|
2020-01-20 14:18:47 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
trace_luring_cqe_handler(req, ret);
|
2020-01-20 14:18:47 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
if (ret < 0) {
|
|
|
|
|
/*
|
|
|
|
|
* Only writev/readv/fsync requests on regular files or host block
|
|
|
|
|
* devices are submitted. Therefore -EAGAIN is not expected but it's
|
|
|
|
|
* known to happen sometimes with Linux SCSI. Submit again and hope
|
|
|
|
|
* the request completes successfully.
|
|
|
|
|
*
|
|
|
|
|
* For more information, see:
|
|
|
|
|
* https://lore.kernel.org/io-uring/20210727165811.284510-3-axboe@kernel.dk/T/#u
|
|
|
|
|
*
|
|
|
|
|
* If the code is changed to submit other types of requests in the
|
|
|
|
|
* future, then this workaround may need to be extended to deal with
|
|
|
|
|
* genuine -EAGAIN results that should not be resubmitted
|
|
|
|
|
* immediately.
|
|
|
|
|
*/
|
|
|
|
|
if (ret == -EINTR || ret == -EAGAIN) {
|
|
|
|
|
aio_add_sqe(luring_prep_sqe, req, &req->cqe_handler);
|
|
|
|
|
return;
|
2020-01-20 14:18:47 +00:00
|
|
|
}
|
2025-11-03 21:29:32 -05:00
|
|
|
} else if (req->qiov) {
|
2020-01-20 14:18:47 +00:00
|
|
|
/* total_read is non-zero only for resubmitted read requests */
|
2025-11-03 21:29:32 -05:00
|
|
|
int total_bytes = ret + req->total_read;
|
2020-01-20 14:18:47 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
if (total_bytes == req->qiov->size) {
|
2020-01-20 14:18:47 +00:00
|
|
|
ret = 0;
|
|
|
|
|
} else {
|
|
|
|
|
/* Short Read/Write */
|
2025-11-03 21:29:32 -05:00
|
|
|
if (req->type == QEMU_AIO_READ) {
|
2020-01-20 14:18:47 +00:00
|
|
|
if (ret > 0) {
|
2025-11-03 21:29:32 -05:00
|
|
|
luring_resubmit_short_read(req, ret);
|
|
|
|
|
return;
|
2020-01-20 14:18:47 +00:00
|
|
|
}
|
2025-11-03 21:29:32 -05:00
|
|
|
|
|
|
|
|
/* Pad with zeroes */
|
|
|
|
|
qemu_iovec_memset(req->qiov, total_bytes, 0,
|
|
|
|
|
req->qiov->size - total_bytes);
|
|
|
|
|
ret = 0;
|
2020-01-20 14:18:47 +00:00
|
|
|
} else {
|
2020-02-18 10:43:53 +01:00
|
|
|
ret = -ENOSPC;
|
2020-01-20 14:18:47 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
req->ret = ret;
|
|
|
|
|
qemu_iovec_destroy(&req->resubmit_qiov);
|
2020-01-20 14:18:47 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
/*
|
|
|
|
|
* If the coroutine is already entered it must be in luring_co_submit() and
|
|
|
|
|
* will notice req->ret has been filled in when it eventually runs later.
|
|
|
|
|
* Coroutines cannot be entered recursively so avoid doing that!
|
|
|
|
|
*/
|
|
|
|
|
if (!qemu_coroutine_entered(req->co)) {
|
|
|
|
|
aio_co_wake(req->co);
|
2020-01-20 14:18:47 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
int coroutine_fn luring_co_submit(BlockDriverState *bs, int fd,
|
|
|
|
|
uint64_t offset, QEMUIOVector *qiov,
|
|
|
|
|
int type, BdrvRequestFlags flags)
|
2020-01-20 14:18:47 +00:00
|
|
|
{
|
2025-11-03 21:29:32 -05:00
|
|
|
LuringRequest req = {
|
2020-01-20 14:18:47 +00:00
|
|
|
.co = qemu_coroutine_self(),
|
|
|
|
|
.qiov = qiov,
|
2025-11-03 21:29:32 -05:00
|
|
|
.ret = -EINPROGRESS,
|
|
|
|
|
.type = type,
|
|
|
|
|
.fd = fd,
|
|
|
|
|
.offset = offset,
|
|
|
|
|
.flags = flags,
|
2020-01-20 14:18:47 +00:00
|
|
|
};
|
2020-01-20 14:18:52 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
req.cqe_handler.cb = luring_cqe_handler;
|
2020-01-20 14:18:47 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
trace_luring_co_submit(bs, &req, fd, offset, qiov ? qiov->size : 0, type);
|
|
|
|
|
aio_add_sqe(luring_prep_sqe, &req, &req.cqe_handler);
|
2020-01-20 14:18:47 +00:00
|
|
|
|
2025-11-03 21:29:32 -05:00
|
|
|
if (req.ret == -EINPROGRESS) {
|
|
|
|
|
qemu_coroutine_yield();
|
2020-01-20 14:18:47 +00:00
|
|
|
}
|
2025-11-03 21:29:32 -05:00
|
|
|
return req.ret;
|
2020-01-20 14:18:47 +00:00
|
|
|
}
|
file-posix: Support FUA writes
Until now, FUA was always emulated with a separate flush after the write
for file-posix. The overhead of processing a second request can reduce
performance significantly for a guest disk that has disabled the write
cache, especially if the host disk is already write through, too, and
the flush isn't actually doing anything.
Advertise support for REQ_FUA in write requests and implement it for
Linux AIO and io_uring using the RWF_DSYNC flag for write requests. The
thread pool still performs a separate fdatasync() call. This can be
improved later by using the pwritev2() syscall if available.
As an example, this is how fio numbers can be improved in some scenarios
with this patch (all using virtio-blk with cache=directsync on an nvme
block device for the VM, fio with ioengine=libaio,direct=1,sync=1):
| old | with FUA support
------------------------------+---------------+-------------------
bs=4k, iodepth=1, numjobs=1 | 45.6k iops | 56.1k iops
bs=4k, iodepth=1, numjobs=16 | 183.3k iops | 236.0k iops
bs=4k, iodepth=16, numjobs=1 | 258.4k iops | 311.1k iops
However, not all scenarios are clear wins. On another slower disk I saw
little to no improvment. In fact, in two corner case scenarios, I even
observed a regression, which I however consider acceptable:
1. On slow host disks in a write through cache mode, when the guest is
using virtio-blk in a separate iothread so that polling can be
enabled, and each completion is quickly followed up with a new
request (so that polling gets it), it can happen that enabling FUA
makes things slower - the additional very fast no-op flush we used to
have gave the adaptive polling algorithm a success so that it kept
polling. Without it, we only have the slow write request, which
disables polling. This is a problem in the polling algorithm that
will be fixed later in this series.
2. With a high queue depth, it can be beneficial to have flush requests
for another reason: The optimisation in bdrv_co_flush() that flushes
only once per write generation acts as a synchronisation mechanism
that lets all requests complete at the same time. This can result in
better batching and if the disk is very fast (I only saw this with a
null_blk backend), this can make up for the overhead of the flush and
improve throughput. In theory, we could optionally introduce a
similar artificial latency in the normal completion path to achieve
the same kind of completion batching. This is not implemented in this
series.
Compatibility is not a concern for the kernel side of io_uring, it has
supported RWF_DSYNC from the start. However, io_uring_prep_writev2() is
not available before liburing 2.2.
Linux AIO started supporting it in Linux 4.13 and libaio 0.3.111. The
kernel is not a problem for any supported build platform, so it's not
necessary to add runtime checks. However, openSUSE is still stuck with
an older libaio version that would break the build.
We must detect the presence of the writev2 functions in the user space
libraries at build time to avoid build failures.
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
Message-ID: <20250307221634.71951-2-kwolf@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Kevin Wolf <kwolf@redhat.com>
2025-03-07 23:16:30 +01:00
|
|
|
|
|
|
|
|
bool luring_has_fua(void)
|
|
|
|
|
{
|
|
|
|
|
#ifdef HAVE_IO_URING_PREP_WRITEV2
|
|
|
|
|
return true;
|
|
|
|
|
#else
|
|
|
|
|
return false;
|
|
|
|
|
#endif
|
|
|
|
|
}
|