来源:腾讯技术工程微信号
作者:draculaqian,腾讯后台开发工程师
Fixed Files模式
优化思想
非关键逻辑上提至循环外,简化关键路径。
优化实现
可以调用io\_uring\_register系统调用,使用IORING\_REGISTER\_FILES操作码,将一组file注册到内核,最终调用io\_sqe\_files\_register,这样内核在注册阶段就批量完成文件的一些基本操作(对于这组文件填充相应的数据结构fixed\_file\_data,其中fixed\_file\_table是维护的file表。内核态下,如何获得文件描述符获取相关的信息呢,就需要通过fget,根据fd号获得指向文件的struct file),之后的再次批量IO时就不需要重复地进行此类基本信息设置(更具体地,例如对文件进行fget/fput操作)。如果需要进行IO操作的文件相对固定(比如数据库日志),这会节省一定量的IO时间。
fixed\_file\_data结构
struct fixed_file_data { struct fixed_file_table *table; struct io_ring_ctx *ctx; struct fixed_file_ref_node *node; struct percpu_ref refs; struct completion done; struct list_head ref_list; spinlock_t lock;};
io\_sqe\_files\_register实现Fixed Files操作
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args){ __s32 __user *fds = (__s32 __user *) arg; unsigned nr_tables, i; struct file *file; int fd, ret = -ENOMEM; struct fixed_file_ref_node *ref_node; struct fixed_file_data *file_data; if (ctx->file_data) return -EBUSY; if (!nr_args) return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; file_data = kzalloc(sizeof(*ctx->file_data), GFP_KERNEL); if (!file_data) return -ENOMEM; file_data->ctx = ctx; init_completion(&file_data->done); INIT_LIST_HEAD(&file_data->ref_list); spin_lock_init(&file_data->lock); nr_tables = DIV_ROUND_UP(nr_args, IORING_MAX_FILES_TABLE); file_data->table = kcalloc(nr_tables, sizeof(*file_data->table), GFP_KERNEL); if (!file_data->table) goto out_free; if (percpu_ref_init(&file_data->refs, io_file_ref_kill, PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) goto out_free; if (io_sqe_alloc_file_tables(file_data, nr_tables, nr_args)) goto out_ref; ctx->file_data = file_data; for (i = 0; i < nr_args; i++, ctx->nr_user_files++) { struct fixed_file_table *table; unsigned index; if (copy_from_user(&fd, &fds[i], sizeof(fd))) { ret = -EFAULT; goto out_fput; } /* allow sparse sets */ if (fd == -1) continue; file = fget(fd); ret = -EBADF; if (!file) goto out_fput; /* * Don't allow io_uring instances to be registered. If UNIX * isn't enabled, then this causes a reference cycle and this * instance can never get freed. If UNIX is enabled we'll * handle it just fine, but there's still no point in allowing * a ring fd as it doesn't support regular read/write anyway. */ if (file->f_op == &io_uring_fops) { fput(file); goto out_fput; } table = &file_data->table[i >> IORING_FILE_TABLE_SHIFT]; index = i & IORING_FILE_TABLE_MASK; table->files[index] = file; } ret = io_sqe_files_scm(ctx); if (ret) { io_sqe_files_unregister(ctx); return ret; } ref_node = alloc_fixed_file_ref_node(ctx); if (IS_ERR(ref_node)) { io_sqe_files_unregister(ctx); return PTR_ERR(ref_node); } file_data->node = ref_node; spin_lock(&file_data->lock); list_add_tail(&ref_node->node, &file_data->ref_list); spin_unlock(&file_data->lock); percpu_ref_get(&file_data->refs); return ret;out_fput: for (i = 0; i < ctx->nr_user_files; i++) { file = io_file_from_index(ctx, i); if (file) fput(file); } for (i = 0; i < nr_tables; i++) kfree(file_data->table[i].files); ctx->nr_user_files = 0;out_ref: percpu_ref_exit(&file_data->refs);out_free: kfree(file_data->table); kfree(file_data); ctx->file_data = NULL; return ret;}
Fixed Buffers模式
优化思想
优化思想也是将非关键逻辑上提至循环外,简化关键路径。
优化实现
如果应用提交到内核的虚拟内存地址是固定的,那么可以提前完成虚拟地址到物理pages的映射,将这个并不是每次都要做的非关键路径从关键的IO 路径中剥离,避免每次I/O都进行转换,从而优化性能。可以在io\_uring\_setup之后,调用io\_uring\_register,使用IORING\_REGISTER\_BUFFERS 操作码,将一组buffer注册到内核(参数是一个指向iovec的数组,表示这些地址需要map到内核),最终调用io\_sqe\_buffer\_register,这样内核在注册阶段就批量完成buffer的一些基本操作(减小get\_user\_pages、put\_page开销,提前使用get\_user\_pages来获得userspace虚拟地址对应的物理pages,初始化在io\_ring\_ctx上下文中用于管理用户态buffer的io\_mapped\_ubuf数据结构,map/unmap,传递IOV的地址和长度等),之后的再次批量IO时就不需要重复地进行此类内存拷贝和基础信息检测。
在操作IO的时,如果需要进行IO操作的buffer相对固定,提交的虚拟地址曾经被注册过,那么可以使用带FIXED系列的opcode(IORING\_OP\_READ\_FIXED/IORING\_OP\_WRITE\_FIXED)IO,可以看到底层调用链:io\_issue\_sqe->io\_read->io\_import\_iovec->\_\_io\_import\_iovec->io\_import\_fixed,会直接使用已经完成的“成果”,如此就免去了虚拟地址到pages的转换,这会节省一定量的IO时间。
io\_mapped\_ubuf结构
struct io_mapped_ubuf { u64 ubuf; size_t len; struct bio_vec *bvec; unsigned int nr_bvecs; unsigned long acct_pages;};
io\_sqe\_buffer\_register实现Fixed Buffers操作
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args){ struct vm_area_struct **vmas = NULL; struct page **pages = NULL; struct page *last_hpage = NULL; int i, j, got_pages = 0; int ret = -EINVAL; if (ctx->user_bufs) return -EBUSY; if (!nr_args || nr_args > UIO_MAXIOV) return -EINVAL; ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf), GFP_KERNEL); if (!ctx->user_bufs) return -ENOMEM; for (i = 0; i < nr_args; i++) { struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; unsigned long off, start, end, ubuf; int pret, nr_pages; struct iovec iov; size_t size; ret = io_copy_iov(ctx, &iov, arg, i); if (ret) goto err; /* * Don't impose further limits on the size and buffer * constraints here, we'll -EINVAL later when IO is * submitted if they are wrong. */ ret = -EFAULT; if (!iov.iov_base || !iov.iov_len) goto err; /* arbitrary limit, but we need something */ if (iov.iov_len > SZ_1G) goto err; ubuf = (unsigned long) iov.iov_base; end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ubuf >> PAGE_SHIFT; nr_pages = end - start; ret = 0; if (!pages || nr_pages > got_pages) { kvfree(vmas); kvfree(pages); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); vmas = kvmalloc_array(nr_pages, sizeof(struct vm_area_struct *), GFP_KERNEL); if (!pages || !vmas) { ret = -ENOMEM; goto err; } got_pages = nr_pages; } imu->bvec = kvmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); ret = -ENOMEM; if (!imu->bvec) goto err; ret = 0; mmap_read_lock(current->mm); pret = pin_user_pages(ubuf, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages, vmas); if (pret == nr_pages) { /* don't support file backed memory */ for (j = 0; j < nr_pages; j++) { struct vm_area_struct *vma = vmas[j]; if (vma->vm_file && !is_file_hugepages(vma->vm_file)) { ret = -EOPNOTSUPP; break; } } } else { ret = pret < 0 ? pret : -EFAULT; } mmap_read_unlock(current->mm); if (ret) { /* * if we did partial map, or found file backed vmas, * release any pages we did get */ if (pret > 0) unpin_user_pages(pages, pret); kvfree(imu->bvec); goto err; } ret = io_buffer_account_pin(ctx, pages, pret, imu, &last_hpage); if (ret) { unpin_user_pages(pages, pret); kvfree(imu->bvec); goto err; } off = ubuf & ~PAGE_MASK; size = iov.iov_len; for (j = 0; j < nr_pages; j++) { size_t vec_len; vec_len = min_t(size_t, size, PAGE_SIZE - off); imu->bvec[j].bv_page = pages[j]; imu->bvec[j].bv_len = vec_len; imu->bvec[j].bv_offset = off; off = 0; size -= vec_len; } /* store original address for later verification */ imu->ubuf = ubuf; imu->len = iov.iov_len; imu->nr_bvecs = nr_pages; ctx->nr_user_bufs++; } kvfree(pages); kvfree(vmas); return 0;err: kvfree(pages); kvfree(vmas); io_sqe_buffer_unregister(ctx); return ret;}
Polled IO模式
优化思想
将较多的CPU时间放到重要的事情上,全速完成关键路径。
状态从未完成变成已完成,就需要对完成状态进行探测,很多时候,可以使用中断模型,也就是等待后端数据处理完毕之后,内核会发起一个SIGIO或eventfd的EPOLLIN状态提醒核外有数据已经完成了,可以开始处理。但是,中断其实是比较耗时的,如果是高IOPS的场景,就会不停地中断,中断开销就得不偿失。
我们可以更激进一些,让内核采用Polled IO模式收割块设备层请求。这在一定的程度上加速了IO,这在追求低延时和高IOPS的应用场景非常有用。
优化实现
io\_uring\_enter通过正确设置IORING\_ENTER\_GETEVENTS,IORING\_SETUP\_IOPOLL等flag(如下代码设置IORING\_SETUP\_IOPOLL并且不设置IORING\_SETUP\_SQPOLL,即没有使用SQ线程)调用io\_iopoll\_check。
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, u32, min_complete, u32, flags, const sigset_t __user *, sig, size_t, sigsz){ struct io_ring_ctx *ctx; long ret = -EBADF; int submitted = 0; struct fd f; io_run_task_work(); if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP | IORING_ENTER_SQ_WAIT)) return -EINVAL; f = fdget(fd); if (!f.file) return -EBADF; ret = -EOPNOTSUPP; if (f.file->f_op != &io_uring_fops) goto out_fput; ret = -ENXIO; ctx = f.file->private_data; if (!percpu_ref_tryget(&ctx->refs)) goto out_fput; ret = -EBADFD; if (ctx->flags & IORING_SETUP_R_DISABLED) goto out; /* * For SQ polling, the thread will do all submissions and completions. * Just return the requested submit count, and wake the thread if * we were asked to. */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { if (!list_empty_careful(&ctx->cq_overflow_list)) io_cqring_overflow_flush(ctx, false, NULL, NULL); if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) io_sqpoll_wait_sq(ctx); submitted = to_submit; } else if (to_submit) { ret = io_uring_add_task_file(ctx, f.file); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); submitted = io_submit_sqes(ctx, to_submit); mutex_unlock(&ctx->uring_lock); if (submitted != to_submit) goto out; } if (flags & IORING_ENTER_GETEVENTS) { min_complete = min(min_complete, ctx->cq_entries); /* * When SETUP_IOPOLL and SETUP_SQPOLL are both enabled, user * space applications don't need to do io completion events * polling again, they can rely on io_sq_thread to do polling * work, which can reduce cpu usage and uring_lock contention. */ if (ctx->flags & IORING_SETUP_IOPOLL && !(ctx->flags & IORING_SETUP_SQPOLL)) { ret = io_iopoll_check(ctx, min_complete); } else { ret = io_cqring_wait(ctx, min_complete, sig, sigsz); } }out: percpu_ref_put(&ctx->refs);out_fput: fdput(f); return submitted ? submitted : ret;}
io\_iopoll\_check开始poll核外程序可以不停的轮询需要的完成事件数量min\_complete,循环内主要调用io\_iopoll\_getevents。
static int io_iopoll_check(struct io_ring_ctx *ctx, long min){ unsigned int nr_events = 0; int iters = 0, ret = 0; /* * We disallow the app entering submit/complete with polling, but we * still need to lock the ring to prevent racing with polled issue * that got punted to a workqueue. */ mutex_lock(&ctx->uring_lock); do { /* * Don't enter poll loop if we already have events pending. * If we do, we can potentially be spinning for commands that * already triggered a CQE (eg in error). */ if (io_cqring_events(ctx, false)) break; /* * If a submit got punted to a workqueue, we can have the * application entering polling for a command before it gets * issued. That app will hold the uring_lock for the duration * of the poll right here, so we need to take a breather every * now and then to ensure that the issue has a chance to add * the poll to the issued list. Otherwise we can spin here * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ if (!(++iters & 7)) { mutex_unlock(&ctx->uring_lock); io_run_task_work(); mutex_lock(&ctx->uring_lock); } ret = io_iopoll_getevents(ctx, &nr_events, min); if (ret <= 0) break; ret = 0; } while (min && !nr_events && !need_resched()); mutex_unlock(&ctx->uring_lock); return ret;}
io\_iopoll\_getevents调用io\_do\_iopoll。
/* * Poll for a minimum of 'min' events. Note that if min == 0 we consider that a * non-spinning poll check - we'll still enter the driver poll loop, but only * as a non-spinning completion check. */static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, long min){ while (!list_empty(&ctx->iopoll_list) && !need_resched()) { int ret; ret = io_do_iopoll(ctx, nr_events, min); if (ret < 0) return ret; if (*nr_events >= min) return 0; } return 1;}
io\_do\_iopoll中的kiocb->ki\_filp->f\_op->iopoll,即blkdev\_iopoll,不断地轮询探测确认提交给Block层的请求的完成状态,直到足够数量的IO完成。
static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, long min){ struct io_kiocb *req, *tmp; LIST_HEAD(done); bool spin; int ret; /* * Only spin for completions if we don't have multiple devices hanging * off our complete list, and we're under the requested amount. */ spin = !ctx->poll_multi_file && *nr_events < min; ret = 0; list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; /* * Move completed and retryable entries to our local lists. * If we find a request that requires polling, break out * and complete those lists first, if we have entries there. */ if (READ_ONCE(req->iopoll_completed)) { list_move_tail(&req->inflight_entry, &done); continue; } if (!list_empty(&done)) break; ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); if (ret < 0) break; /* iopoll may have completed current req */ if (READ_ONCE(req->iopoll_completed)) list_move_tail(&req->inflight_entry, &done); if (ret && spin) spin = false; ret = 0; } if (!list_empty(&done)) io_iopoll_complete(ctx, nr_events, &done); return ret;}
块设备层相关file\_operations。
const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .iopoll = blkdev_iopoll, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl,#ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl,#endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate,};
当使用POLL IO时,大多数CPU时间花费在blkdev\_iopoll上。即全速完成关键路径。
static int blkdev_iopoll(struct kiocb *kiocb, bool wait){ struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); struct request_queue *q = bdev_get_queue(bdev); return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);}
Kernel Side Polling
IORING\_SETUP\_SQPOLL,当前应用更新SQ并填充一个新的SQE,内核线程sq\_thread会自动完成提交,这样应用无需每次调用io\_uring\_enter系统调用来提交IO。应用可通过IORING\_SETUP\_SQ\_AFF和sq\_thread\_cpu绑定特定的CPU。
实际机器上,不仅有高IOPS场景,还有些场景的IOPS有些时间段会非常低。为了节省无IO场景的CPU开销,一段时间空闲,该内核线程可以自动睡眠。核外在下发新的IO时,通过IORING\_ENTER\_SQ\_WAKEUP唤醒该内核线程。
小结
如上可见,内核提供了足够多的选择,不同的方案有着不同角度的优化方向,这些优化方案可以自行组合。通过合理地使用,可以使io\_uring 全速运转。
io\_uring用户态库liburing
正如前文所说,简单并不一定意味着易用——io\_uring的接口足够简单,但是相对于这种简单,操作上需要手动mmap来映射内存,稍显复杂。为了更方便地使用io\_uring,原作者Jens Axboe还开发了一套liburing库。liburing库提供了一组辅助函数实现设置和内存映射,应用不必了解诸多io\_uring的细节就可以简单地使用起来。例如,无需担心memory barrier,或者是ring buffer管理之类等。上文所提的一些高级特性,在liburing中也有封装。
核心数据结构
liburing中,核心的结构有io\_uring、io\_uring\_sq、io\_uring\_cq
/* * Library interface to io_uring */struct io_uring_sq { unsigned *khead; unsigned *ktail; unsigned *kring_mask; unsigned *kring_entries; unsigned *kflags; unsigned *kdropped; unsigned *array; struct io_uring_sqe *sqes; unsigned sqe_head; unsigned sqe_tail; size_t ring_sz;};struct io_uring_cq { unsigned *khead; unsigned *ktail; unsigned *kring_mask; unsigned *kring_entries; unsigned *koverflow; struct io_uring_cqe *cqes; size_t ring_sz;};struct io_uring { struct io_uring_sq sq; struct io_uring_cq cq; int ring_fd;};
核心接口
相关接口在头文件linux/tools/io\_uring/liburing.h,如果是通过标准方式安装的liburing,则在/usr/include/下。
/* * System calls */extern int io_uring_setup(unsigned entries, struct io_uring_params *p);extern int io_uring_enter(int fd, unsigned to_submit, unsigned min_complete, unsigned flags, sigset_t *sig);extern int io_uring_register(int fd, unsigned int opcode, void *arg, unsigned int nr_args);/* * Library interface */extern int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags);extern int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring);extern void io_uring_queue_exit(struct io_uring *ring);extern int io_uring_peek_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr);extern int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr);extern int io_uring_submit(struct io_uring *ring);extern struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring);
主要流程
- 使用io\_uring\_queue\_init,完成io\_uring相关结构的初始化。在这个函数的实现中,会调用多个mmap来初始化一些内存。
- 初始化完成之后,为了提交IO请求,需要获取里面queue的一个项,使用io\_uring\_get\_sqe。
- 获取到了空闲项之后,使用io\_uring\_prep\_readv、io\_uring\_prep\_writev初始化读、写请求。和前文所提preadv、pwritev的思想差不多,这里直接以不同的操作码委托io\_uring\_prep\_rw,io\_uring\_prep\_rw只是简单地初始化io\_uring\_sqe。
- 准备完成之后,使用io\_uring\_submit提交请求。
- 提交了IO请求时,可以通过非阻塞式函数io\_uring\_peek\_cqe、阻塞式函数io\_uring\_wait\_cqe获取请求完成的情况。默认情况下,完成的IO请求还会存在内部的队列中,需要通过io\_uring\_cqe\_seen表标记完成操作。
- 使用完成之后要通过io\_uring\_queue\_exit来完成资源清理的工作。
核心实现
io\_uring\_queue\_init的实现,前文已略有提及。其中的操作主要就是io\_uring\_setup和io\_uring\_queue\_mmap,io\_uring\_setup前文已解析过,这里主要看io\_uring\_queue\_mmap。
/* * Returns -1 on error, or zero on success. On success, 'ring' * contains the necessary information to read/write to the rings. */int io_uring_queue_init(unsigned entries, struct io_uring *ring, unsigned flags){ struct io_uring_params p; int fd, ret; memset(&p, 0, sizeof(p)); p.flags = flags; fd = io_uring_setup(entries, &p); if (fd < 0) return fd; ret = io_uring_queue_mmap(fd, &p, ring); if (ret) close(fd); return ret;}
io\_uring\_queue\_mmap初始化io\_uring结构,然后主要调用io\_uring\_mmap。
/* * For users that want to specify sq_thread_cpu or sq_thread_idle, this * interface is a convenient helper for mmap()ing the rings. * Returns -1 on error, or zero on success. On success, 'ring' * contains the necessary information to read/write to the rings. */int io_uring_queue_mmap(int fd, struct io_uring_params *p, struct io_uring *ring){ int ret; memset(ring, 0, sizeof(*ring)); ret = io_uring_mmap(fd, p, &ring->sq, &ring->cq); if (!ret) ring->ring_fd = fd; return ret;}
io\_uring\_mmap初始化io\_uring\_sq结构和io\_uring\_cq结构的内存,另外还会分配一个io\_uring\_sqe结构的数组。
static int io_uring_mmap(int fd, struct io_uring_params *p, struct io_uring_sq *sq, struct io_uring_cq *cq){ size_t size; void *ptr; int ret; sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned); ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING); if (ptr == MAP_FAILED) return -errno; sq->khead = ptr + p->sq_off.head; sq->ktail = ptr + p->sq_off.tail; sq->kring_mask = ptr + p->sq_off.ring_mask; sq->kring_entries = ptr + p->sq_off.ring_entries; sq->kflags = ptr + p->sq_off.flags; sq->kdropped = ptr + p->sq_off.dropped; sq->array = ptr + p->sq_off.array; size = p->sq_entries * sizeof(struct io_uring_sqe); sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES); if (sq->sqes == MAP_FAILED) { ret = -errno;err: munmap(sq->khead, sq->ring_sz); return ret; } cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe); ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING); if (ptr == MAP_FAILED) { ret = -errno; munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe)); goto err; } cq->khead = ptr + p->cq_off.head; cq->ktail = ptr + p->cq_off.tail; cq->kring_mask = ptr + p->cq_off.ring_mask; cq->kring_entries = ptr + p->cq_off.ring_entries; cq->koverflow = ptr + p->cq_off.overflow; cq->cqes = ptr + p->cq_off.cqes; return 0;}
具体例程
如下是一个基于liburing的helloworld示例。
#include <unistd.h>#include <fcntl.h>#include <string.h>#include <stdio.h>#include <liburing.h>#define ENTRIES 4int main(int argc, char *argv[]){ struct io_uring ring; struct io_uring_sqe *sqe; struct io_uring_cqe *cqe; struct iovec iov = { .iov_base = "Hello World", .iov_len = strlen("Hello World"), }; int fd, ret; if (argc != 2) { printf("%s: <testfile>\n", argv[0]); return 1; } /* setup io_uring and do mmap */ ret = io_uring_queue_init(ENTRIES, &ring, 0); if (ret < 0) { printf("io_uring_queue_init: %s\n", strerror(-ret)); return 1; } fd = open("testfile", O_WRONLY | O_CREAT); if (fd < 0) { printf("open failed\n"); ret = 1; goto exit; } /* get an sqe and fill in a WRITEV operation */ sqe = io_uring_get_sqe(&ring); if (!sqe) { printf("io_uring_get_sqe failed\n"); ret = 1; goto out; } io_uring_prep_writev(sqe, fd, &iov, 1, 0); /* tell the kernel we have an sqe ready for consumption */ ret = io_uring_submit(&ring); if (ret < 0) { printf("io_uring_submit: %s\n", strerror(-ret)); goto out; } /* wait for the sqe to complete */ ret = io_uring_wait_cqe(&ring, &cqe); if (ret < 0) { printf("io_uring_wait_cqe: %s\n", strerror(-ret)); goto out; } /* read and process cqe event */ io_uring_cqe_seen(&ring, cqe);out: close(fd);exit: /* tear down */ io_uring_queue_exit(&ring); return ret;}
更多的示例可参考:
http://git.kernel.dk/cgit/lib...
https://git.kernel.dk/cgit/li...
性能
如上,推演过了设计与实现,回归到存储的需求上来,io\_uring子系统是否能满足我们对高性能的极致需求呢?这一切还是需要profile。
测试方法
io\_uring原作者Jens Axboe在fio中提供了ioengine=io\_uring的支持,可以使用fio进行测试,使用ioengine选项指定异步IO引擎。
可以基于不同的IO栈:
- libaio
- kernel+io\_uring
- kernel+io\_uring polling mode
可以基于一些硬件之上:
- NVMe SSD
- ...
测试过程中主要4k数据的顺序读、顺序写、随机读、随机写,对比几种IO引擎的性能及QoS等指标
io\_uring polling mode测试实例:
fio -name=testname -filename=/mnt/vdd/testfilename -iodepth=64 -thread -rw=randread -ioengine=io_uring -sqthread_poll=1 -direct=1 -bs=4k -size=10G -numjobs=1 -runtime=600 -group_reporting
测试结果
网上可以找到一些关于io uring的性能测试,这里列出部分供参考:
- Im__proved Flash Performance Using the New Linux Kernel I/O Interface
- io\_uring echo server benchs
- [PATCHSET v5] io\_uring IO interface
- ...
主要有以下几个测试结果
- io\_uring在非polling模式下,相比libaio,性能提升不是非常显著。
- io\_uring在polling模式下,性能提升显著,与spdk接近,在队列深度较高时性能更好。
- 在meltdown和spectre漏洞没有修复的场景下,io\_uring的提升并不太高。虽然减少了大量的用户态到内核态的上下文切换,在meldown和spectre漏洞没有修复的场景下,用户态到内核态的切换开销本比较小,所以提升不太高。
- 在某些场景下使用io\_uring + Kernel NVMe的驱动,效果甚至要比使用SPDK 用户态NVMe 驱动更好
从测试中,我们可以得出结论,在存储中使用io\_uring,相比使用libaio,应用的性能会有显著的提升。
在同样的硬件平台上,仅仅更换IO引擎,就可以带来较大的提升,是很难得的,对于存储这种延时敏感的应用而言十分宝贵。
io\_uring的优势
综合前文和测试,io\_uring有如此出众的性能,主要来源于以下几个方面:
- 用户态和内核态共享提交队列SQ和完成队列CQ实现零拷贝。
- IO提交和收割可以offload给Kernel,不需要经过系统调用。
- 支持块设备层的Polling模式。
- 可以提前注册用户态内存地址,从而减少地址映射的开销。
- 相比libaio,支持buffered IO
发展方向
事物的发展是一个哲学话题。前文阐述了io\_uring作为一个新事物,发展的根本动力、内因和外因,谨此简述一些可预见的未来的发展方向。
普及
应用层多使用。目前主要应用在存储的场景中,这是一个不仅需要高性能,也需要稳定的场景,而一般来说,新事物并不具备“稳定”的属性。但是io\_uring同样也是稳定的,因为虽然io\_uring使用到了若干新概念,但是这些新的东西已经有了实践的检验,如eventfd通知机制,SIGIO信号机制,与AIO基本相似。它是一个质变的新事物。
就我们腾讯而言,内核使用tlinux,tlinux3基于4.14.99主线;tlinux4基于5.4.23主线。
所以,tlinux3可以用native aio,tlinux4之后已经可以用native io\_uring。
相信通过大家的努力,正如前文所说的PostgreSQL使用彼时新接口pread,Nginx使用彼时的新接口AIO一样,通过使用新街口,我们的工程也能获得巨大收益。
优化方向
降低本身的工作负载
持续降低系统调用开销、拷贝开销、框架本身的负载。
重构
"Politics are for the moment. An equation is for eternity.
——Albert Einstein
追求真理的人不可避免地追求永恒。“政治只是一时,方程却是永恒。”——爱因斯坦如是说,时值以色列的第一任总统魏兹曼于1952年逝世,继任首相古理安建议邀请爱因斯坦担任第二任总统。
我们说折衷权衡、精益求精,字里行间都是永恒,然而软件应该持续重构,这实际上并不只是io\_uring需要做的,有机会我会写一篇关于重构的文章。
总结
首先,本文简述了Linux过往的的IO发展历程,同步IO接口、原生异步IO接口AIO的缺陷,为何原有方式存在缺陷。其次,再从设计的角度出发,介绍了最新的IO引擎io\_uring的相关内容。最后,深入最新版内核linux-5.10中解析了io\_uring的大体实现(关键数据结构、流程、特性实现等)。
关于
难免纰漏,欢迎交流,可以通过以下网址找到本文。
内容会更新,可以关注我的公众号,欢迎交流。
参考
PATCH 12/19]io\_uring: add support for pre-mapped user IO buffers
Add pread/pwrite support bits to match the lseek bit
Toward non-blocking asynchronous I/O
A new kernel polling interface
Ringing in a new asynchronous I/O API
The current state of kernel page-table isolation
https://zhuanlan.zhihu.com/p/...
why we need io\_uring? by byteisland
Computer Systems: A Programmer's Perspective, Third Edition
Advanced Programming in the UNIX Environment, Third Edition
The Linux Programming Interface: A Linux and UNIX System Programming Handbook
Understanding Nginx Modules Development and Architecture Resolving(Second Edition)
推荐阅读:
更多腾讯AI相关技术干货,请关注专栏腾讯技术工程