io_uring.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Shared application/kernel submission and completion ring pairs, for
 * supporting fast/efficient IO.
 *
 * A note on the read/write ordering memory barriers that are matched between
 * the application and kernel side.
 *
 * After the application reads the CQ ring tail, it must use an
 * appropriate smp_rmb() to pair with the smp_wmb() the kernel uses
 * before writing the tail (using smp_load_acquire to read the tail will
 * do). It also needs a smp_mb() before updating CQ head (ordering the
 * entry load(s) with the head store), pairing with an implicit barrier
 * through a control-dependency in io_get_cqe (smp_store_release to
 * store head will do). Failure to do so could lead to reading invalid
 * CQ entries.
 *
 * Likewise, the application must use an appropriate smp_wmb() before
 * writing the SQ tail (ordering SQ entry stores with the tail store),
 * which pairs with smp_load_acquire in io_get_sqring (smp_store_release
 * to store the tail will do). And it needs a barrier ordering the SQ
 * head load before writing new SQ entries (smp_load_acquire to read
 * head will do).
 *
 * When using the SQ poll thread (IORING_SETUP_SQPOLL), the application
 * needs to check the SQ flags for IORING_SQ_NEED_WAKEUP *after*
 * updating the SQ tail; a full memory barrier smp_mb() is needed
 * between.
 *
 * Also see the examples in the liburing library:
 *
 *	git://git.kernel.dk/liburing
 *
 * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens
 * from data shared between the kernel and application. This is done both
 * for ordering purposes, but also to ensure that once a value is loaded from
 * data that the application could potentially modify, it remains stable.
 *
 * Copyright (C) 2018-2019 Jens Axboe
 * Copyright (c) 2018-2019 Christoph Hellwig
 */
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>

#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/task_work.h>
#include <linux/io_uring.h>
#include <linux/io_uring/cmd.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <asm/shmparam.h>

#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>

#include <uapi/linux/io_uring.h>

#include "io-wq.h"

#include "io_uring.h"
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
#include "register.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
#include "rsrc.h"
#include "cancel.h"
#include "net.h"
#include "notif.h"
#include "waitid.h"
#include "futex.h"
#include "napi.h"
#include "uring_cmd.h"
#include "msg_ring.h"
#include "memmap.h"

#include "timeout.h"
#include "poll.h"
#include "rw.h"
#include "alloc_cache.h"
#include "eventfd.h"

#define IORING_MAX_ENTRIES …
#define IORING_MAX_CQ_ENTRIES …

#define SQE_COMMON_FLAGS …

#define SQE_VALID_FLAGS …

#define IO_REQ_CLEAN_FLAGS …

#define IO_REQ_CLEAN_SLOW_FLAGS …

#define IO_TCTX_REFS_CACHE_NR …

#define IO_COMPL_BATCH …
#define IO_REQ_ALLOC_BATCH …

struct io_defer_entry { … };

/* requests with any of those set should undergo io_disarm_next() */
#define IO_DISARM_MASK …
#define IO_REQ_LINK_FLAGS …

/*
 * No waiters. It's larger than any valid value of the tw counter
 * so that tests against ->cq_wait_nr would fail and skip wake_up().
 */
#define IO_CQ_WAKE_INIT …
/* Forced wake up if there is a waiter regardless of ->cq_wait_nr */
#define IO_CQ_WAKE_FORCE …

static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
					 struct task_struct *task,
					 bool cancel_all);

static void io_queue_sqe(struct io_kiocb *req);

struct kmem_cache *req_cachep;
static struct workqueue_struct *iou_wq __ro_after_init;

static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = …;

#ifdef CONFIG_SYSCTL
static struct ctl_table kernel_io_uring_disabled_table[] = …;
#endif

static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{ … }

static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
{ … }

static bool io_match_linked(struct io_kiocb *head)
{ … }

/*
 * As io_match_task() but protected against racing with linked timeouts.
 * User must not hold timeout_lock.
 */
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
			bool cancel_all)
{ … }

static inline void req_fail_link_node(struct io_kiocb *req, int res)
{ … }

static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{ … }

static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{ … }

static __cold void io_fallback_req_func(struct work_struct *work)
{ … }

static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
{ … }

static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{ … }

static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{ … }

static bool req_need_defer(struct io_kiocb *req, u32 seq)
{ … }

static void io_clean_op(struct io_kiocb *req)
{ … }

static inline void io_req_track_inflight(struct io_kiocb *req)
{ … }

static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{ … }

static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{ … }

static noinline void __io_arm_ltimeout(struct io_kiocb *req)
{ … }

static inline void io_arm_ltimeout(struct io_kiocb *req)
{ … }

static void io_prep_async_work(struct io_kiocb *req)
{ … }

static void io_prep_async_link(struct io_kiocb *req)
{ … }

static void io_queue_iowq(struct io_kiocb *req)
{ … }

static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{ … }

void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{ … }

static inline void __io_cq_lock(struct io_ring_ctx *ctx)
{ … }

static inline void io_cq_lock(struct io_ring_ctx *ctx)
	__acquires(ctx->completion_lock)
{ … }

static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
{ … }

static void io_cq_unlock_post(struct io_ring_ctx *ctx)
	__releases(ctx->completion_lock)
{ … }

static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
{ … }

static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
{ … }

static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
{ … }

/* can be called by any task */
static void io_put_task_remote(struct task_struct *task)
{ … }

/* used by a task to put its own references */
static void io_put_task_local(struct task_struct *task)
{ … }

/* must to be called somewhat shortly after putting a request */
static inline void io_put_task(struct task_struct *task)
{ … }

void io_task_refs_refill(struct io_uring_task *tctx)
{ … }

static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{ … }

static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
				     s32 res, u32 cflags, u64 extra1, u64 extra2)
{ … }

static void io_req_cqe_overflow(struct io_kiocb *req)
{ … }

/*
 * writes to the cq entry need to come after reading head; the
 * control dependency is enough as we're using WRITE_ONCE to
 * fill the cq entry
 */
bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
{ … }

static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
			      u32 cflags)
{ … }

static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
			      u32 cflags)
{ … }

bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{ … }

/*
 * Must be called from inline task_work so we now a flush will happen later,
 * and obviously with ctx->uring_lock held (tw always has that).
 */
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{ … }

/*
 * A helper for multishot requests posting additional CQEs.
 * Should only be used from a task_work including IO_URING_F_MULTISHOT.
 */
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
{ … }

static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{ … }

void io_req_defer_failed(struct io_kiocb *req, s32 res)
	__must_hold(&ctx->uring_lock)
{ … }

/*
 * Don't initialise the fields below on every allocation, but do that in
 * advance and keep them valid across allocations.
 */
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{ … }

/*
 * A request might get retired back into the request caches even before opcode
 * handlers and io_issue_sqe() are done with it, e.g. inline completion path.
 * Because of that, io_alloc_req() should be called only under ->uring_lock
 * and with extra caution to not get a request that is still worked on.
 */
__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
	__must_hold(&ctx->uring_lock)
{ … }

__cold void io_free_req(struct io_kiocb *req)
{ … }

static void __io_req_find_next_prep(struct io_kiocb *req)
{ … }

static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{ … }

static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
{ … }

/*
 * Run queued task_work, returning the number of entries processed in *count.
 * If more entries than max_entries are available, stop processing once this
 * is reached and return the rest of the list.
 */
struct llist_node *io_handle_tw_list(struct llist_node *node,
				     unsigned int *count,
				     unsigned int max_entries)
{ … }

/**
 * io_llist_xchg - swap all entries in a lock-less list
 * @head:	the head of lock-less list to delete all entries
 * @new:	new entry as the head of the list
 *
 * If list is empty, return NULL, otherwise, return the pointer to the first entry.
 * The order of entries returned is from the newest to the oldest added one.
 */
static inline struct llist_node *io_llist_xchg(struct llist_head *head,
					       struct llist_node *new)
{ … }

static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
{ … }

struct llist_node *tctx_task_work_run(struct io_uring_task *tctx,
				      unsigned int max_entries,
				      unsigned int *count)
{ … }

void tctx_task_work(struct callback_head *cb)
{ … }

static inline void io_req_local_work_add(struct io_kiocb *req,
					 struct io_ring_ctx *ctx,
					 unsigned flags)
{ … }

static void io_req_normal_work_add(struct io_kiocb *req)
{ … }

void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
{ … }

void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
				 unsigned flags)
{ … }

static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
{ … }

static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
				       int min_events)
{ … }

static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
			       int min_events)
{ … }

static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
					   int min_events)
{ … }

static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
{ … }

static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
{ … }

void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
{ … }

void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{ … }

void io_req_task_queue(struct io_kiocb *req)
{ … }

void io_queue_next(struct io_kiocb *req)
{ … }

static void io_free_batch_list(struct io_ring_ctx *ctx,
			       struct io_wq_work_node *node)
	__must_hold(&ctx->uring_lock)
{ … }

void __io_submit_flush_completions(struct io_ring_ctx *ctx)
	__must_hold(&ctx->uring_lock)
{ … }

static unsigned io_cqring_events(struct io_ring_ctx *ctx)
{ … }

/*
 * We can't just wait for polled events to come to us, we have to actively
 * find and complete them.
 */
static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{ … }

static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{ … }

void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
{ … }

/*
 * After the iocb has been issued, it's safe to be found on the poll list.
 * Adding the kiocb to the list AFTER submission ensures that we don't
 * find it from a io_do_iopoll() thread before the issuer is done
 * accessing the kiocb cookie.
 */
static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{ … }

io_req_flags_t io_file_get_flags(struct file *file)
{ … }

bool io_alloc_async_data(struct io_kiocb *req)
{ … }

static u32 io_get_sequence(struct io_kiocb *req)
{ … }

static __cold void io_drain_req(struct io_kiocb *req)
	__must_hold(&ctx->uring_lock)
{ … }

static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
			   unsigned int issue_flags)
{ … }

static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{ … }

int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
{ … }

struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
{ … }

void io_wq_submit_work(struct io_wq_work *work)
{ … }

inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
				      unsigned int issue_flags)
{ … }

struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{ … }

static void io_queue_async(struct io_kiocb *req, int ret)
	__must_hold(&req->ctx->uring_lock)
{ … }

static inline void io_queue_sqe(struct io_kiocb *req)
	__must_hold(&req->ctx->uring_lock)
{ … }

static void io_queue_sqe_fallback(struct io_kiocb *req)
	__must_hold(&req->ctx->uring_lock)
{ … }

/*
 * Check SQE restrictions (opcode and flags).
 *
 * Returns 'true' if SQE is allowed, 'false' otherwise.
 */
static inline bool io_check_restriction(struct io_ring_ctx *ctx,
					struct io_kiocb *req,
					unsigned int sqe_flags)
{ … }

static void io_init_req_drain(struct io_kiocb *req)
{ … }

static __cold int io_init_fail_req(struct io_kiocb *req, int err)
{ … }

static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
		       const struct io_uring_sqe *sqe)
	__must_hold(&ctx->uring_lock)
{ … }

static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
				      struct io_kiocb *req, int ret)
{ … }

static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
			 const struct io_uring_sqe *sqe)
	__must_hold(&ctx->uring_lock)
{ … }

/*
 * Batched submission is done, ensure local IO is flushed out.
 */
static void io_submit_state_end(struct io_ring_ctx *ctx)
{ … }

/*
 * Start submission side cache.
 */
static void io_submit_state_start(struct io_submit_state *state,
				  unsigned int max_ios)
{ … }

static void io_commit_sqring(struct io_ring_ctx *ctx)
{ … }

/*
 * Fetch an sqe, if one is available. Note this returns a pointer to memory
 * that is mapped by userspace. This means that care needs to be taken to
 * ensure that reads are stable, as we cannot rely on userspace always
 * being a good citizen. If members of the sqe are validated and then later
 * used, it's important that those reads are done through READ_ONCE() to
 * prevent a re-load down the line.
 */
static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
{ … }

int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
	__must_hold(&ctx->uring_lock)
{ … }

static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
			    int wake_flags, void *key)
{ … }

int io_run_task_work_sig(struct io_ring_ctx *ctx)
{ … }

static bool current_pending_io(void)
{ … }

/* when returns >0, the caller should retry */
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
					  struct io_wait_queue *iowq)
{ … }

/*
 * Wait until events become available, if we don't already have some. The
 * application must reap them itself, as they reside on the shared cq ring.
 */
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
			  const sigset_t __user *sig, size_t sigsz,
			  struct __kernel_timespec __user *uts)
{ … }

static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
			  size_t size)
{ … }

static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
			 size_t size)
{ … }

static void io_rings_free(struct io_ring_ctx *ctx)
{ … }

static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
				unsigned int cq_entries, size_t *sq_offset)
{ … }

static void io_req_caches_free(struct io_ring_ctx *ctx)
{ … }

static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{ … }

static __cold void io_activate_pollwq_cb(struct callback_head *cb)
{ … }

__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{ … }

static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{ … }

struct io_tctx_exit { … };

static __cold void io_tctx_exit_cb(struct callback_head *cb)
{ … }

static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{ … }

static __cold void io_ring_exit_work(struct work_struct *work)
{ … }

static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{ … }

static int io_uring_release(struct inode *inode, struct file *file)
{ … }

struct io_task_cancel { … };

static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{ … }

static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
					 struct task_struct *task,
					 bool cancel_all)
{ … }

static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{ … }

static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
						struct task_struct *task,
						bool cancel_all)
{ … }

static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{ … }

/*
 * Find any io_uring ctx that this task has registered or done IO on, and cancel
 * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
 */
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
{ … }

void __io_uring_cancel(bool cancel_all)
{ … }

static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
{ … }

static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz,
			  struct __kernel_timespec __user **ts,
			  const sigset_t __user **sig)
{ … }

SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
		u32, min_complete, u32, flags, const void __user *, argp,
		size_t, argsz)
{ … }

static const struct file_operations io_uring_fops = …;

bool io_is_uring_fops(struct file *file)
{ … }

static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
					 struct io_uring_params *p)
{ … }

static int io_uring_install_fd(struct file *file)
{ … }

/*
 * Allocate an anonymous fd, this is what constitutes the application
 * visible backing of an io_uring instance. The application mmaps this
 * fd to gain access to the SQ/CQ ring details.
 */
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{ … }

static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
				  struct io_uring_params __user *params)
{ … }

/*
 * Sets up an aio uring context, and returns the fd. Applications asks for a
 * ring size, we return the actual sq/cq ring sizes (among other things) in the
 * params structure passed in.
 */
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{ … }

static inline bool io_uring_allowed(void)
{ … }

SYSCALL_DEFINE2(io_uring_setup, u32, entries,
		struct io_uring_params __user *, params)
{ … }

static int __init io_uring_init(void)
{
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
	BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
	BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
} while (0)

#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
	__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
#define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
	__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
	BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
	BUILD_BUG_SQE_ELEM(0,  __u8,   opcode);
	BUILD_BUG_SQE_ELEM(1,  __u8,   flags);
	BUILD_BUG_SQE_ELEM(2,  __u16,  ioprio);
	BUILD_BUG_SQE_ELEM(4,  __s32,  fd);
	BUILD_BUG_SQE_ELEM(8,  __u64,  off);
	BUILD_BUG_SQE_ELEM(8,  __u64,  addr2);
	BUILD_BUG_SQE_ELEM(8,  __u32,  cmd_op);
	BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
	BUILD_BUG_SQE_ELEM(16, __u64,  addr);
	BUILD_BUG_SQE_ELEM(16, __u64,  splice_off_in);
	BUILD_BUG_SQE_ELEM(24, __u32,  len);
	BUILD_BUG_SQE_ELEM(28,     __kernel_rwf_t, rw_flags);
	BUILD_BUG_SQE_ELEM(28, /* compat */   int, rw_flags);
	BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  fsync_flags);
	BUILD_BUG_SQE_ELEM(28, /* compat */ __u16,  poll_events);
	BUILD_BUG_SQE_ELEM(28, __u32,  poll32_events);
	BUILD_BUG_SQE_ELEM(28, __u32,  sync_range_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  msg_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  timeout_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  accept_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  cancel_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  open_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  statx_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  fadvise_advice);
	BUILD_BUG_SQE_ELEM(28, __u32,  splice_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  rename_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  unlink_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  hardlink_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  xattr_flags);
	BUILD_BUG_SQE_ELEM(28, __u32,  msg_ring_flags);
	BUILD_BUG_SQE_ELEM(32, __u64,  user_data);
	BUILD_BUG_SQE_ELEM(40, __u16,  buf_index);
	BUILD_BUG_SQE_ELEM(40, __u16,  buf_group);
	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
	BUILD_BUG_SQE_ELEM(44, __u32,  file_index);
	BUILD_BUG_SQE_ELEM(44, __u16,  addr_len);
	BUILD_BUG_SQE_ELEM(46, __u16,  __pad3[0]);
	BUILD_BUG_SQE_ELEM(48, __u64,  addr3);
	BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
	BUILD_BUG_SQE_ELEM(56, __u64,  __pad2);

	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
		     sizeof(struct io_uring_rsrc_update));
	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
		     sizeof(struct io_uring_rsrc_update2));

	/* ->buf_index is u16 */
	BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
	BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
		     offsetof(struct io_uring_buf_ring, tail));

	/* should fit into one byte */
	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
	BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
	BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);

	BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof_field(struct io_kiocb, flags));

	BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));

	/* top 8bits are for internal use */
	BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);

	io_uring_optable_init();

	/*
	 * Allow user copy in the per-command field, which starts after the
	 * file in io_kiocb and until the opcode field. The openat2 handling
	 * requires copying in user memory into the io_kiocb object in that
	 * range, and HARDENED_USERCOPY will complain if we haven't
	 * correctly annotated this range.
	 */
	req_cachep = kmem_cache_create_usercopy("io_kiocb",
				sizeof(struct io_kiocb), 0,
				SLAB_HWCACHE_ALIGN | SLAB_PANIC |
				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU,
				offsetof(struct io_kiocb, cmd.data),
				sizeof_field(struct io_kiocb, cmd.data), NULL);
	io_buf_cachep = KMEM_CACHE(io_buffer,
					  SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);

	iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64);

#ifdef CONFIG_SYSCTL
	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
#endif

	return 0;
};
__initcall(io_uring_init);
linux/io_uring/io_uring.c