#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/syscalls.h>
#include <net/compat.h>
#include <linux/refcount.h>
#include <linux/uio.h>
#include <linux/bits.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/percpu.h>
#include <linux/slab.h>
#include <linux/bvec.h>
#include <linux/net.h>
#include <net/sock.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <linux/nospec.h>
#include <linux/fsnotify.h>
#include <linux/fadvise.h>
#include <linux/task_work.h>
#include <linux/io_uring.h>
#include <linux/io_uring/cmd.h>
#include <linux/audit.h>
#include <linux/security.h>
#include <asm/shmparam.h>
#define CREATE_TRACE_POINTS
#include <trace/events/io_uring.h>
#include <uapi/linux/io_uring.h>
#include "io-wq.h"
#include "io_uring.h"
#include "opdef.h"
#include "refs.h"
#include "tctx.h"
#include "register.h"
#include "sqpoll.h"
#include "fdinfo.h"
#include "kbuf.h"
#include "rsrc.h"
#include "cancel.h"
#include "net.h"
#include "notif.h"
#include "waitid.h"
#include "futex.h"
#include "napi.h"
#include "uring_cmd.h"
#include "msg_ring.h"
#include "memmap.h"
#include "timeout.h"
#include "poll.h"
#include "rw.h"
#include "alloc_cache.h"
#include "eventfd.h"
#define IORING_MAX_ENTRIES …
#define IORING_MAX_CQ_ENTRIES …
#define SQE_COMMON_FLAGS …
#define SQE_VALID_FLAGS …
#define IO_REQ_CLEAN_FLAGS …
#define IO_REQ_CLEAN_SLOW_FLAGS …
#define IO_TCTX_REFS_CACHE_NR …
#define IO_COMPL_BATCH …
#define IO_REQ_ALLOC_BATCH …
struct io_defer_entry { … };
#define IO_DISARM_MASK …
#define IO_REQ_LINK_FLAGS …
#define IO_CQ_WAKE_INIT …
#define IO_CQ_WAKE_FORCE …
static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all);
static void io_queue_sqe(struct io_kiocb *req);
struct kmem_cache *req_cachep;
static struct workqueue_struct *iou_wq __ro_after_init;
static int __read_mostly sysctl_io_uring_disabled;
static int __read_mostly sysctl_io_uring_group = …;
#ifdef CONFIG_SYSCTL
static struct ctl_table kernel_io_uring_disabled_table[] = …;
#endif
static inline unsigned int __io_cqring_events(struct io_ring_ctx *ctx)
{ … }
static inline unsigned int __io_cqring_events_user(struct io_ring_ctx *ctx)
{ … }
static bool io_match_linked(struct io_kiocb *head)
{ … }
bool io_match_task_safe(struct io_kiocb *head, struct task_struct *task,
bool cancel_all)
{ … }
static inline void req_fail_link_node(struct io_kiocb *req, int res)
{ … }
static inline void io_req_add_to_cache(struct io_kiocb *req, struct io_ring_ctx *ctx)
{ … }
static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref)
{ … }
static __cold void io_fallback_req_func(struct work_struct *work)
{ … }
static int io_alloc_hash_table(struct io_hash_table *table, unsigned bits)
{ … }
static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{ … }
static void io_account_cq_overflow(struct io_ring_ctx *ctx)
{ … }
static bool req_need_defer(struct io_kiocb *req, u32 seq)
{ … }
static void io_clean_op(struct io_kiocb *req)
{ … }
static inline void io_req_track_inflight(struct io_kiocb *req)
{ … }
static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req)
{ … }
static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
{ … }
static noinline void __io_arm_ltimeout(struct io_kiocb *req)
{ … }
static inline void io_arm_ltimeout(struct io_kiocb *req)
{ … }
static void io_prep_async_work(struct io_kiocb *req)
{ … }
static void io_prep_async_link(struct io_kiocb *req)
{ … }
static void io_queue_iowq(struct io_kiocb *req)
{ … }
static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts)
{ … }
void io_req_queue_iowq(struct io_kiocb *req)
{ … }
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{ … }
void __io_commit_cqring_flush(struct io_ring_ctx *ctx)
{ … }
static inline void __io_cq_lock(struct io_ring_ctx *ctx)
{ … }
static inline void io_cq_lock(struct io_ring_ctx *ctx)
__acquires(ctx->completion_lock)
{ … }
static inline void __io_cq_unlock_post(struct io_ring_ctx *ctx)
{ … }
static void io_cq_unlock_post(struct io_ring_ctx *ctx)
__releases(ctx->completion_lock)
{ … }
static void __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool dying)
{ … }
static void io_cqring_overflow_kill(struct io_ring_ctx *ctx)
{ … }
static void io_cqring_do_overflow_flush(struct io_ring_ctx *ctx)
{ … }
static void io_put_task_remote(struct task_struct *task)
{ … }
static void io_put_task_local(struct task_struct *task)
{ … }
static inline void io_put_task(struct task_struct *task)
{ … }
void io_task_refs_refill(struct io_uring_task *tctx)
{ … }
static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
{ … }
static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
s32 res, u32 cflags, u64 extra1, u64 extra2)
{ … }
static void io_req_cqe_overflow(struct io_kiocb *req)
{ … }
bool io_cqe_cache_refill(struct io_ring_ctx *ctx, bool overflow)
{ … }
static bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{ … }
static bool __io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res,
u32 cflags)
{ … }
bool io_post_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{ … }
void io_add_aux_cqe(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags)
{ … }
bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags)
{ … }
static void io_req_complete_post(struct io_kiocb *req, unsigned issue_flags)
{ … }
void io_req_defer_failed(struct io_kiocb *req, s32 res)
__must_hold(&ctx->uring_lock)
{ … }
static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx)
{ … }
__cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{ … }
__cold void io_free_req(struct io_kiocb *req)
{ … }
static void __io_req_find_next_prep(struct io_kiocb *req)
{ … }
static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req)
{ … }
static void ctx_flush_and_put(struct io_ring_ctx *ctx, struct io_tw_state *ts)
{ … }
struct llist_node *io_handle_tw_list(struct llist_node *node,
unsigned int *count,
unsigned int max_entries)
{ … }
static inline struct llist_node *io_llist_xchg(struct llist_head *head,
struct llist_node *new)
{ … }
static __cold void io_fallback_tw(struct io_uring_task *tctx, bool sync)
{ … }
struct llist_node *tctx_task_work_run(struct io_uring_task *tctx,
unsigned int max_entries,
unsigned int *count)
{ … }
void tctx_task_work(struct callback_head *cb)
{ … }
static inline void io_req_local_work_add(struct io_kiocb *req,
struct io_ring_ctx *ctx,
unsigned flags)
{ … }
static void io_req_normal_work_add(struct io_kiocb *req)
{ … }
void __io_req_task_work_add(struct io_kiocb *req, unsigned flags)
{ … }
void io_req_task_work_add_remote(struct io_kiocb *req, struct io_ring_ctx *ctx,
unsigned flags)
{ … }
static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx)
{ … }
static bool io_run_local_work_continue(struct io_ring_ctx *ctx, int events,
int min_events)
{ … }
static int __io_run_local_work(struct io_ring_ctx *ctx, struct io_tw_state *ts,
int min_events)
{ … }
static inline int io_run_local_work_locked(struct io_ring_ctx *ctx,
int min_events)
{ … }
static int io_run_local_work(struct io_ring_ctx *ctx, int min_events)
{ … }
static void io_req_task_cancel(struct io_kiocb *req, struct io_tw_state *ts)
{ … }
void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
{ … }
void io_req_task_queue_fail(struct io_kiocb *req, int ret)
{ … }
void io_req_task_queue(struct io_kiocb *req)
{ … }
void io_queue_next(struct io_kiocb *req)
{ … }
static void io_free_batch_list(struct io_ring_ctx *ctx,
struct io_wq_work_node *node)
__must_hold(&ctx->uring_lock)
{ … }
void __io_submit_flush_completions(struct io_ring_ctx *ctx)
__must_hold(&ctx->uring_lock)
{ … }
static unsigned io_cqring_events(struct io_ring_ctx *ctx)
{ … }
static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx)
{ … }
static int io_iopoll_check(struct io_ring_ctx *ctx, long min)
{ … }
void io_req_task_complete(struct io_kiocb *req, struct io_tw_state *ts)
{ … }
static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags)
{ … }
io_req_flags_t io_file_get_flags(struct file *file)
{ … }
bool io_alloc_async_data(struct io_kiocb *req)
{ … }
static u32 io_get_sequence(struct io_kiocb *req)
{ … }
static __cold void io_drain_req(struct io_kiocb *req)
__must_hold(&ctx->uring_lock)
{ … }
static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
unsigned int issue_flags)
{ … }
static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
{ … }
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
{ … }
struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
{ … }
void io_wq_submit_work(struct io_wq_work *work)
{ … }
inline struct file *io_file_get_fixed(struct io_kiocb *req, int fd,
unsigned int issue_flags)
{ … }
struct file *io_file_get_normal(struct io_kiocb *req, int fd)
{ … }
static void io_queue_async(struct io_kiocb *req, int ret)
__must_hold(&req->ctx->uring_lock)
{ … }
static inline void io_queue_sqe(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{ … }
static void io_queue_sqe_fallback(struct io_kiocb *req)
__must_hold(&req->ctx->uring_lock)
{ … }
static inline bool io_check_restriction(struct io_ring_ctx *ctx,
struct io_kiocb *req,
unsigned int sqe_flags)
{ … }
static void io_init_req_drain(struct io_kiocb *req)
{ … }
static __cold int io_init_fail_req(struct io_kiocb *req, int err)
{ … }
static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{ … }
static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
struct io_kiocb *req, int ret)
{ … }
static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
const struct io_uring_sqe *sqe)
__must_hold(&ctx->uring_lock)
{ … }
static void io_submit_state_end(struct io_ring_ctx *ctx)
{ … }
static void io_submit_state_start(struct io_submit_state *state,
unsigned int max_ios)
{ … }
static void io_commit_sqring(struct io_ring_ctx *ctx)
{ … }
static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
{ … }
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
__must_hold(&ctx->uring_lock)
{ … }
static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode,
int wake_flags, void *key)
{ … }
int io_run_task_work_sig(struct io_ring_ctx *ctx)
{ … }
static bool current_pending_io(void)
{ … }
static enum hrtimer_restart io_cqring_timer_wakeup(struct hrtimer *timer)
{ … }
static enum hrtimer_restart io_cqring_min_timer_wakeup(struct hrtimer *timer)
{ … }
static int io_cqring_schedule_timeout(struct io_wait_queue *iowq,
clockid_t clock_id, ktime_t start_time)
{ … }
static int __io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
ktime_t start_time)
{ … }
static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx,
struct io_wait_queue *iowq,
ktime_t start_time)
{ … }
struct ext_arg { … };
static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
struct ext_arg *ext_arg)
{ … }
static void *io_rings_map(struct io_ring_ctx *ctx, unsigned long uaddr,
size_t size)
{ … }
static void *io_sqes_map(struct io_ring_ctx *ctx, unsigned long uaddr,
size_t size)
{ … }
static void io_rings_free(struct io_ring_ctx *ctx)
{ … }
static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries,
unsigned int cq_entries, size_t *sq_offset)
{ … }
static void io_req_caches_free(struct io_ring_ctx *ctx)
{ … }
static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
{ … }
static __cold void io_activate_pollwq_cb(struct callback_head *cb)
{ … }
__cold void io_activate_pollwq(struct io_ring_ctx *ctx)
{ … }
static __poll_t io_uring_poll(struct file *file, poll_table *wait)
{ … }
struct io_tctx_exit { … };
static __cold void io_tctx_exit_cb(struct callback_head *cb)
{ … }
static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data)
{ … }
static __cold void io_ring_exit_work(struct work_struct *work)
{ … }
static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
{ … }
static int io_uring_release(struct inode *inode, struct file *file)
{ … }
struct io_task_cancel { … };
static bool io_cancel_task_cb(struct io_wq_work *work, void *data)
{ … }
static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
{ … }
static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx)
{ … }
static __cold bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,
struct task_struct *task,
bool cancel_all)
{ … }
static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked)
{ … }
__cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
{ … }
void __io_uring_cancel(bool cancel_all)
{ … }
static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz)
{ … }
static int io_get_ext_arg(unsigned flags, const void __user *argp,
struct ext_arg *ext_arg)
{ … }
SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,
u32, min_complete, u32, flags, const void __user *, argp,
size_t, argsz)
{ … }
static const struct file_operations io_uring_fops = …;
bool io_is_uring_fops(struct file *file)
{ … }
static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{ … }
static int io_uring_install_fd(struct file *file)
{ … }
static struct file *io_uring_get_file(struct io_ring_ctx *ctx)
{ … }
static __cold int io_uring_create(unsigned entries, struct io_uring_params *p,
struct io_uring_params __user *params)
{ … }
static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
{ … }
static inline bool io_uring_allowed(void)
{ … }
SYSCALL_DEFINE2(io_uring_setup, u32, entries,
struct io_uring_params __user *, params)
{ … }
static int __init io_uring_init(void)
{
struct kmem_cache_args kmem_args = {
.useroffset = offsetof(struct io_kiocb, cmd.data),
.usersize = sizeof_field(struct io_kiocb, cmd.data),
};
#define __BUILD_BUG_VERIFY_OFFSET_SIZE(stype, eoffset, esize, ename) do { \
BUILD_BUG_ON(offsetof(stype, ename) != eoffset); \
BUILD_BUG_ON(sizeof_field(stype, ename) != esize); \
} while (0)
#define BUILD_BUG_SQE_ELEM(eoffset, etype, ename) \
__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, sizeof(etype), ename)
#define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \
__BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename)
BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
BUILD_BUG_SQE_ELEM(0, __u8, opcode);
BUILD_BUG_SQE_ELEM(1, __u8, flags);
BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
BUILD_BUG_SQE_ELEM(4, __s32, fd);
BUILD_BUG_SQE_ELEM(8, __u64, off);
BUILD_BUG_SQE_ELEM(8, __u64, addr2);
BUILD_BUG_SQE_ELEM(8, __u32, cmd_op);
BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
BUILD_BUG_SQE_ELEM(16, __u64, addr);
BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
BUILD_BUG_SQE_ELEM(24, __u32, len);
BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
BUILD_BUG_SQE_ELEM(28, int, rw_flags);
BUILD_BUG_SQE_ELEM(28, __u32, rw_flags);
BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
BUILD_BUG_SQE_ELEM(28, __u16, poll_events);
BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
BUILD_BUG_SQE_ELEM(28, __u32, rename_flags);
BUILD_BUG_SQE_ELEM(28, __u32, unlink_flags);
BUILD_BUG_SQE_ELEM(28, __u32, hardlink_flags);
BUILD_BUG_SQE_ELEM(28, __u32, xattr_flags);
BUILD_BUG_SQE_ELEM(28, __u32, msg_ring_flags);
BUILD_BUG_SQE_ELEM(32, __u64, user_data);
BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
BUILD_BUG_SQE_ELEM(42, __u16, personality);
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
sizeof(struct io_uring_rsrc_update));
BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
sizeof(struct io_uring_rsrc_update2));
BUILD_BUG_ON(offsetof(struct io_uring_buf_ring, bufs) != 0);
BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) !=
offsetof(struct io_uring_buf_ring, tail));
BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS);
BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof_field(struct io_kiocb, flags));
BUILD_BUG_ON(sizeof(atomic_t) != sizeof(u32));
BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
io_uring_optable_init();
req_cachep = kmem_cache_create("io_kiocb", sizeof(struct io_kiocb), &kmem_args,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT |
SLAB_TYPESAFE_BY_RCU);
io_buf_cachep = KMEM_CACHE(io_buffer,
SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
iou_wq = alloc_workqueue("iou_exit", WQ_UNBOUND, 64);
#ifdef CONFIG_SYSCTL
register_sysctl_init("kernel", kernel_io_uring_disabled_table);
#endif
return 0;
};
__initcall(io_uring_init);