io_write.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright 2010, 2011 Kent Overstreet <[email protected]>
 * Copyright 2012 Google, Inc.
 */

#include "bcachefs.h"
#include "alloc_foreground.h"
#include "bkey_buf.h"
#include "bset.h"
#include "btree_update.h"
#include "buckets.h"
#include "checksum.h"
#include "clock.h"
#include "compress.h"
#include "debug.h"
#include "ec.h"
#include "error.h"
#include "extent_update.h"
#include "inode.h"
#include "io_write.h"
#include "journal.h"
#include "keylist.h"
#include "move.h"
#include "nocow_locking.h"
#include "rebalance.h"
#include "subvolume.h"
#include "super.h"
#include "super-io.h"
#include "trace.h"

#include <linux/blkdev.h>
#include <linux/prefetch.h>
#include <linux/random.h>
#include <linux/sched/mm.h>

#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT

static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
				       u64 now, int rw)
{
	u64 latency_capable =
		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
	/* ideally we'd be taking into account the device's variance here: */
	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
	s64 latency_over = io_latency - latency_threshold;

	if (latency_threshold && latency_over > 0) {
		/*
		 * bump up congested by approximately latency_over * 4 /
		 * latency_threshold - we don't need much accuracy here so don't
		 * bother with the divide:
		 */
		if (atomic_read(&ca->congested) < CONGESTED_MAX)
			atomic_add(latency_over >>
				   max_t(int, ilog2(latency_threshold) - 2, 0),
				   &ca->congested);

		ca->congested_last = now;
	} else if (atomic_read(&ca->congested) > 0) {
		atomic_dec(&ca->congested);
	}
}

void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
{
	atomic64_t *latency = &ca->cur_latency[rw];
	u64 now = local_clock();
	u64 io_latency = time_after64(now, submit_time)
		? now - submit_time
		: 0;
	u64 old, new;

	old = atomic64_read(latency);
	do {
		/*
		 * If the io latency was reasonably close to the current
		 * latency, skip doing the update and atomic operation - most of
		 * the time:
		 */
		if (abs((int) (old - io_latency)) < (old >> 1) &&
		    now & ~(~0U << 5))
			break;

		new = ewma_add(old, io_latency, 5);
	} while (!atomic64_try_cmpxchg(latency, &old, new));

	bch2_congested_acct(ca, io_latency, now, rw);

	__bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
}

#endif

/* Allocate, free from mempool: */

void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
{ … }

static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
{ … }

void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
			       size_t size)
{ … }

/* Extent update path: */

int bch2_sum_sector_overwrites(struct btree_trans *trans,
			       struct btree_iter *extent_iter,
			       struct bkey_i *new,
			       bool *usage_increasing,
			       s64 *i_sectors_delta,
			       s64 *disk_sectors_delta)
{ … }

static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
						    struct btree_iter *extent_iter,
						    u64 new_i_size,
						    s64 i_sectors_delta)
{ … }

int bch2_extent_update(struct btree_trans *trans,
		       subvol_inum inum,
		       struct btree_iter *iter,
		       struct bkey_i *k,
		       struct disk_reservation *disk_res,
		       u64 new_i_size,
		       s64 *i_sectors_delta_total,
		       bool check_enospc)
{ … }

static int bch2_write_index_default(struct bch_write_op *op)
{ … }

/* Writes */

void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
			       enum bch_data_type type,
			       const struct bkey_i *k,
			       bool nocow)
{ … }

static void __bch2_write(struct bch_write_op *);

static void bch2_write_done(struct closure *cl)
{ … }

static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
{ … }

/**
 * __bch2_write_index - after a write, update index to point to new data
 * @op:		bch_write_op to process
 */
static void __bch2_write_index(struct bch_write_op *op)
{ … }

static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
{ … }

static inline void wp_update_state(struct write_point *wp, bool running)
{ … }

static CLOSURE_CALLBACK(bch2_write_index)
{ … }

static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
{ … }

void bch2_write_point_do_index_updates(struct work_struct *work)
{ … }

static void bch2_write_endio(struct bio *bio)
{ … }

static void init_append_extent(struct bch_write_op *op,
			       struct write_point *wp,
			       struct bversion version,
			       struct bch_extent_crc_unpacked crc)
{ … }

static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
					struct write_point *wp,
					struct bio *src,
					bool *page_alloc_failed,
					void *buf)
{ … }

static int bch2_write_rechecksum(struct bch_fs *c,
				 struct bch_write_op *op,
				 unsigned new_csum_type)
{ … }

static int bch2_write_decrypt(struct bch_write_op *op)
{ … }

static enum prep_encoded_ret { … }

static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
			     struct bio **_dst)
{ … }

static bool bch2_extent_is_writeable(struct bch_write_op *op,
				     struct bkey_s_c k)
{ … }

static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
						  struct btree_iter *iter,
						  struct bkey_i *orig,
						  struct bkey_s_c k,
						  u64 new_i_size)
{ … }

static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{ … }

static void __bch2_nocow_write_done(struct bch_write_op *op)
{ … }

static CLOSURE_CALLBACK(bch2_nocow_write_done)
{ … }

struct bucket_to_lock { … };

static void bch2_nocow_write(struct bch_write_op *op)
{ … }

static void __bch2_write(struct bch_write_op *op)
{ … }

static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
{ … }

/**
 * bch2_write() - handle a write to a cache device or flash only volume
 * @cl:		&bch_write_op->cl
 *
 * This is the starting point for any data to end up in a cache device; it could
 * be from a normal write, or a writeback write, or a write to a flash only
 * volume - it's also used by the moving garbage collector to compact data in
 * mostly empty buckets.
 *
 * It first writes the data to the cache, creating a list of keys to be inserted
 * (if the data won't fit in a single open bucket, there will be multiple keys);
 * after the data is written it calls bch_journal, and after the keys have been
 * added to the next journal write they're inserted into the btree.
 *
 * If op->discard is true, instead of inserting the data it invalidates the
 * region of the cache represented by op->bio and op->inode.
 */
CLOSURE_CALLBACK(bch2_write)
{ … }

static const char * const bch2_write_flags[] = …;

void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
{ … }

void bch2_fs_io_write_exit(struct bch_fs *c)
{ … }

int bch2_fs_io_write_init(struct bch_fs *c)
{ … }
linux/fs/bcachefs/io_write.c