linux/drivers/md/dm-vdo/indexer/index-session.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright 2023 Red Hat
 */

#include "index-session.h"

#include <linux/atomic.h>

#include "logger.h"
#include "memory-alloc.h"
#include "time-utils.h"

#include "funnel-requestqueue.h"
#include "index.h"
#include "index-layout.h"

/*
 * The index session contains a lock (the request_mutex) which ensures that only one thread can
 * change the state of its index at a time. The state field indicates the current state of the
 * index through a set of descriptive flags. The request_mutex must be notified whenever a
 * non-transient state flag is cleared. The request_mutex is also used to count the number of
 * requests currently in progress so that they can be drained when suspending or closing the index.
 *
 * If the index session is suspended shortly after opening an index, it may have to suspend during
 * a rebuild. Depending on the size of the index, a rebuild may take a significant amount of time,
 * so UDS allows the rebuild to be paused in order to suspend the session in a timely manner. When
 * the index session is resumed, the rebuild can continue from where it left off. If the index
 * session is shut down with a suspended rebuild, the rebuild progress is abandoned and the rebuild
 * will start from the beginning the next time the index is loaded. The mutex and status fields in
 * the index_load_context are used to record the state of any interrupted rebuild.
 */

enum index_session_flag_bit {
	IS_FLAG_BIT_START = 8,
	/* The session has started loading an index but not completed it. */
	IS_FLAG_BIT_LOADING = IS_FLAG_BIT_START,
	/* The session has loaded an index, which can handle requests. */
	IS_FLAG_BIT_LOADED,
	/* The session's index has been permanently disabled. */
	IS_FLAG_BIT_DISABLED,
	/* The session's index is suspended. */
	IS_FLAG_BIT_SUSPENDED,
	/* The session is handling some index state change. */
	IS_FLAG_BIT_WAITING,
	/* The session's index is closing and draining requests. */
	IS_FLAG_BIT_CLOSING,
	/* The session is being destroyed and is draining requests. */
	IS_FLAG_BIT_DESTROYING,
};

enum index_session_flag {
	IS_FLAG_LOADED = (1 << IS_FLAG_BIT_LOADED),
	IS_FLAG_LOADING = (1 << IS_FLAG_BIT_LOADING),
	IS_FLAG_DISABLED = (1 << IS_FLAG_BIT_DISABLED),
	IS_FLAG_SUSPENDED = (1 << IS_FLAG_BIT_SUSPENDED),
	IS_FLAG_WAITING = (1 << IS_FLAG_BIT_WAITING),
	IS_FLAG_CLOSING = (1 << IS_FLAG_BIT_CLOSING),
	IS_FLAG_DESTROYING = (1 << IS_FLAG_BIT_DESTROYING),
};

/* Release a reference to an index session. */
static void release_index_session(struct uds_index_session *index_session)
{
	mutex_lock(&index_session->request_mutex);
	if (--index_session->request_count == 0)
		uds_broadcast_cond(&index_session->request_cond);
	mutex_unlock(&index_session->request_mutex);
}

/*
 * Acquire a reference to the index session for an asynchronous index request. The reference must
 * eventually be released with a corresponding call to release_index_session().
 */
static int get_index_session(struct uds_index_session *index_session)
{
	unsigned int state;
	int result = UDS_SUCCESS;

	mutex_lock(&index_session->request_mutex);
	index_session->request_count++;
	state = index_session->state;
	mutex_unlock(&index_session->request_mutex);

	if (state == IS_FLAG_LOADED) {
		return UDS_SUCCESS;
	} else if (state & IS_FLAG_DISABLED) {
		result = UDS_DISABLED;
	} else if ((state & IS_FLAG_LOADING) ||
		   (state & IS_FLAG_SUSPENDED) ||
		   (state & IS_FLAG_WAITING)) {
		result = -EBUSY;
	} else {
		result = UDS_NO_INDEX;
	}

	release_index_session(index_session);
	return result;
}

int uds_launch_request(struct uds_request *request)
{
	size_t internal_size;
	int result;

	if (request->callback == NULL) {
		vdo_log_error("missing required callback");
		return -EINVAL;
	}

	switch (request->type) {
	case UDS_DELETE:
	case UDS_POST:
	case UDS_QUERY:
	case UDS_QUERY_NO_UPDATE:
	case UDS_UPDATE:
		break;
	default:
		vdo_log_error("received invalid callback type");
		return -EINVAL;
	}

	/* Reset all internal fields before processing. */
	internal_size =
		sizeof(struct uds_request) - offsetof(struct uds_request, zone_number);
	// FIXME should be using struct_group for this instead
	memset((char *) request + sizeof(*request) - internal_size, 0, internal_size);

	result = get_index_session(request->session);
	if (result != UDS_SUCCESS)
		return result;

	request->found = false;
	request->unbatched = false;
	request->index = request->session->index;

	uds_enqueue_request(request, STAGE_TRIAGE);
	return UDS_SUCCESS;
}

static void enter_callback_stage(struct uds_request *request)
{
	if (request->status != UDS_SUCCESS) {
		/* All request errors are considered unrecoverable */
		mutex_lock(&request->session->request_mutex);
		request->session->state |= IS_FLAG_DISABLED;
		mutex_unlock(&request->session->request_mutex);
	}

	uds_request_queue_enqueue(request->session->callback_queue, request);
}

static inline void count_once(u64 *count_ptr)
{
	WRITE_ONCE(*count_ptr, READ_ONCE(*count_ptr) + 1);
}

static void update_session_stats(struct uds_request *request)
{
	struct session_stats *session_stats = &request->session->stats;

	count_once(&session_stats->requests);

	switch (request->type) {
	case UDS_POST:
		if (request->found)
			count_once(&session_stats->posts_found);
		else
			count_once(&session_stats->posts_not_found);

		if (request->location == UDS_LOCATION_IN_OPEN_CHAPTER)
			count_once(&session_stats->posts_found_open_chapter);
		else if (request->location == UDS_LOCATION_IN_DENSE)
			count_once(&session_stats->posts_found_dense);
		else if (request->location == UDS_LOCATION_IN_SPARSE)
			count_once(&session_stats->posts_found_sparse);
		break;

	case UDS_UPDATE:
		if (request->found)
			count_once(&session_stats->updates_found);
		else
			count_once(&session_stats->updates_not_found);
		break;

	case UDS_DELETE:
		if (request->found)
			count_once(&session_stats->deletions_found);
		else
			count_once(&session_stats->deletions_not_found);
		break;

	case UDS_QUERY:
	case UDS_QUERY_NO_UPDATE:
		if (request->found)
			count_once(&session_stats->queries_found);
		else
			count_once(&session_stats->queries_not_found);
		break;

	default:
		request->status = VDO_ASSERT(false, "unknown request type: %d",
					     request->type);
	}
}

static void handle_callbacks(struct uds_request *request)
{
	struct uds_index_session *index_session = request->session;

	if (request->status == UDS_SUCCESS)
		update_session_stats(request);

	request->status = uds_status_to_errno(request->status);
	request->callback(request);
	release_index_session(index_session);
}

static int __must_check make_empty_index_session(struct uds_index_session **index_session_ptr)
{
	int result;
	struct uds_index_session *session;

	result = vdo_allocate(1, struct uds_index_session, __func__, &session);
	if (result != VDO_SUCCESS)
		return result;

	mutex_init(&session->request_mutex);
	uds_init_cond(&session->request_cond);
	mutex_init(&session->load_context.mutex);
	uds_init_cond(&session->load_context.cond);

	result = uds_make_request_queue("callbackW", &handle_callbacks,
					&session->callback_queue);
	if (result != UDS_SUCCESS) {
		vdo_free(session);
		return result;
	}

	*index_session_ptr = session;
	return UDS_SUCCESS;
}

int uds_create_index_session(struct uds_index_session **session)
{
	if (session == NULL) {
		vdo_log_error("missing session pointer");
		return -EINVAL;
	}

	return uds_status_to_errno(make_empty_index_session(session));
}

static int __must_check start_loading_index_session(struct uds_index_session *index_session)
{
	int result;

	mutex_lock(&index_session->request_mutex);
	if (index_session->state & IS_FLAG_SUSPENDED) {
		vdo_log_info("Index session is suspended");
		result = -EBUSY;
	} else if (index_session->state != 0) {
		vdo_log_info("Index is already loaded");
		result = -EBUSY;
	} else {
		index_session->state |= IS_FLAG_LOADING;
		result = UDS_SUCCESS;
	}
	mutex_unlock(&index_session->request_mutex);
	return result;
}

static void finish_loading_index_session(struct uds_index_session *index_session,
					 int result)
{
	mutex_lock(&index_session->request_mutex);
	index_session->state &= ~IS_FLAG_LOADING;
	if (result == UDS_SUCCESS)
		index_session->state |= IS_FLAG_LOADED;

	uds_broadcast_cond(&index_session->request_cond);
	mutex_unlock(&index_session->request_mutex);
}

static int initialize_index_session(struct uds_index_session *index_session,
				    enum uds_open_index_type open_type)
{
	int result;
	struct uds_configuration *config;

	result = uds_make_configuration(&index_session->parameters, &config);
	if (result != UDS_SUCCESS) {
		vdo_log_error_strerror(result, "Failed to allocate config");
		return result;
	}

	memset(&index_session->stats, 0, sizeof(index_session->stats));
	result = uds_make_index(config, open_type, &index_session->load_context,
				enter_callback_stage, &index_session->index);
	if (result != UDS_SUCCESS)
		vdo_log_error_strerror(result, "Failed to make index");
	else
		uds_log_configuration(config);

	uds_free_configuration(config);
	return result;
}

static const char *get_open_type_string(enum uds_open_index_type open_type)
{
	switch (open_type) {
	case UDS_CREATE:
		return "creating index";
	case UDS_LOAD:
		return "loading or rebuilding index";
	case UDS_NO_REBUILD:
		return "loading index";
	default:
		return "unknown open method";
	}
}

/*
 * Open an index under the given session. This operation will fail if the
 * index session is suspended, or if there is already an open index.
 */
int uds_open_index(enum uds_open_index_type open_type,
		   const struct uds_parameters *parameters,
		   struct uds_index_session *session)
{
	int result;
	char name[BDEVNAME_SIZE];

	if (parameters == NULL) {
		vdo_log_error("missing required parameters");
		return -EINVAL;
	}
	if (parameters->bdev == NULL) {
		vdo_log_error("missing required block device");
		return -EINVAL;
	}
	if (session == NULL) {
		vdo_log_error("missing required session pointer");
		return -EINVAL;
	}

	result = start_loading_index_session(session);
	if (result != UDS_SUCCESS)
		return uds_status_to_errno(result);

	session->parameters = *parameters;
	format_dev_t(name, parameters->bdev->bd_dev);
	vdo_log_info("%s: %s", get_open_type_string(open_type), name);

	result = initialize_index_session(session, open_type);
	if (result != UDS_SUCCESS)
		vdo_log_error_strerror(result, "Failed %s",
				       get_open_type_string(open_type));

	finish_loading_index_session(session, result);
	return uds_status_to_errno(result);
}

static void wait_for_no_requests_in_progress(struct uds_index_session *index_session)
{
	mutex_lock(&index_session->request_mutex);
	while (index_session->request_count > 0) {
		uds_wait_cond(&index_session->request_cond,
			      &index_session->request_mutex);
	}
	mutex_unlock(&index_session->request_mutex);
}

static int __must_check save_index(struct uds_index_session *index_session)
{
	wait_for_no_requests_in_progress(index_session);
	return uds_save_index(index_session->index);
}

static void suspend_rebuild(struct uds_index_session *session)
{
	mutex_lock(&session->load_context.mutex);
	switch (session->load_context.status) {
	case INDEX_OPENING:
		session->load_context.status = INDEX_SUSPENDING;

		/* Wait until the index indicates that it is not replaying. */
		while ((session->load_context.status != INDEX_SUSPENDED) &&
		       (session->load_context.status != INDEX_READY)) {
			uds_wait_cond(&session->load_context.cond,
				      &session->load_context.mutex);
		}

		break;

	case INDEX_READY:
		/* Index load does not need to be suspended. */
		break;

	case INDEX_SUSPENDED:
	case INDEX_SUSPENDING:
	case INDEX_FREEING:
	default:
		/* These cases should not happen. */
		VDO_ASSERT_LOG_ONLY(false, "Bad load context state %u",
				    session->load_context.status);
		break;
	}
	mutex_unlock(&session->load_context.mutex);
}

/*
 * Suspend index operation, draining all current index requests and preventing new index requests
 * from starting. Optionally saves all index data before returning.
 */
int uds_suspend_index_session(struct uds_index_session *session, bool save)
{
	int result = UDS_SUCCESS;
	bool no_work = false;
	bool rebuilding = false;

	/* Wait for any current index state change to complete. */
	mutex_lock(&session->request_mutex);
	while (session->state & IS_FLAG_CLOSING)
		uds_wait_cond(&session->request_cond, &session->request_mutex);

	if ((session->state & IS_FLAG_WAITING) || (session->state & IS_FLAG_DESTROYING)) {
		no_work = true;
		vdo_log_info("Index session is already changing state");
		result = -EBUSY;
	} else if (session->state & IS_FLAG_SUSPENDED) {
		no_work = true;
	} else if (session->state & IS_FLAG_LOADING) {
		session->state |= IS_FLAG_WAITING;
		rebuilding = true;
	} else if (session->state & IS_FLAG_LOADED) {
		session->state |= IS_FLAG_WAITING;
	} else {
		no_work = true;
		session->state |= IS_FLAG_SUSPENDED;
		uds_broadcast_cond(&session->request_cond);
	}
	mutex_unlock(&session->request_mutex);

	if (no_work)
		return uds_status_to_errno(result);

	if (rebuilding)
		suspend_rebuild(session);
	else if (save)
		result = save_index(session);
	else
		result = uds_flush_index_session(session);

	mutex_lock(&session->request_mutex);
	session->state &= ~IS_FLAG_WAITING;
	session->state |= IS_FLAG_SUSPENDED;
	uds_broadcast_cond(&session->request_cond);
	mutex_unlock(&session->request_mutex);
	return uds_status_to_errno(result);
}

static int replace_device(struct uds_index_session *session, struct block_device *bdev)
{
	int result;

	result = uds_replace_index_storage(session->index, bdev);
	if (result != UDS_SUCCESS)
		return result;

	session->parameters.bdev = bdev;
	return UDS_SUCCESS;
}

/*
 * Resume index operation after being suspended. If the index is suspended and the supplied block
 * device differs from the current backing store, the index will start using the new backing store.
 */
int uds_resume_index_session(struct uds_index_session *session,
			     struct block_device *bdev)
{
	int result = UDS_SUCCESS;
	bool no_work = false;
	bool resume_replay = false;

	mutex_lock(&session->request_mutex);
	if (session->state & IS_FLAG_WAITING) {
		vdo_log_info("Index session is already changing state");
		no_work = true;
		result = -EBUSY;
	} else if (!(session->state & IS_FLAG_SUSPENDED)) {
		/* If not suspended, just succeed. */
		no_work = true;
		result = UDS_SUCCESS;
	} else {
		session->state |= IS_FLAG_WAITING;
		if (session->state & IS_FLAG_LOADING)
			resume_replay = true;
	}
	mutex_unlock(&session->request_mutex);

	if (no_work)
		return result;

	if ((session->index != NULL) && (bdev != session->parameters.bdev)) {
		result = replace_device(session, bdev);
		if (result != UDS_SUCCESS) {
			mutex_lock(&session->request_mutex);
			session->state &= ~IS_FLAG_WAITING;
			uds_broadcast_cond(&session->request_cond);
			mutex_unlock(&session->request_mutex);
			return uds_status_to_errno(result);
		}
	}

	if (resume_replay) {
		mutex_lock(&session->load_context.mutex);
		switch (session->load_context.status) {
		case INDEX_SUSPENDED:
			session->load_context.status = INDEX_OPENING;
			/* Notify the index to start replaying again. */
			uds_broadcast_cond(&session->load_context.cond);
			break;

		case INDEX_READY:
			/* There is no index rebuild to resume. */
			break;

		case INDEX_OPENING:
		case INDEX_SUSPENDING:
		case INDEX_FREEING:
		default:
			/* These cases should not happen; do nothing. */
			VDO_ASSERT_LOG_ONLY(false, "Bad load context state %u",
					    session->load_context.status);
			break;
		}
		mutex_unlock(&session->load_context.mutex);
	}

	mutex_lock(&session->request_mutex);
	session->state &= ~IS_FLAG_WAITING;
	session->state &= ~IS_FLAG_SUSPENDED;
	uds_broadcast_cond(&session->request_cond);
	mutex_unlock(&session->request_mutex);
	return UDS_SUCCESS;
}

static int save_and_free_index(struct uds_index_session *index_session)
{
	int result = UDS_SUCCESS;
	bool suspended;
	struct uds_index *index = index_session->index;

	if (index == NULL)
		return UDS_SUCCESS;

	mutex_lock(&index_session->request_mutex);
	suspended = (index_session->state & IS_FLAG_SUSPENDED);
	mutex_unlock(&index_session->request_mutex);

	if (!suspended) {
		result = uds_save_index(index);
		if (result != UDS_SUCCESS)
			vdo_log_warning_strerror(result,
						 "ignoring error from save_index");
	}
	uds_free_index(index);
	index_session->index = NULL;

	/*
	 * Reset all index state that happens to be in the index
	 * session, so it doesn't affect any future index.
	 */
	mutex_lock(&index_session->load_context.mutex);
	index_session->load_context.status = INDEX_OPENING;
	mutex_unlock(&index_session->load_context.mutex);

	mutex_lock(&index_session->request_mutex);
	/* Only the suspend bit will remain relevant. */
	index_session->state &= IS_FLAG_SUSPENDED;
	mutex_unlock(&index_session->request_mutex);

	return result;
}

/* Save and close the current index. */
int uds_close_index(struct uds_index_session *index_session)
{
	int result = UDS_SUCCESS;

	/* Wait for any current index state change to complete. */
	mutex_lock(&index_session->request_mutex);
	while ((index_session->state & IS_FLAG_WAITING) ||
	       (index_session->state & IS_FLAG_CLOSING)) {
		uds_wait_cond(&index_session->request_cond,
			      &index_session->request_mutex);
	}

	if (index_session->state & IS_FLAG_SUSPENDED) {
		vdo_log_info("Index session is suspended");
		result = -EBUSY;
	} else if ((index_session->state & IS_FLAG_DESTROYING) ||
		   !(index_session->state & IS_FLAG_LOADED)) {
		/* The index doesn't exist, hasn't finished loading, or is being destroyed. */
		result = UDS_NO_INDEX;
	} else {
		index_session->state |= IS_FLAG_CLOSING;
	}
	mutex_unlock(&index_session->request_mutex);
	if (result != UDS_SUCCESS)
		return uds_status_to_errno(result);

	vdo_log_debug("Closing index");
	wait_for_no_requests_in_progress(index_session);
	result = save_and_free_index(index_session);
	vdo_log_debug("Closed index");

	mutex_lock(&index_session->request_mutex);
	index_session->state &= ~IS_FLAG_CLOSING;
	uds_broadcast_cond(&index_session->request_cond);
	mutex_unlock(&index_session->request_mutex);
	return uds_status_to_errno(result);
}

/* This will save and close an open index before destroying the session. */
int uds_destroy_index_session(struct uds_index_session *index_session)
{
	int result;
	bool load_pending = false;

	vdo_log_debug("Destroying index session");

	/* Wait for any current index state change to complete. */
	mutex_lock(&index_session->request_mutex);
	while ((index_session->state & IS_FLAG_WAITING) ||
	       (index_session->state & IS_FLAG_CLOSING)) {
		uds_wait_cond(&index_session->request_cond,
			      &index_session->request_mutex);
	}

	if (index_session->state & IS_FLAG_DESTROYING) {
		mutex_unlock(&index_session->request_mutex);
		vdo_log_info("Index session is already closing");
		return -EBUSY;
	}

	index_session->state |= IS_FLAG_DESTROYING;
	load_pending = ((index_session->state & IS_FLAG_LOADING) &&
			(index_session->state & IS_FLAG_SUSPENDED));
	mutex_unlock(&index_session->request_mutex);

	if (load_pending) {
		/* Tell the index to terminate the rebuild. */
		mutex_lock(&index_session->load_context.mutex);
		if (index_session->load_context.status == INDEX_SUSPENDED) {
			index_session->load_context.status = INDEX_FREEING;
			uds_broadcast_cond(&index_session->load_context.cond);
		}
		mutex_unlock(&index_session->load_context.mutex);

		/* Wait until the load exits before proceeding. */
		mutex_lock(&index_session->request_mutex);
		while (index_session->state & IS_FLAG_LOADING) {
			uds_wait_cond(&index_session->request_cond,
				      &index_session->request_mutex);
		}
		mutex_unlock(&index_session->request_mutex);
	}

	wait_for_no_requests_in_progress(index_session);
	result = save_and_free_index(index_session);
	uds_request_queue_finish(index_session->callback_queue);
	index_session->callback_queue = NULL;
	vdo_log_debug("Destroyed index session");
	vdo_free(index_session);
	return uds_status_to_errno(result);
}

/* Wait until all callbacks for index operations are complete. */
int uds_flush_index_session(struct uds_index_session *index_session)
{
	wait_for_no_requests_in_progress(index_session);
	uds_wait_for_idle_index(index_session->index);
	return UDS_SUCCESS;
}

/* Statistics collection is intended to be thread-safe. */
static void collect_stats(const struct uds_index_session *index_session,
			  struct uds_index_stats *stats)
{
	const struct session_stats *session_stats = &index_session->stats;

	stats->current_time = ktime_to_seconds(current_time_ns(CLOCK_REALTIME));
	stats->posts_found = READ_ONCE(session_stats->posts_found);
	stats->in_memory_posts_found = READ_ONCE(session_stats->posts_found_open_chapter);
	stats->dense_posts_found = READ_ONCE(session_stats->posts_found_dense);
	stats->sparse_posts_found = READ_ONCE(session_stats->posts_found_sparse);
	stats->posts_not_found = READ_ONCE(session_stats->posts_not_found);
	stats->updates_found = READ_ONCE(session_stats->updates_found);
	stats->updates_not_found = READ_ONCE(session_stats->updates_not_found);
	stats->deletions_found = READ_ONCE(session_stats->deletions_found);
	stats->deletions_not_found = READ_ONCE(session_stats->deletions_not_found);
	stats->queries_found = READ_ONCE(session_stats->queries_found);
	stats->queries_not_found = READ_ONCE(session_stats->queries_not_found);
	stats->requests = READ_ONCE(session_stats->requests);
}

int uds_get_index_session_stats(struct uds_index_session *index_session,
				struct uds_index_stats *stats)
{
	if (stats == NULL) {
		vdo_log_error("received a NULL index stats pointer");
		return -EINVAL;
	}

	collect_stats(index_session, stats);
	if (index_session->index != NULL) {
		uds_get_index_stats(index_session->index, stats);
	} else {
		stats->entries_indexed = 0;
		stats->memory_used = 0;
		stats->collisions = 0;
		stats->entries_discarded = 0;
	}

	return UDS_SUCCESS;
}

void uds_wait_cond(struct cond_var *cv, struct mutex *mutex)
{
	DEFINE_WAIT(__wait);

	prepare_to_wait(&cv->wait_queue, &__wait, TASK_IDLE);
	mutex_unlock(mutex);
	schedule();
	finish_wait(&cv->wait_queue, &__wait);
	mutex_lock(mutex);
}