stats.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-or-later
/*
 * Debugging module statistics.
 *
 * Copyright (C) 2023 Luis Chamberlain <[email protected]>
 */

#include <linux/module.h>
#include <uapi/linux/module.h>
#include <linux/string.h>
#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/list.h>
#include <linux/debugfs.h>
#include <linux/rculist.h>
#include <linux/math.h>

#include "internal.h"

/**
 * DOC: module debugging statistics overview
 *
 * Enabling CONFIG_MODULE_STATS enables module debugging statistics which
 * are useful to monitor and root cause memory pressure issues with module
 * loading. These statistics are useful to allow us to improve production
 * workloads.
 *
 * The current module debugging statistics supported help keep track of module
 * loading failures to enable improvements either for kernel module auto-loading
 * usage (request_module()) or interactions with userspace. Statistics are
 * provided to track all possible failures in the finit_module() path and memory
 * wasted in this process space.  Each of the failure counters are associated
 * to a type of module loading failure which is known to incur a certain amount
 * of memory allocation loss. In the worst case loading a module will fail after
 * a 3 step memory allocation process:
 *
 *   a) memory allocated with kernel_read_file_from_fd()
 *   b) module decompression processes the file read from
 *      kernel_read_file_from_fd(), and vmap() is used to map
 *      the decompressed module to a new local buffer which represents
 *      a copy of the decompressed module passed from userspace. The buffer
 *      from kernel_read_file_from_fd() is freed right away.
 *   c) layout_and_allocate() allocates space for the final resting
 *      place where we would keep the module if it were to be processed
 *      successfully.
 *
 * If a failure occurs after these three different allocations only one
 * counter will be incremented with the summation of the allocated bytes freed
 * incurred during this failure. Likewise, if module loading failed only after
 * step b) a separate counter is used and incremented for the bytes freed and
 * not used during both of those allocations.
 *
 * Virtual memory space can be limited, for example on x86 virtual memory size
 * defaults to 128 MiB. We should strive to limit and avoid wasting virtual
 * memory allocations when possible. These module debugging statistics help
 * to evaluate how much memory is being wasted on bootup due to module loading
 * failures.
 *
 * All counters are designed to be incremental. Atomic counters are used so to
 * remain simple and avoid delays and deadlocks.
 */

/**
 * DOC: dup_failed_modules - tracks duplicate failed modules
 *
 * Linked list of modules which failed to be loaded because an already existing
 * module with the same name was already being processed or already loaded.
 * The finit_module() system call incurs heavy virtual memory allocations. In
 * the worst case an finit_module() system call can end up allocating virtual
 * memory 3 times:
 *
 *   1) kernel_read_file_from_fd() call uses vmalloc()
 *   2) optional module decompression uses vmap()
 *   3) layout_and allocate() can use vzalloc() or an arch specific variation of
 *      vmalloc to deal with ELF sections requiring special permissions
 *
 * In practice on a typical boot today most finit_module() calls fail due to
 * the module with the same name already being loaded or about to be processed.
 * All virtual memory allocated to these failed modules will be freed with
 * no functional use.
 *
 * To help with this the dup_failed_modules allows us to track modules which
 * failed to load due to the fact that a module was already loaded or being
 * processed.  There are only two points at which we can fail such calls,
 * we list them below along with the number of virtual memory allocation
 * calls:
 *
 *   a) FAIL_DUP_MOD_BECOMING: at the end of early_mod_check() before
 *	layout_and_allocate().
 *	- with module decompression: 2 virtual memory allocation calls
 *	- without module decompression: 1 virtual memory allocation calls
 *   b) FAIL_DUP_MOD_LOAD: after layout_and_allocate() on add_unformed_module()
 *   	- with module decompression 3 virtual memory allocation calls
 *   	- without module decompression 2 virtual memory allocation calls
 *
 * We should strive to get this list to be as small as possible. If this list
 * is not empty it is a reflection of possible work or optimizations possible
 * either in-kernel or in userspace.
 */
static LIST_HEAD(dup_failed_modules);

/**
 * DOC: module statistics debugfs counters
 *
 * The total amount of wasted virtual memory allocation space during module
 * loading can be computed by adding the total from the summation:
 *
 *   * @invalid_kread_bytes +
 *     @invalid_decompress_bytes +
 *     @invalid_becoming_bytes +
 *     @invalid_mod_bytes
 *
 * The following debugfs counters are available to inspect module loading
 * failures:
 *
 *   * total_mod_size: total bytes ever used by all modules we've dealt with on
 *     this system
 *   * total_text_size: total bytes of the .text and .init.text ELF section
 *     sizes we've dealt with on this system
 *   * invalid_kread_bytes: bytes allocated and then freed on failures which
 *     happen due to the initial kernel_read_file_from_fd(). kernel_read_file_from_fd()
 *     uses vmalloc(). These should typically not happen unless your system is
 *     under memory pressure.
 *   * invalid_decompress_bytes: number of bytes allocated and freed due to
 *     memory allocations in the module decompression path that use vmap().
 *     These typically should not happen unless your system is under memory
 *     pressure.
 *   * invalid_becoming_bytes: total number of bytes allocated and freed used
 *     to read the kernel module userspace wants us to read before we
 *     promote it to be processed to be added to our @modules linked list. These
 *     failures can happen if we had a check in between a successful kernel_read_file_from_fd()
 *     call and right before we allocate the our private memory for the module
 *     which would be kept if the module is successfully loaded. The most common
 *     reason for this failure is when userspace is racing to load a module
 *     which it does not yet see loaded. The first module to succeed in
 *     add_unformed_module() will add a module to our &modules list and
 *     subsequent loads of modules with the same name will error out at the
 *     end of early_mod_check(). The check for module_patient_check_exists()
 *     at the end of early_mod_check() prevents duplicate allocations
 *     on layout_and_allocate() for modules already being processed. These
 *     duplicate failed modules are non-fatal, however they typically are
 *     indicative of userspace not seeing a module in userspace loaded yet and
 *     unnecessarily trying to load a module before the kernel even has a chance
 *     to begin to process prior requests. Although duplicate failures can be
 *     non-fatal, we should try to reduce vmalloc() pressure proactively, so
 *     ideally after boot this will be close to as 0 as possible.  If module
 *     decompression was used we also add to this counter the cost of the
 *     initial kernel_read_file_from_fd() of the compressed module. If module
 *     decompression was not used the value represents the total allocated and
 *     freed bytes in kernel_read_file_from_fd() calls for these type of
 *     failures. These failures can occur because:
 *
 *    * module_sig_check() - module signature checks
 *    * elf_validity_cache_copy() - some ELF validation issue
 *    * early_mod_check():
 *
 *      * blacklisting
 *      * failed to rewrite section headers
 *      * version magic
 *      * live patch requirements didn't check out
 *      * the module was detected as being already present
 *
 *   * invalid_mod_bytes: these are the total number of bytes allocated and
 *     freed due to failures after we did all the sanity checks of the module
 *     which userspace passed to us and after our first check that the module
 *     is unique.  A module can still fail to load if we detect the module is
 *     loaded after we allocate space for it with layout_and_allocate(), we do
 *     this check right before processing the module as live and run its
 *     initialization routines. Note that you have a failure of this type it
 *     also means the respective kernel_read_file_from_fd() memory space was
 *     also freed and not used, and so we increment this counter with twice
 *     the size of the module. Additionally if you used module decompression
 *     the size of the compressed module is also added to this counter.
 *
 *  * modcount: how many modules we've loaded in our kernel life time
 *  * failed_kreads: how many modules failed due to failed kernel_read_file_from_fd()
 *  * failed_decompress: how many failed module decompression attempts we've had.
 *    These really should not happen unless your compression / decompression
 *    might be broken.
 *  * failed_becoming: how many modules failed after we kernel_read_file_from_fd()
 *    it and before we allocate memory for it with layout_and_allocate(). This
 *    counter is never incremented if you manage to validate the module and
 *    call layout_and_allocate() for it.
 *  * failed_load_modules: how many modules failed once we've allocated our
 *    private space for our module using layout_and_allocate(). These failures
 *    should hopefully mostly be dealt with already. Races in theory could
 *    still exist here, but it would just mean the kernel had started processing
 *    two threads concurrently up to early_mod_check() and one thread won.
 *    These failures are good signs the kernel or userspace is doing something
 *    seriously stupid or that could be improved. We should strive to fix these,
 *    but it is perhaps not easy to fix them. A recent example are the modules
 *    requests incurred for frequency modules, a separate module request was
 *    being issued for each CPU on a system.
 */

atomic_long_t total_mod_size;
atomic_long_t total_text_size;
atomic_long_t invalid_kread_bytes;
atomic_long_t invalid_decompress_bytes;
static atomic_long_t invalid_becoming_bytes;
static atomic_long_t invalid_mod_bytes;
atomic_t modcount;
atomic_t failed_kreads;
atomic_t failed_decompress;
static atomic_t failed_becoming;
static atomic_t failed_load_modules;

static const char *mod_fail_to_str(struct mod_fail_load *mod_fail)
{ … }

void mod_stat_bump_invalid(struct load_info *info, int flags)
{ … }

void mod_stat_bump_becoming(struct load_info *info, int flags)
{ … }

int try_add_failed_module(const char *name, enum fail_dup_mod_reason reason)
{ … }

/*
 * At 64 bytes per module and assuming a 1024 bytes preamble we can fit the
 * 112 module prints within 8k.
 *
 * 1024 + (64*112) = 8k
 */
#define MAX_PREAMBLE …
#define MAX_FAILED_MOD_PRINT …
#define MAX_BYTES_PER_MOD …
static ssize_t read_file_mod_stats(struct file *file, char __user *user_buf,
				   size_t count, loff_t *ppos)
{ … }
#undef MAX_PREAMBLE
#undef MAX_FAILED_MOD_PRINT
#undef MAX_BYTES_PER_MOD

static const struct file_operations fops_mod_stats = …;

#define mod_debug_add_ulong …
#define mod_debug_add_atomic …
static int __init module_stats_init(void)
{ … }
#undef mod_debug_add_ulong
#undef mod_debug_add_atomic
module_init(…) …;
linux/kernel/module/stats.c