compress.c | Explore in Territory

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Copyright (C) 2024, SUSE LLC
 *
 * Authors: Enzo Matsumiya <[email protected]>
 *
 * This file implements I/O compression support for SMB2 messages (SMB 3.1.1 only).
 * See compress/ for implementation details of each algorithm.
 *
 * References:
 * MS-SMB2 "3.1.4.4 Compressing the Message"
 * MS-SMB2 "3.1.5.3 Decompressing the Chained Message"
 * MS-XCA - for details of the supported algorithms
 */
#include <linux/slab.h>
#include <linux/kernel.h>
#include <linux/uio.h>
#include <linux/sort.h>

#include "cifsglob.h"
#include "../common/smb2pdu.h"
#include "cifsproto.h"
#include "smb2proto.h"

#include "compress/lz77.h"
#include "compress.h"

/*
 * The heuristic_*() functions below try to determine data compressibility.
 *
 * Derived from fs/btrfs/compression.c, changing coding style, some parameters, and removing
 * unused parts.
 *
 * Read that file for better and more detailed explanation of the calculations.
 *
 * The algorithms are ran in a collected sample of the input (uncompressed) data.
 * The sample is formed of 2K reads in PAGE_SIZE intervals, with a maximum size of 4M.
 *
 * Parsing the sample goes from "low-hanging fruits" (fastest algorithms, likely compressible)
 * to "need more analysis" (likely uncompressible).
 */

struct bucket { … };

/**
 * has_low_entropy() - Compute Shannon entropy of the sampled data.
 * @bkt:	Bytes counts of the sample.
 * @slen:	Size of the sample.
 *
 * Return: true if the level (percentage of number of bits that would be required to
 *	   compress the data) is below the minimum threshold.
 *
 * Note:
 * There _is_ an entropy level here that's > 65 (minimum threshold) that would indicate a
 * possibility of compression, but compressing, or even further analysing, it would waste so much
 * resources that it's simply not worth it.
 *
 * Also Shannon entropy is the last computed heuristic; if we got this far and ended up
 * with uncertainty, just stay on the safe side and call it uncompressible.
 */
static bool has_low_entropy(struct bucket *bkt, size_t slen)
{ … }

#define BYTE_DIST_BAD …
#define BYTE_DIST_GOOD …
#define BYTE_DIST_MAYBE …
/**
 * calc_byte_distribution() - Compute byte distribution on the sampled data.
 * @bkt:	Byte counts of the sample.
 * @slen:	Size of the sample.
 *
 * Return:
 * BYTE_DIST_BAD:	A "hard no" for compression -- a computed uniform distribution of
 *			the bytes (e.g. random or encrypted data).
 * BYTE_DIST_GOOD:	High probability (normal (Gaussian) distribution) of the data being
 *			compressible.
 * BYTE_DIST_MAYBE:	When computed byte distribution resulted in "low > n < high"
 *			grounds.  has_low_entropy() should be used for a final decision.
 */
static int calc_byte_distribution(struct bucket *bkt, size_t slen)
{ … }

static bool is_mostly_ascii(const struct bucket *bkt)
{ … }

static bool has_repeated_data(const u8 *sample, size_t len)
{ … }

static int cmp_bkt(const void *_a, const void *_b)
{ … }

/*
 * TODO:
 * Support other iter types, if required.
 * Only ITER_XARRAY is supported for now.
 */
static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
{ … }

/**
 * is_compressible() - Determines if a chunk of data is compressible.
 * @data: Iterator containing uncompressed data.
 *
 * Return: true if @data is compressible, false otherwise.
 *
 * Tests shows that this function is quite reliable in predicting data compressibility,
 * matching close to 1:1 with the behaviour of LZ77 compression success and failures.
 */
static bool is_compressible(const struct iov_iter *data)
{ … }

bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq)
{ … }

int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_send_fn send_fn)
{ … }
linux/fs/smb/client/compress.c