// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2024, SUSE LLC * * Authors: Enzo Matsumiya <[email protected]> * * This file implements I/O compression support for SMB2 messages (SMB 3.1.1 only). * See compress/ for implementation details of each algorithm. * * References: * MS-SMB2 "3.1.4.4 Compressing the Message" * MS-SMB2 "3.1.5.3 Decompressing the Chained Message" * MS-XCA - for details of the supported algorithms */ #include <linux/slab.h> #include <linux/kernel.h> #include <linux/uio.h> #include <linux/sort.h> #include "cifsglob.h" #include "../common/smb2pdu.h" #include "cifsproto.h" #include "smb2proto.h" #include "compress/lz77.h" #include "compress.h" /* * The heuristic_*() functions below try to determine data compressibility. * * Derived from fs/btrfs/compression.c, changing coding style, some parameters, and removing * unused parts. * * Read that file for better and more detailed explanation of the calculations. * * The algorithms are ran in a collected sample of the input (uncompressed) data. * The sample is formed of 2K reads in PAGE_SIZE intervals, with a maximum size of 4M. * * Parsing the sample goes from "low-hanging fruits" (fastest algorithms, likely compressible) * to "need more analysis" (likely uncompressible). */ struct bucket { … }; /** * has_low_entropy() - Compute Shannon entropy of the sampled data. * @bkt: Bytes counts of the sample. * @slen: Size of the sample. * * Return: true if the level (percentage of number of bits that would be required to * compress the data) is below the minimum threshold. * * Note: * There _is_ an entropy level here that's > 65 (minimum threshold) that would indicate a * possibility of compression, but compressing, or even further analysing, it would waste so much * resources that it's simply not worth it. * * Also Shannon entropy is the last computed heuristic; if we got this far and ended up * with uncertainty, just stay on the safe side and call it uncompressible. */ static bool has_low_entropy(struct bucket *bkt, size_t slen) { … } #define BYTE_DIST_BAD … #define BYTE_DIST_GOOD … #define BYTE_DIST_MAYBE … /** * calc_byte_distribution() - Compute byte distribution on the sampled data. * @bkt: Byte counts of the sample. * @slen: Size of the sample. * * Return: * BYTE_DIST_BAD: A "hard no" for compression -- a computed uniform distribution of * the bytes (e.g. random or encrypted data). * BYTE_DIST_GOOD: High probability (normal (Gaussian) distribution) of the data being * compressible. * BYTE_DIST_MAYBE: When computed byte distribution resulted in "low > n < high" * grounds. has_low_entropy() should be used for a final decision. */ static int calc_byte_distribution(struct bucket *bkt, size_t slen) { … } static bool is_mostly_ascii(const struct bucket *bkt) { … } static bool has_repeated_data(const u8 *sample, size_t len) { … } static int cmp_bkt(const void *_a, const void *_b) { … } /* * TODO: * Support other iter types, if required. * Only ITER_XARRAY is supported for now. */ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample) { … } /** * is_compressible() - Determines if a chunk of data is compressible. * @data: Iterator containing uncompressed data. * * Return: true if @data is compressible, false otherwise. * * Tests shows that this function is quite reliable in predicting data compressibility, * matching close to 1:1 with the behaviour of LZ77 compression success and failures. */ static bool is_compressible(const struct iov_iter *data) { … } bool should_compress(const struct cifs_tcon *tcon, const struct smb_rqst *rq) { … } int smb_compress(struct TCP_Server_Info *server, struct smb_rqst *rq, compress_send_fn send_fn) { … }