// Copyright 2005 Google Inc. All Rights Reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "snappy-internal.h" #include "snappy-sinksource.h" #include "snappy.h" #if !defined(SNAPPY_HAVE_BMI2) // __BMI2__ is defined by GCC and Clang. Visual Studio doesn't target BMI2 // specifically, but it does define __AVX2__ when AVX2 support is available. // Fortunately, AVX2 was introduced in Haswell, just like BMI2. // // BMI2 is not defined as a subset of AVX2 (unlike SSSE3 and AVX above). So, // GCC and Clang can build code with AVX2 enabled but BMI2 disabled, in which // case issuing BMI2 instructions results in a compiler error. #if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) #define SNAPPY_HAVE_BMI2 … #else #define SNAPPY_HAVE_BMI2 … #endif #endif // !defined(SNAPPY_HAVE_BMI2) #if !defined(SNAPPY_HAVE_X86_CRC32) #if defined(__SSE4_2__) #define SNAPPY_HAVE_X86_CRC32 … #else #define SNAPPY_HAVE_X86_CRC32 … #endif #endif // !defined(SNAPPY_HAVE_X86_CRC32) #if !defined(SNAPPY_HAVE_NEON_CRC32) #if SNAPPY_HAVE_NEON && defined(__ARM_FEATURE_CRC32) #define SNAPPY_HAVE_NEON_CRC32 … #else #define SNAPPY_HAVE_NEON_CRC32 … #endif #endif // !defined(SNAPPY_HAVE_NEON_CRC32) #if SNAPPY_HAVE_BMI2 || SNAPPY_HAVE_X86_CRC32 // Please do not replace with <x86intrin.h>. or with headers that assume more // advanced SSE versions without checking with all the OWNERS. #include <immintrin.h> #elif SNAPPY_HAVE_NEON_CRC32 #include <arm_acle.h> #endif #include <algorithm> #include <array> #include <cstddef> #include <cstdint> #include <cstdio> #include <cstring> #include <string> #include <utility> #include <vector> namespace snappy { namespace { // The amount of slop bytes writers are using for unconditional copies. constexpr int kSlopBytes = …; char_table; COPY_1_BYTE_OFFSET; COPY_2_BYTE_OFFSET; COPY_4_BYTE_OFFSET; kMaximumTagLength; LITERAL; #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE using internal::V128; using internal::V128_Load; using internal::V128_LoadU; using internal::V128_Shuffle; using internal::V128_StoreU; using internal::V128_DupChar; #endif // We translate the information encoded in a tag through a lookup table to a // format that requires fewer instructions to decode. Effectively we store // the length minus the tag part of the offset. The lowest significant byte // thus stores the length. While total length - offset is given by // entry - ExtractOffset(type). The nice thing is that the subtraction // immediately sets the flags for the necessary check that offset >= length. // This folds the cmp with sub. We engineer the long literals and copy-4 to // always fail this check, so their presence doesn't affect the fast path. // To prevent literals from triggering the guard against offset < length (offset // does not apply to literals) the table is giving them a spurious offset of // 256. inline constexpr int16_t MakeEntry(int16_t len, int16_t offset) { … } inline constexpr int16_t LengthMinusOffset(int data, int type) { … } inline constexpr int16_t LengthMinusOffset(uint8_t tag) { … } template <size_t... Ints> struct index_sequence { … }; template <std::size_t N, size_t... Is> struct make_index_sequence : make_index_sequence<N - 1, N - 1, Is...> { … }; make_index_sequence<0, Is...>; template <size_t... seq> constexpr std::array<int16_t, 256> MakeTable(index_sequence<seq...>) { … } alignas(64) const std::array<int16_t, 256> kLengthMinusOffset = …; // Given a table of uint16_t whose size is mask / 2 + 1, return a pointer to the // relevant entry, if any, for the given bytes. Any hash function will do, // but a good hash function reduces the number of collisions and thus yields // better compression for compressible input. // // REQUIRES: mask is 2 * (table_size - 1), and table_size is a power of two. inline uint16_t* TableEntry(uint16_t* table, uint32_t bytes, uint32_t mask) { … } } // namespace size_t MaxCompressedLength(size_t source_bytes) { … } namespace { void UnalignedCopy64(const void* src, void* dst) { … } void UnalignedCopy128(const void* src, void* dst) { … } template <bool use_16bytes_chunk> inline void ConditionalUnalignedCopy128(const char* src, char* dst) { … } // Copy [src, src+(op_limit-op)) to [op, (op_limit-op)) a byte at a time. Used // for handling COPY operations where the input and output regions may overlap. // For example, suppose: // src == "ab" // op == src + 2 // op_limit == op + 20 // After IncrementalCopySlow(src, op, op_limit), the result will have eleven // copies of "ab" // ababababababababababab // Note that this does not match the semantics of either std::memcpy() or // std::memmove(). inline char* IncrementalCopySlow(const char* src, char* op, char* const op_limit) { … } #if SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE // Computes the bytes for shuffle control mask (please read comments on // 'pattern_generation_masks' as well) for the given index_offset and // pattern_size. For example, when the 'offset' is 6, it will generate a // repeating pattern of size 6. So, the first 16 byte indexes will correspond to // the pattern-bytes {0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3} and the // next 16 byte indexes will correspond to the pattern-bytes {4, 5, 0, 1, 2, 3, // 4, 5, 0, 1, 2, 3, 4, 5, 0, 1}. These byte index sequences are generated by // calling MakePatternMaskBytes(0, 6, index_sequence<16>()) and // MakePatternMaskBytes(16, 6, index_sequence<16>()) respectively. template <size_t... indexes> inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes( int index_offset, int pattern_size, index_sequence<indexes...>) { return {static_cast<char>((index_offset + indexes) % pattern_size)...}; } // Computes the shuffle control mask bytes array for given pattern-sizes and // returns an array. template <size_t... pattern_sizes_minus_one> inline constexpr std::array<std::array<char, sizeof(V128)>, sizeof...(pattern_sizes_minus_one)> MakePatternMaskBytesTable(int index_offset, index_sequence<pattern_sizes_minus_one...>) { return { MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1, make_index_sequence</*indexes=*/sizeof(V128)>())...}; } // This is an array of shuffle control masks that can be used as the source // operand for PSHUFB to permute the contents of the destination XMM register // into a repeating byte pattern. alignas(16) constexpr std::array<std::array<char, sizeof(V128)>, 16> pattern_generation_masks = MakePatternMaskBytesTable( /*index_offset=*/0, /*pattern_sizes_minus_one=*/make_index_sequence<16>()); // Similar to 'pattern_generation_masks', this table is used to "rotate" the // pattern so that we can copy the *next 16 bytes* consistent with the pattern. // Basically, pattern_reshuffle_masks is a continuation of // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as // pattern_generation_masks for offsets 1, 2, 4, 8 and 16. alignas(16) constexpr std::array<std::array<char, sizeof(V128)>, 16> pattern_reshuffle_masks = MakePatternMaskBytesTable( /*index_offset=*/16, /*pattern_sizes_minus_one=*/make_index_sequence<16>()); SNAPPY_ATTRIBUTE_ALWAYS_INLINE static inline V128 LoadPattern(const char* src, const size_t pattern_size) { V128 generation_mask = V128_Load(reinterpret_cast<const V128*>( pattern_generation_masks[pattern_size - 1].data())); // Uninitialized bytes are masked out by the shuffle mask. // TODO: remove annotation and macro defs once MSan is fixed. SNAPPY_ANNOTATE_MEMORY_IS_INITIALIZED(src + pattern_size, 16 - pattern_size); return V128_Shuffle(V128_LoadU(reinterpret_cast<const V128*>(src)), generation_mask); } SNAPPY_ATTRIBUTE_ALWAYS_INLINE static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */> LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { V128 pattern = LoadPattern(src, pattern_size); // This mask will generate the next 16 bytes in-place. Doing so enables us to // write data by at most 4 V128_StoreU. // // For example, suppose pattern is: abcdefabcdefabcd // Shuffling with this mask will generate: efabcdefabcdefab // Shuffling again will generate: cdefabcdefabcdef V128 reshuffle_mask = V128_Load(reinterpret_cast<const V128*>( pattern_reshuffle_masks[pattern_size - 1].data())); return {pattern, reshuffle_mask}; } #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE // Fallback for when we need to copy while extending the pattern, for example // copying 10 bytes from 3 positions back abc -> abcabcabcabca. // // REQUIRES: [dst - offset, dst + 64) is a valid address range. SNAPPY_ATTRIBUTE_ALWAYS_INLINE static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { … } // Copy [src, src+(op_limit-op)) to [op, op_limit) but faster than // IncrementalCopySlow. buf_limit is the address past the end of the writable // region of the buffer. inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, char* const buf_limit) { … } } // namespace template <bool allow_fast_path> static inline char* EmitLiteral(char* op, const char* literal, int len) { … } template <bool len_less_than_12> static inline char* EmitCopyAtMost64(char* op, size_t offset, size_t len) { … } template <bool len_less_than_12> static inline char* EmitCopy(char* op, size_t offset, size_t len) { … } bool GetUncompressedLength(const char* start, size_t n, size_t* result) { … } namespace { uint32_t CalculateTableSize(uint32_t input_size) { … } } // namespace namespace internal { WorkingMemory::WorkingMemory(size_t input_size) { … } WorkingMemory::~WorkingMemory() { … } uint16_t* WorkingMemory::GetHashTable(size_t fragment_size, int* table_size) const { … } } // end namespace internal // Flat array compression that does not emit the "uncompressed length" // prefix. Compresses "input" string to the "*op" buffer. // // REQUIRES: "input" is at most "kBlockSize" bytes long. // REQUIRES: "op" points to an array of memory that is at least // "MaxCompressedLength(input.size())" in size. // REQUIRES: All elements in "table[0..table_size-1]" are initialized to zero. // REQUIRES: "table_size" is a power of two // // Returns an "end" pointer into "op" buffer. // "end - op" is the compressed size of "input". namespace internal { char* CompressFragment(const char* input, size_t input_size, char* op, uint16_t* table, const int table_size) { … } } // end namespace internal // Called back at avery compression call to trace parameters and sizes. static inline void Report(const char *algorithm, size_t compressed_size, size_t uncompressed_size) { … } // Signature of output types needed by decompression code. // The decompression code is templatized on a type that obeys this // signature so that we do not pay virtual function call overhead in // the middle of a tight decompression loop. // // class DecompressionWriter { // public: // // Called before decompression // void SetExpectedLength(size_t length); // // // For performance a writer may choose to donate the cursor variable to the // // decompression function. The decompression will inject it in all its // // function calls to the writer. Keeping the important output cursor as a // // function local stack variable allows the compiler to keep it in // // register, which greatly aids performance by avoiding loads and stores of // // this variable in the fast path loop iterations. // T GetOutputPtr() const; // // // At end of decompression the loop donates the ownership of the cursor // // variable back to the writer by calling this function. // void SetOutputPtr(T op); // // // Called after decompression // bool CheckLength() const; // // // Called repeatedly during decompression // // Each function get a pointer to the op (output pointer), that the writer // // can use and update. Note it's important that these functions get fully // // inlined so that no actual address of the local variable needs to be // // taken. // bool Append(const char* ip, size_t length, T* op); // bool AppendFromSelf(uint32_t offset, size_t length, T* op); // // // The rules for how TryFastAppend differs from Append are somewhat // // convoluted: // // // // - TryFastAppend is allowed to decline (return false) at any // // time, for any reason -- just "return false" would be // // a perfectly legal implementation of TryFastAppend. // // The intention is for TryFastAppend to allow a fast path // // in the common case of a small append. // // - TryFastAppend is allowed to read up to <available> bytes // // from the input buffer, whereas Append is allowed to read // // <length>. However, if it returns true, it must leave // // at least five (kMaximumTagLength) bytes in the input buffer // // afterwards, so that there is always enough space to read the // // next tag without checking for a refill. // // - TryFastAppend must always return decline (return false) // // if <length> is 61 or more, as in this case the literal length is not // // decoded fully. In practice, this should not be a big problem, // // as it is unlikely that one would implement a fast path accepting // // this much data. // // // bool TryFastAppend(const char* ip, size_t available, size_t length, T* op); // }; static inline uint32_t ExtractLowBytes(const uint32_t& v, int n) { … } static inline bool LeftShiftOverflows(uint8_t value, uint32_t shift) { … } inline bool Copy64BytesWithPatternExtension(ptrdiff_t dst, size_t offset) { … } // Copies between size bytes and 64 bytes from src to dest. size cannot exceed // 64. More than size bytes, but never exceeding 64, might be copied if doing // so gives better performance. [src, src + size) must not overlap with // [dst, dst + size), but [src, src + 64) may overlap with [dst, dst + 64). void MemCopy64(char* dst, const void* src, size_t size) { … } void MemCopy64(ptrdiff_t dst, const void* src, size_t size) { … } void ClearDeferred(const void** deferred_src, size_t* deferred_length, uint8_t* safe_source) { … } void DeferMemCopy(const void** deferred_src, size_t* deferred_length, const void* src, size_t length) { … } SNAPPY_ATTRIBUTE_ALWAYS_INLINE inline size_t AdvanceToNextTagARMOptimized(const uint8_t** ip_p, size_t* tag) { … } SNAPPY_ATTRIBUTE_ALWAYS_INLINE inline size_t AdvanceToNextTagX86Optimized(const uint8_t** ip_p, size_t* tag) { … } // Extract the offset for copy-1 and copy-2 returns 0 for literals or copy-4. inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) { // For x86 non-static storage works better. For ARM static storage is better. // TODO: Once the array is recognized as a register, improve the // readability for x86. #if defined(__x86_64__) constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull; uint16_t result; memcpy(&result, reinterpret_cast<const char*>(&kExtractMasksCombined) + 2 * tag_type, sizeof(result)); return val & result; #elif defined(__aarch64__) constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull; return val & static_cast<uint32_t>( (kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF); #else static constexpr uint32_t kExtractMasks[4] = {0, 0xFF, 0xFFFF, 0}; return val & kExtractMasks[tag_type]; #endif }; // Core decompression loop, when there is enough data available. // Decompresses the input buffer [ip, ip_limit) into the output buffer // [op, op_limit_min_slop). Returning when either we are too close to the end // of the input buffer, or we exceed op_limit_min_slop or when a exceptional // tag is encountered (literal of length > 60) or a copy-4. // Returns {ip, op} at the points it stopped decoding. // TODO This function probably does not need to be inlined, as it // should decode large chunks at a time. This allows runtime dispatch to // implementations based on CPU capability (BMI2 / perhaps 32 / 64 byte memcpy). template <typename T> std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless( const uint8_t* ip, const uint8_t* ip_limit, ptrdiff_t op, T op_base, ptrdiff_t op_limit_min_slop) { … } // Helper class for decompression class SnappyDecompressor { … }; constexpr uint32_t CalculateNeeded(uint8_t tag) { … } #if __cplusplus >= 201402L constexpr bool VerifyCalculateNeeded() { … } // Make sure CalculateNeeded is correct by verifying it against the established // table encoding the number of added bytes needed. static_assert …; #endif // c++14 bool SnappyDecompressor::RefillTag() { … } template <typename Writer> static bool InternalUncompress(Source* r, Writer* writer) { … } template <typename Writer> static bool InternalUncompressAllTags(SnappyDecompressor* decompressor, Writer* writer, uint32_t compressed_len, uint32_t uncompressed_len) { … } bool GetUncompressedLength(Source* source, uint32_t* result) { … } size_t Compress(Source* reader, Sink* writer) { … } // ----------------------------------------------------------------------- // IOVec interfaces // ----------------------------------------------------------------------- // A `Source` implementation that yields the contents of an `iovec` array. Note // that `total_size` is the total number of bytes to be read from the elements // of `iov` (_not_ the total number of elements in `iov`). class SnappyIOVecReader : public Source { … }; // A type that writes to an iovec. // Note that this is not a "ByteSink", but a type that matches the // Writer template argument to SnappyDecompressor::DecompressAllTags(). class SnappyIOVecWriter { … }; bool RawUncompressToIOVec(const char* compressed, size_t compressed_length, const struct iovec* iov, size_t iov_cnt) { … } bool RawUncompressToIOVec(Source* compressed, const struct iovec* iov, size_t iov_cnt) { … } // ----------------------------------------------------------------------- // Flat array interfaces // ----------------------------------------------------------------------- // A type that writes to a flat array. // Note that this is not a "ByteSink", but a type that matches the // Writer template argument to SnappyDecompressor::DecompressAllTags(). class SnappyArrayWriter { … }; bool RawUncompress(const char* compressed, size_t compressed_length, char* uncompressed) { … } bool RawUncompress(Source* compressed, char* uncompressed) { … } bool Uncompress(const char* compressed, size_t compressed_length, std::string* uncompressed) { … } // A Writer that drops everything on the floor and just does validation class SnappyDecompressionValidator { … }; bool IsValidCompressedBuffer(const char* compressed, size_t compressed_length) { … } bool IsValidCompressed(Source* compressed) { … } void RawCompress(const char* input, size_t input_length, char* compressed, size_t* compressed_length) { … } void RawCompressFromIOVec(const struct iovec* iov, size_t uncompressed_length, char* compressed, size_t* compressed_length) { … } size_t Compress(const char* input, size_t input_length, std::string* compressed) { … } size_t CompressFromIOVec(const struct iovec* iov, size_t iov_cnt, std::string* compressed) { … } // ----------------------------------------------------------------------- // Sink interface // ----------------------------------------------------------------------- // A type that decompresses into a Sink. The template parameter // Allocator must export one method "char* Allocate(int size);", which // allocates a buffer of "size" and appends that to the destination. template <typename Allocator> class SnappyScatteredWriter { … }; template <typename Allocator> bool SnappyScatteredWriter<Allocator>::SlowAppend(const char* ip, size_t len) { … } template <typename Allocator> bool SnappyScatteredWriter<Allocator>::SlowAppendFromSelf(size_t offset, size_t len) { … } class SnappySinkAllocator { … }; size_t UncompressAsMuchAsPossible(Source* compressed, Sink* uncompressed) { … } bool Uncompress(Source* compressed, Sink* uncompressed) { … } } // namespace snappy