// Copyright 2012 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "components/sync/base/unique_position.h" #include <algorithm> #include <limits> #include "base/logging.h" #include "base/rand_util.h" #include "base/strings/string_number_conversions.h" #include "base/trace_event/memory_usage_estimator.h" #include "components/sync/protocol/unique_position.pb.h" #include "third_party/zlib/zlib.h" namespace syncer { constexpr size_t UniquePosition::kSuffixLength; constexpr size_t UniquePosition::kCompressBytesThreshold; // static. bool UniquePosition::IsValidSuffix(const std::string& suffix) { … } // static. bool UniquePosition::IsValidBytes(const std::string& bytes) { … } // static. std::string UniquePosition::RandomSuffix() { … } // static. UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) { … } // static. UniquePosition UniquePosition::FromInt64(int64_t x, const std::string& suffix) { … } // static. UniquePosition UniquePosition::InitialPosition(const std::string& suffix) { … } // static. UniquePosition UniquePosition::Before(const UniquePosition& x, const std::string& suffix) { … } // static. UniquePosition UniquePosition::After(const UniquePosition& x, const std::string& suffix) { … } // static. UniquePosition UniquePosition::Between(const UniquePosition& before, const UniquePosition& after, const std::string& suffix) { … } UniquePosition::UniquePosition() = default; bool UniquePosition::LessThan(const UniquePosition& other) const { … } bool UniquePosition::Equals(const UniquePosition& other) const { … } sync_pb::UniquePosition UniquePosition::ToProto() const { … } void UniquePosition::SerializeToString(std::string* blob) const { … } bool UniquePosition::IsValid() const { … } std::string UniquePosition::ToDebugString() const { … } std::string UniquePosition::GetSuffixForTest() const { … } std::string UniquePosition::FindSmallerWithSuffix(const std::string& reference, const std::string& suffix) { … } // static std::string UniquePosition::FindGreaterWithSuffix(const std::string& reference, const std::string& suffix) { … } // static std::string UniquePosition::FindBetweenWithSuffix(const std::string& before, const std::string& after, const std::string& suffix) { … } UniquePosition::UniquePosition(const std::string& compressed) : … { … } UniquePosition::UniquePosition(const std::string& uncompressed, const std::string& suffix) : … { … } // On custom compression: // // Let C(x) be the compression function and U(x) be the uncompression function. // // This compression scheme has a few special properties. For one, it is // order-preserving. For any two valid position strings x and y: // x < y <=> C(x) < C(y) // This allows us keep the position strings compressed as we sort them. // // The compressed format and the decode algorithm: // // The compressed string is a series of blocks, almost all of which are 8 bytes // in length. The only exception is the last block in the compressed string, // which may be a remainder block, which has length no greater than 7. The // full-length blocks are either repeated character blocks or plain data blocks. // All blocks are entirely self-contained. Their decoded values are independent // from that of their neighbours. // // A repeated character block is encoded into eight bytes and represents between // 4 and 2^31 repeated instances of a given character in the unencoded stream. // The encoding consists of a single character repeated four times, followed by // an encoded count. The encoded count is stored as a big-endian 32 bit // integer. There are 2^31 possible count values, and two encodings for each. // The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc = // count'. At compression time, the algorithm will choose between the two // encodings based on which of the two will maintain the appropriate sort // ordering (by a process which will be described below). The decompression // algorithm need not concern itself with which encoding was used; it needs only // to decode it. The decoded value of this block is "count" instances of the // character that was repeated four times in the first half of this block. // // A plain data block is encoded into eight bytes and represents exactly eight // bytes of data in the unencoded stream. The plain data block must not begin // with the same character repeated four times. It is allowed to contain such a // four-character sequence, just not at the start of the block. The decoded // value of a plain data block is identical to its encoded value. // // A remainder block has length of at most seven. It is a shorter version of // the plain data block. It occurs only at the end of the encoded stream and // represents exactly as many bytes of unencoded data as its own length. Like a // plain data block, the remainder block never begins with the same character // repeated four times. The decoded value of this block is identical to its // encoded value. // // The encode algorithm: // // From the above description, it can be seen that there may be more than one // way to encode a given input string. The encoder must be careful to choose // the encoding that guarantees sort ordering. // // The rules for the encoder are as follows: // 1. Iterate through the input string and produce output blocks one at a time. // 2. Where possible (ie. where the next four bytes of input consist of the // same character repeated four times), produce a repeated data block of // maximum possible length. // 3. If there is at least 8 bytes of data remaining and it is not possible // to produce a repeated character block, produce a plain data block. // 4. If there are less than 8 bytes of data remaining and it is not possible // to produce a repeated character block, produce a remainder block. // 5. When producing a repeated character block, the count encoding must be // chosen in such a way that the sort ordering is maintained. The choice is // best illustrated by way of example: // // When comparing two strings, the first of which begins with of 8 // instances of the letter 'B' and the second with 10 instances of the // letter 'B', which of the two should compare lower? The result depends // on the 9th character of the first string, since it will be compared // against the 9th 'B' in the second string. If that character is an 'A', // then the first string will compare lower. If it is a 'C', then the // first string will compare higher. // // The key insight is that the comparison value of a repeated character block // depends on the value of the character that follows it. If the character // follows the repeated character has a value greater than the repeated // character itself, then a shorter run length should translate to a higher // comparison value. Therefore, we encode its count using the low encoding. // Similarly, if the following character is lower, we use the high encoding. namespace { // Appends an encoded run length to |output_str|. static void WriteEncodedRunLength(uint32_t length, bool high_encoding, std::string* output_str) { … } // Reads an encoded run length for |str| at position |i|. static uint32_t ReadEncodedRunLength(const std::string& str, size_t i) { … } // A series of four identical chars at the beginning of a block indicates // the beginning of a repeated character block. static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) { … } } // namespace // static // Wraps the CompressImpl function with a bunch of DCHECKs. std::string UniquePosition::Compress(const std::string& str) { … } // static // Performs the order preserving run length compression of a given input string. std::string UniquePosition::CompressImpl(const std::string& str) { … } // static // Uncompresses strings that were compresed with UniquePosition::Compress. std::string UniquePosition::Uncompress(const std::string& str) { … } bool UniquePosition::IsValidCompressed(const std::string& str) { … } size_t UniquePosition::EstimateMemoryUsage() const { … } } // namespace syncer