unique_position.cc | Explore in Territory

// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/sync/base/unique_position.h"

#include <algorithm>
#include <limits>

#include "base/logging.h"
#include "base/rand_util.h"
#include "base/strings/string_number_conversions.h"
#include "base/trace_event/memory_usage_estimator.h"
#include "components/sync/protocol/unique_position.pb.h"
#include "third_party/zlib/zlib.h"

namespace syncer {

constexpr size_t UniquePosition::kSuffixLength;
constexpr size_t UniquePosition::kCompressBytesThreshold;

// static.
bool UniquePosition::IsValidSuffix(const std::string& suffix) { … }

// static.
bool UniquePosition::IsValidBytes(const std::string& bytes) { … }

// static.
std::string UniquePosition::RandomSuffix() { … }

// static.
UniquePosition UniquePosition::FromProto(const sync_pb::UniquePosition& proto) { … }

// static.
UniquePosition UniquePosition::FromInt64(int64_t x, const std::string& suffix) { … }

// static.
UniquePosition UniquePosition::InitialPosition(const std::string& suffix) { … }

// static.
UniquePosition UniquePosition::Before(const UniquePosition& x,
                                      const std::string& suffix) { … }

// static.
UniquePosition UniquePosition::After(const UniquePosition& x,
                                     const std::string& suffix) { … }

// static.
UniquePosition UniquePosition::Between(const UniquePosition& before,
                                       const UniquePosition& after,
                                       const std::string& suffix) { … }

UniquePosition::UniquePosition() = default;

bool UniquePosition::LessThan(const UniquePosition& other) const { … }

bool UniquePosition::Equals(const UniquePosition& other) const { … }

sync_pb::UniquePosition UniquePosition::ToProto() const { … }

void UniquePosition::SerializeToString(std::string* blob) const { … }

bool UniquePosition::IsValid() const { … }

std::string UniquePosition::ToDebugString() const { … }

std::string UniquePosition::GetSuffixForTest() const { … }

std::string UniquePosition::FindSmallerWithSuffix(const std::string& reference,
                                                  const std::string& suffix) { … }

// static
std::string UniquePosition::FindGreaterWithSuffix(const std::string& reference,
                                                  const std::string& suffix) { … }

// static
std::string UniquePosition::FindBetweenWithSuffix(const std::string& before,
                                                  const std::string& after,
                                                  const std::string& suffix) { … }

UniquePosition::UniquePosition(const std::string& compressed)
    : … { … }

UniquePosition::UniquePosition(const std::string& uncompressed,
                               const std::string& suffix)
    : … { … }

// On custom compression:
//
// Let C(x) be the compression function and U(x) be the uncompression function.
//
// This compression scheme has a few special properties.  For one, it is
// order-preserving.  For any two valid position strings x and y:
//   x < y <=> C(x) < C(y)
// This allows us keep the position strings compressed as we sort them.
//
// The compressed format and the decode algorithm:
//
// The compressed string is a series of blocks, almost all of which are 8 bytes
// in length.  The only exception is the last block in the compressed string,
// which may be a remainder block, which has length no greater than 7.  The
// full-length blocks are either repeated character blocks or plain data blocks.
// All blocks are entirely self-contained.  Their decoded values are independent
// from that of their neighbours.
//
// A repeated character block is encoded into eight bytes and represents between
// 4 and 2^31 repeated instances of a given character in the unencoded stream.
// The encoding consists of a single character repeated four times, followed by
// an encoded count.  The encoded count is stored as a big-endian 32 bit
// integer.  There are 2^31 possible count values, and two encodings for each.
// The high encoding is 'enc = kuint32max - count'; the low encoding is 'enc =
// count'.  At compression time, the algorithm will choose between the two
// encodings based on which of the two will maintain the appropriate sort
// ordering (by a process which will be described below).  The decompression
// algorithm need not concern itself with which encoding was used; it needs only
// to decode it.  The decoded value of this block is "count" instances of the
// character that was repeated four times in the first half of this block.
//
// A plain data block is encoded into eight bytes and represents exactly eight
// bytes of data in the unencoded stream.  The plain data block must not begin
// with the same character repeated four times.  It is allowed to contain such a
// four-character sequence, just not at the start of the block.  The decoded
// value of a plain data block is identical to its encoded value.
//
// A remainder block has length of at most seven.  It is a shorter version of
// the plain data block.  It occurs only at the end of the encoded stream and
// represents exactly as many bytes of unencoded data as its own length.  Like a
// plain data block, the remainder block never begins with the same character
// repeated four times.  The decoded value of this block is identical to its
// encoded value.
//
// The encode algorithm:
//
// From the above description, it can be seen that there may be more than one
// way to encode a given input string.  The encoder must be careful to choose
// the encoding that guarantees sort ordering.
//
// The rules for the encoder are as follows:
// 1. Iterate through the input string and produce output blocks one at a time.
// 2. Where possible (ie. where the next four bytes of input consist of the
//    same character repeated four times), produce a repeated data block of
//    maximum possible length.
// 3. If there is at least 8 bytes of data remaining and it is not possible
//    to produce a repeated character block, produce a plain data block.
// 4. If there are less than 8 bytes of data remaining and it is not possible
//    to produce a repeated character block, produce a remainder block.
// 5. When producing a repeated character block, the count encoding must be
//    chosen in such a way that the sort ordering is maintained.  The choice is
//    best illustrated by way of example:
//
//      When comparing two strings, the first of which begins with of 8
//      instances of the letter 'B' and the second with 10 instances of the
//      letter 'B', which of the two should compare lower?  The result depends
//      on the 9th character of the first string, since it will be compared
//      against the 9th 'B' in the second string.  If that character is an 'A',
//      then the first string will compare lower.  If it is a 'C', then the
//      first string will compare higher.
//
//    The key insight is that the comparison value of a repeated character block
//    depends on the value of the character that follows it.  If the character
//    follows the repeated character has a value greater than the repeated
//    character itself, then a shorter run length should translate to a higher
//    comparison value.  Therefore, we encode its count using the low encoding.
//    Similarly, if the following character is lower, we use the high encoding.

namespace {

// Appends an encoded run length to |output_str|.
static void WriteEncodedRunLength(uint32_t length,
                                  bool high_encoding,
                                  std::string* output_str) { … }

// Reads an encoded run length for |str| at position |i|.
static uint32_t ReadEncodedRunLength(const std::string& str, size_t i) { … }

// A series of four identical chars at the beginning of a block indicates
// the beginning of a repeated character block.
static bool IsRepeatedCharPrefix(const std::string& chars, size_t start_index) { … }

}  // namespace

// static
// Wraps the CompressImpl function with a bunch of DCHECKs.
std::string UniquePosition::Compress(const std::string& str) { … }

// static
// Performs the order preserving run length compression of a given input string.
std::string UniquePosition::CompressImpl(const std::string& str) { … }

// static
// Uncompresses strings that were compresed with UniquePosition::Compress.
std::string UniquePosition::Uncompress(const std::string& str) { … }

bool UniquePosition::IsValidCompressed(const std::string& str) { … }

size_t UniquePosition::EstimateMemoryUsage() const { … }

}  // namespace syncer
chromium/components/sync/base/unique_position.cc