//===--- PostingList.cpp - Symbol identifiers storage interface -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// #include "PostingList.h" #include "index/dex/Iterator.h" #include "index/dex/Token.h" #include "llvm/Support/MathExtras.h" #include <optional> namespace clang { namespace clangd { namespace dex { namespace { /// Implements iterator of PostingList chunks. This requires iterating over two /// levels: the first level iterator iterates over the chunks and decompresses /// them on-the-fly when the contents of chunk are to be seen. class ChunkIterator : public Iterator { … }; static constexpr size_t BitsPerEncodingByte = …; /// Writes a variable length DocID into the buffer and updates the buffer size. /// If it doesn't fit, returns false and doesn't write to the buffer. bool encodeVByte(DocID Delta, llvm::MutableArrayRef<uint8_t> &Payload) { … } /// Use Variable-length Byte (VByte) delta encoding to compress sorted list of /// DocIDs. The compression stores deltas (differences) between subsequent /// DocIDs and encodes these deltas utilizing the least possible number of /// bytes. /// /// Each encoding byte consists of two parts: the first bit (continuation bit) /// indicates whether this is the last byte (0 if this byte is the last) of /// current encoding and seven bytes a piece of DocID (payload). DocID contains /// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit /// payloads and one 4-bit payload), but in practice it is expected that gaps /// (deltas) between subsequent DocIDs are not large enough to require 5 bytes. /// In very dense posting lists (with average gaps less than 128) this /// representation would be 4 times more efficient than raw DocID array. /// /// PostingList encoding example: /// /// DocIDs 42 47 7000 /// gaps 5 6958 /// Encoding (raw number) 00000101 10110110 00101110 std::vector<Chunk> encodeStream(llvm::ArrayRef<DocID> Documents) { … } /// Reads variable length DocID from the buffer and updates the buffer size. If /// the stream is terminated, return std::nullopt. std::optional<DocID> readVByte(llvm::ArrayRef<uint8_t> &Bytes) { … } } // namespace llvm::SmallVector<DocID, Chunk::PayloadSize + 1> Chunk::decompress() const { … } PostingList::PostingList(llvm::ArrayRef<DocID> Documents) : … { … } std::unique_ptr<Iterator> PostingList::iterator(const Token *Tok) const { … } } // namespace dex } // namespace clangd } // namespace clang