/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <algorithm>
#include <fstream>
#include <limits>
#include <random>
#include <string>
#include <unordered_set>
#include <vector>
#include <glog/logging.h>
#include <folly/Benchmark.h>
#include <folly/Likely.h>
#include <folly/Optional.h>
#include <folly/compression/Instructions.h>
#include <folly/portability/GTest.h>
namespace folly {
namespace compression {
template <class URNG>
std::vector<uint64_t> generateRandomList(
size_t n, uint64_t maxId, URNG&& g, bool withDuplicates = false) {
CHECK_LT(n, 2 * maxId);
std::uniform_int_distribution<uint64_t> uid(1, maxId);
std::unordered_set<uint64_t> dataset;
while (dataset.size() < n) {
uint64_t value = uid(g);
if (dataset.count(value) == 0) {
dataset.insert(value);
}
}
std::vector<uint64_t> ids(dataset.begin(), dataset.end());
if (withDuplicates && n > 0) {
// Ensure 20% of the list has at least 1 duplicate, 10% has at least 2, and
// 5% is a run of the same value.
std::copy(ids.begin() + ids.size() * 8 / 10, ids.end(), ids.begin());
std::copy(ids.begin() + ids.size() * 9 / 10, ids.end(), ids.begin());
std::fill(ids.begin(), ids.begin() + ids.size() / 20, ids[0]);
}
std::sort(ids.begin(), ids.end());
return ids;
}
inline std::vector<uint64_t> generateRandomList(
size_t n, uint64_t maxId, bool withDuplicates = false) {
std::mt19937 gen;
return generateRandomList(n, maxId, gen, withDuplicates);
}
inline std::vector<uint64_t> generateSeqList(
uint64_t minId, uint64_t maxId, uint64_t step = 1) {
CHECK_LE(minId, maxId);
CHECK_GT(step, 0);
std::vector<uint64_t> ids;
ids.reserve((maxId - minId) / step + 1);
for (uint64_t i = minId; i <= maxId; i += step) {
ids.push_back(i);
}
return ids;
}
inline std::vector<uint64_t> loadList(const std::string& filename) {
std::ifstream fin(filename);
std::vector<uint64_t> result;
uint64_t id;
while (fin >> id) {
result.push_back(id);
}
return result;
}
// Test previousValue only if Reader has it.
template <class... Args>
void maybeTestPreviousValue(Args&&...) {}
// Make all the arguments template because if the types are not exact,
// the above overload will be picked (for example i could be size_t or
// ssize_t).
template <class Vector, class Reader, class Index>
auto maybeTestPreviousValue(const Vector& data, Reader& reader, Index i)
-> decltype(reader.previousValue(), void()) {
if (i != 0) {
EXPECT_EQ(reader.previousValue(), data[i - 1]);
}
}
// Test previous only if Reader has it.
template <class... Args>
void maybeTestPrevious(Args&&...) {}
// Make all the arguments template because if the types are not exact,
// the above overload will be picked (for example i could be size_t or
// ssize_t).
template <class Vector, class Reader, class Index>
auto maybeTestPrevious(const Vector& data, Reader& reader, Index i)
-> decltype(reader.previous(), void()) {
auto r = reader.previous();
if (i != 0) {
EXPECT_TRUE(r);
EXPECT_EQ(reader.value(), data[i - 1]);
} else {
EXPECT_FALSE(r);
}
reader.next();
EXPECT_EQ(reader.value(), data[i]);
}
template <class Reader, class List>
void testNext(const std::vector<uint64_t>& data, const List& list) {
Reader reader(list);
EXPECT_FALSE(reader.valid());
for (size_t i = 0; i < data.size(); ++i) {
EXPECT_TRUE(reader.next()) << i << " " << data.size();
EXPECT_TRUE(reader.valid()) << i << " " << data.size();
EXPECT_EQ(reader.value(), data[i]) << i << " " << data.size();
EXPECT_EQ(reader.position(), i) << i << " " << data.size();
maybeTestPreviousValue(data, reader, i);
maybeTestPrevious(data, reader, i);
}
EXPECT_FALSE(reader.next()) << data.size();
EXPECT_FALSE(reader.valid()) << data.size();
EXPECT_EQ(reader.position(), reader.size());
}
template <class Reader, class List>
void testSkip(
const std::vector<uint64_t>& data, const List& list, size_t skipStep) {
CHECK_GT(skipStep, 0);
Reader reader(list);
for (size_t i = skipStep - 1; i < data.size(); i += skipStep) {
// Destination must be representable.
if (i + skipStep > std::numeric_limits<typename Reader::SizeType>::max()) {
return;
}
// Also test that skip(0) stays in place.
for (auto step : {skipStep, size_t(0)}) {
EXPECT_TRUE(reader.skip(step));
EXPECT_TRUE(reader.valid());
EXPECT_EQ(reader.value(), data[i]);
EXPECT_EQ(reader.position(), i);
}
maybeTestPreviousValue(data, reader, i);
maybeTestPrevious(data, reader, i);
}
EXPECT_FALSE(reader.skip(skipStep));
EXPECT_FALSE(reader.valid());
EXPECT_EQ(reader.position(), reader.size());
EXPECT_FALSE(reader.next());
}
template <class Reader, class List>
void testSkip(const std::vector<uint64_t>& data, const List& list) {
for (size_t skipStep = 1; skipStep < 25; ++skipStep) {
testSkip<Reader, List>(data, list, skipStep);
}
for (size_t skipStep = 25; skipStep <= 500; skipStep += 25) {
testSkip<Reader, List>(data, list, skipStep);
}
}
template <class Reader, class List>
void testSkipTo(
const std::vector<uint64_t>& data, const List& list, size_t skipToStep) {
using ValueType = typename Reader::ValueType;
CHECK_GT(skipToStep, 0);
Reader reader(list);
const uint64_t delta = std::max<uint64_t>(1, data.back() / skipToStep);
ValueType target = delta;
auto it = data.begin();
while (true) {
it = std::lower_bound(it, data.end(), target);
if (it == data.end()) {
EXPECT_FALSE(reader.skipTo(target));
break;
}
EXPECT_TRUE(reader.skipTo(target));
// Test the whole group of equal values.
for (auto it2 = it; it2 != data.end() && *it2 == *it;
++it2, reader.next()) {
ASSERT_TRUE(reader.valid());
EXPECT_EQ(reader.value(), *it2);
EXPECT_EQ(reader.position(), std::distance(data.begin(), it2));
// The reader should stay in place even if we're in the middle of a group.
EXPECT_TRUE(reader.skipTo(*it2));
EXPECT_EQ(reader.value(), *it2);
EXPECT_EQ(reader.position(), std::distance(data.begin(), it2));
maybeTestPreviousValue(data, reader, std::distance(data.begin(), it2));
maybeTestPrevious(data, reader, std::distance(data.begin(), it2));
}
// Reader is now past the group.
if (!reader.valid()) {
break;
}
target += delta;
if (target < reader.value()) {
// Value following the group is already greater than target, or delta is
// so large that target wrapped around.
target = reader.value();
}
}
EXPECT_FALSE(reader.valid());
EXPECT_EQ(reader.position(), reader.size());
EXPECT_FALSE(reader.next());
}
template <class Reader, class List>
void testSkipTo(const std::vector<uint64_t>& data, const List& list) {
for (size_t steps = 10; steps < 100; steps += 10) {
testSkipTo<Reader, List>(data, list, steps);
}
for (size_t steps = 100; steps <= 1000; steps += 100) {
testSkipTo<Reader, List>(data, list, steps);
}
testSkipTo<Reader, List>(data, list, std::numeric_limits<size_t>::max());
{
// Skip to the first element.
Reader reader(list);
EXPECT_TRUE(reader.skipTo(data[0]));
EXPECT_EQ(reader.value(), data[0]);
EXPECT_EQ(reader.position(), 0);
}
// Skip past the last element, when possible. Make sure to probe values far
// from the last element, as the reader implementation may keep an internal
// upper bound larger than that, and we need to make sure we exercise skipping
// both before and after that.
using ValueType = typename Reader::ValueType;
std::vector<ValueType> valuesPastTheEnd;
const auto lastValue = data.back();
const auto kMaxValue = std::numeric_limits<ValueType>::max();
// Keep doubling the distance from the last value until we overflow.
for (ValueType value = lastValue + 1; value > lastValue;
value += value - lastValue) {
valuesPastTheEnd.push_back(value);
}
if (kMaxValue != lastValue) {
valuesPastTheEnd.push_back(kMaxValue);
}
for (auto value : valuesPastTheEnd) {
Reader reader(list);
EXPECT_FALSE(reader.skipTo(value)) << value << " " << lastValue;
EXPECT_FALSE(reader.valid()) << value << " " << lastValue;
EXPECT_EQ(reader.position(), reader.size()) << value << " " << lastValue;
EXPECT_FALSE(reader.next()) << value << " " << lastValue;
}
}
template <class Reader, class List>
void testJump(const std::vector<uint64_t>& data, const List& list) {
std::mt19937 gen;
std::vector<size_t> is(data.size());
for (size_t i = 0; i < data.size(); ++i) {
is[i] = i;
}
std::shuffle(is.begin(), is.end(), gen);
if (Reader::EncoderType::forwardQuantum == 0) {
is.resize(std::min<size_t>(is.size(), 100));
}
Reader reader(list);
for (auto i : is) {
// Also test idempotency.
for (size_t round = 0; round < 2; ++round) {
EXPECT_TRUE(reader.jump(i)) << i << " " << data.size();
EXPECT_EQ(reader.value(), data[i]) << i << " " << data.size();
EXPECT_EQ(reader.position(), i) << i << " " << data.size();
}
maybeTestPreviousValue(data, reader, i);
maybeTestPrevious(data, reader, i);
}
EXPECT_FALSE(reader.jump(data.size()));
EXPECT_FALSE(reader.valid());
EXPECT_EQ(reader.position(), reader.size());
}
template <class Reader, class List>
void testJumpTo(const std::vector<uint64_t>& data, const List& list) {
using ValueType = typename Reader::ValueType;
CHECK(!data.empty());
Reader reader(list);
std::mt19937 gen;
std::uniform_int_distribution<ValueType> targets(0, data.back());
const size_t iters = Reader::EncoderType::skipQuantum == 0 ? 100 : 10000;
for (size_t i = 0; i < iters; ++i) {
uint64_t target;
// Force boundary targets interleaved with random targets.
if (i == 10) {
target = data.back();
} else if (i == 20) {
target = 0;
} else {
target = targets(gen);
}
auto it = std::lower_bound(data.begin(), data.end(), target);
CHECK(it != data.end());
EXPECT_TRUE(reader.jumpTo(target));
// Test the whole group of equal values.
for (auto it2 = it; it2 != data.end() && *it2 == *it;
++it2, reader.next()) {
EXPECT_EQ(reader.value(), *it2);
EXPECT_EQ(reader.position(), std::distance(data.begin(), it2));
}
// Calling jumpTo() on the current value should reposition on the beginning
// of the group.
EXPECT_TRUE(reader.jumpTo(*it));
EXPECT_EQ(reader.position(), std::distance(data.begin(), it));
}
if (data.back() != std::numeric_limits<ValueType>::max()) {
EXPECT_FALSE(reader.jumpTo(data.back() + 1));
EXPECT_FALSE(reader.valid());
EXPECT_EQ(reader.position(), reader.size());
}
}
template <class Reader, class Encoder>
void testEmpty() {
const typename Encoder::ValueType* const data = nullptr;
auto list = Encoder::encode(data, data);
{
Reader reader(list);
EXPECT_FALSE(reader.next());
EXPECT_EQ(reader.size(), 0);
}
{
Reader reader(list);
EXPECT_FALSE(reader.skip(1));
EXPECT_FALSE(reader.skip(10));
EXPECT_FALSE(reader.jump(0));
EXPECT_FALSE(reader.jump(10));
}
{
Reader reader(list);
EXPECT_FALSE(reader.skipTo(1));
EXPECT_FALSE(reader.jumpTo(1));
}
}
// `upperBoundExtension` is required to inject additional 0-blocks
// at the end of the list. This allows us to test lists with a large gap between
// last element and universe upper bound, to exercise bounds-checking when
// skipping past the last element
template <class Reader, class Encoder>
void testAll(
const std::vector<uint64_t>& data, uint64_t upperBoundExtension = 0) {
SCOPED_TRACE(__PRETTY_FUNCTION__);
Encoder encoder(data.size(), data.back() + upperBoundExtension);
for (const auto value : data) {
encoder.add(value);
}
auto list = encoder.finish();
testNext<Reader>(data, list);
testSkip<Reader>(data, list);
testSkipTo<Reader>(data, list);
testJump<Reader>(data, list);
testJumpTo<Reader>(data, list);
list.free();
}
template <class Reader, class List>
void bmNext(const List& list, const std::vector<uint64_t>& data, size_t iters) {
if (data.empty()) {
return;
}
Reader reader(list);
for (size_t i = 0; i < iters; ++i) {
if (FOLLY_LIKELY(reader.next())) {
folly::doNotOptimizeAway(reader.value());
} else {
reader.reset();
}
}
}
template <class Reader, class List>
void bmSkip(
const List& list,
const std::vector<uint64_t>& /* data */,
size_t logAvgSkip,
size_t iters) {
size_t avg = (size_t(1) << logAvgSkip);
size_t base = avg - (avg >> 2);
size_t mask = (avg > 1) ? (avg >> 1) - 1 : 0;
Reader reader(list);
for (size_t i = 0; i < iters; ++i) {
size_t skip = base + (i & mask);
if (FOLLY_LIKELY(reader.skip(skip))) {
folly::doNotOptimizeAway(reader.value());
} else {
reader.reset();
}
}
}
template <class Reader, class List>
void bmSkipTo(
const List& list,
const std::vector<uint64_t>& data,
size_t logAvgSkip,
size_t iters) {
size_t avg = (size_t(1) << logAvgSkip);
size_t base = avg - (avg >> 2);
size_t mask = (avg > 1) ? (avg >> 1) - 1 : 0;
Reader reader(list);
for (size_t i = 0, j = -1; i < iters; ++i) {
size_t skip = base + (i & mask);
j += skip;
if (j >= data.size()) {
reader.reset();
j = -1;
}
reader.skipTo(data[j]);
folly::doNotOptimizeAway(reader.value());
}
}
template <class Reader, class List>
void bmJump(
const List& list,
const std::vector<uint64_t>& data,
const std::vector<size_t>& order,
size_t iters) {
CHECK(!data.empty());
CHECK_EQ(data.size(), order.size());
Reader reader(list);
for (size_t i = 0, j = 0; i < iters; ++i, ++j) {
if (j == order.size()) {
j = 0;
}
reader.jump(order[j]);
folly::doNotOptimizeAway(reader.value());
}
}
template <class Reader, class List>
void bmJumpTo(
const List& list,
const std::vector<uint64_t>& data,
const std::vector<size_t>& order,
size_t iters) {
CHECK(!data.empty());
CHECK_EQ(data.size(), order.size());
Reader reader(list);
for (size_t i = 0, j = 0; i < iters; ++i, ++j) {
if (j == order.size()) {
j = 0;
}
reader.jumpTo(data[order[j]]);
folly::doNotOptimizeAway(reader.value());
}
}
folly::Optional<instructions::Type> instructionsOverride();
template <class F>
auto dispatchInstructions(F&& f)
-> decltype(f(std::declval<instructions::Default>())) {
if (auto type = instructionsOverride()) {
return instructions::dispatch(*type, std::forward<F>(f));
} else {
return instructions::dispatch(std::forward<F>(f));
}
}
} // namespace compression
} // namespace folly