TDigest.cpp | Explore in Territory

/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <folly/stats/TDigest.h>

#include <algorithm>
#include <limits>

#include <glog/logging.h>

#include <folly/stats/detail/DoubleRadixSort.h>

namespace folly {

/*
 * A good biased scaling function has the following properties:
 *   - The value of the function k(0, delta) = 0, and k(1, delta) = delta.
 *     This is a requirement for any t-digest function.
 *   - The limit of the derivative of the function dk/dq at 0 is inf, and at
 *     1 is inf. This provides bias to improve accuracy at the tails.
 *   - For any q <= 0.5, dk/dq(q) = dk/dq(1-q). This ensures that the accuracy
 *     of upper and lower quantiles are equivalent.
 *
 * The scaling function used here is...
 *   k(q, d) = (IF q >= 0.5, d - d * sqrt(2 - 2q) / 2, d * sqrt(2q) / 2)
 *
 *   k(0, d) = 0
 *   k(1, d) = d
 *
 *   dk/dq = (IF q >= 0.5, d / sqrt(2-2q), d / sqrt(2q))
 *   limit q->1 dk/dq = inf
 *   limit q->0 dk/dq = inf
 *
 *   When plotted, the derivative function is symmetric, centered at q=0.5.
 *
 * Note that FMA has been tested here, but benchmarks have not shown it to be a
 * performance improvement.
 */

/*
 * q_to_k is unused but left here as a comment for completeness.
 * double q_to_k(double q, double d) {
 *   if (q >= 0.5) {
 *     return d - d * std::sqrt(0.5 - 0.5 * q);
 *   }
 *   return d * std::sqrt(0.5 * q);
 * }
 */

static double k_to_q(double k, double d) { … }

static double clamp(double v, double lo, double hi) { … }

TDigest::TDigest(
    std::vector<Centroid> centroids,
    double sum,
    double count,
    double max_val,
    double min_val,
    size_t maxSize)
    : … { … }

// Merge unsorted values by first sorting them.  Use radix sort if
// possible.  This implementation puts all additional memory in the
// heap, so that if called from fiber context we do not smash the
// stack.  Otherwise it is very similar to boost::spreadsort.
TDigest TDigest::merge(Range<const double*> unsortedValues) const { … }

void TDigest::internalMerge(
    TDigest& dst,
    Range<const double*> sortedValues,
    std::vector<Centroid>& workingBuffer) const { … }

TDigest TDigest::merge(
    sorted_equivalent_t, Range<const double*> sortedValues) const { … }

void TDigest::merge(
    sorted_equivalent_t,
    Range<const double*> sortedValues,
    MergeWorkingBuffer& workingBuffer) { … }

TDigest TDigest::merge(Range<const TDigest*> digests) { … }

double TDigest::estimateQuantile(double q) const { … }

double TDigest::Centroid::add(double sum, double weight) { … }

} // namespace folly
folly/folly/stats/TDigest.cpp