SkMaskBlurFilter.cpp | Explore in Territory

/*
 * Copyright 2017 Google Inc.
 *
 * Use of this source code is governed by a BSD-style license that can be
 * found in the LICENSE file.
 */

#include "src/core/SkMaskBlurFilter.h"

#include "include/core/SkColorPriv.h"
#include "include/private/base/SkMalloc.h"
#include "include/private/base/SkTPin.h"
#include "include/private/base/SkTemplates.h"
#include "include/private/base/SkTo.h"
#include "src/base/SkArenaAlloc.h"
#include "src/base/SkVx.h"
#include "src/core/SkGaussFilter.h"

#include <cmath>
#include <climits>

namespace {
static const double kPi = …;

class PlanGauss final { … };

} // namespace

// NB 135 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
// using the Gauss filter. It also limits the size of buffers used hold intermediate values. The
// additional + 1 added to window represents adding one more leading element before subtracting the
// trailing element.
// Explanation of maximums:
//   sum0 = (window + 1) * 255
//   sum1 = (window + 1) * sum0 -> (window + 1) * (window + 1) * 255
//   sum2 = (window + 1) * sum1 -> (window + 1) * (window + 1) * (window + 1) * 255 -> window^3 * 255
//
//   The value (window + 1)^3 * 255 must fit in a uint32_t. So,
//      (window + 1)^3 * 255 < 2^32. window = 255.
//
//   window = floor(sigma * 3 * sqrt(2 * kPi) / 4)
//   For window <= 255, the largest value for sigma is 135.
SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH)
    : … { … }

bool SkMaskBlurFilter::hasNoBlur() const { … }

// We favor A8 masks, and if we need to work with another format, we'll convert to A8 first.
// Each of these converts width (up to 8) mask values to A8.
static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) { … }
static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) { … }
static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) { … }
ToA8;

fp88; // 8-wide fixed point 8.8

static fp88 load(const uint8_t* from, int width, ToA8* toA8) { … }

static void store(uint8_t* to, const fp88& v, int width) { … }

static constexpr uint16_t _____ = …;
static constexpr uint16_t kHalf = …;

// In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded
// in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8
// format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits
// resulting in a 8.8 format.
//
// The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This
// system is setup to minimize the number of multiplies needed.
//
// Explanation:
//    Blurring a specific mask value is given by the following equation where D_n is the resulting
// mask value and S_n is the source value. The example below is for a filter with a radius of 1
// and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are
// aligned. The filter is given by G_n where n is the symmetric filter value.
//
//   D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1].
//
// We can start the source index at an offset relative to the destination separated by the
// radius. This results in a non-traditional restating of the above filter.
//
//  D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1]
//
// If we look at three specific consecutive destinations the following equations result:
//
//   D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1]
//   D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1]
//   D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1].
//
// In the above equations, notice that S[7] is used in all three. In particular, two values are
// used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and
// D[7].
//
// From the point of view of a source value we end up with the following three equations.
//
// Given S[7]:
//   D[5] += S[7]*G[1]
//   D[6] += S[7]*G[0]
//   D[7] += S[7]*G[1]
//
// In General:
//   D[n]   += S[n]*G[1]
//   D[n+1] += S[n]*G[0]
//   D[n+2] += S[n]*G[1]
//
// Now these equations can be ganged using SIMD to form:
//   D[n..n+7]   += S[n..n+7]*G[1]
//   D[n+1..n+8] += S[n..n+7]*G[0]
//   D[n+2..n+9] += S[n..n+7]*G[1]
// The next set of values becomes.
//   D[n+8..n+15]  += S[n+8..n+15]*G[1]
//   D[n+9..n+16]  += S[n+8..n+15]*G[0]
//   D[n+10..n+17] += S[n+8..n+15]*G[1]
// You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both
// S[n..7] and S[n+8..n+15].
//
// Just one more transformation allows the code to maintain all working values in
// registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is
// prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}.
//
//   D[n..n+7]  += S[n..n+7] * G[1]
//   D[n..n+8]  += {0, S[n..n+7] * G[0]}
//   D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
//
// Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a
// register d8. In addition, S[0..n+7] becomes s0.
//
// The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below.
//
// Sk8h v0 = s0*G[0]
// Sk8h v1 = s0*G[1]
// /* D[n..n+7]  += S[n..n+7] * G[1] */
// d0 += v1;
// /* D[n..n+8]  += {0, S[n..n+7] * G[0]} */
// d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}
// d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____}
// /* D[n..n+9]  += {0, 0, S[n..n+7] * G[1]} */
// d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}
// d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____}
// Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.

static void blur_x_radius_1(
        const fp88& s0,
        const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&,
        fp88* d0, fp88* d8) { … }

static void blur_x_radius_2(
        const fp88& s0,
        const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&,
        fp88* d0, fp88* d8) { … }

static void blur_x_radius_3(
        const fp88& s0,
        const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&,
        fp88* d0, fp88* d8) { … }

static void blur_x_radius_4(
        const fp88& s0,
        const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
        fp88* d0, fp88* d8) { … }

BlurX;

// BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
static void blur_row(
        BlurX blur,
        const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
        const uint8_t* src, int srcW,
              uint8_t* dst, int dstW) { … }

// BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
static void blur_x_rect(BlurX blur,
                        uint16_t* gauss,
                        const uint8_t* src, size_t srcStride, int srcW,
                        uint8_t* dst, size_t dstStride, int dstW, int dstH) { … }

static void direct_blur_x(int radius, uint16_t* gauss,
                          const uint8_t* src, size_t srcStride, int srcW,
                          uint8_t* dst, size_t dstStride, int dstW, int dstH) { … }

// The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N
// functions, but end up being simpler because there is no complicated shift of registers. We
// start with the non-traditional form of the gaussian filter. In the following r is the value
// when added generates the next value in the column.
//
//   D[n+0r] = S[n+0r]*G[1]
//           + S[n+1r]*G[0]
//           + S[n+2r]*G[1]
//
// Expanding out in a way similar to blur_x_radius_N for specific values of n.
//
//   D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1]
//   D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1]
//   D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1]
//
// We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we
// can look at the calculation form the point of view of a source value.
//
//   Given S[n+0r]:
//   D[n+0r] += S[n+0r]*G[1];
//   /* D[n+0r] is done and can be stored now. */
//   D[n+1r] += S[n+0r]*G[0];
//   D[n+2r]  = S[n+0r]*G[1];
//
// Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in
// S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also,
// notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice
// how values flow in the following two iterations in source.
//
//   D[n+0r] += S[n+0r]*G[1]
//   D[n+1r] += S[n+0r]*G[0]
//   D[n+2r]  = S[n+0r]*G[1]
//   /* ------- */
//   D[n+1r] += S[n+1r]*G[1]
//   D[n+2r] += S[n+1r]*G[0]
//   D[n+3r]  = S[n+1r]*G[1]
//
// Instead of using memory we can introduce temporaries d01 and d12. The update step changes
// to the following.
//
//   answer = d01 + S[n+0r]*G[1]
//   d01    = d12 + S[n+0r]*G[0]
//   d12    =       S[n+0r]*G[1]
//   return answer
//
// Finally, this can be ganged into SIMD style.
//   answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1]
//   d01[0..7]    = d12[0..7] + S[n+0r..n+0r+7]*G[0]
//   d12[0..7]    =             S[n+0r..n+0r+7]*G[1]
//   return answer[0..7]
static fp88 blur_y_radius_1(
        const fp88& s0,
        const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&,
        fp88* d01, fp88* d12, fp88*, fp88*, fp88*, fp88*, fp88*, fp88*) { … }

static fp88 blur_y_radius_2(
        const fp88& s0,
        const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&,
        fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88*, fp88*, fp88*, fp88*) { … }

static fp88 blur_y_radius_3(
        const fp88& s0,
        const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&,
        fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88*, fp88*) { … }

static fp88 blur_y_radius_4(
    const fp88& s0,
    const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
    fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88* d67, fp88* d78) { … }

using BlurY = decltype(blur_y_radius_1);

// BlurY will be one of blur_y_radius_(1|2|3|4).
static void blur_column(
        ToA8 toA8,
        BlurY blur, int radius, int width,
        const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
        const uint8_t* src, size_t srcRB, int srcH,
        uint8_t* dst, size_t dstRB) { … }

// BlurY will be one of blur_y_radius_(1|2|3|4).
static void blur_y_rect(ToA8 toA8, const int strideOf8,
                        BlurY blur, int radius, uint16_t *gauss,
                        const uint8_t *src, size_t srcRB, int srcW, int srcH,
                        uint8_t *dst, size_t dstRB) { … }

static void direct_blur_y(ToA8 toA8, const int strideOf8,
                          int radius, uint16_t* gauss,
                          const uint8_t* src, size_t srcRB, int srcW, int srcH,
                          uint8_t* dst, size_t dstRB) { … }

static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMaskBuilder* dst) { … }

// TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the
// API forces the sigmas to be the same.
SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMaskBuilder* dst) const { … }
chromium/third_party/skia/src/core/SkMaskBlurFilter.cpp