/* * Copyright 2017 Google Inc. * * Use of this source code is governed by a BSD-style license that can be * found in the LICENSE file. */ #include "src/core/SkMaskBlurFilter.h" #include "include/core/SkColorPriv.h" #include "include/private/base/SkMalloc.h" #include "include/private/base/SkTPin.h" #include "include/private/base/SkTemplates.h" #include "include/private/base/SkTo.h" #include "src/base/SkArenaAlloc.h" #include "src/base/SkVx.h" #include "src/core/SkGaussFilter.h" #include <cmath> #include <climits> namespace { static const double kPi = …; class PlanGauss final { … }; } // namespace // NB 135 is the largest sigma that will not cause a buffer full of 255 mask values to overflow // using the Gauss filter. It also limits the size of buffers used hold intermediate values. The // additional + 1 added to window represents adding one more leading element before subtracting the // trailing element. // Explanation of maximums: // sum0 = (window + 1) * 255 // sum1 = (window + 1) * sum0 -> (window + 1) * (window + 1) * 255 // sum2 = (window + 1) * sum1 -> (window + 1) * (window + 1) * (window + 1) * 255 -> window^3 * 255 // // The value (window + 1)^3 * 255 must fit in a uint32_t. So, // (window + 1)^3 * 255 < 2^32. window = 255. // // window = floor(sigma * 3 * sqrt(2 * kPi) / 4) // For window <= 255, the largest value for sigma is 135. SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH) : … { … } bool SkMaskBlurFilter::hasNoBlur() const { … } // We favor A8 masks, and if we need to work with another format, we'll convert to A8 first. // Each of these converts width (up to 8) mask values to A8. static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) { … } static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) { … } static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) { … } ToA8; fp88; // 8-wide fixed point 8.8 static fp88 load(const uint8_t* from, int width, ToA8* toA8) { … } static void store(uint8_t* to, const fp88& v, int width) { … } static constexpr uint16_t _____ = …; static constexpr uint16_t kHalf = …; // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8 // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits // resulting in a 8.8 format. // // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This // system is setup to minimize the number of multiplies needed. // // Explanation: // Blurring a specific mask value is given by the following equation where D_n is the resulting // mask value and S_n is the source value. The example below is for a filter with a radius of 1 // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are // aligned. The filter is given by G_n where n is the symmetric filter value. // // D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1]. // // We can start the source index at an offset relative to the destination separated by the // radius. This results in a non-traditional restating of the above filter. // // D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1] // // If we look at three specific consecutive destinations the following equations result: // // D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1] // D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1] // D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1]. // // In the above equations, notice that S[7] is used in all three. In particular, two values are // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and // D[7]. // // From the point of view of a source value we end up with the following three equations. // // Given S[7]: // D[5] += S[7]*G[1] // D[6] += S[7]*G[0] // D[7] += S[7]*G[1] // // In General: // D[n] += S[n]*G[1] // D[n+1] += S[n]*G[0] // D[n+2] += S[n]*G[1] // // Now these equations can be ganged using SIMD to form: // D[n..n+7] += S[n..n+7]*G[1] // D[n+1..n+8] += S[n..n+7]*G[0] // D[n+2..n+9] += S[n..n+7]*G[1] // The next set of values becomes. // D[n+8..n+15] += S[n+8..n+15]*G[1] // D[n+9..n+16] += S[n+8..n+15]*G[0] // D[n+10..n+17] += S[n+8..n+15]*G[1] // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both // S[n..7] and S[n+8..n+15]. // // Just one more transformation allows the code to maintain all working values in // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}. // // D[n..n+7] += S[n..n+7] * G[1] // D[n..n+8] += {0, S[n..n+7] * G[0]} // D[n..n+9] += {0, 0, S[n..n+7] * G[1]} // // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a // register d8. In addition, S[0..n+7] becomes s0. // // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below. // // Sk8h v0 = s0*G[0] // Sk8h v1 = s0*G[1] // /* D[n..n+7] += S[n..n+7] * G[1] */ // d0 += v1; // /* D[n..n+8] += {0, S[n..n+7] * G[0]} */ // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]} // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____} // /* D[n..n+9] += {0, 0, S[n..n+7] * G[1]} */ // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]} // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____} // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation. static void blur_x_radius_1( const fp88& s0, const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&, fp88* d0, fp88* d8) { … } static void blur_x_radius_2( const fp88& s0, const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&, fp88* d0, fp88* d8) { … } static void blur_x_radius_3( const fp88& s0, const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&, fp88* d0, fp88* d8) { … } static void blur_x_radius_4( const fp88& s0, const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4, fp88* d0, fp88* d8) { … } BlurX; // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). static void blur_row( BlurX blur, const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4, const uint8_t* src, int srcW, uint8_t* dst, int dstW) { … } // BlurX will only be one of the functions blur_x_radius_(1|2|3|4). static void blur_x_rect(BlurX blur, uint16_t* gauss, const uint8_t* src, size_t srcStride, int srcW, uint8_t* dst, size_t dstStride, int dstW, int dstH) { … } static void direct_blur_x(int radius, uint16_t* gauss, const uint8_t* src, size_t srcStride, int srcW, uint8_t* dst, size_t dstStride, int dstW, int dstH) { … } // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N // functions, but end up being simpler because there is no complicated shift of registers. We // start with the non-traditional form of the gaussian filter. In the following r is the value // when added generates the next value in the column. // // D[n+0r] = S[n+0r]*G[1] // + S[n+1r]*G[0] // + S[n+2r]*G[1] // // Expanding out in a way similar to blur_x_radius_N for specific values of n. // // D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1] // D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1] // D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1] // // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we // can look at the calculation form the point of view of a source value. // // Given S[n+0r]: // D[n+0r] += S[n+0r]*G[1]; // /* D[n+0r] is done and can be stored now. */ // D[n+1r] += S[n+0r]*G[0]; // D[n+2r] = S[n+0r]*G[1]; // // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also, // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice // how values flow in the following two iterations in source. // // D[n+0r] += S[n+0r]*G[1] // D[n+1r] += S[n+0r]*G[0] // D[n+2r] = S[n+0r]*G[1] // /* ------- */ // D[n+1r] += S[n+1r]*G[1] // D[n+2r] += S[n+1r]*G[0] // D[n+3r] = S[n+1r]*G[1] // // Instead of using memory we can introduce temporaries d01 and d12. The update step changes // to the following. // // answer = d01 + S[n+0r]*G[1] // d01 = d12 + S[n+0r]*G[0] // d12 = S[n+0r]*G[1] // return answer // // Finally, this can be ganged into SIMD style. // answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1] // d01[0..7] = d12[0..7] + S[n+0r..n+0r+7]*G[0] // d12[0..7] = S[n+0r..n+0r+7]*G[1] // return answer[0..7] static fp88 blur_y_radius_1( const fp88& s0, const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&, fp88* d01, fp88* d12, fp88*, fp88*, fp88*, fp88*, fp88*, fp88*) { … } static fp88 blur_y_radius_2( const fp88& s0, const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&, fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88*, fp88*, fp88*, fp88*) { … } static fp88 blur_y_radius_3( const fp88& s0, const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&, fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88*, fp88*) { … } static fp88 blur_y_radius_4( const fp88& s0, const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4, fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88* d67, fp88* d78) { … } using BlurY = decltype(blur_y_radius_1); // BlurY will be one of blur_y_radius_(1|2|3|4). static void blur_column( ToA8 toA8, BlurY blur, int radius, int width, const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4, const uint8_t* src, size_t srcRB, int srcH, uint8_t* dst, size_t dstRB) { … } // BlurY will be one of blur_y_radius_(1|2|3|4). static void blur_y_rect(ToA8 toA8, const int strideOf8, BlurY blur, int radius, uint16_t *gauss, const uint8_t *src, size_t srcRB, int srcW, int srcH, uint8_t *dst, size_t dstRB) { … } static void direct_blur_y(ToA8 toA8, const int strideOf8, int radius, uint16_t* gauss, const uint8_t* src, size_t srcRB, int srcW, int srcH, uint8_t* dst, size_t dstRB) { … } static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMaskBuilder* dst) { … } // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the // API forces the sigmas to be the same. SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMaskBuilder* dst) const { … }