//--------------------------------------------------------------------------------- // // Little Color Management System // Copyright (c) 1998-2023 Marti Maria Saguer // // Permission is hereby granted, free of charge, to any person obtaining // a copy of this software and associated documentation files (the "Software"), // to deal in the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the Software // is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO // THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. // //--------------------------------------------------------------------------------- // #include "lcms2_internal.h" //---------------------------------------------------------------------------------- // Optimization for 8 bits, Shaper-CLUT (3 inputs only) Prelin8Data; // Generic optimization for 16 bits Shaper-CLUT-Shaper (any inputs) Prelin16Data; // Optimization for matrix-shaper in 8 bits. Numbers are operated in n.14 signed, tables are stored in 1.14 fixed cmsS1Fixed14Number; // Note that this may hold more than 16 bits! #define DOUBLE_TO_1FIXED14(x) … MatShaper8Data; // Curves, optimization is shared between 8 and 16 bits Curves16Data; // A simple adapter to prevent _cmsPipelineEval16Fn vs. _cmsInterpFn16 // confusion, which trips up UBSAN. static void Lerp16Adapter(CMSREGISTER const cmsUInt16Number in[], CMSREGISTER cmsUInt16Number out[], const void* data) { … } // Simple optimizations ---------------------------------------------------------------------------------------------------------- // Clamp a fixed point integer to signed 28 bits to avoid overflow in // calculations. Clamp is intended for use with colorants, requiring one bit // for a colorant and another two bits to avoid overflow when combining the // colors. cmsINLINE cmsS1Fixed14Number _FixedClamp(cmsS1Fixed14Number n) { … } // Perform one row of matrix multiply with translation for MatShaperEval16(). cmsINLINE cmsInt64Number _MatShaperEvaluateRow(cmsS1Fixed14Number* mat, cmsS1Fixed14Number off, cmsS1Fixed14Number r, cmsS1Fixed14Number g, cmsS1Fixed14Number b) { … } // Remove an element in linked chain static void _RemoveElement(cmsStage** head) { … } // Remove all identities in chain. Note that pt actually is a double pointer to the element that holds the pointer. static cmsBool _Remove1Op(cmsPipeline* Lut, cmsStageSignature UnaryOp) { … } // Same, but only if two adjacent elements are found static cmsBool _Remove2Op(cmsPipeline* Lut, cmsStageSignature Op1, cmsStageSignature Op2) { … } static cmsBool CloseEnoughFloat(cmsFloat64Number a, cmsFloat64Number b) { … } static cmsBool isFloatMatrixIdentity(const cmsMAT3* a) { … } // if two adjacent matrices are found, multiply them. static cmsBool _MultiplyMatrix(cmsPipeline* Lut) { … } // Preoptimize just gets rif of no-ops coming paired. Conversion from v2 to v4 followed // by a v4 to v2 and vice-versa. The elements are then discarded. static cmsBool PreOptimize(cmsPipeline* Lut) { … } static void Eval16nop1D(CMSREGISTER const cmsUInt16Number Input[], CMSREGISTER cmsUInt16Number Output[], CMSREGISTER const struct _cms_interp_struc* p) { … } static void PrelinEval16(CMSREGISTER const cmsUInt16Number Input[], CMSREGISTER cmsUInt16Number Output[], CMSREGISTER const void* D) { … } static void PrelinOpt16free(cmsContext ContextID, void* ptr) { … } static void* Prelin16dup(cmsContext ContextID, const void* ptr) { … } static Prelin16Data* PrelinOpt16alloc(cmsContext ContextID, const cmsInterpParams* ColorMap, cmsUInt32Number nInputs, cmsToneCurve** In, cmsUInt32Number nOutputs, cmsToneCurve** Out ) { … } // Resampling --------------------------------------------------------------------------------- #define PRELINEARIZATION_POINTS … // Sampler implemented by another LUT. This is a clean way to precalculate the devicelink 3D CLUT for // almost any transform. We use floating point precision and then convert from floating point to 16 bits. static cmsInt32Number XFormSampler16(CMSREGISTER const cmsUInt16Number In[], CMSREGISTER cmsUInt16Number Out[], CMSREGISTER void* Cargo) { … } // Try to see if the curves of a given MPE are linear static cmsBool AllCurvesAreLinear(cmsStage* mpe) { … } // This function replaces a specific node placed in "At" by the "Value" numbers. Its purpose // is to fix scum dot on broken profiles/transforms. Works on 1, 3 and 4 channels static cmsBool PatchLUT(cmsStage* CLUT, cmsUInt16Number At[], cmsUInt16Number Value[], cmsUInt32Number nChannelsOut, cmsUInt32Number nChannelsIn) { … } // Auxiliary, to see if two values are equal or very different static cmsBool WhitesAreEqual(cmsUInt32Number n, cmsUInt16Number White1[], cmsUInt16Number White2[] ) { … } // Locate the node for the white point and fix it to pure white in order to avoid scum dot. static cmsBool FixWhiteMisalignment(cmsPipeline* Lut, cmsColorSpaceSignature EntryColorSpace, cmsColorSpaceSignature ExitColorSpace) { … } // ----------------------------------------------------------------------------------------------------------------------------------------------- // This function creates simple LUT from complex ones. The generated LUT has an optional set of // prelinearization curves, a CLUT of nGridPoints and optional postlinearization tables. // These curves have to exist in the original LUT in order to be used in the simplified output. // Caller may also use the flags to allow this feature. // LUTS with all curves will be simplified to a single curve. Parametric curves are lost. // This function should be used on 16-bits LUTS only, as floating point losses precision when simplified // ----------------------------------------------------------------------------------------------------------------------------------------------- static cmsBool OptimizeByResampling(cmsPipeline** Lut, cmsUInt32Number Intent, cmsUInt32Number* InputFormat, cmsUInt32Number* OutputFormat, cmsUInt32Number* dwFlags) { … } // ----------------------------------------------------------------------------------------------------------------------------------------------- // Fixes the gamma balancing of transform. This is described in my paper "Prelinearization Stages on // Color-Management Application-Specific Integrated Circuits (ASICs)" presented at NIP24. It only works // for RGB transforms. See the paper for more details // ----------------------------------------------------------------------------------------------------------------------------------------------- // Normalize endpoints by slope limiting max and min. This assures endpoints as well. // Descending curves are handled as well. static void SlopeLimiting(cmsToneCurve* g) { … } // Precomputes tables for 8-bit on input devicelink. static Prelin8Data* PrelinOpt8alloc(cmsContext ContextID, const cmsInterpParams* p, cmsToneCurve* G[3]) { … } static void Prelin8free(cmsContext ContextID, void* ptr) { … } static void* Prelin8dup(cmsContext ContextID, const void* ptr) { … } // A optimized interpolation for 8-bit input. #define DENS … static CMS_NO_SANITIZE void PrelinEval8(CMSREGISTER const cmsUInt16Number Input[], CMSREGISTER cmsUInt16Number Output[], CMSREGISTER const void* D) { … } #undef DENS // Curves that contain wide empty areas are not optimizeable static cmsBool IsDegenerated(const cmsToneCurve* g) { … } // -------------------------------------------------------------------------------------------------------------- // We need xput over here static cmsBool OptimizeByComputingLinearization(cmsPipeline** Lut, cmsUInt32Number Intent, cmsUInt32Number* InputFormat, cmsUInt32Number* OutputFormat, cmsUInt32Number* dwFlags) { … } // Curves optimizer ------------------------------------------------------------------------------------------------------------------ static void CurvesFree(cmsContext ContextID, void* ptr) { … } static void* CurvesDup(cmsContext ContextID, const void* ptr) { … } // Precomputes tables for 8-bit on input devicelink. static Curves16Data* CurvesAlloc(cmsContext ContextID, cmsUInt32Number nCurves, cmsUInt32Number nElements, cmsToneCurve** G) { … } static void FastEvaluateCurves8(CMSREGISTER const cmsUInt16Number In[], CMSREGISTER cmsUInt16Number Out[], CMSREGISTER const void* D) { … } static void FastEvaluateCurves16(CMSREGISTER const cmsUInt16Number In[], CMSREGISTER cmsUInt16Number Out[], CMSREGISTER const void* D) { … } static void FastIdentity16(CMSREGISTER const cmsUInt16Number In[], CMSREGISTER cmsUInt16Number Out[], CMSREGISTER const void* D) { … } // If the target LUT holds only curves, the optimization procedure is to join all those // curves together. That only works on curves and does not work on matrices. static cmsBool OptimizeByJoiningCurves(cmsPipeline** Lut, cmsUInt32Number Intent, cmsUInt32Number* InputFormat, cmsUInt32Number* OutputFormat, cmsUInt32Number* dwFlags) { … } // ------------------------------------------------------------------------------------------------------------------------------------- // LUT is Shaper - Matrix - Matrix - Shaper, which is very frequent when combining two matrix-shaper profiles static void FreeMatShaper(cmsContext ContextID, void* Data) { … } static void* DupMatShaper(cmsContext ContextID, const void* Data) { … } // A fast matrix-shaper evaluator for 8 bits. This is a bit tricky since I'm using 1.14 signed fixed point // to accomplish some performance. Actually it takes 256x3 16 bits tables and 16385 x 3 tables of 8 bits, // in total about 50K, and the performance boost is huge! static CMS_NO_SANITIZE void MatShaperEval16(CMSREGISTER const cmsUInt16Number In[], CMSREGISTER cmsUInt16Number Out[], CMSREGISTER const void* D) { … } // This table converts from 8 bits to 1.14 after applying the curve static void FillFirstShaper(cmsS1Fixed14Number* Table, cmsToneCurve* Curve) { … } // This table converts form 1.14 (being 0x4000 the last entry) to 8 bits after applying the curve static void FillSecondShaper(cmsUInt16Number* Table, cmsToneCurve* Curve, cmsBool Is8BitsOutput) { … } // Compute the matrix-shaper structure static cmsBool SetMatShaper(cmsPipeline* Dest, cmsToneCurve* Curve1[3], cmsMAT3* Mat, cmsVEC3* Off, cmsToneCurve* Curve2[3], cmsUInt32Number* OutputFormat) { … } // 8 bits on input allows matrix-shaper boot up to 25 Mpixels per second on RGB. That's fast! static cmsBool OptimizeMatrixShaper(cmsPipeline** Lut, cmsUInt32Number Intent, cmsUInt32Number* InputFormat, cmsUInt32Number* OutputFormat, cmsUInt32Number* dwFlags) { … } // ------------------------------------------------------------------------------------------------------------------------------------- // Optimization plug-ins // List of optimizations _cmsOptimizationCollection; // The built-in list. We currently implement 4 types of optimizations. Joining of curves, matrix-shaper, linearization and resampling static _cmsOptimizationCollection DefaultOptimization[] = …; // The linked list head _cmsOptimizationPluginChunkType _cmsOptimizationPluginChunk = …; // Duplicates the zone of memory used by the plug-in in the new context static void DupPluginOptimizationList(struct _cmsContext_struct* ctx, const struct _cmsContext_struct* src) { … } void _cmsAllocOptimizationPluginChunk(struct _cmsContext_struct* ctx, const struct _cmsContext_struct* src) { … } // Register new ways to optimize cmsBool _cmsRegisterOptimizationPlugin(cmsContext ContextID, cmsPluginBase* Data) { … } // The entry point for LUT optimization cmsBool CMSEXPORT _cmsOptimizePipeline(cmsContext ContextID, cmsPipeline** PtrLut, cmsUInt32Number Intent, cmsUInt32Number* InputFormat, cmsUInt32Number* OutputFormat, cmsUInt32Number* dwFlags) { … }