chromium/third_party/google-closure-library/closure/goog/i18n/uchar.js

/**
 * @license
 * Copyright The Closure Library Authors.
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @fileoverview Collection of utility functions for Unicode character.
 */

goog.provide('goog.i18n.uChar');


// Constants for handling Unicode supplementary characters (surrogate pairs).


/**
 * The minimum value for Supplementary code points.
 * @type {number}
 * @private
 */
goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_ = 0x10000;


/**
 * The highest Unicode code point value (scalar value) according to the Unicode
 * Standard.
 * @type {number}
 * @private
 */
goog.i18n.uChar.CODE_POINT_MAX_VALUE_ = 0x10FFFF;


/**
 * Lead surrogate minimum value.
 * @type {number}
 * @private
 */
goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_ = 0xD800;


/**
 * Lead surrogate maximum value.
 * @type {number}
 * @private
 */
goog.i18n.uChar.LEAD_SURROGATE_MAX_VALUE_ = 0xDBFF;


/**
 * Trail surrogate minimum value.
 * @type {number}
 * @private
 */
goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;


/**
 * Trail surrogate maximum value.
 * @type {number}
 * @private
 */
goog.i18n.uChar.TRAIL_SURROGATE_MAX_VALUE_ = 0xDFFF;


/**
 * The number of least significant bits of a supplementary code point that in
 * UTF-16 become the least significant bits of the trail surrogate. The rest of
 * the in-use bits of the supplementary code point become the least significant
 * bits of the lead surrogate.
 * @type {number}
 * @private
 */
goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_ = 10;


/**
 * Gets the U+ notation string of a Unicode character. Ex: 'U+0041' for 'A'.
 * @param {string} ch The given character.
 * @return {string} The U+ notation of the given character.
 */
goog.i18n.uChar.toHexString = function(ch) {
  'use strict';
  const chCode = goog.i18n.uChar.toCharCode(ch);
  const chCodeStr = 'U+' +
      goog.i18n.uChar.padString_(chCode.toString(16).toUpperCase(), 4, '0');

  return chCodeStr;
};


/**
 * Gets a string padded with given character to get given size.
 * @param {string} str The given string to be padded.
 * @param {number} length The target size of the string.
 * @param {string} ch The character to be padded with.
 * @return {string} The padded string.
 * @private
 */
goog.i18n.uChar.padString_ = function(str, length, ch) {
  'use strict';
  while (str.length < length) {
    str = ch + str;
  }
  return str;
};


/**
 * Gets Unicode value of the given character.
 * @param {string} ch The given character, which in the case of a supplementary
 * character is actually a surrogate pair. The remainder of the string is
 * ignored.
 * @return {number} The Unicode value of the character.
 */
goog.i18n.uChar.toCharCode = function(ch) {
  'use strict';
  return goog.i18n.uChar.getCodePointAround(ch, 0);
};


/**
 * Gets a character from the given Unicode value. If the given code point is not
 * a valid Unicode code point, null is returned.
 * @param {number} code The Unicode value of the character.
 * @return {?string} The character corresponding to the given Unicode value.
 */
goog.i18n.uChar.fromCharCode = function(code) {
  'use strict';
  if (code == null ||
      !(code >= 0 && code <= goog.i18n.uChar.CODE_POINT_MAX_VALUE_)) {
    return null;
  }
  if (goog.i18n.uChar.isSupplementaryCodePoint(code)) {
    // First, we split the code point into the trail surrogate part (the
    // TRAIL_SURROGATE_BIT_COUNT_ least significant bits) and the lead surrogate
    // part (the rest of the bits, shifted down; note that for now this includes
    // the supplementary offset, also shifted down, to be subtracted off below).
    const leadBits = code >> goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_;
    const trailBits = code &
        // A bit-mask to get the TRAIL_SURROGATE_BIT_COUNT_ (i.e. 10) least
        // significant bits. 1 << 10 = 0x0400. 0x0400 - 1 = 0x03FF.
        ((1 << goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_) - 1);

    // Now we calculate the code point of each surrogate by adding each offset
    // to the corresponding base code point.
    const leadCodePoint = leadBits +
        (goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_ -
         // Subtract off the supplementary offset, which had been shifted down
         // with the rest of leadBits. We do this here instead of before the
         // shift in order to save a separate subtraction step.
         (goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_ >>
          goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_));
    const trailCodePoint =
        trailBits + goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_;

    // Convert the code points into a 2-character long string.
    return String.fromCharCode(leadCodePoint) +
        String.fromCharCode(trailCodePoint);
  }
  return String.fromCharCode(code);
};


/**
 * Returns the Unicode code point at the specified index.
 *
 * If the char value specified at the given index is in the leading-surrogate
 * range, and the following index is less than the length of `string`, and
 * the char value at the following index is in the trailing-surrogate range,
 * then the supplementary code point corresponding to this surrogate pair is
 * returned.
 *
 * If the char value specified at the given index is in the trailing-surrogate
 * range, and the preceding index is not before the start of `string`, and
 * the char value at the preceding index is in the leading-surrogate range, then
 * the negated supplementary code point corresponding to this surrogate pair is
 * returned.
 *
 * The negation allows the caller to differentiate between the case where the
 * given index is at the leading surrogate and the one where it is at the
 * trailing surrogate, and thus deduce where the next character starts and
 * preceding character ends.
 *
 * Otherwise, the char value at the given index is returned. Thus, a leading
 * surrogate is returned when it is not followed by a trailing surrogate, and a
 * trailing surrogate is returned when it is not preceded by a leading
 * surrogate.
 *
 * @param {string} string The string.
 * @param {number} index The index from which the code point is to be retrieved.
 * @return {number} The code point at the given index. If the given index is
 * that of the start (i.e. lead surrogate) of a surrogate pair, returns the code
 * point encoded by the pair. If the given index is that of the end (i.e. trail
 * surrogate) of a surrogate pair, returns the negated code pointed encoded by
 * the pair.
 */
goog.i18n.uChar.getCodePointAround = function(string, index) {
  'use strict';
  const charCode = string.charCodeAt(index);
  if (goog.i18n.uChar.isLeadSurrogateCodePoint(charCode) &&
      index + 1 < string.length) {
    const trail = string.charCodeAt(index + 1);
    if (goog.i18n.uChar.isTrailSurrogateCodePoint(trail)) {
      // Part of a surrogate pair.
      return /** @type {number} */ (
          goog.i18n.uChar.buildSupplementaryCodePoint(charCode, trail));
    }
  } else if (goog.i18n.uChar.isTrailSurrogateCodePoint(charCode) && index > 0) {
    const lead = string.charCodeAt(index - 1);
    if (goog.i18n.uChar.isLeadSurrogateCodePoint(lead)) {
      // Part of a surrogate pair.
      const codepoint = /** @type {number} */ (
          goog.i18n.uChar.buildSupplementaryCodePoint(lead, charCode));
      return -codepoint;
    }
  }
  return charCode;
};


/**
 * Determines the length of the string needed to represent the specified
 * Unicode code point.
 * @param {number} codePoint
 * @return {number} 2 if codePoint is a supplementary character, 1 otherwise.
 */
goog.i18n.uChar.charCount = function(codePoint) {
  'use strict';
  return goog.i18n.uChar.isSupplementaryCodePoint(codePoint) ? 2 : 1;
};


/**
 * Determines whether the specified Unicode code point is in the supplementary
 * Unicode characters range.
 * @param {number} codePoint
 * @return {boolean} Whether then given code point is a supplementary character.
 */
goog.i18n.uChar.isSupplementaryCodePoint = function(codePoint) {
  'use strict';
  return codePoint >= goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_ &&
      codePoint <= goog.i18n.uChar.CODE_POINT_MAX_VALUE_;
};


/**
 * Gets whether the given code point is a leading surrogate character.
 * @param {number} codePoint
 * @return {boolean} Whether the given code point is a leading surrogate
 * character.
 */
goog.i18n.uChar.isLeadSurrogateCodePoint = function(codePoint) {
  'use strict';
  return codePoint >= goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_ &&
      codePoint <= goog.i18n.uChar.LEAD_SURROGATE_MAX_VALUE_;
};


/**
 * Gets whether the given code point is a trailing surrogate character.
 * @param {number} codePoint
 * @return {boolean} Whether the given code point is a trailing surrogate
 * character.
 */
goog.i18n.uChar.isTrailSurrogateCodePoint = function(codePoint) {
  'use strict';
  return codePoint >= goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_ &&
      codePoint <= goog.i18n.uChar.TRAIL_SURROGATE_MAX_VALUE_;
};


/**
 * Composes a supplementary Unicode code point from the given UTF-16 surrogate
 * pair. If leadSurrogate isn't a leading surrogate code point or trailSurrogate
 * isn't a trailing surrogate code point, null is returned.
 * @param {number} lead The leading surrogate code point.
 * @param {number} trail The trailing surrogate code point.
 * @return {?number} The supplementary Unicode code point obtained by decoding
 * the given UTF-16 surrogate pair.
 */
goog.i18n.uChar.buildSupplementaryCodePoint = function(lead, trail) {
  'use strict';
  if (goog.i18n.uChar.isLeadSurrogateCodePoint(lead) &&
      goog.i18n.uChar.isTrailSurrogateCodePoint(trail)) {
    const shiftedLeadOffset =
        (lead << goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_) -
        (goog.i18n.uChar.LEAD_SURROGATE_MIN_VALUE_
         << goog.i18n.uChar.TRAIL_SURROGATE_BIT_COUNT_);
    const trailOffset = trail - goog.i18n.uChar.TRAIL_SURROGATE_MIN_VALUE_ +
        goog.i18n.uChar.SUPPLEMENTARY_CODE_POINT_MIN_VALUE_;
    return shiftedLeadOffset + trailOffset;
  }
  return null;
};