chromium/chrome/browser/resources/chromeos/accessibility/common/word_utils.ts

// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

import {ParagraphUtils} from './paragraph_utils.js';

// Utilities for processing words within strings and nodes.

export class WordUtils {
  /**
   * Searches through text starting at an index to find the next word's
   * start boundary.
   * @param text The string to search through
   * @param indexAfter The index into text at which to start
   *      searching.
   * @param nodeGroupItem The node whose name we
   *      are searching through.
   * @param ignoreStartChar When set to true, the search will only
   *      consider the index within the input node and ignore
   *      nodeGroupItem.startChar offsets. This is useful when we only search
   *      within the input nodeGroupItem, instead of the parent nodeGroup.
   * @return The index of the next word's start
   */
  static getNextWordStart(
      text: string|undefined, indexAfter: number,
      nodeGroupItem: ParagraphUtils.NodeGroupItem,
      ignoreStartChar = false): number {
    if (nodeGroupItem.hasInlineText && nodeGroupItem.node.children.length > 0) {
      const startChar = ignoreStartChar ? 0 : nodeGroupItem.startChar;
      // TODO(b/314203187): Not null asserted, check these to make sure this is
      // correct.
      const node = ParagraphUtils.findInlineTextNodeByCharacterIndex(
          nodeGroupItem.node, indexAfter - startChar)!;
      const startCharInParent = ParagraphUtils.getStartCharIndexInParent(node);
      // TODO(b/314203187): Not nulls asserted, check these to make sure they
      // are correct.
      for (var i = 0; i < node.wordStarts!.length; i++) {
        if (node.wordStarts![i] + startChar + startCharInParent < indexAfter) {
          continue;
        }
        return node.wordStarts![i] + startChar + startCharInParent;
      }
      // Default: We are just off the edge of this node.
      return node.name!.length + startChar + startCharInParent;
    } else {
      // Try to parse using a regex, which is imperfect.
      // Fall back to the given index if we can't find a match.
      return WordUtils.nextWordHelper(
          text, indexAfter, WordUtils.WORD_START_REGEXP, indexAfter);
    }
  }

  /**
   * Searches through text starting at an index to find the next word's
   * end boundary.
   * @param text The string to search through
   * @param indexAfter The index into text at which to start
   *      searching.
   * @param nodeGroupItem The node whose name we
   *      are searching through.
   * @param ignoreStartChar When set to true, the search will only
   *      consider the index within the input node and ignore
   *      nodeGroupItem.startChar offsets. This is useful when we only search
   *      within the input nodeGroupItem, instead of the parent nodeGroup.
   * @return The index of the next word's end
   */
  static getNextWordEnd(
      text: string|undefined, indexAfter: number,
      nodeGroupItem: ParagraphUtils.NodeGroupItem,
      ignoreStartChar = false): number {
    if (nodeGroupItem.hasInlineText && nodeGroupItem.node.children.length > 0) {
      const startChar = ignoreStartChar ? 0 : nodeGroupItem.startChar;
      // TODO(b/314203187): Not null asserted, check these to make sure this is
      // correct.
      const node = ParagraphUtils.findInlineTextNodeByCharacterIndex(
          nodeGroupItem.node, indexAfter - startChar + 1)!;
      const startCharInParent = ParagraphUtils.getStartCharIndexInParent(node);
      // TODO(b/314203187): Not nulls asserted, check these to make sure they
      // are correct.
      for (var i = 0; i < node.wordEnds!.length; i++) {
        if (node.wordEnds![i] + startChar + startCharInParent - 1 <
            indexAfter) {
          continue;
        }
        const result = node.wordEnds![i] + startChar + startCharInParent;
        // TODO(b/314203187): Not null asserted, check these to make sure this
        // is correct.
        return text!.length > result ? result : text!.length;
      }
      // Default.
      // TODO(b/314203187): Not null asserted, check these to make sure this is
      // correct.
      return text!.length;
    } else {
      // Try to parse using a regex, which is imperfect.
      // Fall back to the full length of the text if we can't find a match.
      // TODO(b/314203187): Not null asserted, check these to make sure this is
      // correct.
      return WordUtils.nextWordHelper(
                 text, indexAfter, WordUtils.WORD_END_REGEXP,
                 text!.length - 1) +
          1;
    }
  }

  /**
   * Searches through text to find the first index of a regular expression
   * after a given starting index. Returns a default value if no match is
   * found.
   * @param text The string to search through
   * @param indexAfter The index at which to start searching
   * @param re A regular expression to search for
   * @param defaultValue The default value to return if no
                       match is found.
   * @return The index found by the regular expression, or -1
   *                    if none found.
   */
  static nextWordHelper(
      text: string|undefined, indexAfter: number, re: RegExp,
      defaultValue: number): number {
    if (text === undefined) {
      return defaultValue;
    }
    const result = re.exec(text.substr(indexAfter));
    if (result != null && result.length > 0) {
      return indexAfter + result.index;
    }
    return defaultValue;
  }
}

export namespace WordUtils {

  /**
   * Regular expression to find the start of the next word after a word
   * boundary. We cannot use \b\W to find the next word because it does not
   * match many unicode characters.
   */
  export const WORD_START_REGEXP: RegExp = /\b\S/;

  /**
   * Regular expression to find the end of the next word, which is followed by
   * whitespace. We cannot use \w\b to find the end of the previous word because
   * \w does not know about many unicode characters.
   */
  export const WORD_END_REGEXP: RegExp = /\S\s/;
}