/**
* @license
* Copyright The Closure Library Authors.
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @fileoverview Methods for annotating occurrences of query terms in text or
* in a DOM tree. Adapted from Gmail code.
*/
goog.provide('goog.dom.annotate');
goog.provide('goog.dom.annotate.AnnotateFn');
goog.require('goog.array');
goog.require('goog.asserts');
goog.require('goog.dom');
goog.require('goog.dom.NodeType');
goog.require('goog.dom.TagName');
goog.require('goog.dom.safe');
goog.require('goog.html.SafeHtml');
goog.require('goog.object');
/**
* A function that takes:
* (1) the number of the term that is "hit",
* (2) the HTML (search term) to be annotated,
* and returns the annotated term as an HTML.
* @typedef {function(number, !goog.html.SafeHtml): !goog.html.SafeHtml}
*/
goog.dom.annotate.AnnotateFn;
/**
* Calls `annotateFn` for each occurrence of a search term in text nodes
* under `node`. Returns the number of hits.
*
* @param {Node} node A DOM node.
* @param {Array<!Array<string|boolean>>} terms
* An array of [searchTerm, matchWholeWordOnly] tuples.
* The matchWholeWordOnly value is a per-term attribute because some terms
* may be CJK, while others are not. (For correctness, matchWholeWordOnly
* should always be false for CJK terms.).
* @param {goog.dom.annotate.AnnotateFn} annotateFn
* @param {*=} opt_ignoreCase Whether to ignore the case of the query
* terms when looking for matches.
* @param {Array<string>=} opt_classesToSkip Nodes with one of these CSS class
* names (and its descendants) will be skipped.
* @param {number=} opt_maxMs Number of milliseconds after which this function,
* if still annotating, should stop and return.
*
* @return {boolean} Whether any terms were annotated.
*/
goog.dom.annotate.annotateTerms = function(
node, terms, annotateFn, opt_ignoreCase, opt_classesToSkip, opt_maxMs) {
'use strict';
if (opt_ignoreCase) {
terms = goog.dom.annotate.lowercaseTerms_(terms);
}
var stopTime = +opt_maxMs > 0 ? Date.now() + opt_maxMs : 0;
return goog.dom.annotate.annotateTermsInNode_(
node, terms, annotateFn, opt_ignoreCase, opt_classesToSkip || [],
stopTime, 0);
};
/**
* The maximum recursion depth allowed. Any DOM nodes deeper than this are
* ignored.
* @type {number}
* @private
*/
goog.dom.annotate.MAX_RECURSION_ = 200;
/**
* The node types whose descendants should not be affected by annotation.
* @private {!Object<string, boolean>}
*/
goog.dom.annotate.NODES_TO_SKIP_ = goog.object.createSet(
goog.dom.TagName.SCRIPT, goog.dom.TagName.STYLE, goog.dom.TagName.TEXTAREA);
/**
* Recursive helper function.
*
* @param {Node} node A DOM node.
* @param {Array<!Array<string|boolean>>} terms
* An array of [searchTerm, matchWholeWordOnly] tuples.
* The matchWholeWordOnly value is a per-term attribute because some terms
* may be CJK, while others are not. (For correctness, matchWholeWordOnly
* should always be false for CJK terms.).
* @param {goog.dom.annotate.AnnotateFn} annotateFn
* @param {*} ignoreCase Whether to ignore the case of the query terms
* when looking for matches.
* @param {Array<string>} classesToSkip Nodes with one of these CSS class
* names will be skipped (as will their descendants).
* @param {number} stopTime Deadline for annotation operation (ignored if 0).
* @param {number} recursionLevel How deep this recursive call is; pass the
* value 0 in the initial call.
* @return {boolean} Whether any terms were annotated.
* @private
*/
goog.dom.annotate.annotateTermsInNode_ = function(
node, terms, annotateFn, ignoreCase, classesToSkip, stopTime,
recursionLevel) {
'use strict';
if ((stopTime > 0 && Date.now() >= stopTime) ||
recursionLevel > goog.dom.annotate.MAX_RECURSION_) {
return false;
}
var annotated = false;
if (node.nodeType == goog.dom.NodeType.TEXT) {
var html = goog.dom.annotate.helpAnnotateText_(
node.nodeValue, terms, annotateFn, ignoreCase);
if (html != null) {
// Replace the text with the annotated html. First we put the html into
// a temporary node, to get its DOM structure. To avoid adding a wrapper
// element as a side effect, we'll only actually use the temporary node's
// children.
var tempNode =
goog.dom.getDomHelper(node).createElement(goog.dom.TagName.SPAN);
goog.dom.safe.setInnerHtml(tempNode, html);
var parentNode = node.parentNode;
var nodeToInsert;
while ((nodeToInsert = tempNode.firstChild) != null) {
// Each parentNode.insertBefore call removes the inserted node from
// tempNode's list of children.
parentNode.insertBefore(/** @type {!Node} */ (nodeToInsert), node);
}
parentNode.removeChild(node);
annotated = true;
}
} else if (
node.hasChildNodes() &&
!goog.dom.annotate
.NODES_TO_SKIP_[/** @type {!Element} */ (node).tagName]) {
var classes = /** @type {!Element} */ (node).className.split(/\s+/);
var skip = goog.array.some(classes, function(className) {
'use strict';
return goog.array.contains(classesToSkip, className);
});
if (!skip) {
++recursionLevel;
var curNode = node.firstChild;
while (curNode) {
var nextNode = curNode.nextSibling;
var curNodeAnnotated = goog.dom.annotate.annotateTermsInNode_(
curNode, terms, annotateFn, ignoreCase, classesToSkip, stopTime,
recursionLevel);
annotated = annotated || curNodeAnnotated;
curNode = nextNode;
}
}
}
return annotated;
};
/**
* Regular expression that matches non-word characters.
*
* Performance note: Testing a one-character string using this regex is as fast
* as the equivalent string test ("a-zA-Z0-9_".indexOf(c) < 0), give or take a
* few percent. (The regex is about 5% faster in IE 6 and about 4% slower in
* Firefox 1.5.) If performance becomes critical, it may be better to convert
* the character to a numerical char code and check whether it falls in the
* word character ranges. A quick test suggests that could be 33% faster.
*
* @type {RegExp}
* @private
*/
goog.dom.annotate.NONWORD_RE_ = /\W/;
/**
* Annotates occurrences of query terms in plain text. This process consists of
* identifying all occurrences of all query terms, calling a provided function
* to get the appropriate replacement HTML for each occurrence, and
* HTML-escaping all the text.
*
* @param {string} text The plain text to be searched.
* @param {Array<Array<?>>} terms An array of
* [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
* The matchWholeWordOnly value is a per-term attribute because some terms
* may be CJK, while others are not. (For correctness, matchWholeWordOnly
* should always be false for CJK terms.).
* @param {goog.dom.annotate.AnnotateFn} annotateFn
* @param {*=} opt_ignoreCase Whether to ignore the case of the query
* terms when looking for matches.
* @return {goog.html.SafeHtml} The HTML equivalent of `text` with terms
* annotated, or null if the text did not contain any of the terms.
*/
goog.dom.annotate.annotateText = function(
text, terms, annotateFn, opt_ignoreCase) {
'use strict';
if (opt_ignoreCase) {
terms = goog.dom.annotate.lowercaseTerms_(terms);
}
return goog.dom.annotate.helpAnnotateText_(
text, terms, annotateFn, opt_ignoreCase);
};
/**
* Annotates occurrences of query terms in plain text. This process consists of
* identifying all occurrences of all query terms, calling a provided function
* to get the appropriate replacement HTML for each occurrence, and
* HTML-escaping all the text.
*
* @param {string} text The plain text to be searched.
* @param {Array<Array<?>>} terms An array of
* [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
* If `ignoreCase` is true, each search term must already be lowercase.
* The matchWholeWordOnly value is a per-term attribute because some terms
* may be CJK, while others are not. (For correctness, matchWholeWordOnly
* should always be false for CJK terms.).
* @param {goog.dom.annotate.AnnotateFn} annotateFn
* @param {*} ignoreCase Whether to ignore the case of the query terms
* when looking for matches.
* @return {goog.html.SafeHtml} The HTML equivalent of `text` with terms
* annotated, or null if the text did not contain any of the terms.
* @private
*/
goog.dom.annotate.helpAnnotateText_ = function(
text, terms, annotateFn, ignoreCase) {
'use strict';
var hit = false;
var textToSearch = ignoreCase ? text.toLowerCase() : text;
var textLen = textToSearch.length;
var numTerms = terms.length;
// Each element will be an array of hit positions for the term.
var termHits = new Array(numTerms);
// First collect all the hits into allHits.
for (var i = 0; i < numTerms; i++) {
var term = terms[i];
var hits = [];
var termText = term[0];
if (termText != '') {
var matchWholeWordOnly = term[1];
var termLen = termText.length;
var pos = 0;
// Find each hit for term t and append to termHits.
while (pos < textLen) {
var hitPos = textToSearch.indexOf(termText, pos);
if (hitPos == -1) {
break;
} else {
var prevCharPos = hitPos - 1;
var nextCharPos = hitPos + termLen;
if (!matchWholeWordOnly ||
((prevCharPos < 0 ||
goog.dom.annotate.NONWORD_RE_.test(
textToSearch.charAt(prevCharPos))) &&
(nextCharPos >= textLen ||
goog.dom.annotate.NONWORD_RE_.test(
textToSearch.charAt(nextCharPos))))) {
hits.push(hitPos);
hit = true;
}
pos = hitPos + termLen;
}
}
}
termHits[i] = hits;
}
if (hit) {
var html = [];
var pos = 0;
while (true) {
// First determine which of the n terms is the next hit.
var termIndexOfNextHit;
var posOfNextHit = -1;
for (var i = 0; i < numTerms; i++) {
var hits = termHits[i];
// pull off the position of the next hit of term t
// (it's always the first in the array because we're shifting
// hits off the front of the array as we process them)
// this is the next candidate to consider for the next overall hit
if (!goog.array.isEmpty(hits)) {
var hitPos = hits[0];
// Discard any hits embedded in the previous hit.
while (hitPos >= 0 && hitPos < pos) {
hits.shift();
hitPos = goog.array.isEmpty(hits) ? -1 : hits[0];
}
if (hitPos >= 0 && (posOfNextHit < 0 || hitPos < posOfNextHit)) {
termIndexOfNextHit = i;
posOfNextHit = hitPos;
}
}
}
// Quit if there are no more hits.
if (posOfNextHit < 0) break;
goog.asserts.assertNumber(termIndexOfNextHit);
// Remove the next hit from our hit list.
termHits[termIndexOfNextHit].shift();
// Append everything from the end of the last hit up to this one.
html.push(text.substr(pos, posOfNextHit - pos));
// Append the annotated term.
var termLen = terms[termIndexOfNextHit][0].length;
var termHtml =
goog.html.SafeHtml.htmlEscape(text.substr(posOfNextHit, termLen));
html.push(
annotateFn(goog.asserts.assertNumber(termIndexOfNextHit), termHtml));
pos = posOfNextHit + termLen;
}
// Append everything after the last hit.
html.push(text.substr(pos));
return goog.html.SafeHtml.concat(html);
} else {
return null;
}
};
/**
* Converts terms to lowercase.
*
* @param {Array<Array<?>>} terms An array of
* [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
* @return {!Array<Array<?>>} An array of
* [{string} searchTerm, {boolean} matchWholeWordOnly] tuples.
* @private
*/
goog.dom.annotate.lowercaseTerms_ = function(terms) {
'use strict';
var lowercaseTerms = [];
for (var i = 0; i < terms.length; ++i) {
var term = terms[i];
lowercaseTerms[i] = [term[0].toLowerCase(), term[1]];
}
return lowercaseTerms;
};