textextractor.js | Explore in Territory

/**
 * @license
 * Copyright The Closure Library Authors.
 * SPDX-License-Identifier: Apache-2.0
 */


/**
 * @fileoverview Contains utility methods to extract text content from HTML.
 * @supported IE 10+, Chrome 26+, Firefox 22+, Safari 7.1+, Opera 15+
 */

goog.provide('goog.html.textExtractor');

goog.require('goog.dom.TagName');
goog.require('goog.html.sanitizer.HtmlSanitizer');
goog.require('goog.object');
goog.require('goog.userAgent');


/**
 * Safely extracts text from an untrusted HTML string using the HtmlSanitizer.
 * Compared to goog.html.utils.stripHtmlTags, it tries to be smarter about
 * printing newlines between blocks and leave out textual content that would not
 * be displayed to the user (such as SCRIPT and STYLE tags).
 * @param {string} html The untrusted HTML string.
 * @return {string}
 */
// TODO(pelizzi): consider an optional bool parameter to also extract the text
// content of alt attributes and such.
goog.html.textExtractor.extractTextContent = function(html) {
  'use strict';
  if (!goog.html.textExtractor.isSupported()) {
    return '';
  }
  // Disable all attributes except style to protect against DOM clobbering.
  var sanitizer = new goog.html.sanitizer.HtmlSanitizer.Builder()
                      .onlyAllowAttributes(['style'])
                      .allowCssStyles()
                      .build();
  // The default policy of the sanitizer strips the content of tags such as
  // SCRIPT and STYLE, whose non-textual content would otherwise end up in the
  // extracted text.
  var sanitizedNodes = sanitizer.sanitizeToDomNode(html);
  // textContent and innerText do not handle spacing between block elements
  // properly. We need to reimplement a similar algorithm ourselves and account
  // for spacing between block elements.
  return goog.html.textExtractor.extractTextContentFromNode_(sanitizedNodes)
      .trim();
};


/**
 * Recursively extract text from the supplied DOM node and its descendants.
 * @param {!Node} node
 * @return {string}
 * @private
 */
goog.html.textExtractor.extractTextContentFromNode_ = function(node) {
  'use strict';
  switch (node.nodeType) {
    case Node.ELEMENT_NODE:
      var element = /** @type {!Element} */ (node);
      if (element.tagName == goog.dom.TagName.BR) {
        return '\n';
      }
      var result = Array.prototype.map
                       .call(
                           node.childNodes,
                           goog.html.textExtractor.extractTextContentFromNode_)
                       .join('');
      if (goog.html.textExtractor.isBlockElement_(element)) {
        result = '\n' + result + '\n';
      }
      return result;
    case Node.TEXT_NODE:
      return node.nodeValue.replace(/\s+/g, ' ').trim();
    default:
      return '';
  }
};


/**
 * A set of block elements.
 * @private @const {!Object<!goog.dom.TagName, boolean>}
 */
goog.html.textExtractor.BLOCK_ELEMENTS_ = goog.object.createSet(
    goog.dom.TagName.ADDRESS, goog.dom.TagName.BLOCKQUOTE,
    goog.dom.TagName.CENTER, goog.dom.TagName.DIV, goog.dom.TagName.DL,
    goog.dom.TagName.FIELDSET, goog.dom.TagName.FORM, goog.dom.TagName.H1,
    goog.dom.TagName.H2, goog.dom.TagName.H3, goog.dom.TagName.H4,
    goog.dom.TagName.H5, goog.dom.TagName.H6, goog.dom.TagName.HR,
    goog.dom.TagName.OL, goog.dom.TagName.P, goog.dom.TagName.PRE,
    goog.dom.TagName.TABLE, goog.dom.TagName.UL);


/**
 * Returns true whether this is a block element, i.e. the browser would visually
 * separate the text content from the text content of the previous node.
 * @param {!Element} element
 * @return {boolean}
 * @private
 */
goog.html.textExtractor.isBlockElement_ = function(element) {
  'use strict';
  return element.style.display == 'block' ||
      goog.html.textExtractor.BLOCK_ELEMENTS_.hasOwnProperty(element.tagName);
};


/**
 * Whether the browser supports the text extractor. The extractor depends on the
 * HTML Sanitizer, which only supports IE starting from version 10.
 * Visible for testing.
 * @return {boolean}
 * @package
 */
goog.html.textExtractor.isSupported = function() {
  'use strict';
  return !goog.userAgent.IE || goog.userAgent.isVersionOrHigher(10);
};
chromium/third_party/google-closure-library/closure/goog/html/textextractor.js