chromium/third_party/google-closure-library/closure/goog/html/sanitizer/safedomtreeprocessor.js

/**
 * @license
 * Copyright The Closure Library Authors.
 * SPDX-License-Identifier: Apache-2.0
 */


/**
 * @fileoverview A base class to safely parse and transform an HTML string
 * using an inert DOM, which avoids executing scripts and loading images. Note:
 * this class does *not* guarantee that the output does not contain scripts and
 * images that eventually execute once the output is inserted into an active DOM
 * document. If any subclass claims to produce SafeHtml output, it must be
 * reviewed separately.
 * @supported IE 10+, Chrome 26+, Firefox 22+, Safari 7.1+, Opera 15+
 */

goog.module('goog.html.sanitizer.SafeDomTreeProcessor');
goog.module.declareLegacyNamespace();

var Const = goog.require('goog.string.Const');
var ElementWeakMap = goog.require('goog.html.sanitizer.ElementWeakMap');
var Logger = goog.require('goog.log.Logger');
var NodeType = goog.require('goog.dom.NodeType');
var TagName = goog.require('goog.dom.TagName');
var googDom = goog.require('goog.dom');
var googLog = goog.require('goog.log');
var noclobber = goog.require('goog.html.sanitizer.noclobber');
var safe = goog.require('goog.dom.safe');
var uncheckedconversions = goog.require('goog.html.uncheckedconversions');
var userAgent = goog.require('goog.userAgent');

/** @const {?Logger} */
var logger = googLog.getLogger('goog.html.sanitizer.SafeDomTreeProcessor');

/**
 * Whether the HTML sanitizer is supported. For now mainly exclude
 * IE9 or below, for which we know the sanitizer is insecure or broken.
 * @const {boolean}
 */
var SAFE_PARSING_SUPPORTED =
    !userAgent.IE || userAgent.isDocumentModeOrHigher(10);

/**
 * Whether the template tag is supported.
 * @const {boolean}
 */
var HTML_SANITIZER_TEMPLATE_SUPPORTED =
    !userAgent.IE || document.documentMode == null;

/**
 * Parses a string of unsanitized HTML and provides an iterator over the
 * resulting DOM tree nodes. The parsing operation is inert (that is,
 * it does not cause execution of any active content or cause the browser to
 * issue any requests). The returned iterator is guaranteed to iterate over a
 * parent element before iterating over any of its children.
 * @param {string} html
 * @return {!TreeWalker}
 */
function getDomTreeWalker(html) {
  var iteratorParent;
  var safeHtml =
      uncheckedconversions.safeHtmlFromStringKnownToSatisfyTypeContract(
          Const.from('Never attached to DOM.'), html);
  var templateElement = document.createElement('template');
  if (HTML_SANITIZER_TEMPLATE_SUPPORTED && 'content' in templateElement) {
    safe.unsafeSetInnerHtmlDoNotUseOrElse(templateElement, safeHtml);
    iteratorParent = templateElement.content;
  } else {
    // In browsers where <template> is not implemented, use an inert
    // HTMLDocument.
    var doc = document.implementation.createHTMLDocument('x');
    iteratorParent = doc.body;
    safe.unsafeSetInnerHtmlDoNotUseOrElse(doc.body, safeHtml);
  }
  return document.createTreeWalker(
      iteratorParent, NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT,
      null /* filter */, false /* entityReferenceExpansion */);
}

/**
 * Constructs a {@link SafeDomTreeProcessor} object that safely parses an input
 * string into a DOM tree using an inert document, and creates a new tree based
 * on the original tree, optionally transforming it in the process. The
 * transformation is not specified in this abstract class; subclasses are
 * supposed to override its protected methods to define a transformation that
 * allows tags and attributes, drops entire subtrees, modifies tag names or
 * attributes, etc.
 * @constructor @struct @abstract
 */
var SafeDomTreeProcessor = function() {};

/**
 * Parses an HTML string and walks the resulting DOM forest to apply the
 * transformation function and generate a new forest. Returns the string
 * representation of the forest.
 * @param {string} html
 * @return {string}
 * @protected @final
 */
SafeDomTreeProcessor.prototype.processToString = function(html) {
  if (!SAFE_PARSING_SUPPORTED) {
    return '';
  }

  var newTree = this.processToTree(html);
  if (noclobber.getElementAttributes(newTree).length > 0) {
    // We want to preserve the outer SPAN tag, because the processor has
    // attached attributes to it. To do so, we make a new SPAN tag the parent of
    // the existing root span tag, so that the rest of the function will remove
    // that one instead.
    var newRoot = googDom.createElement(TagName.SPAN);
    newRoot.appendChild(newTree);
    newTree = newRoot;
  }
  // The XMLSerializer will add a spurious xmlns attribute to the root node.
  var serializedNewTree = new XMLSerializer().serializeToString(newTree);
  // Remove the outer span before returning the string representation of the
  // processed copy.
  return serializedNewTree.slice(
      serializedNewTree.indexOf('>') + 1, serializedNewTree.lastIndexOf('</'));
};

/**
 * Parses an HTML string and walks the resulting DOM forest to apply the
 * transformation function and generate a copy of the forest. Returns the forest
 * wrapped in a common SPAN parent, so that the result is always a tree.
 * @param {string} html
 * @return {!HTMLSpanElement}
 * @protected @final
 */
SafeDomTreeProcessor.prototype.processToTree = function(html) {
  if (!SAFE_PARSING_SUPPORTED) {
    return googDom.createElement(TagName.SPAN);
  }
  var newRoot = googDom.createElement(TagName.SPAN);
  // Allow subclasses to attach properties to the root.
  this.processRoot(newRoot);

  // Allow subclasses to pre-process the HTML string before performing the main
  // tree-based transformation.
  html = this.preProcessHtml(html);
  var originalTreeWalker = getDomTreeWalker(html);

  // Mapping from original nodes to new nodes, used to find the parent to which
  // a newly processed node should be attached.
  var elementMap = ElementWeakMap.newWeakMap();

  var originalNode;
  while (originalNode = originalTreeWalker.nextNode()) {
    // Make a copy of the node, potentially dropping it or changing its content,
    // tag name, etc.
    var newNode = this.createNode_(originalNode);
    if (!newNode) {
      // The transformation function chose not to copy over the node. We delete
      // the children so that the current treeWalker will stop iterating on
      // them.
      googDom.removeChildren(originalNode);
      continue;
    }
    if (noclobber.isNodeElement(newNode)) {
      elementMap.set(originalNode, newNode);
    }

    // Finds the new parent to which newNode should be appended. The tree is
    // copied top-down, so the parent of the current node has already been
    // copied and placed into the new tree. The new parent is either the root
    // of the new tree or a node found using originalToNewElementMap.
    var originalParent = noclobber.getParentNode(originalNode);
    var isParentRoot = false;
    if (originalParent) {
      var originalParentNodeType = noclobber.getNodeType(originalParent);
      var originalParentNodeName =
          noclobber.getNodeName(originalParent).toLowerCase();
      var originalGrandParent = noclobber.getParentNode(originalParent);
      // The following checks if newParent is an immediate child of the inert
      // parent template element.
      if (originalParentNodeType == NodeType.DOCUMENT_FRAGMENT &&
          !originalGrandParent) {
        isParentRoot = true;
      } else if (originalParentNodeName == 'body' && originalGrandParent) {
        // The following checks if newParent is an immediate child of the
        // inert parent HtmlDocument.
        var dirtyGreatGrandParent =
            noclobber.getParentNode(originalGrandParent);
        if (dirtyGreatGrandParent &&
            !noclobber.getParentNode(dirtyGreatGrandParent)) {
          isParentRoot = true;
        }
      }
      var newParent = null;
      if (isParentRoot || !originalParent) {
        newParent = newRoot;
      } else if (noclobber.isNodeElement(originalParent)) {
        newParent = elementMap.get(originalParent);
      }
      if (newParent.content) {
        newParent = newParent.content;
      }
      newParent.appendChild(newNode);
    }
  }
  if (elementMap.clear) {
    // Clear the map. On browsers that don't support WeakMap, entries are not
    // automatically cleaned up.
    elementMap.clear();
  }
  return newRoot;
};

/**
 * Creates the root SPAN element for the new tree. This function can be
 * overridden to add attributes to the tag. Note that if any attributes are
 * added to the element, then {@link processToString} will not strip it from the
 * generated string to preserve the attributes.
 * @param {!HTMLSpanElement} newRoot
 * @protected @abstract
 */
SafeDomTreeProcessor.prototype.processRoot = function(newRoot) {};

/**
 * Pre-processes the input html before the main tree-based transformation.
 * @param {string} html
 * @return {string}
 * @protected @abstract
 */
SafeDomTreeProcessor.prototype.preProcessHtml = function(html) {};

/**
 * Returns a new node based on the transformation of an original node, or null
 * if the node and all its children should not be copied over to the new tree.
 * @param {!Node} originalNode
 * @return {?Node}
 * @private
 */
SafeDomTreeProcessor.prototype.createNode_ = function(originalNode) {
  var nodeType = noclobber.getNodeType(originalNode);
  switch (nodeType) {
    case NodeType.TEXT:
      return this.createTextNode(/** @type {!Text} */ (originalNode));
    case NodeType.ELEMENT:
      return this.createElement_(noclobber.assertNodeIsElement(originalNode));
    default:
      googLog.warning(logger, 'Dropping unknown node type: ' + nodeType);
      return null;
  }
};

/**
 * Creates a new text node from the original text node, or null if the node
 * should not be copied over to the new tree.
 * @param {!Text} originalNode
 * @return {?Text}
 * @protected @abstract
 */
SafeDomTreeProcessor.prototype.createTextNode = function(originalNode) {};

/**
 * Creates a new element from the original element, potentially applying
 * transformations to the element's tagname and attributes.
 * @param {!Element} originalElement
 * @return {?Element}
 * @private
 */
SafeDomTreeProcessor.prototype.createElement_ = function(originalElement) {
  if (noclobber.getNodeName(originalElement).toUpperCase() == 'TEMPLATE') {
    // Processing TEMPLATE tags is not supported, they are automatically
    // dropped.
    return null;
  }
  var newElement = this.createElementWithoutAttributes(originalElement);
  if (!newElement) {
    return null;
  }
  // Copy over element attributes, applying a transformation on each attribute.
  this.processElementAttributes_(originalElement, newElement);
  return newElement;
};

/**
 * Creates a new element from the original element. This function should only
 * either create a new element (optionally changing the tag name from the
 * original element) or return null to prevent the entire subtree from appearing
 * in the output. Note that TEMPLATE tags and their contents are automatically
 * dropped, and this function is not called to decide whether to keep them or
 * not.
 * @param {!Element} originalElement
 * @return {?Element}
 * @protected @abstract
 */
SafeDomTreeProcessor.prototype.createElementWithoutAttributes = function(
    originalElement) {};

/**
 * Copies over the attributes of an original node to its corresponding new node
 * generated with {@link processNode}.
 * @param {!Element} originalElement
 * @param {!Element} newElement
 * @private
 */
SafeDomTreeProcessor.prototype.processElementAttributes_ = function(
    originalElement, newElement) {
  var attributes = noclobber.getElementAttributes(originalElement);
  if (attributes == null) {
    return;
  }
  for (var i = 0, attribute; attribute = attributes[i]; i++) {
    if (attribute.specified) {
      var newValue = this.processElementAttribute(originalElement, attribute);
      if (newValue !== null) {
        noclobber.setElementAttribute(newElement, attribute.name, newValue);
      }
    }
  }
};

/**
 * Returns the new value for an attribute, or null if the attribute should be
 * dropped.
 * @param {!Element} element
 * @param {!Attr} attribute
 * @return {?string}
 * @protected @abstract
 */
SafeDomTreeProcessor.prototype.processElementAttribute = function(
    element, attribute) {};

/** @const {boolean} */
SafeDomTreeProcessor.SAFE_PARSING_SUPPORTED = SAFE_PARSING_SUPPORTED;
exports = SafeDomTreeProcessor;