htmlprettyprinter.js | Explore in Territory

/**
 * @license
 * Copyright The Closure Library Authors.
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @fileoverview Provides functions to parse and pretty-print HTML strings.
 */

goog.provide('goog.format.HtmlPrettyPrinter');
goog.provide('goog.format.HtmlPrettyPrinter.Buffer');

goog.require('goog.dom.TagName');
goog.require('goog.object');
goog.require('goog.string.StringBuffer');



/**
 * This class formats HTML to be more human-readable.
 * TODO(user): Add hierarchical indentation.
 * @param {number=} opt_timeOutMillis Max # milliseconds to spend on #format. If
 *     this time is exceeded, return partially formatted. 0 or negative number
 *     indicates no timeout.
 * @constructor
 * @final
 */
goog.format.HtmlPrettyPrinter = function(opt_timeOutMillis) {
  'use strict';
  /**
   * Max # milliseconds to spend on #format.
   * @type {number}
   * @private
   */
  this.timeOutMillis_ =
      opt_timeOutMillis && opt_timeOutMillis > 0 ? opt_timeOutMillis : 0;
};


/**
 * Singleton.
 * @private {goog.format.HtmlPrettyPrinter?}
 */
goog.format.HtmlPrettyPrinter.instance_ = null;


/**
 * Singleton lazy initializer.
 * @return {!goog.format.HtmlPrettyPrinter} Singleton.
 * @private
 */
goog.format.HtmlPrettyPrinter.getInstance_ = function() {
  'use strict';
  if (!goog.format.HtmlPrettyPrinter.instance_) {
    goog.format.HtmlPrettyPrinter.instance_ =
        new goog.format.HtmlPrettyPrinter();
  }
  return goog.format.HtmlPrettyPrinter.instance_;
};


/**
 * Static utility function. See prototype #format.
 * @param {string} html The HTML text to pretty print.
 * @return {string} Formatted result.
 */
goog.format.HtmlPrettyPrinter.format = function(html) {
  'use strict';
  return goog.format.HtmlPrettyPrinter.getInstance_().format(html);
};


/**
 * List of patterns used to tokenize HTML for pretty printing. Cache
 * subexpression for tag name.
 * comment|meta-tag|tag|text|other-less-than-characters
 * @private {!RegExp}
 * @const
 */
goog.format.HtmlPrettyPrinter.TOKEN_REGEX_ =
    /(?:<!--.*?-->|<!.*?>|<(\/?)(\w+)[^<>]*>|[^<]+|<)/g;


/**
 * Tags whose contents we don't want pretty printed.
 * @private {!Object}
 * @const
 */
goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_ = goog.object.createSet(
    goog.dom.TagName.SCRIPT, goog.dom.TagName.STYLE, goog.dom.TagName.PRE,
    'XMP');


/**
 * 'Block' tags. We should add newlines before and after these tags during
 * pretty printing. Tags drawn mostly from HTML4 definitions for block and other
 * non-online tags, excepting the ones in
 * #goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_.
 * @private {!Object}
 * @const
 */
goog.format.HtmlPrettyPrinter.BLOCK_TAGS_ = goog.object.createSet(
    goog.dom.TagName.ADDRESS, goog.dom.TagName.APPLET, goog.dom.TagName.AREA,
    goog.dom.TagName.BASE, goog.dom.TagName.BASEFONT,
    goog.dom.TagName.BLOCKQUOTE, goog.dom.TagName.BODY,
    goog.dom.TagName.CAPTION, goog.dom.TagName.CENTER, goog.dom.TagName.COL,
    goog.dom.TagName.COLGROUP, goog.dom.TagName.DIR, goog.dom.TagName.DIV,
    goog.dom.TagName.DL, goog.dom.TagName.FIELDSET, goog.dom.TagName.FORM,
    goog.dom.TagName.FRAME, goog.dom.TagName.FRAMESET, goog.dom.TagName.H1,
    goog.dom.TagName.H2, goog.dom.TagName.H3, goog.dom.TagName.H4,
    goog.dom.TagName.H5, goog.dom.TagName.H6, goog.dom.TagName.HEAD,
    goog.dom.TagName.HR, goog.dom.TagName.HTML, goog.dom.TagName.IFRAME,
    goog.dom.TagName.ISINDEX, goog.dom.TagName.LEGEND, goog.dom.TagName.LINK,
    goog.dom.TagName.MENU, goog.dom.TagName.META, goog.dom.TagName.NOFRAMES,
    goog.dom.TagName.NOSCRIPT, goog.dom.TagName.OL, goog.dom.TagName.OPTGROUP,
    goog.dom.TagName.OPTION, goog.dom.TagName.P, goog.dom.TagName.PARAM,
    goog.dom.TagName.TABLE, goog.dom.TagName.TBODY, goog.dom.TagName.TD,
    goog.dom.TagName.TFOOT, goog.dom.TagName.TH, goog.dom.TagName.THEAD,
    goog.dom.TagName.TITLE, goog.dom.TagName.TR, goog.dom.TagName.UL);


/**
 * Non-block tags that break flow. We insert a line break after, but not before
 * these. Tags drawn from HTML4 definitions.
 * @private {!Object}
 * @const
 */
goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_ = goog.object.createSet(
    goog.dom.TagName.BR, goog.dom.TagName.DD, goog.dom.TagName.DT,
    goog.dom.TagName.LI, goog.dom.TagName.NOFRAMES);


/**
 * Empty tags. These are treated as both start and end tags.
 * @private {!Object}
 * @const
 */
goog.format.HtmlPrettyPrinter.EMPTY_TAGS_ = goog.object.createSet(
    goog.dom.TagName.BR, goog.dom.TagName.HR, goog.dom.TagName.ISINDEX);


/**
 * Breaks up HTML so it's easily readable by the user.
 * @param {string} html The HTML text to pretty print.
 * @return {string} Formatted result.
 * @throws {Error} Regex error, data loss, or endless loop detected.
 */
goog.format.HtmlPrettyPrinter.prototype.format = function(html) {
  'use strict';
  // Trim leading whitespace, but preserve first indent; in other words, keep
  // any spaces immediately before the first non-whitespace character (that's
  // what $1 is), but remove all other leading whitespace. This adjustment
  // historically had been made in Docs. The motivation is that some
  // browsers prepend several line breaks in designMode.
  html = html.replace(/^\s*?( *\S)/, '$1');

  // Trim trailing whitespace.
  html = html.replace(/\s+$/, '');

  // Keep track of how much time we've used.
  var timeOutMillis = this.timeOutMillis_;
  var startMillis = timeOutMillis ? Date.now() : 0;

  // Handles concatenation of the result and required line breaks.
  var buffer = new goog.format.HtmlPrettyPrinter.Buffer();

  // Declare these for efficiency since we access them in a loop.
  var tokenRegex = goog.format.HtmlPrettyPrinter.TOKEN_REGEX_;
  var nonPpTags = goog.format.HtmlPrettyPrinter.NON_PRETTY_PRINTED_TAGS_;
  var blockTags = goog.format.HtmlPrettyPrinter.BLOCK_TAGS_;
  var breaksFlowTags = goog.format.HtmlPrettyPrinter.BREAKS_FLOW_TAGS_;
  var emptyTags = goog.format.HtmlPrettyPrinter.EMPTY_TAGS_;

  // Used to verify we're making progress through our regex tokenization.
  var lastIndex = 0;

  // Use this to track non-pretty-printed tags and children.
  var nonPpTagStack = [];

  // Loop through each matched token.
  var match;
  while (match = tokenRegex.exec(html)) {
    // Get token.
    var token = match[0];

    // Is this token a tag? match.length == 3 for tags, 1 for all others.
    if (match.length == 3) {
      var tagName = match[2];
      if (tagName) {
        tagName = tagName.toUpperCase();
      }

      // Non-pretty-printed tags?
      if (nonPpTags.hasOwnProperty(tagName)) {
        // End tag?
        if (match[1] == '/') {
          // Do we have a matching start tag?
          var stackSize = nonPpTagStack.length;
          var startTagName = stackSize ? nonPpTagStack[stackSize - 1] : null;
          if (startTagName == tagName) {
            // End of non-pretty-printed block. Line break after.
            nonPpTagStack.pop();
            buffer.pushToken(false, token, !nonPpTagStack.length);
          } else {
            // Malformed HTML. No line breaks.
            buffer.pushToken(false, token, false);
          }
        } else {
          // Start of non-pretty-printed block. Line break before.
          buffer.pushToken(!nonPpTagStack.length, token, false);
          nonPpTagStack.push(tagName);
        }
      } else if (nonPpTagStack.length) {
        // Inside non-pretty-printed block, no new line breaks.
        buffer.pushToken(false, token, false);
      } else if (blockTags.hasOwnProperty(tagName)) {
        // Put line break before start block and after end block tags.
        var isEmpty = emptyTags.hasOwnProperty(tagName);
        var isEndTag = match[1] == '/';
        buffer.pushToken(isEmpty || !isEndTag, token, isEmpty || isEndTag);
      } else if (breaksFlowTags.hasOwnProperty(tagName)) {
        var isEmpty = emptyTags.hasOwnProperty(tagName);
        var isEndTag = match[1] == '/';
        // Put line break after end flow-breaking tags.
        buffer.pushToken(false, token, isEndTag || isEmpty);
      } else {
        // All other tags, no line break.
        buffer.pushToken(false, token, false);
      }
    } else {
      // Non-tags, no line break.
      buffer.pushToken(false, token, false);
    }

    // Double check that we're making progress.
    var newLastIndex = tokenRegex.lastIndex;
    if (!token || newLastIndex <= lastIndex) {
      throw new Error('Regex failed to make progress through source html.');
    }
    lastIndex = newLastIndex;

    // Out of time?
    if (timeOutMillis) {
      if (Date.now() - startMillis > timeOutMillis) {
        // Push unprocessed data as one big token and reset regex object.
        buffer.pushToken(false, html.substring(tokenRegex.lastIndex), false);
        tokenRegex.lastIndex = 0;
        break;
      }
    }
  }

  // Ensure we end in a line break.
  buffer.lineBreak();

  // Construct result string.
  var result = String(buffer);

  // Length should be original length plus # line breaks added.
  var expectedLength = html.length + buffer.breakCount;
  if (result.length != expectedLength) {
    throw new Error('Lost data pretty printing html.');
  }

  return result;
};



/**
 * This class is a buffer to which we push our output. It tracks line breaks to
 * make sure we don't add unnecessary ones.
 * @constructor
 * @final
 */
goog.format.HtmlPrettyPrinter.Buffer = function() {
  'use strict';
  /**
   * Tokens to be output in #toString.
   * @type {goog.string.StringBuffer}
   * @private
   */
  this.out_ = new goog.string.StringBuffer();
};


/**
 * Tracks number of line breaks added.
 * @type {number}
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.breakCount = 0;


/**
 * Tracks if we are at the start of a new line.
 * @type {boolean}
 * @private
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.isBeginningOfNewLine_ = true;


/**
 * Tracks if we need a new line before the next token.
 * @type {boolean}
 * @private
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.needsNewLine_ = false;


/**
 * Adds token and necessary line breaks to output buffer.
 * @param {boolean} breakBefore If true, add line break before token if
 *     necessary.
 * @param {string} token Token to push.
 * @param {boolean} breakAfter If true, add line break after token if
 *     necessary.
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.pushToken = function(
    breakBefore, token, breakAfter) {
  'use strict';
  // If this token needs a preceding line break, and
  // we haven't already added a line break, and
  // this token does not start with a line break,
  // then add line break.
  // Due to FF3.0 bug with lists, we don't insert a /n
  // right before </ul>. See bug 1520665.
  if ((this.needsNewLine_ || breakBefore) && !/^\r?\n/.test(token) &&
      !/\/ul/i.test(token)) {
    this.lineBreak();
  }

  // Token.
  this.out_.append(token);

  // Remember if this string ended with a line break so we know we don't have to
  // insert another one before the next token.
  this.isBeginningOfNewLine_ = /\r?\n$/.test(token);

  // Remember if this token requires a line break after it. We don't insert it
  // here because we might not have to if the next token starts with a line
  // break.
  this.needsNewLine_ = breakAfter && !this.isBeginningOfNewLine_;
};


/**
 * Append line break if we need one.
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.lineBreak = function() {
  'use strict';
  if (!this.isBeginningOfNewLine_) {
    this.out_.append('\n');
    ++this.breakCount;
  }
};


/**
 * @return {string} String representation of tokens.
 * @override
 */
goog.format.HtmlPrettyPrinter.Buffer.prototype.toString = function() {
  'use strict';
  return this.out_.toString();
};
chromium/third_party/google-closure-library/closure/goog/format/htmlprettyprinter.js