idl_lexer.py | Explore in Territory

#!/usr/bin/env python3
# Copyright 2013 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

""" Lexer for Web IDL

The lexer uses the PLY library to build a tokenizer which understands
Web IDL tokens.

Web IDL, and Web IDL regular expressions can be found at:
   http://webidl.spec.whatwg.org/
PLY can be found at:
   http://www.dabeaz.com/ply/
"""

import os.path
import sys

SRC_DIR = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)
sys.path.insert(0, os.path.join(SRC_DIR, 'third_party'))
from ply import lex


#
# IDL Lexer
#
class IDLLexer(object):
  # 'literals' is a value expected by lex which specifies a list of valid
  # literal tokens, meaning the token type and token value are identical.
  literals = r'"*.(){}[],;:=+-/~|&^?<>'

  # 't_ignore' contains ignored characters (spaces and tabs)
  t_ignore = ' \t'

  # 'tokens' is a value required by lex which specifies the complete list
  # of valid token types.
  tokens = [
    # Data types
      'float',
      'integer',
      'string',

    # Symbol and keywords types
      'SPECIAL_COMMENT',
      'identifier',

    # MultiChar operators
      'ELLIPSIS',
  ]

  # 'keywords' is a map of string to token type.  All tokens matching
  # KEYWORD_OR_SYMBOL are matched against keywords dictionary, to determine
  # if the token is actually a keyword.
  keywords = {
      'any': 'ANY',
      'async': 'ASYNC',
      'attribute': 'ATTRIBUTE',
      'bigint': 'BIGINT',
      'boolean': 'BOOLEAN',
      'byte': 'BYTE',
      'ByteString': 'BYTESTRING',
      'callback': 'CALLBACK',
      'const': 'CONST',
      'constructor': 'CONSTRUCTOR',
      'deleter': 'DELETER',
      'dictionary': 'DICTIONARY',
      'DOMString': 'DOMSTRING',
      'double': 'DOUBLE',
      'enum': 'ENUM',
      'false': 'FALSE',
      'float': 'FLOAT',
      'FrozenArray': 'FROZENARRAY',
      'getter': 'GETTER',
      'includes': 'INCLUDES',
      'Infinity': 'INFINITY',
      'inherit': 'INHERIT',
      'interface': 'INTERFACE',
      'iterable': 'ITERABLE',
      'long': 'LONG',
      'maplike': 'MAPLIKE',
      'mixin': 'MIXIN',
      'namespace': 'NAMESPACE',
      'NaN': 'NAN',
      'null': 'NULL',
      'object': 'OBJECT',
      'ObservableArray': 'OBSERVABLEARRAY',
      'octet': 'OCTET',
      'optional': 'OPTIONAL',
      'or': 'OR',
      'partial': 'PARTIAL',
      'Promise': 'PROMISE',
      'readonly': 'READONLY',
      'record': 'RECORD',
      'required': 'REQUIRED',
      'sequence': 'SEQUENCE',
      'setlike': 'SETLIKE',
      'setter': 'SETTER',
      'short': 'SHORT',
      'static': 'STATIC',
      'stringifier': 'STRINGIFIER',
      'true': 'TRUE',
      'typedef': 'TYPEDEF',
      'undefined': 'UNDEFINED',
      'unrestricted': 'UNRESTRICTED',
      'unsigned': 'UNSIGNED',
      'USVString': 'USVSTRING',
      'void': 'VOID'
  }

  # Token definitions
  #
  # Lex assumes any value or function in the form of 't_<TYPE>' represents a
  # regular expression where a match will emit a token of type <TYPE>.  In the
  # case of a function, the function is called when a match is made. These
  # definitions come from WebIDL.
  #
  # These need to be methods for lexer construction, despite not using self.
  # pylint: disable=R0201
  def t_ELLIPSIS(self, t):
    r'\.\.\.'
    return t

  # Regex needs to be in the docstring
  # pylint: disable=C0301
  def t_float(self, t):
    r'-?(([0-9]+\.[0-9]*|[0-9]*\.[0-9]+)([Ee][+-]?[0-9]+)?|[0-9]+[Ee][+-]?[0-9]+)'
    return t

  def t_integer(self, t):
    r'-?([1-9][0-9]*|0[Xx][0-9A-Fa-f]+|0[0-7]*)'
    return t


  # A line ending '\n', we use this to increment the line number
  def t_LINE_END(self, t):
    r'\n+'
    self.AddLines(len(t.value))

  # We do not process escapes in the IDL strings.  Strings are exclusively
  # used for attributes and enums, and not used as typical 'C' constants.
  def t_string(self, t):
    r'"[^"]*"'
    t.value = t.value[1:-1]
    self.AddLines(t.value.count('\n'))
    return t

  # A Javadoc style comment:  /** xxx */
  # Unlike t_COMMENT, this is NOT ignored.
  # Also note that this should be defined before t_COMMENT.
  def t_SPECIAL_COMMENT(self, t):
    r'/\*\*(.|\n)+?\*/'
    self.AddLines(t.value.count('\n'))
    return t

  # A C or C++ style comment:  /* xxx */ or //
  # This token is ignored.
  def t_COMMENT(self, t):
    r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'
    self.AddLines(t.value.count('\n'))

  # A symbol or keyword.
  def t_KEYWORD_OR_SYMBOL(self, t):
    r'[_-]?[A-Za-z][A-Za-z_0-9-]*'

    # All non-keywords are assumed to be symbols
    t.type = self.keywords.get(t.value, 'identifier')

    # We strip leading underscores so that you can specify symbols with the same
    # value as a keywords (E.g. a dictionary named 'interface').
    if t.value[0] == '_':
      t.value = t.value[1:]
    return t

  def t_ANY_error(self, t):
    msg = 'Unrecognized input'
    line = self.Lexer().lineno

    # If that line has not been accounted for, then we must have hit
    # EoF, so compute the beginning of the line that caused the problem.
    if line >= len(self.index):
      # Find the offset in the line of the first word causing the issue
      word = t.value.split()[0]
      offs = self.lines[line - 1].find(word)
      # Add the computed line's starting position
      self.index.append(self.Lexer().lexpos - offs)
      msg = 'Unexpected EoF reached after'

    pos = self.Lexer().lexpos - self.index[line]
    out = self.ErrorMessage(line, pos, msg)
    sys.stderr.write(out + '\n')
    self._lex_errors += 1


  def AddLines(self, count):
    # Set the lexer position for the beginning of the next line.  In the case
    # of multiple lines, tokens can not exist on any of the lines except the
    # last one, so the recorded value for previous lines are unused.  We still
    # fill the array however, to make sure the line count is correct.
    self.Lexer().lineno += count
    for _ in range(count):
      self.index.append(self.Lexer().lexpos)

  def FileLineMsg(self, line, msg):
    # Generate a message containing the file and line number of a token.
    filename = self.Lexer().filename
    if filename:
      return "%s(%d) : %s" % (filename, line + 1, msg)
    return "<BuiltIn> : %s" % msg

  def SourceLine(self, line, pos):
    # Create a source line marker
    caret = ' ' * pos + '^'
    # We decrement the line number since the array is 0 based while the
    # line numbers are 1 based.
    return "%s\n%s" % (self.lines[line - 1], caret)

  def ErrorMessage(self, line, pos, msg):
    return "\n%s\n%s" % (
        self.FileLineMsg(line, msg),
        self.SourceLine(line, pos))

#
# Tokenizer
#
# The token function returns the next token provided by IDLLexer for matching
# against the leaf paterns.
#
  def token(self):
    tok = self.Lexer().token()
    if tok:
      self.last = tok
    return tok


  def GetTokens(self):
    outlist = []
    while True:
      t = self.Lexer().token()
      if not t:
        break
      outlist.append(t)
    return outlist

  def Tokenize(self, data, filename='__no_file__'):
    lexer = self.Lexer()
    lexer.lineno = 1
    lexer.filename = filename
    lexer.input(data)
    self.lines = data.split('\n')

  def KnownTokens(self):
    return self.tokens

  def Lexer(self):
    return self._lexobj

  def _AddToken(self, token):
    if token in self.tokens:
      raise RuntimeError('Same token: ' + token)
    self.tokens.append(token)

  def _AddTokens(self, tokens):
    for token in tokens:
      self._AddToken(token)

  def _AddKeywords(self, keywords):
    for key in keywords:
      value = key.upper()
      self._AddToken(value)
      self.keywords[key] = value

  def _DelKeywords(self, keywords):
    for key in keywords:
      self.tokens.remove(key.upper())
      del self.keywords[key]

  def __init__(self, optimize=True):
    self.index = [0]
    self._lex_errors = 0
    self.linex = []
    self.filename = None
    self.keywords = {}
    self.tokens = []
    self._AddTokens(IDLLexer.tokens)
    self._AddKeywords(IDLLexer.keywords)
    self._lexobj = lex.lex(object=self, lextab=False, optimize=optimize)
    self.last = None
    self.lines = None

# If run by itself, attempt to build the lexer
if __name__ == '__main__':
  lexer_object = IDLLexer()
chromium/tools/idl_parser/idl_lexer.py