chromium/mojo/public/tools/mojom/mojom/parse/lexer.py

# Copyright 2014 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import os.path
import sys

from mojom import fileutil
from mojom.error import Error

fileutil.AddLocalRepoThirdPartyDirToModulePath()
from ply.lex import LexToken, TOKEN


class LexError(Error):
  """Class for errors from the lexer."""

  def __init__(self, filename, message, lineno):
    Error.__init__(self, filename, message, lineno=lineno)


# We have methods which look like they could be functions:
# pylint: disable=R0201
class Lexer:
  def __init__(self, filename):
    self.filename = filename
    self.line_comments = []
    self.suffix_comments = []

  ######################--   PRIVATE   --######################

  ##
  ## Internal auxiliary methods
  ##
  def _error(self, msg, token):
    raise LexError(self.filename, msg, token.lineno)

  ##
  ## Reserved keywords
  ##
  keywords = (
      'HANDLE',
      'IMPORT',
      'MODULE',
      'STRUCT',
      'UNION',
      'INTERFACE',
      'ENUM',
      'CONST',
      'TRUE',
      'FALSE',
      'DEFAULT',
      'ARRAY',
      'MAP',
      'PENDING_REMOTE',
      'PENDING_RECEIVER',
      'PENDING_ASSOCIATED_REMOTE',
      'PENDING_ASSOCIATED_RECEIVER',
      'FEATURE',
  )

  keyword_map = {}
  for keyword in keywords:
    keyword_map[keyword.lower()] = keyword

  ##
  ## All the tokens recognized by the lexer
  ##
  tokens = keywords + (
      # Identifiers
      'NAME',

      # Constants
      'ORDINAL',
      'INT_CONST_DEC',
      'INT_CONST_HEX',
      'FLOAT_CONST',

      # String literals
      'STRING_LITERAL',

      # Operators
      'MINUS',
      'PLUS',
      'QSTN',

      # Assignment
      'EQUALS',

      # Request / response
      'RESPONSE',

      # Delimiters
      'LPAREN',
      'RPAREN',  # ( )
      'LBRACKET',
      'RBRACKET',  # [ ]
      'LBRACE',
      'RBRACE',  # { }
      'LANGLE',
      'RANGLE',  # < >
      'SEMI',  # ;
      'COMMA',
      'DOT'  # , .
  )

  ##
  ## Regexes for use in tokens
  ##

  # valid C identifiers (K&R2: A.2.3)
  identifier = r'[a-zA-Z_][0-9a-zA-Z_]*'

  hex_prefix = '0[xX]'
  hex_digits = '[0-9a-fA-F]+'

  # integer constants (K&R2: A.2.5.1)
  decimal_constant = '0|([1-9][0-9]*)'
  hex_constant = hex_prefix + hex_digits
  # Don't allow octal constants (even invalid octal).
  octal_constant_disallowed = '0[0-9]+'

  # character constants (K&R2: A.2.5.2)
  # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line
  # directives with Windows paths as filenames (..\..\dir\file)
  # For the same reason, decimal_escape allows all digit sequences. We want to
  # parse all correct code, even if it means to sometimes parse incorrect
  # code.
  #
  simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])"""
  decimal_escape = r"""(\d+)"""
  hex_escape = r"""(x[0-9a-fA-F]+)"""
  bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])"""

  escape_sequence = \
      r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))'

  # string literals (K&R2: A.2.6)
  string_char = r"""([^"\\\n]|""" + escape_sequence + ')'
  string_literal = '"' + string_char + '*"'
  bad_string_literal = '"' + string_char + '*' + bad_escape + string_char + '*"'

  # floating constants (K&R2: A.2.5.3)
  exponent_part = r"""([eE][-+]?[0-9]+)"""
  fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)"""
  floating_constant = \
      '(((('+fractional_constant+')'+ \
      exponent_part+'?)|([0-9]+'+exponent_part+')))'

  # Ordinals
  ordinal = r'@[0-9]+'
  missing_ordinal_value = r'@'
  # Don't allow ordinal values in octal (even invalid octal, like 09) or
  # hexadecimal.
  octal_or_hex_ordinal_disallowed = (
      r'@((0[0-9]+)|(' + hex_prefix + hex_digits + '))')

  comment = r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)'

  ##
  ## Rules for the normal state
  ##
  t_ignore = ' \t\r'

  # Newlines
  def t_NEWLINE(self, t):
    r'\n+'
    t.lexer.lineno += len(t.value)

  # Operators
  t_MINUS = r'-'
  t_PLUS = r'\+'
  t_QSTN = r'\?'

  # =
  t_EQUALS = r'='

  # =>
  t_RESPONSE = r'=>'

  # Delimiters
  t_LPAREN = r'\('
  t_RPAREN = r'\)'
  t_LBRACKET = r'\['
  t_RBRACKET = r'\]'
  t_LBRACE = r'\{'
  t_RBRACE = r'\}'
  t_LANGLE = r'<'
  t_RANGLE = r'>'
  t_COMMA = r','
  t_DOT = r'\.'
  t_SEMI = r';'

  t_STRING_LITERAL = string_literal

  # The following floating and integer constants are defined as
  # functions to impose a strict order (otherwise, decimal
  # is placed before the others because its regex is longer,
  # and this is bad)
  #
  @TOKEN(floating_constant)
  def t_FLOAT_CONST(self, t):
    return t

  @TOKEN(hex_constant)
  def t_INT_CONST_HEX(self, t):
    return t

  @TOKEN(octal_constant_disallowed)
  def t_OCTAL_CONSTANT_DISALLOWED(self, t):
    msg = "Octal values not allowed"
    self._error(msg, t)

  @TOKEN(decimal_constant)
  def t_INT_CONST_DEC(self, t):
    return t

  # unmatched string literals are caught by the preprocessor

  @TOKEN(bad_string_literal)
  def t_BAD_STRING_LITERAL(self, t):
    msg = "String contains invalid escape code"
    self._error(msg, t)

  # Handle ordinal-related tokens in the right order:
  @TOKEN(octal_or_hex_ordinal_disallowed)
  def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t):
    msg = "Octal and hexadecimal ordinal values not allowed"
    self._error(msg, t)

  @TOKEN(ordinal)
  def t_ORDINAL(self, t):
    return t

  @TOKEN(missing_ordinal_value)
  def t_BAD_ORDINAL(self, t):
    msg = "Missing ordinal value"
    self._error(msg, t)

  @TOKEN(identifier)
  def t_NAME(self, t):
    t.type = self.keyword_map.get(t.value, "NAME")
    return t

  # Collect comments in the lexer, but don't promote them to the parser.
  @TOKEN(comment)
  def t_COMMENT(self, t):
    # Clone the LexToken object to only hold the needed data and not other
    # references.
    comment = LexToken()
    for a in ('value', 'type', 'lineno', 'lexpos'):
      setattr(comment, a, getattr(t, a))

    t.lexer.lineno += t.value.count("\n")

    # Walk back to see if this is a comment on its own line.
    pos = t.lexpos - 1
    while pos >= 0:
      c = t.lexer.lexdata[pos]
      if c in t.lexer.lexignore:
        pos -= 1
      else:
        if c == '\n':
          self.line_comments.append(comment)
        else:
          self.suffix_comments.append(comment)
        return

    # Reached the beginning of the file, so this must be a line comment.
    self.line_comments.append(comment)

  def t_error(self, t):
    msg = "Illegal character %s" % repr(t.value[0])
    self._error(msg, t)