chromium/tools/grit/grit/tool/update_resource_ids/parser.py

# Copyright 2019 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Structure-preserving parser for resource_ids files.

Naive usage of eval() destroys resource_ids structure. This module provides a
custom parser that annotates source byte ranges of "leaf values" (strings and
integers).
"""


_isWhitespace = lambda ch: ch in ' \t\n'
_isNotNewline = lambda ch: ch != '\n'
_isDigit = lambda ch: ch.isdigit()


def _RenderLineCol(data, pos):
  """Renders |pos| within text |data| in as text showing line and column."""
  # This is used to pinpoint fatal parse errors, so okay to be inefficient.
  new_lines = [i for i in range(pos) if data[i] == '\n']
  row = 1 + len(new_lines)
  col = (pos - new_lines[-1]) if new_lines else 1 + pos
  return 'line %d, column %d' % (row, col)


def Tokenize(data):
  """Generator to split |data| into tokens.

  Each token is specified as |(t, lo, hi)|:
  * |t|: Type, with '#' = space / comments, '0' = int, 'S' = string, 'E' = end,
         and other characters denoting themselves.
  * |lo, hi|: Token's range within |data| (as |data[lo:hi]|).
  """

  class ctx:  # Local context for mutable data shared across inner functions.
    pos = 0

  def _HasData():
    return ctx.pos < len(data)

  # Returns True if ended by |not pred()|, or False if ended by EOF.
  def _EatWhile(pred):
    while _HasData():
      if pred(data[ctx.pos]):
        ctx.pos += 1
      else:
        return True
    return False

  def _NextBlank():
    lo = ctx.pos
    while True:
      if not _EatWhile(_isWhitespace) or data[ctx.pos] != '#':
        break
      ctx.pos += 1
      if not _EatWhile(_isNotNewline):
        break
      ctx.pos += 1
    return None if ctx.pos == lo else (lo, ctx.pos)

  def _EatString():
    lo = ctx.pos
    delim = data[ctx.pos]
    is_escaped = False
    ctx.pos += 1
    while _HasData():
      ch = data[ctx.pos]
      ctx.pos += 1
      if is_escaped:
        is_escaped = False
      elif ch == '\\':
        is_escaped = True
      elif ch == delim:
        return
    raise ValueError('Unterminated string at %s' % _RenderLineCol(data, lo))

  while _HasData():
    blank = _NextBlank()
    if blank is not None:
      yield ('#', blank[0], blank[1])
    if not _HasData():
      break
    lo = ctx.pos
    ch = data[ctx.pos]
    if ch in '{}[],:':
      ctx.pos += 1
      t = ch
    elif ch.isdigit():
      _EatWhile(_isDigit)
      t = '0'
    elif ch in '+-':
      ctx.pos += 1
      if not _HasData() or not data[ctx.pos].isdigit():
        raise ValueError('Invalid int at %s' % _RenderLineCol(data, lo))
      _EatWhile(_isDigit)
      t = '0'
    elif ch in '"\'':
      _EatString()
      t = 'S'
    else:
      raise ValueError('Unknown char %s at %s' %
                       (repr(ch), _RenderLineCol(data, lo)))
    yield (t, lo, ctx.pos)
  yield ('E', ctx.pos, ctx.pos)  # End sentinel.


def _SkipBlanks(toks):
  """Generator to remove whitespace and comments from Tokenize()."""
  for t, lo, hi in toks:
    if t != '#':
      yield t, lo, hi


class AnnotatedValue:
  """Container for leaf values (ints or strings) with an annotated range."""

  def __init__(self, val, lo, hi):
    self.val = val
    self.lo = lo
    self.hi = hi

  def __str__(self):
    return '<%s@%d:%d>' % (str(self.val), self.lo, self.hi)

  def __repr__(self):
    return '<%r@%d:%d>' % (self.val, self.lo, self.hi)

  def __hash__(self):
    return hash(self.val)

  def __eq__(self, other):
    return self.val == other


class ResourceIdParser:
  """resource_ids parser that stores leaf values as AnnotatedValue.

  Algorithm: Use Tokenize() to split |data| into tokens and _SkipBlanks() to
  ignore comments and spacing, then apply a recursive parsing, using a one-token
  look-ahead for decision making.
  """

  def __init__(self, data, tok_gen):
    self.data = data
    self.state = []
    self.toks = _SkipBlanks(tok_gen)
    self.tok_look_ahead = None

  def _MakeErr(self, msg, pos):
    return ValueError(msg + ' at ' + _RenderLineCol(self.data, pos))

  def _PeekTok(self):
    if self.tok_look_ahead is None:
      self.tok_look_ahead = next(self.toks)
    return self.tok_look_ahead

  def _NextTok(self):
    if self.tok_look_ahead is None:
      return next(self.toks)
    ret = self.tok_look_ahead
    self.tok_look_ahead = None
    return ret

  def _EatTok(self, exp_t, tok_name=None):
    t, lo, _ = self._NextTok()
    if t != exp_t:
      raise self._MakeErr('Bad token: Expect \'%s\'' % (tok_name or exp_t), lo)

  def _NextIntOrString(self):
    t, lo, hi = self._NextTok()
    if t != '0' and t != 'S':
      raise self._MakeErr('Expected number or string', lo)
    value = eval(self.data[lo:hi])
    return AnnotatedValue(value, lo, hi)

  # Consumes separator ',' and returns whether |end_ch| is encountered.
  def _EatSep(self, end_ch):
    t, lo, _ = self._PeekTok()
    if t == ',':
      self._EatTok(',')
      # Allow trailing ','.
      t, _, _ = self._PeekTok()
      return t == end_ch
    elif t == end_ch:
      return True
    else:
      raise self._MakeErr('Expect \',\' or \'%s\'' % end_ch, lo)

  def _NextList(self):
    self._EatTok('[')
    ret = []
    t, _, _ = self._PeekTok()
    if t != ']':
      while True:
        ret.append(self._NextObject())
        if self._EatSep(']'):
          break
    self._EatTok(']')
    return ret

  def _NextDict(self):
    self._EatTok('{')
    ret = {}
    t, _, _ = self._PeekTok()
    if t != '}':
      while True:
        k = self._NextIntOrString()
        self._EatTok(':')
        v = self._NextObject()
        ret[k] = v
        if self._EatSep('}'):
          break
    self._EatTok('}')
    return ret

  def _NextObject(self):
    t, lo, _ = self._PeekTok()
    if t == '[':
      return self._NextList()
    elif t == '{':
      return self._NextDict()
    elif t == '0' or t == 'S':
      return self._NextIntOrString()
    else:
      raise self._MakeErr('Bad token: Type = %s' % t, lo)

  def Parse(self):
    root_obj = self._NextObject()
    self._EatTok('E', 'EOF')
    return root_obj