chromium/tools/grit/grit/node/message.py

# Copyright 2012 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

'''Handling of the <message> element.
'''

import re

from grit.node import base

from grit import clique
from grit import exception
from grit import lazy_re
from grit import tclib
from grit import util


# Matches exactly three dots ending a line or followed by whitespace.
_ELLIPSIS_PATTERN = lazy_re.compile(r'(?<!\.)\.\.\.(?=$|\s)')
_ELLIPSIS_SYMBOL = '\u2026'  # Ellipsis

# Finds whitespace at the start and end of a string which can be multiline.
_WHITESPACE = lazy_re.compile(r'(?P<start>\s*)(?P<body>.+?)(?P<end>\s*)\Z',
                              re.DOTALL | re.MULTILINE)

# <ph> placeholder elements should contain the special character formatters
# used to format <ph> element content.
# Android format.
_ANDROID_FORMAT = (r'%[1-9]+\$'
                   r'([-#+ 0,(]*)([0-9]+)?(\.[0-9]+)?'
                   r'([bBhHsScCdoxXeEfgGaAtT%n])')
# Chrome l10n format.
_CHROME_FORMAT = r'\$+\d'
# Windows EWT numeric and GRIT %s %d formats.
_OTHER_FORMAT = r'%[0-9sd]'

# Finds formatters that must be in a placeholder (<ph>) element.
_FORMATTERS = lazy_re.compile(
    '(%s)|(%s)|(%s)' % (_ANDROID_FORMAT, _CHROME_FORMAT, _OTHER_FORMAT))
_BAD_PLACEHOLDER_MSG = ('ERROR: Placeholder formatter found outside of <ph> '
                        'tag in message "%s" in %s.')
_INVALID_PH_CHAR_MSG = ('ERROR: Invalid format characters found in message '
                        '"%s" <ph> tag in %s.')

# Finds HTML tag tokens.
_HTMLTOKEN = lazy_re.compile(r'<[/]?[a-z][a-z0-9]*[^>]*>', re.I)

# Finds HTML entities.
_HTMLENTITY = lazy_re.compile(r'&[^\s]*;')


class MessageNode(base.ContentNode):
  '''A <message> element.'''

  # For splitting a list of things that can be separated by commas or
  # whitespace
  _SPLIT_RE = lazy_re.compile(r'\s*,\s*|\s+')

  def __init__(self):
    super().__init__()
    # Valid after EndParsing, this is the MessageClique that contains the
    # source message and any translations of it that have been loaded.
    self.clique = None

    # We don't send leading and trailing whitespace into the translation
    # console, but rather tack it onto the source message and any
    # translations when formatting them into RC files or what have you.
    self.ws_at_start = ''  # Any whitespace characters at the start of the text
    self.ws_at_end = ''  # --"-- at the end of the text

    # A list of "shortcut groups" this message is in.  We check to make sure
    # that shortcut keys (e.g. &J) within each shortcut group are unique.
    self.shortcut_groups_ = []

    # Formatter-specific data used to control the output of individual strings.
    # formatter_data is a space separated list of C preprocessor-style
    # definitions. Names without values are given the empty string value.
    # Example: "foo=5 bar baz=100"
    self.formatter_data = {}

    # Whether or not to convert ... -> U+2026 within Translate().
    self._replace_ellipsis = False

  def _IsValidChild(self, child):
    return isinstance(child, (PhNode))

  def _IsValidAttribute(self, name, value):
    if name not in [
        'name', 'offset', 'translateable', 'desc', 'meaning',
        'internal_comment', 'shortcut_groups', 'custom_type', 'validation_expr',
        'use_name_for_id', 'sub_variable', 'formatter_data',
        'is_accessibility_with_no_ui'
    ]:
      return False
    if (name in ('translateable', 'sub_variable') and
        value not in ['true', 'false']):
      return False
    return True

  def SetReplaceEllipsis(self, value):
    r'''Sets whether to replace ... with \u2026.
    '''
    self._replace_ellipsis = value

  def MandatoryAttributes(self):
    return ['name|offset']

  def DefaultAttributes(self):
    return {
        'custom_type': '',
        'desc': '',
        'formatter_data': '',
        'internal_comment': '',
        'is_accessibility_with_no_ui': 'false',
        'meaning': '',
        'shortcut_groups': '',
        'sub_variable': 'false',
        'translateable': 'true',
        'use_name_for_id': 'false',
        'validation_expr': '',
    }

  def HandleAttribute(self, attrib, value):
    base.ContentNode.HandleAttribute(self, attrib, value)
    if attrib != 'formatter_data':
      return

    # Parse value, a space-separated list of defines, into a dict.
    # Example: "foo=5 bar" -> {'foo':'5', 'bar':''}
    for item in value.split():
      name, _, val = item.partition('=')
      self.formatter_data[name] = val

  def GetTextualIds(self):
    '''
    Returns the concatenation of the parent's node first_id and
    this node's offset if it has one, otherwise just call the
    superclass' implementation
    '''
    if 'offset' not in self.attrs:
      return super().GetTextualIds()

    # we search for the first grouping node in the parents' list
    # to take care of the case where the first parent is an <if> node
    grouping_parent = self.parent
    import grit.node.empty
    while grouping_parent and not isinstance(grouping_parent,
                                             grit.node.empty.GroupingNode):
      grouping_parent = grouping_parent.parent

    assert 'first_id' in grouping_parent.attrs
    return [grouping_parent.attrs['first_id'] + '_' + self.attrs['offset']]

  def IsTranslateable(self):
    return self.attrs['translateable'] == 'true'

  def EndParsing(self):
    super().EndParsing()

    # Make the text (including placeholder references) and list of placeholders,
    # verify placeholder formats, then strip and store leading and trailing
    # whitespace and create the tclib.Message() and a clique to contain it.

    text = ''
    placeholders = []

    for item in self.mixed_content:
      if isinstance(item, str):
        # Not a <ph> element: fail if any <ph> formatters are detected.
        if _FORMATTERS.search(item):
          print(_BAD_PLACEHOLDER_MSG % (item, self.source))
          raise exception.PlaceholderNotInsidePhNode
        text += item
      else:
        # Extract the <ph> element components.
        presentation = item.attrs['name'].upper()
        text += presentation
        ex = ' '  # <ex> example element cdata if present.
        if len(item.children):
          ex = item.children[0].GetCdata()
        original = item.GetCdata()

        # Sanity check the <ph> element content.
        cdata = original
        # Replace all HTML tag tokens in cdata.
        match = _HTMLTOKEN.search(cdata)
        while match:
          cdata = cdata.replace(match.group(0), '_')
          match = _HTMLTOKEN.search(cdata)
        # Replace all HTML entities in cdata.
        match = _HTMLENTITY.search(cdata)
        while match:
          cdata = cdata.replace(match.group(0), '_')
          match = _HTMLENTITY.search(cdata)
        # Remove first matching formatter from cdata.
        match = _FORMATTERS.search(cdata)
        if match:
          cdata = cdata.replace(match.group(0), '')
        # Fail if <ph> special chars remain in cdata.
        if re.search(r'[%\$]', cdata):
          message_id = self.attrs['name'] + ' ' + original;
          print(_INVALID_PH_CHAR_MSG % (message_id, self.source))
          raise exception.InvalidCharactersInsidePhNode

        # Otherwise, accept this <ph> placeholder.
        placeholders.append(tclib.Placeholder(presentation, original, ex))

    m = _WHITESPACE.match(text)
    if m:
      self.ws_at_start = m.group('start')
      self.ws_at_end = m.group('end')
      text = m.group('body')

    self.shortcut_groups_ = self._SPLIT_RE.split(self.attrs['shortcut_groups'])
    self.shortcut_groups_ = [i for i in self.shortcut_groups_ if i != '']

    description_or_id = self.attrs['desc']
    if description_or_id == '' and 'name' in self.attrs:
      description_or_id = 'ID: %s' % self.attrs['name']

    assigned_id = None
    if self.attrs['use_name_for_id'] == 'true':
      assigned_id = self.attrs['name']
    message = tclib.Message(text=text, placeholders=placeholders,
                            description=description_or_id,
                            meaning=self.attrs['meaning'],
                            assigned_id=assigned_id)
    self.InstallMessage(message)

  def InstallMessage(self, message):
    '''Sets this node's clique from a tclib.Message instance.

    Args:
      message: A tclib.Message.
    '''
    self.clique = self.UberClique().MakeClique(message, self.IsTranslateable())
    for group in self.shortcut_groups_:
      self.clique.AddToShortcutGroup(group)
    if self.attrs['custom_type'] != '':
      self.clique.SetCustomType(util.NewClassInstance(self.attrs['custom_type'],
                                                      clique.CustomType))
    elif self.attrs['validation_expr'] != '':
      self.clique.SetCustomType(
        clique.OneOffCustomType(self.attrs['validation_expr']))

  def SubstituteMessages(self, substituter):
    '''Applies substitution to this message.

    Args:
      substituter: a grit.util.Substituter object.
    '''
    message = substituter.SubstituteMessage(self.clique.GetMessage())
    if message is not self.clique.GetMessage():
      self.InstallMessage(message)

  def GetCliques(self):
    return [self.clique] if self.clique else []

  def Translate(self, lang):
    '''Returns a translated version of this message.
    '''
    assert self.clique
    msg = self.clique.MessageForLanguage(lang,
                                         self.PseudoIsAllowed(),
                                         self.ShouldFallbackToEnglish()
                                         ).GetRealContent()
    if self._replace_ellipsis:
      msg = _ELLIPSIS_PATTERN.sub(_ELLIPSIS_SYMBOL, msg)
    # Always remove all byte order marks (\uFEFF) https://crbug.com/1033305
    msg = msg.replace('\uFEFF','')
    return msg.replace('[GRITLANGCODE]', lang)

  def NameOrOffset(self):
    key = 'name' if 'name' in self.attrs else 'offset'
    return self.attrs[key]

  def ExpandVariables(self):
    '''We always expand variables on Messages.'''
    return True

  def GetDataPackValue(self, lang, encoding):
    '''Returns a str represenation for a data_pack entry.'''
    message = self.ws_at_start + self.Translate(lang) + self.ws_at_end
    return util.Encode(message, encoding)

  def IsResourceMapSource(self):
    return True

  @staticmethod
  def Construct(parent, message, name, desc='', meaning='', translateable=True):
    '''Constructs a new message node that is a child of 'parent', with the
    name, desc, meaning and translateable attributes set using the same-named
    parameters and the text of the message and any placeholders taken from
    'message', which must be a tclib.Message() object.'''
    # Convert type to appropriate string
    translateable = 'true' if translateable else 'false'

    node = MessageNode()
    node.StartParsing('message', parent)
    node.HandleAttribute('name', name)
    node.HandleAttribute('desc', desc)
    node.HandleAttribute('meaning', meaning)
    node.HandleAttribute('translateable', translateable)

    items = message.GetContent()
    for ix, item in enumerate(items):
      if isinstance(item, str):
        # Ensure whitespace at front and back of message is correctly handled.
        if ix == 0:
          item = "'''" + item
        if ix == len(items) - 1:
          item = item + "'''"

        node.AppendContent(item)
      else:
        phnode = PhNode()
        phnode.StartParsing('ph', node)
        phnode.HandleAttribute('name', item.GetPresentation())
        phnode.AppendContent(item.GetOriginal())

        if len(item.GetExample()) and item.GetExample() != ' ':
          exnode = ExNode()
          exnode.StartParsing('ex', phnode)
          exnode.AppendContent(item.GetExample())
          exnode.EndParsing()
          phnode.AddChild(exnode)

        phnode.EndParsing()
        node.AddChild(phnode)

    node.EndParsing()
    return node


class PhNode(base.ContentNode):
  '''A <ph> element.'''

  def _IsValidChild(self, child):
    return isinstance(child, ExNode)

  def MandatoryAttributes(self):
    return ['name']

  def EndParsing(self):
    super().EndParsing()
    # We only allow a single example for each placeholder
    if len(self.children) > 1:
      raise exception.TooManyExamples()

  def GetTextualIds(self):
    # The 'name' attribute is not an ID.
    return []


class ExNode(base.ContentNode):
  '''An <ex> element.'''
  pass