bcanalyzer.py | Explore in Territory

#!/usr/bin/env python3
# Copyright 2018 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Runs bcanalyzer to extract data from LLVM Bitcode (BC) files.

IsBitcodeFile():
  Reads the magic header of a file to quickly decide whether it is a BC file.

ParseTag():
  Heuristically parses a single-line tag from bcanalyzer dump (exporeted for
  testing).

RunBcAnalyzerOnIntermediates():
  BulkForkAndCall() target: Given BC file [paths], runs (llvm-)bcanalyzer on
  each path, parses the output, extracts strings, and returns {path: [strings]}.

This file can also be run stand-alone in order to test out the logic on smaller
sample sizes.
"""

import argparse
import os
import re
import subprocess

import parallel
import path_util


# Upper bound on number of bytes per character in strings. 4-byte / 32-bit
# strings are rare and are likely confused with 32-bit int arrays. So by
# default, only accept up to 2-byte / 16-bit strings.
_CHAR_WIDTH_LIMIT = 2

_RE_SPLIT = re.compile(r'=(\d+)')
# <TYPE_BLOCK_ID> children tags that should not be counted as types.
# - <NUMENTRY> is meta data.
# - <STRUCT_NAME> with the following <STRUCT_NAMED> (or other tag) are counted
#   as a single type entry.
_NON_TYPE_TAGS = set(['NUMENTRY', 'STRUCT_NAME'])

# Use bit-fields for tag types: 1 => Opening tag, 2 => Closed tag.
OPENING_TAG = 1
CLOSING_TAG = 2
SELF_CLOSING_TAG = OPENING_TAG | CLOSING_TAG


def _IsOpeningTag(tag_type):
  return tag_type & 1


def _IsClosingTag(tag_type):
  return tag_type & 2


def IsBitcodeFile(path):
  try:
    with open(path, 'rb') as f:
      return f.read(4) == b'BC\xc0\xde'
  except IOError:
    return False


def ParseTag(line):
  """Heuristically parses a single-line tag from bcanalyzer dump.

  Since input data are machine-generated, so we only need "good enough" parsing
  logic that favors simplicity. For example, '</FOO/>' is accepted.

  Args:
    line: Stripped line that may have a single-line tag with trailing text.

  Returns:
    (tag_type, tag, attrib_pos) if successful, else (None) * 3. Details:
    tag_type: One of {OPENING_TAG, CLOSING_TAG, SELF_CLOSING_TAG}.
    tag: The tag name.
    attrib_pos: Position in |line| to start parsing attributes.
  """
  # <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4>
  #     ==> (OPENING_TAG, 'TYPE_BLOCK_ID', 14).
  # <ARRAY abbrevid=9 op0=1 op1=7/> Trailing text!
  #     ==> (SELF_CLOSING_TAG, 'ARRAY', 6).
  # </TYPE_BLOCK_ID>
  #     ==> (CLOSING_TAG, 'TYPE_BLOCK_ID', 15).

  # Assumes |line| is stripped, i.e., so no indent and no trailing new line.
  if len(line) < 2 or line[0] != '<':
    return (None, None, None)
  tag_type, pos = (CLOSING_TAG, 2) if line[1] == '/' else (OPENING_TAG, 1)
  for i in range(pos, len(line)):
    if not line[i].isalnum() and line[i] != '_':
      if i == pos or not line[i] in ' >/':
        break
      end = line.find('>', i)
      if end < 0:
        break
      if line[end - 1] == '/':
        return (SELF_CLOSING_TAG, line[pos:i], i)
      return (tag_type, line[pos:i], i)
  return (None, None, None)


def _ParseOpItems(line, pos):
  """Heuristically extracts op0=# op1=# ... values from a single-line tag."""
  # <SETTYPE abbrevid=4 op0=42/>
  #         ^ pos = 8
  #     ==> iter([42]).
  # <CSTRING abbrevid=11 op0=84 op1=101 op2=115 op3=116 op4=56 op5=97/>
  #         ^ pos = 8
  #     ==> iter([84, 101, 115, 116, 56, 97]).
  # <STRING abbrevid=9 op0=1 op1=0 op2=0 op3=1 op4=1 op5=0/>
  #        ^ pos = 7
  #     ==> iter([1, 0, 0, 1, 1, 0]).
  # <DATA op0=8412 op1=101 op2=1150 op3=116 op4=5200 op5=98 op6=0/>
  #      ^ pos = 5
  #     ==> iter([8412, 101, 1150, 116, 5200, 98, 0]).

  # In particular, skip 'abbrevid=#'.
  start = line.index(' op', pos)
  end = line.index('>', start)
  for t in _RE_SPLIT.finditer(line[start:end]):
    yield int(t.group(1))


# Emits uint16 values as a stream of 2 bytes (little-endian).
def _UnpackUint16ListToBytes(items):
  for item in items:
    yield item & 0xFF
    yield (item >> 8) & 0xFF


# Emits uint32 values as a stream of 4 bytes (little-endian).
def _UnpackUint32ListToBytes(items):
  for item in items:
    yield item & 0xFF
    yield (item >> 8) & 0xFF
    yield (item >> 16) & 0xFF
    yield (item >> 24) & 0xFF


class _BcIntArrayType:
  """The specs of an integer array type."""

  # Lookup table to map from width to an unpacker that splits ints into bytes.
  _UNPACKER_MAP = {
    1: iter,
    2: _UnpackUint16ListToBytes,
    4: _UnpackUint32ListToBytes
  }

  def __init__(self, length, width):
    # Number of elements in the array.
    self.length = length
    # Number of bytes per element.
    self.width = width

  def ParseOpItemsAsBytes(self, line, attrib_pos, add_null_at_end):
    """Reads op0=# op=# ... values and returns them as a list of bytes.

    Interprets each op0=# op1=# ... value as a |self.width|-byte integer, splits
    them into component bytes (little-endian), and returns the result as string.

    Args:
      line: Stripped line of single-line tag with op0=# op1=# ... data.
      attrib_pos: Position in |line| where attribute list starts.
      add_null_add_end: Whether to append |'\x00' * self.width|.
    """
    items = _ParseOpItems(line, attrib_pos)
    unpacker = _BcIntArrayType._UNPACKER_MAP[self.width]
    s = bytes(unpacker(items))
    if add_null_at_end:
      s += b'\x00' * self.width
    # Rather stringent check to ensure exact size match.
    assert len(s) == self.length * self.width
    return s


class _BcTypeInfo:
  """Stateful parser of <TYPE_BLOCK_ID>, specialized for integer arrays."""

  # <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4>
  #   <NUMENTRY op0=9/>                  # Type ids should be in [0, 8].
  #   <INTEGER op0=8/>                   # Type id = 0: int8.
  #   <POINTER abbrevid=4 op0=0 op1=0/>  # Type id = 1: Pointer to type id 0
  #                                      #    ==> int8*.
  #   <ARRAY abbrevid=9 op0=4 op1=0/>    # Type id = 2: Array with 4 elements
  #                                      # of type id 0 ==> int8[4]
  #   <STRUCT_NAME op0=115 op1=116 op2=114/>  # Joins next Tag.
  #   <STRUCT_NAMED abbrevid=8 op0=0 op1=1/>  # Type id = 3: Struct (unused).
  #   <FUNCTION abbrevid=5 op0=0 op1=12/>  # Type id = 4: Function (unused).
  #   <INTEGER op0=16/>                  # Type id = 5: int16.
  #   <POINTER abbrevid=4 op0=5 op1=0/>  # Type id = 6: Pointer to type id 5
  #                                      #    ==> int16*.
  #   <INTEGER op0=32/>                  # Type id = 7: int32.
  #   <ARRAY abbrevid=9 op0=5 op1=4/>    # Type id = 8: Array with 4 elements
  #                                      # of type id 5 ==> int16[4]
  # <TYPE_BLOCK_ID>

  def __init__(self):
    # Auto-incrementing current type id.
    self.cur_type_id = 0
    # Maps from type id (of an integer) to number of bits.
    self.int_types = {}
    # Maps from type id (of an integer array) to _BcIntArrayType.
    self.int_array_types = {}

  def Feed(self, line, tag, attrib_pos):
    """Parses a single-line tag and store integer and integer array types.

    Args:
      line: Stripped line of single-line tag with op0=# op1=# ... data.
      tag: The tag type in |line| (child tag of <TYPE_BLOCK_ID>).
      attrib_pos: Position in |line| where attribute list starts.
    """
    if tag in _NON_TYPE_TAGS:
      return
    if tag == 'INTEGER':
      num_bits = next(_ParseOpItems(line, attrib_pos))  # op0.
      self.int_types[self.cur_type_id] = num_bits
    elif tag == 'ARRAY':
      [size, item_type_id] = list(_ParseOpItems(line, attrib_pos))  # op0, op1.
      bits = self.int_types.get(item_type_id)
      if bits is not None:  # |bits| can be None for non-int arrays.
        self.int_array_types[self.cur_type_id] = _BcIntArrayType(
            size, bits // 8)
    self.cur_type_id += 1

  def GetArrayType(self, idx):
    return self.int_array_types.get(idx)


def _ParseBcAnalyzer(lines):
  """A generator to extract bytes() from bcanalyzer dump of a BC file."""

  # ...
  # <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4>
  #    ... (See above; parsed by _BcTypeInfo)
  # <TYPE_BLOCK_ID>
  # ...
  # <CONSTANTS_BLOCK NumWords=93 BlockCodeSize=4>
  #   <SETTYPE abbrevid=4 op0=1/>  # Current type id := 1 ==> int8*.
  #   <CE_INBOUNDS_GEP op0=3 op1=4 op2=1 op3=12 op4=57 op5=12 op6=57/>
  #   <SETTYPE abbrevid=4 op0=2/>  # Current type id := 2 ==> int8[4].
  #   <CSTRING abbrevid=11 op0=70 op1=111 op2=111/> record string = 'Foo'
  #   <STRING abbrevid=11 op0=70 op1=111 op2=111 op3=1/>  # {'F','o','o',1}.
  #   <SETTYPE abbrevid=6 op0=7/>    # Current type id := 7 ==> int32.
  #   <INTEGER abbrevid=5 op0=2000/> # Stores 1000.
  #   <INTEGER abbrevid=5 op0=2001/> # Stores -1000.
  #   <SETTYPE abbrevid=4 op0=8/>    # Current type id := 8 ==> int16[4].
  #   <NULL/>
  #   <DATA abbrevid=11 op0=8400 op1=10100 op2=11500 op3=0/>
  # </CONSTANTS_BLOCK>
  # ...

  # Notes:
  # - Only parse first <TYPE_BLOCK_ID> and first <CONSTANTS_BLOCK>.
  # - <CONSTANTS_BLOCK> is stateful: A "current type id" exists, and that's set
  #   by <SETTYPE>, with op0= referring to type id.
  #   - For array lengths one needs to refer to the corresponding <ARRAY>.
  # - Strings / arrays are in <CSTRING>, <STRING>, and <DATA>.
  #   - abbrevid=# is redundant (repeats tag type) and unused
  #   - Character data are stored in op0=# op1=# ..., one per character. These
  #     values should fit in the proper range, and can be fairly large.
  #   - <CSTRING> has implicit 0 at end.
  #   - Data lengths agree with the length in the matching <ARRAY> entry.
  #   - "record string" text is not very useful: It only appears if all
  #     characters are printable.
  # - Signed vs. unsigned types are undistinguished.
  #   - In <INTEGER>, the op0= value is stored as 2 * abs(x) + (signed ? 0 : 1).
  #   - In <ARRAY> of int, values are coerced to unsigned type.
  # - Strings and int arrays are undistinguished.
  #   - <CSTRING>: If an uint8 array happens to end with 0, then this gets used!
  # - Arrays (or integers) of all-0 appear as <NULL/>. Presumably this gets
  #   placed into .bss section.

  STATE_VOID = 0
  STATE_TYPE_BLOCK = 1
  STATE_CONST_BLOCK = 2
  state = STATE_VOID

  type_info = None
  consts_cur_type = None

  # State machine to parse the *first* <TYPE_BLOCK_ID> to initialize
  # |type_info|, then the *first* <CONSTANTS_BLOCK> to yield strings.
  for line in lines:
    line = line.lstrip()
    (tag_type, tag, attrib_pos) = ParseTag(line)
    if tag_type is None:
      continue
    if state == STATE_VOID:
      if _IsOpeningTag(tag_type):
        if tag == 'TYPE_BLOCK_ID':
          if type_info is None:
            state = STATE_TYPE_BLOCK
            type_info = _BcTypeInfo()
        elif tag == 'CONSTANTS_BLOCK':
          if type_info is not None:
            state = STATE_CONST_BLOCK

    elif state == STATE_TYPE_BLOCK:
      if _IsClosingTag(tag_type) and tag == 'TYPE_BLOCK_ID':
        state = STATE_VOID
      else:
        type_info.Feed(line, tag, attrib_pos)

    elif state == STATE_CONST_BLOCK:
      if _IsClosingTag(tag_type) and tag == 'CONSTANTS_BLOCK':
        # Skip remaining data, including subsequent <CONSTANTS_BLOCK>s.
        break
      if tag == 'SETTYPE':
        try:
          consts_cur_type_id = next(_ParseOpItems(line, attrib_pos))  # op0.
        except StopIteration:
          return
        consts_cur_type = type_info.GetArrayType(consts_cur_type_id)
      elif consts_cur_type and consts_cur_type.width <= _CHAR_WIDTH_LIMIT:
        if tag in ['CSTRING', 'STRING', 'DATA']:
          # Exclude 32-bit / 4-byte strings since they're rarely used, and are
          # likely confused with 32-bit int arrays.
          s = consts_cur_type.ParseOpItemsAsBytes(line, attrib_pos,
                                                  tag == 'CSTRING')
          yield (consts_cur_type, s)


class _BcAnalyzerRunner:
  """Helper to run bcanalyzer and extract output lines. """

  def __init__(self, output_directory):
    self._args = [
        path_util.GetBcAnalyzerPath(), '--dump', '--disable-histogram'
    ]
    self._output_directory = output_directory

  def RunOnFile(self, obj_file):
    output = subprocess.check_output(
        self._args + [obj_file], cwd=self._output_directory).decode('ascii')
    return output.splitlines()


# This is a target for BulkForkAndCall().
def RunBcAnalyzerOnIntermediates(target, output_directory):
  """Calls bcanalyzer and returns encoded map from path to strings.

  Args:
    target: A list of BC file paths.
  """
  assert isinstance(target, list)
  runner = _BcAnalyzerRunner(output_directory)
  strings_by_path = {}
  for t in target:
    strings_by_path[t] = [s for _, s in _ParseBcAnalyzer(runner.RunOnFile(t))]
  # Escape strings by repr() so there will be no special characters to interfere
  # parallel.EncodeDictOfLists() and decoding.
  return parallel.EncodeDictOfLists(strings_by_path, value_transform=repr)


def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--output-directory', default='.')
  parser.add_argument('--char-width-limit', type=int)
  parser.add_argument('objects', type=os.path.realpath, nargs='+')

  args = parser.parse_args()
  base_path = os.path.normpath(args.output_directory)
  runner = _BcAnalyzerRunner(args.output_directory)
  if args.char_width_limit is not None:
    global _CHAR_WIDTH_LIMIT
    _CHAR_WIDTH_LIMIT = args.char_width_limit

  for obj_path in args.objects:
    rel_path = os.path.relpath(obj_path, base_path)
    print('File: %s' % rel_path)
    for cur_type, s in _ParseBcAnalyzer(runner.RunOnFile(obj_path)):
      print('    char%d[%d]: %r' % (cur_type.width * 8, cur_type.length, s))
    print('')


if __name__ == '__main__':
  main()
chromium/tools/binary_size/libsupersize/bcanalyzer.py