#!/usr/bin/env python3
# Copyright 2018 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Runs bcanalyzer to extract data from LLVM Bitcode (BC) files.
IsBitcodeFile():
Reads the magic header of a file to quickly decide whether it is a BC file.
ParseTag():
Heuristically parses a single-line tag from bcanalyzer dump (exporeted for
testing).
RunBcAnalyzerOnIntermediates():
BulkForkAndCall() target: Given BC file [paths], runs (llvm-)bcanalyzer on
each path, parses the output, extracts strings, and returns {path: [strings]}.
This file can also be run stand-alone in order to test out the logic on smaller
sample sizes.
"""
import argparse
import os
import re
import subprocess
import parallel
import path_util
# Upper bound on number of bytes per character in strings. 4-byte / 32-bit
# strings are rare and are likely confused with 32-bit int arrays. So by
# default, only accept up to 2-byte / 16-bit strings.
_CHAR_WIDTH_LIMIT = 2
_RE_SPLIT = re.compile(r'=(\d+)')
# <TYPE_BLOCK_ID> children tags that should not be counted as types.
# - <NUMENTRY> is meta data.
# - <STRUCT_NAME> with the following <STRUCT_NAMED> (or other tag) are counted
# as a single type entry.
_NON_TYPE_TAGS = set(['NUMENTRY', 'STRUCT_NAME'])
# Use bit-fields for tag types: 1 => Opening tag, 2 => Closed tag.
OPENING_TAG = 1
CLOSING_TAG = 2
SELF_CLOSING_TAG = OPENING_TAG | CLOSING_TAG
def _IsOpeningTag(tag_type):
return tag_type & 1
def _IsClosingTag(tag_type):
return tag_type & 2
def IsBitcodeFile(path):
try:
with open(path, 'rb') as f:
return f.read(4) == b'BC\xc0\xde'
except IOError:
return False
def ParseTag(line):
"""Heuristically parses a single-line tag from bcanalyzer dump.
Since input data are machine-generated, so we only need "good enough" parsing
logic that favors simplicity. For example, '</FOO/>' is accepted.
Args:
line: Stripped line that may have a single-line tag with trailing text.
Returns:
(tag_type, tag, attrib_pos) if successful, else (None) * 3. Details:
tag_type: One of {OPENING_TAG, CLOSING_TAG, SELF_CLOSING_TAG}.
tag: The tag name.
attrib_pos: Position in |line| to start parsing attributes.
"""
# <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4>
# ==> (OPENING_TAG, 'TYPE_BLOCK_ID', 14).
# <ARRAY abbrevid=9 op0=1 op1=7/> Trailing text!
# ==> (SELF_CLOSING_TAG, 'ARRAY', 6).
# </TYPE_BLOCK_ID>
# ==> (CLOSING_TAG, 'TYPE_BLOCK_ID', 15).
# Assumes |line| is stripped, i.e., so no indent and no trailing new line.
if len(line) < 2 or line[0] != '<':
return (None, None, None)
tag_type, pos = (CLOSING_TAG, 2) if line[1] == '/' else (OPENING_TAG, 1)
for i in range(pos, len(line)):
if not line[i].isalnum() and line[i] != '_':
if i == pos or not line[i] in ' >/':
break
end = line.find('>', i)
if end < 0:
break
if line[end - 1] == '/':
return (SELF_CLOSING_TAG, line[pos:i], i)
return (tag_type, line[pos:i], i)
return (None, None, None)
def _ParseOpItems(line, pos):
"""Heuristically extracts op0=# op1=# ... values from a single-line tag."""
# <SETTYPE abbrevid=4 op0=42/>
# ^ pos = 8
# ==> iter([42]).
# <CSTRING abbrevid=11 op0=84 op1=101 op2=115 op3=116 op4=56 op5=97/>
# ^ pos = 8
# ==> iter([84, 101, 115, 116, 56, 97]).
# <STRING abbrevid=9 op0=1 op1=0 op2=0 op3=1 op4=1 op5=0/>
# ^ pos = 7
# ==> iter([1, 0, 0, 1, 1, 0]).
# <DATA op0=8412 op1=101 op2=1150 op3=116 op4=5200 op5=98 op6=0/>
# ^ pos = 5
# ==> iter([8412, 101, 1150, 116, 5200, 98, 0]).
# In particular, skip 'abbrevid=#'.
start = line.index(' op', pos)
end = line.index('>', start)
for t in _RE_SPLIT.finditer(line[start:end]):
yield int(t.group(1))
# Emits uint16 values as a stream of 2 bytes (little-endian).
def _UnpackUint16ListToBytes(items):
for item in items:
yield item & 0xFF
yield (item >> 8) & 0xFF
# Emits uint32 values as a stream of 4 bytes (little-endian).
def _UnpackUint32ListToBytes(items):
for item in items:
yield item & 0xFF
yield (item >> 8) & 0xFF
yield (item >> 16) & 0xFF
yield (item >> 24) & 0xFF
class _BcIntArrayType:
"""The specs of an integer array type."""
# Lookup table to map from width to an unpacker that splits ints into bytes.
_UNPACKER_MAP = {
1: iter,
2: _UnpackUint16ListToBytes,
4: _UnpackUint32ListToBytes
}
def __init__(self, length, width):
# Number of elements in the array.
self.length = length
# Number of bytes per element.
self.width = width
def ParseOpItemsAsBytes(self, line, attrib_pos, add_null_at_end):
"""Reads op0=# op=# ... values and returns them as a list of bytes.
Interprets each op0=# op1=# ... value as a |self.width|-byte integer, splits
them into component bytes (little-endian), and returns the result as string.
Args:
line: Stripped line of single-line tag with op0=# op1=# ... data.
attrib_pos: Position in |line| where attribute list starts.
add_null_add_end: Whether to append |'\x00' * self.width|.
"""
items = _ParseOpItems(line, attrib_pos)
unpacker = _BcIntArrayType._UNPACKER_MAP[self.width]
s = bytes(unpacker(items))
if add_null_at_end:
s += b'\x00' * self.width
# Rather stringent check to ensure exact size match.
assert len(s) == self.length * self.width
return s
class _BcTypeInfo:
"""Stateful parser of <TYPE_BLOCK_ID>, specialized for integer arrays."""
# <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4>
# <NUMENTRY op0=9/> # Type ids should be in [0, 8].
# <INTEGER op0=8/> # Type id = 0: int8.
# <POINTER abbrevid=4 op0=0 op1=0/> # Type id = 1: Pointer to type id 0
# # ==> int8*.
# <ARRAY abbrevid=9 op0=4 op1=0/> # Type id = 2: Array with 4 elements
# # of type id 0 ==> int8[4]
# <STRUCT_NAME op0=115 op1=116 op2=114/> # Joins next Tag.
# <STRUCT_NAMED abbrevid=8 op0=0 op1=1/> # Type id = 3: Struct (unused).
# <FUNCTION abbrevid=5 op0=0 op1=12/> # Type id = 4: Function (unused).
# <INTEGER op0=16/> # Type id = 5: int16.
# <POINTER abbrevid=4 op0=5 op1=0/> # Type id = 6: Pointer to type id 5
# # ==> int16*.
# <INTEGER op0=32/> # Type id = 7: int32.
# <ARRAY abbrevid=9 op0=5 op1=4/> # Type id = 8: Array with 4 elements
# # of type id 5 ==> int16[4]
# <TYPE_BLOCK_ID>
def __init__(self):
# Auto-incrementing current type id.
self.cur_type_id = 0
# Maps from type id (of an integer) to number of bits.
self.int_types = {}
# Maps from type id (of an integer array) to _BcIntArrayType.
self.int_array_types = {}
def Feed(self, line, tag, attrib_pos):
"""Parses a single-line tag and store integer and integer array types.
Args:
line: Stripped line of single-line tag with op0=# op1=# ... data.
tag: The tag type in |line| (child tag of <TYPE_BLOCK_ID>).
attrib_pos: Position in |line| where attribute list starts.
"""
if tag in _NON_TYPE_TAGS:
return
if tag == 'INTEGER':
num_bits = next(_ParseOpItems(line, attrib_pos)) # op0.
self.int_types[self.cur_type_id] = num_bits
elif tag == 'ARRAY':
[size, item_type_id] = list(_ParseOpItems(line, attrib_pos)) # op0, op1.
bits = self.int_types.get(item_type_id)
if bits is not None: # |bits| can be None for non-int arrays.
self.int_array_types[self.cur_type_id] = _BcIntArrayType(
size, bits // 8)
self.cur_type_id += 1
def GetArrayType(self, idx):
return self.int_array_types.get(idx)
def _ParseBcAnalyzer(lines):
"""A generator to extract bytes() from bcanalyzer dump of a BC file."""
# ...
# <TYPE_BLOCK_ID NumWords=103 BlockCodeSize=4>
# ... (See above; parsed by _BcTypeInfo)
# <TYPE_BLOCK_ID>
# ...
# <CONSTANTS_BLOCK NumWords=93 BlockCodeSize=4>
# <SETTYPE abbrevid=4 op0=1/> # Current type id := 1 ==> int8*.
# <CE_INBOUNDS_GEP op0=3 op1=4 op2=1 op3=12 op4=57 op5=12 op6=57/>
# <SETTYPE abbrevid=4 op0=2/> # Current type id := 2 ==> int8[4].
# <CSTRING abbrevid=11 op0=70 op1=111 op2=111/> record string = 'Foo'
# <STRING abbrevid=11 op0=70 op1=111 op2=111 op3=1/> # {'F','o','o',1}.
# <SETTYPE abbrevid=6 op0=7/> # Current type id := 7 ==> int32.
# <INTEGER abbrevid=5 op0=2000/> # Stores 1000.
# <INTEGER abbrevid=5 op0=2001/> # Stores -1000.
# <SETTYPE abbrevid=4 op0=8/> # Current type id := 8 ==> int16[4].
# <NULL/>
# <DATA abbrevid=11 op0=8400 op1=10100 op2=11500 op3=0/>
# </CONSTANTS_BLOCK>
# ...
# Notes:
# - Only parse first <TYPE_BLOCK_ID> and first <CONSTANTS_BLOCK>.
# - <CONSTANTS_BLOCK> is stateful: A "current type id" exists, and that's set
# by <SETTYPE>, with op0= referring to type id.
# - For array lengths one needs to refer to the corresponding <ARRAY>.
# - Strings / arrays are in <CSTRING>, <STRING>, and <DATA>.
# - abbrevid=# is redundant (repeats tag type) and unused
# - Character data are stored in op0=# op1=# ..., one per character. These
# values should fit in the proper range, and can be fairly large.
# - <CSTRING> has implicit 0 at end.
# - Data lengths agree with the length in the matching <ARRAY> entry.
# - "record string" text is not very useful: It only appears if all
# characters are printable.
# - Signed vs. unsigned types are undistinguished.
# - In <INTEGER>, the op0= value is stored as 2 * abs(x) + (signed ? 0 : 1).
# - In <ARRAY> of int, values are coerced to unsigned type.
# - Strings and int arrays are undistinguished.
# - <CSTRING>: If an uint8 array happens to end with 0, then this gets used!
# - Arrays (or integers) of all-0 appear as <NULL/>. Presumably this gets
# placed into .bss section.
STATE_VOID = 0
STATE_TYPE_BLOCK = 1
STATE_CONST_BLOCK = 2
state = STATE_VOID
type_info = None
consts_cur_type = None
# State machine to parse the *first* <TYPE_BLOCK_ID> to initialize
# |type_info|, then the *first* <CONSTANTS_BLOCK> to yield strings.
for line in lines:
line = line.lstrip()
(tag_type, tag, attrib_pos) = ParseTag(line)
if tag_type is None:
continue
if state == STATE_VOID:
if _IsOpeningTag(tag_type):
if tag == 'TYPE_BLOCK_ID':
if type_info is None:
state = STATE_TYPE_BLOCK
type_info = _BcTypeInfo()
elif tag == 'CONSTANTS_BLOCK':
if type_info is not None:
state = STATE_CONST_BLOCK
elif state == STATE_TYPE_BLOCK:
if _IsClosingTag(tag_type) and tag == 'TYPE_BLOCK_ID':
state = STATE_VOID
else:
type_info.Feed(line, tag, attrib_pos)
elif state == STATE_CONST_BLOCK:
if _IsClosingTag(tag_type) and tag == 'CONSTANTS_BLOCK':
# Skip remaining data, including subsequent <CONSTANTS_BLOCK>s.
break
if tag == 'SETTYPE':
try:
consts_cur_type_id = next(_ParseOpItems(line, attrib_pos)) # op0.
except StopIteration:
return
consts_cur_type = type_info.GetArrayType(consts_cur_type_id)
elif consts_cur_type and consts_cur_type.width <= _CHAR_WIDTH_LIMIT:
if tag in ['CSTRING', 'STRING', 'DATA']:
# Exclude 32-bit / 4-byte strings since they're rarely used, and are
# likely confused with 32-bit int arrays.
s = consts_cur_type.ParseOpItemsAsBytes(line, attrib_pos,
tag == 'CSTRING')
yield (consts_cur_type, s)
class _BcAnalyzerRunner:
"""Helper to run bcanalyzer and extract output lines. """
def __init__(self, output_directory):
self._args = [
path_util.GetBcAnalyzerPath(), '--dump', '--disable-histogram'
]
self._output_directory = output_directory
def RunOnFile(self, obj_file):
output = subprocess.check_output(
self._args + [obj_file], cwd=self._output_directory).decode('ascii')
return output.splitlines()
# This is a target for BulkForkAndCall().
def RunBcAnalyzerOnIntermediates(target, output_directory):
"""Calls bcanalyzer and returns encoded map from path to strings.
Args:
target: A list of BC file paths.
"""
assert isinstance(target, list)
runner = _BcAnalyzerRunner(output_directory)
strings_by_path = {}
for t in target:
strings_by_path[t] = [s for _, s in _ParseBcAnalyzer(runner.RunOnFile(t))]
# Escape strings by repr() so there will be no special characters to interfere
# parallel.EncodeDictOfLists() and decoding.
return parallel.EncodeDictOfLists(strings_by_path, value_transform=repr)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--output-directory', default='.')
parser.add_argument('--char-width-limit', type=int)
parser.add_argument('objects', type=os.path.realpath, nargs='+')
args = parser.parse_args()
base_path = os.path.normpath(args.output_directory)
runner = _BcAnalyzerRunner(args.output_directory)
if args.char_width_limit is not None:
global _CHAR_WIDTH_LIMIT
_CHAR_WIDTH_LIMIT = args.char_width_limit
for obj_path in args.objects:
rel_path = os.path.relpath(obj_path, base_path)
print('File: %s' % rel_path)
for cur_type, s in _ParseBcAnalyzer(runner.RunOnFile(obj_path)):
print(' char%d[%d]: %r' % (cur_type.width * 8, cur_type.length, s))
print('')
if __name__ == '__main__':
main()