chromium/tools/valgrind/asan/asan_symbolize.py

#!/usr/bin/env python3

# Copyright 2012 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

from third_party import asan_symbolize

import argparse
import base64
import json
import os
import platform
import re
import subprocess
import sys

class LineBuffered(object):
  """Disable buffering on a file object."""
  def __init__(self, stream):
    self.stream = stream

  def write(self, data):
    self.stream.write(data)
    if '\n' in data:
      self.stream.flush()

  def __getattr__(self, attr):
    return getattr(self.stream, attr)


def disable_buffering():
  """Makes this process and child processes stdout unbuffered."""
  if not os.environ.get('PYTHONUNBUFFERED'):
    # Since sys.stdout is a C++ object, it's impossible to do
    # sys.stdout.write = lambda...
    sys.stdout = LineBuffered(sys.stdout)
    os.environ['PYTHONUNBUFFERED'] = 'x'


def set_symbolizer_path():
  """Set the path to the llvm-symbolize binary in the Chromium source tree."""
  if not os.environ.get('LLVM_SYMBOLIZER_PATH'):
    script_dir = os.path.dirname(os.path.abspath(__file__))
    # Assume this script resides three levels below src/ (i.e.
    # src/tools/valgrind/asan/).
    src_root = os.path.join(script_dir, "..", "..", "..")
    symbolizer_path = os.path.join(src_root, 'third_party',
        'llvm-build', 'Release+Asserts', 'bin', 'llvm-symbolizer')
    assert(os.path.isfile(symbolizer_path))
    os.environ['LLVM_SYMBOLIZER_PATH'] = os.path.abspath(symbolizer_path)


def is_hash_name(name):
  match = re.match('[0-9a-f]+$', name)
  return bool(match)


def split_path(path):
  ret = []
  while True:
    head, tail = os.path.split(path)
    if head == path:
      return [head] + ret
    ret, path = [tail] + ret, head


def chrome_product_dir_path(exe_path):
  if exe_path is None:
    return None
  path_parts = split_path(exe_path)
  # Make sure the product dir path isn't empty if |exe_path| consists of
  # a single component.
  if len(path_parts) == 1:
    path_parts = ['.'] + path_parts
  for index, part in enumerate(path_parts):
    if part.endswith('.app'):
      return os.path.join(*path_parts[:index])
  # If the executable isn't an .app bundle, it's a commandline binary that
  # resides right in the product dir.
  return os.path.join(*path_parts[:-1])


inode_path_cache = {}


def find_inode_at_path(inode, path):
  if inode in inode_path_cache:
    return inode_path_cache[inode]
  cmd = ['find', path, '-inum', str(inode)]
  find_line = subprocess.check_output(cmd).rstrip()
  lines = find_line.split('\n')
  ret = None
  if lines:
    # `find` may give us several paths (e.g. 'Chromium Framework' in the
    # product dir and 'Chromium Framework' inside 'Chromium.app',
    # chrome_dsym_hints() will produce correct .dSYM path for any of them.
    ret = lines[0]
  inode_path_cache[inode] = ret
  return ret


# Construct a path to the .dSYM bundle for the given binary.
# There are three possible cases for binary location in Chromium:
# 1. The binary is a standalone executable or dynamic library in the product
#    dir, the debug info is in "binary.dSYM" in the product dir.
# 2. The binary is a standalone framework or .app bundle, the debug info is in
#    "Framework.framework.dSYM" or "App.app.dSYM" in the product dir.
# 3. The binary is a framework or an .app bundle within another .app bundle
#    (e.g. Outer.app/Contents/Versions/1.2.3.4/Inner.app), and the debug info
#    is in Inner.app.dSYM in the product dir.
# The first case is handled by llvm-symbolizer, so we only need to construct
# .dSYM paths for .app bundles and frameworks.
# We're assuming that there're no more than two nested bundles in the binary
# path. Only one of these bundles may be a framework and frameworks cannot
# contain other bundles.
def chrome_dsym_hints(binary):
  path_parts = split_path(binary)
  app_positions = []
  framework_positions = []
  for index, part in enumerate(path_parts):
    if part.endswith('.app'):
      app_positions.append(index)
    elif part.endswith('.framework'):
      framework_positions.append(index)
  bundle_positions = app_positions + framework_positions
  bundle_positions.sort()
  assert len(bundle_positions) <= 2, \
      "The path contains more than two nested bundles: %s" % binary
  if len(bundle_positions) == 0:
    # Case 1: this is a standalone executable or dylib.
    return []
  assert (not (len(app_positions) == 1 and
               len(framework_positions) == 1 and
               app_positions[0] > framework_positions[0])), \
      "The path contains an app bundle inside a framework: %s" % binary
  # Cases 2 and 3. The outermost bundle (which is the only bundle in the case 2)
  # is located in the product dir.
  outermost_bundle = bundle_positions[0]
  product_dir = path_parts[:outermost_bundle]
  # In case 2 this is the same as |outermost_bundle|.
  innermost_bundle = bundle_positions[-1]
  dsym_path = product_dir + [path_parts[innermost_bundle]]
  result = '%s.dSYM' % os.path.join(*dsym_path)
  return [result]


class JSONTestRunSymbolizer(object):
  def __init__(self, symbolization_loop):
    self.symbolization_loop = symbolization_loop

  def symbolize_snippet(self, snippet):
    symbolized_lines = []
    for line in snippet.split('\n'):
      symbolized_lines += self.symbolization_loop.process_line(line)
    return '\n'.join(symbolized_lines)

  def symbolize(self, test_run):
    original_snippet = base64.b64decode(
        test_run['output_snippet_base64']).decode('utf-8', 'replace')

    # replace non-ascii character with '?'.
    original_snippet = ''.join(i if i <= u'~' else u'?'
                               for i in original_snippet)

    symbolized_snippet = self.symbolize_snippet(original_snippet)
    if symbolized_snippet == original_snippet:
      # No sanitizer reports in snippet.
      return

    test_run['original_output_snippet'] = test_run['output_snippet']
    test_run['original_output_snippet_base64'] = \
        test_run['output_snippet_base64']

    test_run['output_snippet'] = symbolized_snippet
    test_run['output_snippet_base64'] = \
        base64.b64encode(symbolized_snippet.encode('utf-8', 'replace')).decode()
    test_run['snippet_processed_by'] = 'asan_symbolize.py'


def symbolize_snippets_in_json(filename, symbolization_loop):
  with open(filename, 'r') as f:
    json_data = json.load(f)

  test_run_symbolizer = JSONTestRunSymbolizer(symbolization_loop)
  for iteration_data in json_data['per_iteration_data']:
    for test_name, test_runs in iteration_data.items():
      for test_run in test_runs:
        test_run_symbolizer.symbolize(test_run)

  with open(filename, 'w') as f:
    json.dump(json_data, f, indent=3, sort_keys=True)


class macOSBinaryNameFilterPlugin(asan_symbolize.AsanSymbolizerPlugIn):
  def __init__(self):
    self.product_dir_path = ''

  def filter_binary_path(self, binary_path):
    # Create a binary name filter that works around https://crbug.com/444835.
    # When running tests on OSX swarming servers, ASan sometimes prints paths to
    # files in cache (ending with SHA1 filenames) instead of paths to hardlinks
    # to those files in the product dir.
    # For a given |binary_path| macOSBinaryNameFilterPlugin returns one of the
    # hardlinks to the same inode in |product_dir_path|.
    basename = os.path.basename(binary_path)
    if is_hash_name(basename) and self.product_dir_path:
      inode = os.stat(binary_path).st_ino
      new_binary_path = find_inode_at_path(inode, self.product_dir_path)
      if new_binary_path:
        return new_binary_path
    return binary_path


class CheckUTF8:
  # This wraps stream and show warnings if stream gets invalid data as utf-8.
  def __init__(self, stream):
    self._stream = stream

  def __iter__(self):
    return self

  def __next__(self):

    l = self._stream.buffer.readline()

    if not l:
      raise StopIteration

    try:
      return l.decode()
    except UnicodeDecodeError:
      print("WARNING: asan_symbolize.py failed to decode %s (base64 encoded)" %
            base64.b64encode(l).decode())
      return ""


def main():
  parser = argparse.ArgumentParser(description='Symbolize sanitizer reports.')
  parser.add_argument('--test-summary-json-file',
      help='Path to a JSON file produced by the test launcher. The script will '
           'ignore stdandard input and instead symbolize the output stnippets '
           'inside the JSON file. The result will be written back to the JSON '
           'file.')
  parser.add_argument('strip_path_prefix', nargs='*',
      help='When printing source file names, the longest prefix ending in one '
           'of these substrings will be stripped. E.g.: "Release/../../".')
  parser.add_argument('--executable-path',
      help='Path to program executable. Used on OSX swarming bots to locate '
           'dSYM bundles for associated frameworks and bundles.')
  parser.add_argument('--sysroot', help='Root directory for symbol files')
  args = parser.parse_args()

  disable_buffering()
  set_symbolizer_path()
  asan_symbolize.demangle = True
  asan_symbolize.fix_filename_patterns = args.strip_path_prefix
  # Most source paths for Chromium binaries start with
  # /path/to/src/out/Release/../../
  asan_symbolize.fix_filename_patterns.append('Release/../../')

  with asan_symbolize.AsanSymbolizerPlugInProxy() as plugin_proxy:
    if args.sysroot:
      sysroot_filter = asan_symbolize.SysRootFilterPlugIn()
      sysroot_filter.sysroot_path = args.sysroot
      plugin_proxy.add_plugin(sysroot_filter)
    elif platform.uname()[0] == 'Darwin':
      macos_filter = macOSBinaryNameFilterPlugin()
      macos_filter.product_dir_path = args.executable_path
      plugin_proxy.add_plugin(macos_filter)

    loop = asan_symbolize.SymbolizationLoop(
        plugin_proxy=plugin_proxy, dsym_hint_producer=chrome_dsym_hints)

    if args.test_summary_json_file:
      symbolize_snippets_in_json(args.test_summary_json_file, loop)
    else:
      asan_symbolize.logfile = CheckUTF8(sys.stdin)
      loop.process_logfile()


if __name__ == '__main__':
  main()