chromium/tools/resources/generate_resource_allowlist.py

#!/usr/bin/env python
# Copyright 2016 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

__doc__ = """generate_resource_allowlist.py [-o OUTPUT] INPUTS...

INPUTS are paths to unstripped binaries or PDBs containing references to
resources in their debug info.

This script generates a resource allowlist by reading debug info from
INPUTS and writes it to OUTPUT.
"""

# Allowlisted resources are identified by searching the input file for
# instantiations of the special function ui::AllowlistedResource (see
# ui/base/resource/allowlist.h).

import argparse
import os
import subprocess
import sys

import ar

llvm_bindir = os.path.join(os.path.dirname(sys.argv[0]), '..', '..',
                           'third_party', 'llvm-build', 'Release+Asserts',
                           'bin')


def ExtractAllowlistFromFile(path, resource_ids):
  with open(path, 'rb') as f:
    data = f.read()
  # When symbol_level=0, only mangled names exist.
  # E.g.: _ZN2ui19AllowlistedResourceILi22870EEEvv
  prefix = b'AllowlistedResourceILi'
  start_idx = 0
  while start_idx != -1:
    start_idx = data.find(prefix, start_idx)
    if start_idx != -1:
      end_idx = data.find(b'E', start_idx)
      resource_ids.add(int(data[start_idx + len(prefix):end_idx]))
      start_idx = end_idx


def GetResourceAllowlistELF(path):
  # Produce a resource allowlist by searching for debug info referring to
  # AllowlistedResource.
  # This used to use "readelf -p .debug_str", but it doesn't seem to work with
  # use_debug_fission=true. Reading the raw file is faster anyways.
  resource_ids = set()
  ExtractAllowlistFromFile(path, resource_ids)
  return resource_ids


def GetResourceAllowlistPDB(path):
  # Produce a resource allowlist by using llvm-pdbutil to read a PDB file's
  # publics stream, which is essentially a symbol table, and searching for
  # instantiations of AllowlistedResource. Any such instantiations are demangled
  # to extract the resource identifier.
  pdbutil = subprocess.Popen(
      [os.path.join(llvm_bindir, 'llvm-pdbutil'), 'dump', '-publics', path],
      stdout=subprocess.PIPE)
  names = ''
  for line in pdbutil.stdout:
    line = line.decode('utf8')
    # Read a line of the form
    # "733352 | S_PUB32 [size = 56] `??$AllowlistedResource@$0BFGM@@ui@@YAXXZ`".
    if '`' not in line:
      continue
    sym_name = line[line.find('`') + 1:line.rfind('`')]
    # Under certain conditions such as the GN arg `use_clang_coverage = true` it
    # is possible for the compiler to emit additional symbols that do not match
    # the standard mangled-name format.
    # Example: __profd_??$AllowlistedResource@$0BGPH@@ui@@YAXXZ
    # C++ mangled names are supposed to begin with `?`, so check for that.
    if 'AllowlistedResource' in sym_name and sym_name.startswith('?'):
      names += sym_name + '\n'
  exit_code = pdbutil.wait()
  if exit_code != 0:
    raise Exception('llvm-pdbutil exited with exit code %d' % exit_code)

  undname = subprocess.Popen([os.path.join(llvm_bindir, 'llvm-undname')],
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE)
  stdout, _ = undname.communicate(names.encode('utf8'))
  resource_ids = set()
  for line in stdout.split(b'\n'):
    line = line.decode('utf8')
    # Read a line of the form
    # "void __cdecl ui::AllowlistedResource<5484>(void)".
    prefix = ' ui::AllowlistedResource<'
    pos = line.find(prefix)
    if pos == -1:
      continue
    try:
      resource_ids.add(int(line[pos + len(prefix):line.rfind('>')]))
    except ValueError:
      continue
  exit_code = undname.wait()
  if exit_code != 0:
    raise Exception('llvm-undname exited with exit code %d' % exit_code)
  return resource_ids


def GetResourceAllowlistFileList(file_list_path):
  # Creates a list of resources given the list of linker input files.
  # Simply grep's them for AllowlistedResource<...>.
  with open(file_list_path) as f:
    paths = f.read().splitlines()

  paths = ar.ExpandThinArchives(paths)

  resource_ids = set()
  for p in paths:
    ExtractAllowlistFromFile(p, resource_ids)
  return resource_ids


def WriteResourceAllowlist(args):
  resource_ids = set()
  for input in args.inputs:
    with open(input, 'rb') as f:
      magic = f.read(4)
      chunk = f.read(60)
    if magic == b'\x7fELF':
      func = GetResourceAllowlistELF
    elif magic == b'Micr':
      func = GetResourceAllowlistPDB
    elif magic == b'obj/' or b'/obj/' in chunk:
      # For secondary toolchain, path will look like android_clang_arm/obj/...
      func = GetResourceAllowlistFileList
    else:
      raise Exception('unknown file format')

    resource_ids.update(func(input))

  # The last time this broke, exactly two resources were still being found.
  if len(resource_ids) < 100:
    raise Exception('Suspiciously few resources found. Likely an issue with '
                    'the regular expression in this script. Found: ' +
                    ','.join(sorted(resource_ids)))
  for id in sorted(resource_ids):
    args.output.write(str(id) + '\n')


def main():
  parser = argparse.ArgumentParser(usage=__doc__)
  parser.add_argument('inputs', nargs='+', help='An unstripped binary or PDB.')
  parser.add_argument('-o',
                      dest='output',
                      type=argparse.FileType('w'),
                      default=sys.stdout,
                      help='The resource list path to write (default stdout)')

  args = parser.parse_args()
  WriteResourceAllowlist(args)


if __name__ == '__main__':
  main()