analyze_includes.py | Explore in Territory

#!/usr/bin/env python3
# Copyright 2021 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""This script is used to analyze #include graphs.

It produces the .js file that accompanies include-analysis.html.

Usage:

$ gn gen --args="show_includes=true symbol_level=0 enable_precompiled_headers=false" out/Debug
$ autoninja -C out/Debug -v chrome | tee /tmp/build_log
$ analyze_includes.py --target=chrome --revision=$(git rev-parse --short HEAD) \
    --json-out=/tmp/include-analysis.js /tmp/build_log

(If you have reclient access, add use_reclient=true to the gn args, but not on
Windows due to crbug.com/1223741#c9)

The script takes roughly half an hour on a fast machine for the chrome build
target, which is considered fast enough for batch job purposes for now.

If --json-out is not provided, the script exits after printing some statistics
to stdout. This is significantly faster than generating the full JSON data. For
example:

$ autoninja -C out/Debug -v chrome | analyze_includes.py - 2>/dev/null
build_size 270237664463
"""

import argparse
import json
import os
import pathlib
import re
import sys
import unittest
from collections import defaultdict
from datetime import datetime


def parse_build(build_log, root_filter=None):
  """Parse the build_log (generated as in the Usage note above) to capture the
  include graph. Returns a (roots, includes) pair, where roots is a list of root
  nodes in the graph (the source files) and includes is a dict from filename to
  list of filenames included by that filename."""
  build_dir = '.'
  file_stack = []
  includes = {}
  roots = set()

  # Note: A file might include different files for different compiler
  # invocations depending on -D flags. For such cases, includes[file] will be
  # the union of those includes.

  # Normalize paths.
  normalized = {}

  def norm(fn):
    if fn not in normalized:
      x = fn.replace('\\\\', '\\')
      # Use Path.resolve() rather than path.realpath() to get the canonical
      # upper/lower-case version of the path on Windows.
      p = pathlib.Path(os.path.join(build_dir, x)).resolve()
      x = os.path.relpath(p)
      x = x.replace(os.path.sep, '/')
      normalized[fn] = x
    return normalized[fn]

  # ninja: Entering directory `out/foo'
  ENTER_DIR_RE = re.compile(r'ninja: Entering directory `(.*?)\'$')
  # [M/N] clang... -c foo.cc -o foo.o ...
  # [M/N] .../clang... -c foo.cc -o foo.o ...
  # [M/N] clang-cl.exe /c foo.cc /Fofoo.o ...
  # [M/N] ...\clang-cl.exe /c foo.cc /Fofoo.o ...
  COMPILE_RE = re.compile(r'\[\d+/\d+\] (.*[/\\])?clang.* [/-]c (\S*)')
  # . a.h
  # .. b.h
  # . c.h
  INCLUDE_RE = re.compile(r'(\.+) (.*)$')

  skipping_root = False

  for line in build_log:
    m = INCLUDE_RE.match(line)
    if m:
      if skipping_root:
        continue
      prev_depth = len(file_stack) - 1
      depth = len(m.group(1))
      filename = norm(m.group(2))
      includes.setdefault(filename, set())

      if depth > prev_depth:
        if sys.platform != 'win32':
          # TODO(crbug.com/40187759): Always assert.
          assert depth == prev_depth + 1
        elif depth > prev_depth + 1:
          # Until the bug is fixed, skip these includes.
          print('missing include under', file_stack[0])
          continue
      else:
        for _ in range(prev_depth - depth + 1):
          file_stack.pop()

      includes[file_stack[-1]].add(filename)
      file_stack.append(filename)
      continue

    m = COMPILE_RE.match(line)
    if m:
      skipping_root = False
      filename = norm(m.group(2))
      if root_filter and not root_filter.match(filename):
        skipping_root = True
        continue
      roots.add(filename)
      file_stack = [filename]
      includes.setdefault(filename, set())
      continue

    m = ENTER_DIR_RE.match(line)
    if m:
      build_dir = m.group(1)
      continue

  return roots, includes


class TestParseBuild(unittest.TestCase):
  def test_basic(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[1/3] clang -c ../../a.cc -o a.o',
        '. ../../a.h',
        '[2/3] clang -c gen/c.c -o a.o',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
    self.assertEqual(set(includes.keys()),
                     set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
    self.assertEqual(includes['a.cc'], set(['a.h']))
    self.assertEqual(includes['a.h'], set())
    self.assertEqual(includes['out/foo/gen/c.c'], set())

  def test_more(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[20/99] clang -c ../../a.cc -o a.o',
        '. ../../a.h',
        '. ../../b.h',
        '.. ../../c.h',
        '... ../../d.h',
        '. ../../e.h',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, set(['a.cc']))
    self.assertEqual(includes['a.cc'], set(['a.h', 'b.h', 'e.h']))
    self.assertEqual(includes['b.h'], set(['c.h']))
    self.assertEqual(includes['c.h'], set(['d.h']))
    self.assertEqual(includes['d.h'], set())
    self.assertEqual(includes['e.h'], set())

  def test_multiple(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[123/234] clang -c ../../a.cc -o a.o',
        '. ../../a.h',
        '[124/234] clang -c ../../b.cc -o b.o',
        '. ../../b.h',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, set(['a.cc', 'b.cc']))
    self.assertEqual(includes['a.cc'], set(['a.h']))
    self.assertEqual(includes['b.cc'], set(['b.h']))

  def test_root_filter(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[9/100] clang -c ../../a.cc -o a.o',
        '. ../../a.h',
        '[10/100] clang -c ../../b.cc -o b.o',
        '. ../../b.h',
    ]
    (roots, includes) = parse_build(x, re.compile(r'^a.cc$'))
    self.assertEqual(roots, set(['a.cc']))
    self.assertEqual(set(includes.keys()), set(['a.cc', 'a.h']))
    self.assertEqual(includes['a.cc'], set(['a.h']))

  def test_windows(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[1/3] path\\clang-cl.exe /c ../../a.cc /Foa.o',
        '. ../../a.h',
        '[2/3] clang-cl.exe /c gen/c.c /Foa.o',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
    self.assertEqual(set(includes.keys()),
                     set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
    self.assertEqual(includes['a.cc'], set(['a.h']))
    self.assertEqual(includes['a.h'], set())
    self.assertEqual(includes['out/foo/gen/c.c'], set())


def post_order_nodes(root, child_nodes):
  """Generate the nodes reachable from root (including root itself) in
  post-order traversal order. child_nodes maps each node to its children."""
  visited = set()

  def walk(n):
    if n in visited:
      return
    visited.add(n)

    for c in child_nodes[n]:
      for x in walk(c):
        yield x
    yield n

  return walk(root)


def compute_doms(root, includes):
  """Compute the dominators for all nodes reachable from root. Node A dominates
  node B if all paths from the root to B go through A. Returns a dict from
  filename to the set of dominators of that filename (including itself).

  The implementation follows the "simple" version of Lengauer & Tarjan "A Fast
  Algorithm for Finding Dominators in a Flowgraph" (TOPLAS 1979).
  """

  parent = {}
  ancestor = {}
  vertex = []
  label = {}
  semi = {}
  pred = defaultdict(list)
  bucket = defaultdict(list)
  dom = {}

  def dfs(v):
    semi[v] = len(vertex)
    vertex.append(v)
    label[v] = v

    for w in includes[v]:
      if w not in semi:
        parent[w] = v
        dfs(w)
      pred[w].append(v)

  def compress(v):
    if ancestor[v] in ancestor:
      compress(ancestor[v])
      if semi[label[ancestor[v]]] < semi[label[v]]:
        label[v] = label[ancestor[v]]
      ancestor[v] = ancestor[ancestor[v]]

  def evaluate(v):
    if v not in ancestor:
      return v
    compress(v)
    return label[v]

  def link(v, w):
    ancestor[w] = v

  # Step 1: Initialization.
  dfs(root)

  for w in reversed(vertex[1:]):
    # Step 2: Compute semidominators.
    for v in pred[w]:
      u = evaluate(v)
      if semi[u] < semi[w]:
        semi[w] = semi[u]

    bucket[vertex[semi[w]]].append(w)
    link(parent[w], w)

    # Step 3: Implicitly define the immediate dominator for each node.
    for v in bucket[parent[w]]:
      u = evaluate(v)
      dom[v] = u if semi[u] < semi[v] else parent[w]
    bucket[parent[w]] = []

  # Step 4: Explicitly define the immediate dominator for each node.
  for w in vertex[1:]:
    if dom[w] != vertex[semi[w]]:
      dom[w] = dom[dom[w]]

  # Get the full dominator set for each node.
  all_doms = {}
  all_doms[root] = {root}

  def dom_set(node):
    if node not in all_doms:
      # node's dominators is itself and the dominators of its immediate
      # dominator.
      all_doms[node] = {node}
      all_doms[node].update(dom_set(dom[node]))

    return all_doms[node]

  return {n: dom_set(n) for n in vertex}


class TestComputeDoms(unittest.TestCase):
  def test_basic(self):
    includes = {}
    includes[1] = [2]
    includes[2] = [1]
    includes[3] = [2]
    includes[4] = [1]
    includes[5] = [4, 3]
    root = 5

    doms = compute_doms(root, includes)

    self.assertEqual(doms[1], set([5, 1]))
    self.assertEqual(doms[2], set([5, 2]))
    self.assertEqual(doms[3], set([5, 3]))
    self.assertEqual(doms[4], set([5, 4]))
    self.assertEqual(doms[5], set([5]))

  def test_larger(self):
    # Fig. 1 in the Lengauer-Tarjan paper.
    includes = {}
    includes['a'] = ['d']
    includes['b'] = ['a', 'd', 'e']
    includes['c'] = ['f', 'g']
    includes['d'] = ['l']
    includes['e'] = ['h']
    includes['f'] = ['i']
    includes['g'] = ['i', 'j']
    includes['h'] = ['k', 'e']
    includes['i'] = ['k']
    includes['j'] = ['i']
    includes['k'] = ['i', 'r']
    includes['l'] = ['h']
    includes['r'] = ['a', 'b', 'c']
    root = 'r'

    doms = compute_doms(root, includes)

    # Fig. 2 in the Lengauer-Tarjan paper.
    self.assertEqual(doms['a'], set(['a', 'r']))
    self.assertEqual(doms['b'], set(['b', 'r']))
    self.assertEqual(doms['c'], set(['c', 'r']))
    self.assertEqual(doms['d'], set(['d', 'r']))
    self.assertEqual(doms['e'], set(['e', 'r']))
    self.assertEqual(doms['f'], set(['f', 'c', 'r']))
    self.assertEqual(doms['g'], set(['g', 'c', 'r']))
    self.assertEqual(doms['h'], set(['h', 'r']))
    self.assertEqual(doms['i'], set(['i', 'r']))
    self.assertEqual(doms['j'], set(['j', 'g', 'c', 'r']))
    self.assertEqual(doms['k'], set(['k', 'r']))
    self.assertEqual(doms['l'], set(['l', 'd', 'r']))
    self.assertEqual(doms['r'], set(['r']))


def trans_size(root, includes, sizes):
  """Compute the transitive size of a file, i.e. the size of the file itself and
  all its transitive includes."""
  return sum([sizes[n] for n in post_order_nodes(root, includes)])


def log(*args, **kwargs):
  """Log output to stderr."""
  print(*args, file=sys.stderr, **kwargs)


def analyze(target, revision, build_log_file, json_file, root_filter):
  log('Parsing build log...')
  (roots, includes) = parse_build(build_log_file, root_filter)

  log('Getting file sizes...')
  sizes = {name: os.path.getsize(name) for name in includes}

  log('Computing transitive sizes...')
  trans_sizes = {n: trans_size(n, includes, sizes) for n in includes}

  build_size = sum([trans_sizes[n] for n in roots])

  print('build_size', build_size)

  if json_file is None:
    log('--json-out not set; exiting.')
    return 0

  log('Counting prevalence...')
  prevalence = {name: 0 for name in includes}
  for r in roots:
    for n in post_order_nodes(r, includes):
      prevalence[n] += 1

  # Map from file to files that include it.
  log('Building reverse include map...')
  included_by = {k: set() for k in includes}
  for k in includes:
    for i in includes[k]:
      included_by[i].add(k)

  log('Computing added sizes...')

  # Split each src -> dst edge in includes into src -> (src,dst) -> dst, so that
  # we can compute how much each include graph edge adds to the size by doing
  # dominance analysis on the (src,dst) nodes.
  augmented_includes = {}
  for src in includes:
    augmented_includes[src] = set()
    for dst in includes[src]:
      augmented_includes[src].add((src, dst))
      augmented_includes[(src, dst)] = {dst}

  added_sizes = {node: 0 for node in augmented_includes}
  for r in roots:
    doms = compute_doms(r, augmented_includes)
    for node in doms:
      if node not in sizes:
        # Skip the (src,dst) pseudo nodes.
        continue
      for dom in doms[node]:
        added_sizes[dom] += sizes[node]

  # Assign a number to each filename for tighter JSON representation.
  names = []
  name2nr = {}
  for n in sorted(includes.keys()):
    name2nr[n] = len(names)
    names.append(n)

  def nr(name):
    return name2nr[name]

  log('Writing output...')

  # Provide a JS object for convenient inclusion in the HTML file.
  # If someone really wants a proper JSON file, maybe we can reconsider this.
  json_file.write('data = ')

  json.dump(
      {
          'target': target,
          'revision': revision,
          'date': datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S UTC'),
          'files': names,
          'roots': [nr(x) for x in sorted(roots)],
          'includes': [[nr(x) for x in sorted(includes[n])] for n in names],
          'included_by': [[nr(x) for x in included_by[n]] for n in names],
          'sizes': [sizes[n] for n in names],
          'tsizes': [trans_sizes[n] for n in names],
          'asizes': [added_sizes[n] for n in names],
          'esizes': [[added_sizes[(s, d)] for d in sorted(includes[s])]
                     for s in names],
          'prevalence': [prevalence[n] for n in names],
      }, json_file)

  log('All done!')


def main():
  result = unittest.main(argv=sys.argv[:1], exit=False, verbosity=2).result
  if len(result.failures) > 0 or len(result.errors) > 0:
    return 1

  parser = argparse.ArgumentParser(description='Analyze an #include graph.')
  parser.add_argument('build_log',
                      type=argparse.FileType('r', errors='replace'),
                      help='The build log to analyze (- for stdin).')
  parser.add_argument('--target',
                      help='The target that was built (e.g. chrome).')
  parser.add_argument('--revision',
                      help='The revision that was built (e.g. 016588d4ee20).')
  parser.add_argument(
      '--json-out',
      type=argparse.FileType('w'),
      help='Write full analysis data to a JSON file (- for stdout).')
  parser.add_argument('--root-filter',
                      help='Regex to filter which root files are analyzed.')
  args = parser.parse_args()

  if args.json_out and not (args.target and args.revision):
    print('error: --json-out requires both --target and --revision to be set')
    return 1

  try:
    root_filter = re.compile(args.root_filter) if args.root_filter else None
  except Exception:
    print('error: --root-filter is not a valid regex')
    return 1

  analyze(args.target, args.revision, args.build_log, args.json_out,
          root_filter)


if __name__ == '__main__':
  sys.exit(main())
chromium/tools/clang/scripts/analyze_includes.py