chromium/tools/clang/blink_gc_plugin/process-graph.py

#!/usr/bin/env python3
# Copyright 2014 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

from __future__ import print_function
import argparse, os, sys, json, subprocess, pickle

try:
  from StringIO import StringIO  # Python 2
except:
  from io import StringIO

parser = argparse.ArgumentParser(
  description =
    "Process the Blink points-to graph generated by the Blink GC plugin.")

parser.add_argument(
  '-', dest='use_stdin', action='store_true',
  help='Read JSON graph files from stdin')

parser.add_argument(
  '-c', '--detect-cycles', action='store_true',
  help='Detect cycles containing GC roots')

parser.add_argument(
  '-s', '--print-stats', action='store_true',
  help='Statistics about ref-counted and traced objects')

parser.add_argument(
  '-v', '--verbose', action='store_true',
  help='Verbose output')

parser.add_argument(
  '--ignore-cycles', default=None, metavar='FILE',
  help='File with cycles to ignore')

parser.add_argument(
  '--ignore-classes', nargs='*', default=[], metavar='CLASS',
  help='Classes to ignore when detecting cycles')

parser.add_argument(
  '--pickle-graph', default=None, metavar='FILE',
  help='File to read/save the graph from/to')

parser.add_argument(
  'files', metavar='FILE_OR_DIR', nargs='*', default=[],
  help='JSON graph files or directories containing them')

# Command line args after parsing.
args = None

# Map from node labels to nodes.
graph = {}

# Set of root nodes.
roots = []

# List of cycles to ignore.
ignored_cycles = []

# Global flag to determine exit code.
global_reported_error = False

try:
  # Python3 remove sys.maxint.
  maxint = sys.maxint
except AttributeError:
  # Also see https://stackoverflow.com/a/13795777/4052492.
  maxint = sys.maxsize


def set_reported_error(value):
  global global_reported_error
  global_reported_error = value

def reported_error():
  return global_reported_error

def log(msg):
  if args.verbose:
    print(msg)


global_inc_copy = 0
def inc_copy():
  global global_inc_copy
  global_inc_copy += 1

def get_node(name):
  return graph.setdefault(name, Node(name))

ptr_types = ('raw', 'ref', 'mem')

def inc_ptr(dst, ptr):
  if ptr in ptr_types:
    node = graph.get(dst)
    if not node: return
    node.counts[ptr] += 1

def add_counts(s1, s2):
  for (k, v) in s2.iteritems():
    s1[k] += s2[k]

# Representation of graph nodes. Basically a map of directed edges.
class Node:
  def __init__(self, name):
    self.name = name
    self.edges = {}
    self.reset()
  def __repr__(self):
    return "%s(%s) %s" % (self.name, self.visited, self.edges)
  def update_node(self, decl):
    # Currently we don't track any node info besides its edges.
    pass
  def update_edge(self, e):
    new_edge = Edge(**e)
    edge = self.edges.get(new_edge.key)
    if edge:
      # If an edge exist, its kind is the strongest of the two.
      edge.kind = max(edge.kind, new_edge.kind)
    else:
      self.edges[new_edge.key] = new_edge
  def super_edges(self):
    return [e for e in self.edges.values() if e.is_super()]

  def subclass_edges(self):
    return [e for e in self.edges.values() if e.is_subclass()]

  def reset(self):
    self.cost = maxint
    self.visited = False
    self.path = None
    self.counts = {}
    for ptr in ptr_types:
      self.counts[ptr] = 0
  def update_counts(self):
    for e in self.edges.values():
      inc_ptr(e.dst, e.ptr)

# Representation of directed graph edges.
class Edge:
  def __init__(self, **decl):
    self.src = decl['src']
    self.dst = decl['dst']
    self.lbl = decl['lbl']
    self.ptr = decl['ptr']
    self.kind = decl['kind'] # 0 = weak, 1 = strong, 2 = root
    self.loc = decl['loc']
    # The label does not uniquely determine an edge from a node. We
    # define the semi-unique key to be the concatenation of the
    # label and dst name. This is sufficient to track the strongest
    # edge to a particular type. For example, if the field A::m_f
    # has type HashMap<WeakMember<B>, Member<B>> we will have a
    # strong edge with key m_f#B from A to B.
    self.key = '%s#%s' % (self.lbl, self.dst)
  def __repr__(self):
    return '%s (%s) => %s' % (self.src, self.lbl, self.dst)
  def is_root(self):
    return self.kind == 2
  def is_weak(self):
    return self.kind == 0
  def keeps_alive(self):
    return self.kind > 0
  def is_subclass(self):
    return self.lbl.startswith('<subclass>')
  def is_super(self):
    return self.lbl.startswith('<super>')

def parse_file(filename):
  obj = json.load(open(filename))
  return obj

def build_graphs_in_dir(dirname):
  # TODO: Use plateform independent code, eg, os.walk
  files = subprocess.check_output(
    ['find', dirname, '-name', '*.graph.json']).split('\n')
  log("Found %d files" % len(files))
  for f in files:
    f.strip()
    if len(f) < 1:
      continue
    build_graph(f)

def build_graph(filename):
  for decl in parse_file(filename):
    if 'name' in decl:
      # Add/update a node entry
      name = decl['name']
      node = get_node(name)
      node.update_node(decl)
    else:
      # Add/update an edge entry
      name = decl['src']
      node = get_node(name)
      node.update_edge(decl)

# Copy all non-weak edges from super classes to their subclasses.
# This causes all fields of a super to be considered fields of a
# derived class without tranitively relating derived classes with
# each other. For example, if B <: A, C <: A, and for some D, D => B,
# we don't want that to entail that D => C.
def copy_super_edges(edge):
  if edge.is_weak() or not edge.is_super():
    return
  inc_copy()
  # Make the super-class edge weak (prohibits processing twice).
  edge.kind = 0
  # If the super class is not in our graph exit early.
  super_node = graph.get(edge.dst)
  if super_node is None: return
  # Recursively copy all super-class edges.
  for e in super_node.super_edges():
    copy_super_edges(e)
  # Copy strong super-class edges (ignoring sub-class edges) to the sub class.
  sub_node = graph[edge.src]
  for e in super_node.edges.values():
    if e.keeps_alive() and not e.is_subclass():
      new_edge = Edge(
        src = sub_node.name,
        dst = e.dst,
        lbl = '%s <: %s' % (super_node.name, e.lbl),
        ptr = e.ptr,
        kind = e.kind,
        loc = e.loc,
      )
      sub_node.edges[new_edge.key] = new_edge
  # Add a strong sub-class edge.
  sub_edge = Edge(
    src = super_node.name,
    dst = sub_node.name,
    lbl = '<subclass>',
    ptr = edge.ptr,
    kind = 1,
    loc = edge.loc,
  )
  super_node.edges[sub_edge.key] = sub_edge

def complete_graph():
  for node in graph.values():
    for edge in node.super_edges():
      copy_super_edges(edge)
    for edge in node.edges.values():
      if edge.is_root():
        roots.append(edge)
  log("Copied edges down <super> edges for %d graph nodes" % global_inc_copy)

def reset_graph():
  for n in graph.values():
    n.reset()

def shortest_path(start, end):
  start.cost = 0
  minlist = [start]
  while len(minlist) > 0:
    minlist.sort(key=lambda n: -n.cost)
    current = minlist.pop()
    current.visited = True
    if current == end or current.cost >= end.cost + 1:
      return
    for e in current.edges.values():
      if not e.keeps_alive():
        continue
      dst = graph.get(e.dst)
      if dst is None or dst.visited:
        continue
      if current.cost < dst.cost:
        dst.cost = current.cost + 1
        dst.path = e
      minlist.append(dst)

def detect_cycles():
  for root_edge in roots:
    reset_graph()
    # Mark ignored classes as already visited
    for ignore in args.ignore_classes:
      name = ignore.find("::") > 0 and ignore or ("blink::" + ignore)
      node = graph.get(name)
      if node:
        node.visited = True
    src = graph[root_edge.src]
    dst = graph.get(root_edge.dst)
    if src.visited:
      continue
    if root_edge.dst == "WTF::String":
      continue
    if dst is None:
      print("\nPersistent root to incomplete destination object:")
      print(root_edge)
      set_reported_error(True)
      continue
    # Find the shortest path from the root target (dst) to its host (src)
    shortest_path(dst, src)
    if src.cost < maxint:
      report_cycle(root_edge)

def is_ignored_cycle(cycle):
  for block in ignored_cycles:
    if block_match(cycle, block):
      return True

def block_match(b1, b2):
  if len(b1) != len(b2):
    return False
  for (l1, l2) in zip(b1, b2):
    if l1 != l2:
      return False
  return True

def report_cycle(root_edge):
  dst = graph[root_edge.dst]
  path = []
  edge = root_edge
  dst.path = None
  while edge:
    path.append(edge)
    edge = graph[edge.src].path
  path.append(root_edge)
  path.reverse()
  # Find the max loc length for pretty printing.
  max_loc = 0
  for p in path:
    if len(p.loc) > max_loc:
      max_loc = len(p.loc)
  out = StringIO()
  for p in path[:-1]:
    print((p.loc + ':').ljust(max_loc + 1), p, file=out)
  sout = out.getvalue()
  if not is_ignored_cycle(sout):
    print("\nFound a potentially leaking cycle starting from a GC root:\n",
          sout, sep="")
    set_reported_error(True)

def load_graph():
  global graph
  global roots
  log("Reading graph from pickled file: " + args.pickle_graph)
  dump = pickle.load(open(args.pickle_graph, 'rb'))
  graph = dump[0]
  roots = dump[1]

def save_graph():
  log("Saving graph to pickle file: " + args.pickle_graph)
  dump = (graph, roots)
  pickle.dump(dump, open(args.pickle_graph, 'wb'))

def read_ignored_cycles():
  global ignored_cycles
  if not args.ignore_cycles:
    return
  log("Reading ignored cycles from file: " + args.ignore_cycles)
  block = []
  for l in open(args.ignore_cycles):
    line = l.strip()
    if not line or line.startswith('Found'):
      if len(block) > 0:
        ignored_cycles.append(block)
      block = []
    else:
      block += l
  if len(block) > 0:
    ignored_cycles.append(block)

gc_bases = (
  'cppgc::GarbageCollected',
  'cppgc::GarbageCollectedMixin',
)
ref_bases = (
  'WTF::RefCounted',
  'WTF::ThreadSafeRefCounted',
)
gcref_bases = (
  'blink::RefCountedGarbageCollected',
  'blink::ThreadSafeRefCountedGarbageCollected',
)
ref_mixins = (
  'blink::EventTarget',
  'blink::EventTargetWithInlineData',
  'blink::ActiveDOMObject',
)

def print_stats():
  gcref_managed = []
  ref_managed = []
  gc_managed = []
  hierarchies = []

  for node in graph.values():
    node.update_counts()
    for sup in node.super_edges():
      if sup.dst in gcref_bases:
        gcref_managed.append(node)
      elif sup.dst in ref_bases:
        ref_managed.append(node)
      elif sup.dst in gc_bases:
        gc_managed.append(node)

  groups = [("GC manged   ", gc_managed),
            ("ref counted ", ref_managed),
            ("in transition", gcref_managed)]
  total = sum([len(g) for (s,g) in groups])
  for (s, g) in groups:
    percent = len(g) * 100 / total
    print("%2d%% is %s (%d hierarchies)" % (percent, s, len(g)))

  for base in gcref_managed:
    stats = dict({ 'classes': 0, 'ref-mixins': 0 })
    for ptr in ptr_types: stats[ptr] = 0
    hierarchy_stats(base, stats)
    hierarchies.append((base, stats))

  print("\nHierarchies in transition (RefCountedGarbageCollected):")
  hierarchies.sort(key=lambda n: -n[1]['classes'])
  for (node, stats) in hierarchies:
    total = stats['mem'] + stats['ref'] + stats['raw']
    print(
        ("%s %3d%% of %-30s: %3d cls, %3d mem, %3d ref, %3d raw, %3d ref-mixins"
         % (
             stats['ref'] == 0 and stats['ref-mixins'] == 0 and "*" or " ",
             total == 0 and 100 or stats['mem'] * 100 / total,
             node.name.replace('blink::', '').replace(
                 'cppgc::subtle::', '').replace('cppgc::', ''),
             stats['classes'],
             stats['mem'],
             stats['ref'],
             stats['raw'],
             stats['ref-mixins'],
         )))


def hierarchy_stats(node, stats):
  if not node: return
  stats['classes'] += 1
  add_counts(stats, node.counts)
  for edge in node.super_edges():
    if edge.dst in ref_mixins:
      stats['ref-mixins'] += 1
  for edge in node.subclass_edges():
    hierarchy_stats(graph.get(edge.dst), stats)

def main():
  global args
  args = parser.parse_args()
  if not (args.detect_cycles or args.print_stats):
    print("Please select an operation to perform (eg, -c to detect cycles)")
    parser.print_help()
    return 1
  if args.pickle_graph and os.path.isfile(args.pickle_graph):
    load_graph()
  else:
    if args.use_stdin:
      log("Reading files from stdin")
      for f in sys.stdin:
        build_graph(f.strip())
    else:
      log("Reading files and directories from command line")
      if len(args.files) == 0:
        print("Please provide files or directores for building the graph")
        parser.print_help()
        return 1
      for f in args.files:
        if os.path.isdir(f):
          log("Building graph from files in directory: " + f)
          build_graphs_in_dir(f)
        else:
          log("Building graph from file: " + f)
          build_graph(f)
    log("Completing graph construction (%d graph nodes)" % len(graph))
    complete_graph()
    if args.pickle_graph:
      save_graph()
  if args.detect_cycles:
    read_ignored_cycles()
    log("Detecting cycles containg GC roots")
    detect_cycles()
  if args.print_stats:
    log("Printing statistics")
    print_stats()
  if reported_error():
    return 1
  return 0

if __name__ == '__main__':
  sys.exit(main())