demangle_tree.py | Explore in Territory

# Given a path to llvm-objdump and a directory tree, spider the directory tree
# dumping every object file encountered with correct options needed to demangle
# symbols in the object file, and collect statistics about failed / crashed
# demanglings.  Useful for stress testing the demangler against a large corpus
# of inputs.

from __future__ import print_function

import argparse
import functools
import os
import re
import sys
import subprocess
import traceback
from multiprocessing import Pool
import multiprocessing

args = None


def parse_line(line):
    question = line.find("?")
    if question == -1:
        return None, None

    open_paren = line.find("(", question)
    if open_paren == -1:
        return None, None
    close_paren = line.rfind(")", open_paren)
    if open_paren == -1:
        return None, None
    mangled = line[question:open_paren]
    demangled = line[open_paren + 1 : close_paren]
    return mangled.strip(), demangled.strip()


class Result(object):
    def __init__(self):
        self.crashed = []
        self.file = None
        self.nsymbols = 0
        self.errors = set()
        self.nfiles = 0


class MapContext(object):
    def __init__(self):
        self.rincomplete = None
        self.rcumulative = Result()
        self.pending_objs = []
        self.npending = 0


def process_file(path, objdump):
    r = Result()
    r.file = path

    popen_args = [objdump, "-t", "-demangle", path]
    p = subprocess.Popen(popen_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode != 0:
        r.crashed = [r.file]
        return r

    output = stdout.decode("utf-8")

    for line in output.splitlines():
        mangled, demangled = parse_line(line)
        if mangled is None:
            continue
        r.nsymbols += 1
        if "invalid mangled name" in demangled:
            r.errors.add(mangled)
    return r


def add_results(r1, r2):
    r1.crashed.extend(r2.crashed)
    r1.errors.update(r2.errors)
    r1.nsymbols += r2.nsymbols
    r1.nfiles += r2.nfiles


def print_result_row(directory, result):
    print(
        "[{0} files, {1} crashes, {2} errors, {3} symbols]: '{4}'".format(
            result.nfiles,
            len(result.crashed),
            len(result.errors),
            result.nsymbols,
            directory,
        )
    )


def process_one_chunk(pool, chunk_size, objdump, context):
    objs = []

    incomplete = False
    dir_results = {}
    ordered_dirs = []
    while context.npending > 0 and len(objs) < chunk_size:
        this_dir = context.pending_objs[0][0]
        ordered_dirs.append(this_dir)
        re = Result()
        if context.rincomplete is not None:
            re = context.rincomplete
            context.rincomplete = None

        dir_results[this_dir] = re
        re.file = this_dir

        nneeded = chunk_size - len(objs)
        objs_this_dir = context.pending_objs[0][1]
        navail = len(objs_this_dir)
        ntaken = min(nneeded, navail)
        objs.extend(objs_this_dir[0:ntaken])
        remaining_objs_this_dir = objs_this_dir[ntaken:]
        context.pending_objs[0] = (context.pending_objs[0][0], remaining_objs_this_dir)
        context.npending -= ntaken
        if ntaken == navail:
            context.pending_objs.pop(0)
        else:
            incomplete = True

        re.nfiles += ntaken

    assert len(objs) == chunk_size or context.npending == 0

    copier = functools.partial(process_file, objdump=objdump)
    mapped_results = list(pool.map(copier, objs))

    for mr in mapped_results:
        result_dir = os.path.dirname(mr.file)
        result_entry = dir_results[result_dir]
        add_results(result_entry, mr)

    # It's only possible that a single item is incomplete, and it has to be the
    # last item.
    if incomplete:
        context.rincomplete = dir_results[ordered_dirs[-1]]
        ordered_dirs.pop()

    # Now ordered_dirs contains a list of all directories which *did* complete.
    for c in ordered_dirs:
        re = dir_results[c]
        add_results(context.rcumulative, re)
        print_result_row(c, re)


def process_pending_files(pool, chunk_size, objdump, context):
    while context.npending >= chunk_size:
        process_one_chunk(pool, chunk_size, objdump, context)


def go():
    global args

    obj_dir = args.dir
    extensions = args.extensions.split(",")
    extensions = [x if x[0] == "." else "." + x for x in extensions]

    pool_size = 48
    pool = Pool(processes=pool_size)

    try:
        nfiles = 0
        context = MapContext()

        for root, dirs, files in os.walk(obj_dir):
            root = os.path.normpath(root)
            pending = []
            for f in files:
                file, ext = os.path.splitext(f)
                if not ext in extensions:
                    continue

                nfiles += 1
                full_path = os.path.join(root, f)
                full_path = os.path.normpath(full_path)
                pending.append(full_path)

            # If this directory had no object files, just print a default
            # status line and continue with the next dir
            if len(pending) == 0:
                print_result_row(root, Result())
                continue

            context.npending += len(pending)
            context.pending_objs.append((root, pending))
            # Drain the tasks, `pool_size` at a time, until we have less than
            # `pool_size` tasks remaining.
            process_pending_files(pool, pool_size, args.objdump, context)

        assert context.npending < pool_size
        process_one_chunk(pool, pool_size, args.objdump, context)

        total = context.rcumulative
        nfailed = len(total.errors)
        nsuccess = total.nsymbols - nfailed
        ncrashed = len(total.crashed)

        if nfailed > 0:
            print("Failures:")
            for m in sorted(total.errors):
                print("  " + m)
        if ncrashed > 0:
            print("Crashes:")
            for f in sorted(total.crashed):
                print("  " + f)
        print("Summary:")
        spct = float(nsuccess) / float(total.nsymbols)
        fpct = float(nfailed) / float(total.nsymbols)
        cpct = float(ncrashed) / float(nfiles)
        print("Processed {0} object files.".format(nfiles))
        print(
            "{0}/{1} symbols successfully demangled ({2:.4%})".format(
                nsuccess, total.nsymbols, spct
            )
        )
        print("{0} symbols could not be demangled ({1:.4%})".format(nfailed, fpct))
        print("{0} files crashed while demangling ({1:.4%})".format(ncrashed, cpct))

    except:
        traceback.print_exc()

    pool.close()
    pool.join()


if __name__ == "__main__":
    def_obj = "obj" if sys.platform == "win32" else "o"

    parser = argparse.ArgumentParser(
        description="Demangle all symbols in a tree of object files, looking for failures."
    )
    parser.add_argument(
        "dir", type=str, help="the root directory at which to start crawling"
    )
    parser.add_argument(
        "--objdump",
        type=str,
        help="path to llvm-objdump.  If not specified "
        + "the tool is located as if by `which llvm-objdump`.",
    )
    parser.add_argument(
        "--extensions",
        type=str,
        default=def_obj,
        help="comma separated list of extensions to demangle (e.g. `o,obj`).  "
        + "By default this will be `obj` on Windows and `o` otherwise.",
    )

    args = parser.parse_args()

    multiprocessing.freeze_support()
    go()
llvm/llvm/utils/demangle_tree.py