llvm/clang/utils/creduce-clang-crash.py

#!/usr/bin/env python3
"""Calls C-Reduce to create a minimal reproducer for clang crashes.
Unknown arguments are treated at creduce options.

Output files:
  *.reduced.sh -- crash reproducer with minimal arguments
  *.reduced.cpp -- the reduced file
  *.test.sh -- interestingness test for C-Reduce
"""

from argparse import ArgumentParser, RawTextHelpFormatter
import os
import re
import shutil
import stat
import sys
import subprocess
import shlex
import tempfile
import shutil
import multiprocessing

verbose = False
creduce_cmd = None
clang_cmd = None


def verbose_print(*args, **kwargs):
    if verbose:
        print(*args, **kwargs)


def check_file(fname):
    fname = os.path.normpath(fname)
    if not os.path.isfile(fname):
        sys.exit("ERROR: %s does not exist" % (fname))
    return fname


def check_cmd(cmd_name, cmd_dir, cmd_path=None):
    """
    Returns absolute path to cmd_path if it is given,
    or absolute path to cmd_dir/cmd_name.
    """
    if cmd_path:
        # Make the path absolute so the creduce test can be run from any directory.
        cmd_path = os.path.abspath(cmd_path)
        cmd = shutil.which(cmd_path)
        if cmd:
            return cmd
        sys.exit("ERROR: executable `%s` not found" % (cmd_path))

    cmd = shutil.which(cmd_name, path=cmd_dir)
    if cmd:
        return cmd

    if not cmd_dir:
        cmd_dir = "$PATH"
    sys.exit("ERROR: `%s` not found in %s" % (cmd_name, cmd_dir))


def quote_cmd(cmd):
    return " ".join(shlex.quote(arg) for arg in cmd)


def write_to_script(text, filename):
    with open(filename, "w") as f:
        f.write(text)
    os.chmod(filename, os.stat(filename).st_mode | stat.S_IEXEC)


class Reduce(object):
    def __init__(self, crash_script, file_to_reduce, creduce_flags):
        crash_script_name, crash_script_ext = os.path.splitext(crash_script)
        file_reduce_name, file_reduce_ext = os.path.splitext(file_to_reduce)

        self.testfile = file_reduce_name + ".test.sh"
        self.crash_script = crash_script_name + ".reduced" + crash_script_ext
        self.file_to_reduce = file_reduce_name + ".reduced" + file_reduce_ext
        shutil.copy(file_to_reduce, self.file_to_reduce)

        self.clang = clang_cmd
        self.clang_args = []
        self.expected_output = []
        self.needs_stack_trace = False
        self.creduce_flags = ["--tidy"] + creduce_flags

        self.read_clang_args(crash_script, file_to_reduce)
        self.read_expected_output()

    def get_crash_cmd(self, cmd=None, args=None, filename=None):
        if not cmd:
            cmd = self.clang
        if not args:
            args = self.clang_args
        if not filename:
            filename = self.file_to_reduce

        return [cmd] + args + [filename]

    def read_clang_args(self, crash_script, filename):
        print("\nReading arguments from crash script...")
        with open(crash_script) as f:
            # Assume clang call is the first non comment line.
            cmd = []
            for line in f:
                if not line.lstrip().startswith("#"):
                    cmd = shlex.split(line)
                    break
        if not cmd:
            sys.exit("Could not find command in the crash script.")

        # Remove clang and filename from the command
        # Assume the last occurrence of the filename is the clang input file
        del cmd[0]
        for i in range(len(cmd) - 1, -1, -1):
            if cmd[i] == filename:
                del cmd[i]
                break
        self.clang_args = cmd
        verbose_print("Clang arguments:", quote_cmd(self.clang_args))

    def read_expected_output(self):
        print("\nGetting expected crash output...")
        p = subprocess.Popen(
            self.get_crash_cmd(), stdout=subprocess.PIPE, stderr=subprocess.STDOUT
        )
        crash_output, _ = p.communicate()
        result = []

        # Remove color codes
        ansi_escape = r"\x1b\[[0-?]*m"
        crash_output = re.sub(ansi_escape, "", crash_output.decode("utf-8"))

        # Look for specific error messages
        regexes = [
            r"Assertion .+ failed",  # Linux assert()
            r"Assertion failed: .+,",  # FreeBSD/Mac assert()
            r"fatal error: error in backend: .+",
            r"LLVM ERROR: .+",
            r"UNREACHABLE executed at .+?!",
            r"LLVM IR generation of declaration '.+'",
            r"Generating code for declaration '.+'",
            r"\*\*\* Bad machine code: .+ \*\*\*",
            r"ERROR: .*Sanitizer: [^ ]+ ",
        ]
        for msg_re in regexes:
            match = re.search(msg_re, crash_output)
            if match:
                msg = match.group(0)
                result = [msg]
                print("Found message:", msg)
                break

        # If no message was found, use the top five stack trace functions,
        # ignoring some common functions
        # Five is a somewhat arbitrary number; the goal is to get a small number
        # of identifying functions with some leeway for common functions
        if not result:
            self.needs_stack_trace = True
            stacktrace_re = r"[0-9]+\s+0[xX][0-9a-fA-F]+\s*([^(]+)\("
            filters = [
                "PrintStackTrace",
                "RunSignalHandlers",
                "CleanupOnSignal",
                "HandleCrash",
                "SignalHandler",
                "__restore_rt",
                "gsignal",
                "abort",
            ]

            def skip_function(func_name):
                return any(name in func_name for name in filters)

            matches = re.findall(stacktrace_re, crash_output)
            result = [x for x in matches if x and not skip_function(x)][:5]
            for msg in result:
                print("Found stack trace function:", msg)

        if not result:
            print("ERROR: no crash was found")
            print("The crash output was:\n========\n%s========" % crash_output)
            sys.exit(1)

        self.expected_output = result

    def check_expected_output(self, args=None, filename=None):
        if not args:
            args = self.clang_args
        if not filename:
            filename = self.file_to_reduce

        p = subprocess.Popen(
            self.get_crash_cmd(args=args, filename=filename),
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
        )
        crash_output, _ = p.communicate()
        return all(msg in crash_output.decode("utf-8") for msg in self.expected_output)

    def write_interestingness_test(self):
        print("\nCreating the interestingness test...")

        # Disable symbolization if it's not required to avoid slow symbolization.
        disable_symbolization = ""
        if not self.needs_stack_trace:
            disable_symbolization = "export LLVM_DISABLE_SYMBOLIZATION=1"

        output = """#!/bin/bash
%s
if %s >& t.log ; then
  exit 1
fi
""" % (
            disable_symbolization,
            quote_cmd(self.get_crash_cmd()),
        )

        for msg in self.expected_output:
            output += "grep -F %s t.log || exit 1\n" % shlex.quote(msg)

        write_to_script(output, self.testfile)
        self.check_interestingness()

    def check_interestingness(self):
        testfile = os.path.abspath(self.testfile)

        # Check that the test considers the original file interesting
        returncode = subprocess.call(testfile, stdout=subprocess.DEVNULL)
        if returncode:
            sys.exit("The interestingness test does not pass for the original file.")

        # Check that an empty file is not interesting
        # Instead of modifying the filename in the test file, just run the command
        with tempfile.NamedTemporaryFile() as empty_file:
            is_interesting = self.check_expected_output(filename=empty_file.name)
        if is_interesting:
            sys.exit("The interestingness test passes for an empty file.")

    def clang_preprocess(self):
        print("\nTrying to preprocess the source file...")
        with tempfile.NamedTemporaryFile() as tmpfile:
            cmd_preprocess = self.get_crash_cmd() + ["-E", "-o", tmpfile.name]
            cmd_preprocess_no_lines = cmd_preprocess + ["-P"]
            try:
                subprocess.check_call(cmd_preprocess_no_lines)
                if self.check_expected_output(filename=tmpfile.name):
                    print("Successfully preprocessed with line markers removed")
                    shutil.copy(tmpfile.name, self.file_to_reduce)
                else:
                    subprocess.check_call(cmd_preprocess)
                    if self.check_expected_output(filename=tmpfile.name):
                        print("Successfully preprocessed without removing line markers")
                        shutil.copy(tmpfile.name, self.file_to_reduce)
                    else:
                        print(
                            "No longer crashes after preprocessing -- "
                            "using original source"
                        )
            except subprocess.CalledProcessError:
                print("Preprocessing failed")

    @staticmethod
    def filter_args(
        args, opts_equal=[], opts_startswith=[], opts_one_arg_startswith=[]
    ):
        result = []
        skip_next = False
        for arg in args:
            if skip_next:
                skip_next = False
                continue
            if any(arg == a for a in opts_equal):
                continue
            if any(arg.startswith(a) for a in opts_startswith):
                continue
            if any(arg.startswith(a) for a in opts_one_arg_startswith):
                skip_next = True
                continue
            result.append(arg)
        return result

    def try_remove_args(self, args, msg=None, extra_arg=None, **kwargs):
        new_args = self.filter_args(args, **kwargs)

        if extra_arg:
            if extra_arg in new_args:
                new_args.remove(extra_arg)
            new_args.append(extra_arg)

        if new_args != args and self.check_expected_output(args=new_args):
            if msg:
                verbose_print(msg)
            return new_args
        return args

    def try_remove_arg_by_index(self, args, index):
        new_args = args[:index] + args[index + 1 :]
        removed_arg = args[index]

        # Heuristic for grouping arguments:
        # remove next argument if it doesn't start with "-"
        if index < len(new_args) and not new_args[index].startswith("-"):
            del new_args[index]
            removed_arg += " " + args[index + 1]

        if self.check_expected_output(args=new_args):
            verbose_print("Removed", removed_arg)
            return new_args, index
        return args, index + 1

    def simplify_clang_args(self):
        """Simplify clang arguments before running C-Reduce to reduce the time the
        interestingness test takes to run.
        """
        print("\nSimplifying the clang command...")
        new_args = self.clang_args

        # Remove the color diagnostics flag to make it easier to match error
        # text.
        new_args = self.try_remove_args(
            new_args,
            msg="Removed -fcolor-diagnostics",
            opts_equal=["-fcolor-diagnostics"],
        )

        # Remove some clang arguments to speed up the interestingness test
        new_args = self.try_remove_args(
            new_args,
            msg="Removed debug info options",
            opts_startswith=["-gcodeview", "-debug-info-kind=", "-debugger-tuning="],
        )

        new_args = self.try_remove_args(
            new_args, msg="Removed --show-includes", opts_startswith=["--show-includes"]
        )
        # Not suppressing warnings (-w) sometimes prevents the crash from occurring
        # after preprocessing
        new_args = self.try_remove_args(
            new_args,
            msg="Replaced -W options with -w",
            extra_arg="-w",
            opts_startswith=["-W"],
        )
        new_args = self.try_remove_args(
            new_args,
            msg="Replaced optimization level with -O0",
            extra_arg="-O0",
            opts_startswith=["-O"],
        )

        # Try to remove compilation steps
        new_args = self.try_remove_args(
            new_args, msg="Added -emit-llvm", extra_arg="-emit-llvm"
        )
        new_args = self.try_remove_args(
            new_args, msg="Added -fsyntax-only", extra_arg="-fsyntax-only"
        )

        # Try to make implicit int an error for more sensible test output
        new_args = self.try_remove_args(
            new_args,
            msg="Added -Werror=implicit-int",
            opts_equal=["-w"],
            extra_arg="-Werror=implicit-int",
        )

        self.clang_args = new_args
        verbose_print("Simplified command:", quote_cmd(self.get_crash_cmd()))

    def reduce_clang_args(self):
        """Minimize the clang arguments after running C-Reduce, to get the smallest
        command that reproduces the crash on the reduced file.
        """
        print("\nReducing the clang crash command...")

        new_args = self.clang_args

        # Remove some often occurring args
        new_args = self.try_remove_args(
            new_args, msg="Removed -D options", opts_startswith=["-D"]
        )
        new_args = self.try_remove_args(
            new_args, msg="Removed -D options", opts_one_arg_startswith=["-D"]
        )
        new_args = self.try_remove_args(
            new_args, msg="Removed -I options", opts_startswith=["-I"]
        )
        new_args = self.try_remove_args(
            new_args, msg="Removed -I options", opts_one_arg_startswith=["-I"]
        )
        new_args = self.try_remove_args(
            new_args, msg="Removed -W options", opts_startswith=["-W"]
        )

        # Remove other cases that aren't covered by the heuristic
        new_args = self.try_remove_args(
            new_args, msg="Removed -mllvm", opts_one_arg_startswith=["-mllvm"]
        )

        i = 0
        while i < len(new_args):
            new_args, i = self.try_remove_arg_by_index(new_args, i)

        self.clang_args = new_args

        reduced_cmd = quote_cmd(self.get_crash_cmd())
        write_to_script(reduced_cmd, self.crash_script)
        print("Reduced command:", reduced_cmd)

    def run_creduce(self):
        full_creduce_cmd = (
            [creduce_cmd] + self.creduce_flags + [self.testfile, self.file_to_reduce]
        )
        print("\nRunning C-Reduce...")
        verbose_print(quote_cmd(full_creduce_cmd))
        try:
            p = subprocess.Popen(full_creduce_cmd)
            p.communicate()
        except KeyboardInterrupt:
            # Hack to kill C-Reduce because it jumps into its own pgid
            print("\n\nctrl-c detected, killed creduce")
            p.kill()


def main():
    global verbose
    global creduce_cmd
    global clang_cmd

    parser = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
    parser.add_argument(
        "crash_script",
        type=str,
        nargs=1,
        help="Name of the script that generates the crash.",
    )
    parser.add_argument(
        "file_to_reduce", type=str, nargs=1, help="Name of the file to be reduced."
    )
    parser.add_argument(
        "--llvm-bin", dest="llvm_bin", type=str, help="Path to the LLVM bin directory."
    )
    parser.add_argument(
        "--clang",
        dest="clang",
        type=str,
        help="The path to the `clang` executable. "
        "By default uses the llvm-bin directory.",
    )
    parser.add_argument(
        "--creduce",
        dest="creduce",
        type=str,
        help="The path to the `creduce` executable. "
        "Required if `creduce` is not in PATH environment.",
    )
    parser.add_argument("-v", "--verbose", action="store_true")
    args, creduce_flags = parser.parse_known_args()
    verbose = args.verbose
    llvm_bin = os.path.abspath(args.llvm_bin) if args.llvm_bin else None
    creduce_cmd = check_cmd("creduce", None, args.creduce)
    clang_cmd = check_cmd("clang", llvm_bin, args.clang)

    crash_script = check_file(args.crash_script[0])
    file_to_reduce = check_file(args.file_to_reduce[0])

    if "--n" not in creduce_flags:
        creduce_flags += ["--n", str(max(4, multiprocessing.cpu_count() // 2))]

    r = Reduce(crash_script, file_to_reduce, creduce_flags)

    r.simplify_clang_args()
    r.write_interestingness_test()
    r.clang_preprocess()
    r.run_creduce()
    r.reduce_clang_args()


if __name__ == "__main__":
    main()