llvm/clang/tools/scan-build-py/lib/libscanbuild/compilation.py

# -*- coding: utf-8 -*-
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
""" This module is responsible for to parse a compiler invocation. """

import re
import os
import collections

__all__ = ["split_command", "classify_source", "compiler_language"]

# Ignored compiler options map for compilation database creation.
# The map is used in `split_command` method. (Which does ignore and classify
# parameters.) Please note, that these are not the only parameters which
# might be ignored.
#
# Keys are the option name, value number of options to skip
IGNORED_FLAGS = {
    # compiling only flag, ignored because the creator of compilation
    # database will explicitly set it.
    "-c": 0,
    # preprocessor macros, ignored because would cause duplicate entries in
    # the output (the only difference would be these flags). this is actual
    # finding from users, who suffered longer execution time caused by the
    # duplicates.
    "-MD": 0,
    "-MMD": 0,
    "-MG": 0,
    "-MP": 0,
    "-MF": 1,
    "-MT": 1,
    "-MQ": 1,
    # linker options, ignored because for compilation database will contain
    # compilation commands only. so, the compiler would ignore these flags
    # anyway. the benefit to get rid of them is to make the output more
    # readable.
    "-static": 0,
    "-shared": 0,
    "-s": 0,
    "-rdynamic": 0,
    "-l": 1,
    "-L": 1,
    "-u": 1,
    "-z": 1,
    "-T": 1,
    "-Xlinker": 1,
}

# Known C/C++ compiler executable name patterns
COMPILER_PATTERNS = frozenset(
    [
        re.compile(r"^(intercept-|analyze-|)c(c|\+\+)$"),
        re.compile(r"^([^-]*-)*[mg](cc|\+\+)(-\d+(\.\d+){0,2})?$"),
        re.compile(r"^([^-]*-)*clang(\+\+)?(-\d+(\.\d+){0,2})?$"),
        re.compile(r"^llvm-g(cc|\+\+)$"),
    ]
)


def split_command(command):
    """Returns a value when the command is a compilation, None otherwise.

    The value on success is a named tuple with the following attributes:

        files:    list of source files
        flags:    list of compile options
        compiler: string value of 'c' or 'c++'"""

    # the result of this method
    result = collections.namedtuple("Compilation", ["compiler", "flags", "files"])
    result.compiler = compiler_language(command)
    result.flags = []
    result.files = []
    # quit right now, if the program was not a C/C++ compiler
    if not result.compiler:
        return None
    # iterate on the compile options
    args = iter(command[1:])
    for arg in args:
        # quit when compilation pass is not involved
        if arg in {"-E", "-S", "-cc1", "-M", "-MM", "-###"}:
            return None
        # ignore some flags
        elif arg in IGNORED_FLAGS:
            count = IGNORED_FLAGS[arg]
            for _ in range(count):
                next(args)
        elif re.match(r"^-(l|L|Wl,).+", arg):
            pass
        # some parameters could look like filename, take as compile option
        elif arg in {"-D", "-I"}:
            result.flags.extend([arg, next(args)])
        # parameter which looks source file is taken...
        elif re.match(r"^[^-].+", arg) and classify_source(arg):
            result.files.append(arg)
        # and consider everything else as compile option.
        else:
            result.flags.append(arg)
    # do extra check on number of source files
    return result if result.files else None


def classify_source(filename, c_compiler=True):
    """Return the language from file name extension."""

    mapping = {
        ".c": "c" if c_compiler else "c++",
        ".i": "c-cpp-output" if c_compiler else "c++-cpp-output",
        ".ii": "c++-cpp-output",
        ".m": "objective-c",
        ".mi": "objective-c-cpp-output",
        ".mm": "objective-c++",
        ".mii": "objective-c++-cpp-output",
        ".C": "c++",
        ".cc": "c++",
        ".CC": "c++",
        ".cp": "c++",
        ".cpp": "c++",
        ".cxx": "c++",
        ".c++": "c++",
        ".C++": "c++",
        ".txx": "c++",
    }

    __, extension = os.path.splitext(os.path.basename(filename))
    return mapping.get(extension)


def compiler_language(command):
    """A predicate to decide the command is a compiler call or not.

    Returns 'c' or 'c++' when it match. None otherwise."""

    cplusplus = re.compile(r"^(.+)(\+\+)(-.+|)$")

    if command:
        executable = os.path.basename(command[0])
        if any(pattern.match(executable) for pattern in COMPILER_PATTERNS):
            return "c++" if cplusplus.match(executable) else "c"
    return None