llvm/libcxx/test/libcxx/transitive_includes_to_csv.py

#!/usr/bin/env python
# ===----------------------------------------------------------------------===##
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===----------------------------------------------------------------------===##

from dataclasses import dataclass
from typing import List  # Needed for python 3.8 compatibility.
import argparse
import pathlib
import re
import sys


@dataclass
class header:
    name: str = None
    level: int = -1


def parse_line(line: str) -> header:
    """
    Parse an output line from --trace-includes into a `header`.
    """
    match = re.match(r"(\.+) (.+)", line)
    if not match:
        sys.exit(f"Line {line} contains invalid data.")

    # The number of periods in front of the header name is the nesting level of
    # that header.
    return header(match.group(2), len(match.group(1)))


# On Windows, the path separators can either be forward slash or backslash.
# If it is a backslash, Clang prints it escaped as two consecutive
# backslashes, and they need to be escaped in the RE. (Use a raw string for
# the pattern to avoid needing another level of escaping on the Python string
# literal level.)
LIBCXX_HEADER_REGEX = r".*c\+\+(?:/|\\\\)v[0-9]+(?:/|\\\\)(.+)"

def is_libcxx_header(header: str) -> bool:
    """
    Returns whether a header is a libc++ header, excluding the C-compatibility headers.
    """
    # Only keep files in the c++/vN directory.
    match = re.match(LIBCXX_HEADER_REGEX, header)
    if not match:
        return False

    # Skip C compatibility headers (in particular, make sure not to skip libc++ detail headers).
    relative = match.group(1)
    if relative.endswith(".h") and not (
        relative.startswith("__") or re.search(r"(/|\\\\)__", relative)
    ):
        return False

    return True


def parse_file(file: pathlib.Path) -> List[str]:
    """
    Parse a file containing --trace-include output to generate a list of the top-level C++ includes
    contained in it.

    This effectively generates the dependency graph of C++ Standard Library headers of the header
    whose --trace-include it is. In order to get the expected result of --trace-include, the
    -fshow-skipped-includes flag also needs to be passed.
    """
    result = list()
    with file.open(encoding="utf-8") as f:
        for line in f.readlines():
            header = parse_line(line)

            # Skip non-libc++ headers
            if not is_libcxx_header(header.name):
                continue

            # Include top-level headers in the output. There's usually exactly one,
            # except if the compiler is passed a file with `-include`. Top-level
            # headers are transparent, in the sense that we want to go look at
            # transitive includes underneath.
            if header.level == 1:
                level = 999
                result.append(header)
                continue

            # Skip libc++ headers included transitively.
            if header.level > level:
                continue

            # Detail headers are transparent too: we attribute all includes of public libc++
            # headers under a detail header to the last public libc++ header that included it.
            if header.name.startswith("__") or re.search(r"(/|\\\\)__", header.name):
                level = 999
                continue

            # Add the non-detail libc++ header to the list.
            level = header.level
            result.append(header)
    return result


def create_include_graph(trace_includes: List[pathlib.Path]) -> List[str]:
    result = list()
    for file in trace_includes:
        headers = parse_file(file)

        # Get actual filenames relative to libc++'s installation directory instead of full paths
        relative = lambda h: re.match(LIBCXX_HEADER_REGEX, h).group(1)

        top_level = relative(
            next(h.name for h in headers if h.level == 1)
        )  # There should be only one top-level header
        includes = [relative(h.name) for h in headers if h.level != 1]

        # Remove duplicates in all includes.
        includes = list(set(includes))

        if len(includes) != 0:
            result.append([top_level] + includes)
    return result


def print_csv(graph: List[str]) -> None:
    for includes in graph:
        header = includes[0]
        for include in sorted(includes[1:]):
            if header == include:
                sys.exit(f"Cycle detected: header {header} includes itself.")
            print(f"{header} {include}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="""Produce a dependency graph of libc++ headers, in CSV format.
This script is normally executed by libcxx/test/libcxx/transitive_includes.gen.py""",
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )
    parser.add_argument(
        "inputs",
        default=None,
        metavar="FILE",
        nargs='+',
        help="One or more files containing the result of --trace-includes on the headers one wishes to graph.",
    )
    options = parser.parse_args()

    print_csv(create_include_graph(map(pathlib.Path, options.inputs)))