llvm/clang/tools/include-mapping/gen_std.py

#!/usr/bin/env python3
# ===- gen_std.py -  ------------------------------------------*- python -*--===#
#
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# ===------------------------------------------------------------------------===#

"""gen_std.py is a tool to generate a lookup table (from qualified names to
include headers) for C/C++ Standard Library symbols by parsing archived HTML
files from cppreference.

The generated files are located in clang/include/Tooling/Inclusions.

Caveats and FIXMEs:
  - only symbols directly in "std" namespace are added, we should also add std's
    subnamespace symbols (e.g. chrono).
  - symbols with multiple variants or defined in multiple headers aren't added,
    e.g. std::move, std::swap

Usage:
  1. Install BeautifulSoup dependency, see instruction:
       https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-beautiful-soup
  2. Download cppreference offline HTML files (html_book_20220730.zip in Unofficial Release) at
       https://en.cppreference.com/w/Cppreference:Archives
  3. Unzip the zip file from step 2 (e.g., to a "cppreference" directory). You should
     get a "cppreference/reference" directory.
  4. Run the command:
       // Generate C++ symbols
       python3 gen_std.py -cppreference cppreference/reference -symbols=cpp > StdSymbolMap.inc
       // Generate C symbols
       python3 gen_std.py -cppreference cppreference/reference -symbols=c > CSymbolMap.inc
"""


import cppreference_parser
import argparse
import datetime
import os
import sys
import re


CODE_PREFIX = """\
//===-- gen_std.py generated file -------------------------------*- C++ -*-===//
//
// Used to build a lookup table (qualified names => include headers) for %s
// Standard Library symbols.
//
// This file was generated automatically by
// clang/tools/include-mapping/gen_std.py, DO NOT EDIT!
//
// Generated from cppreference offline HTML book (modified on %s).
//===----------------------------------------------------------------------===//
"""


def ParseArg():
    parser = argparse.ArgumentParser(description="Generate StdGen file")
    parser.add_argument(
        "-cppreference",
        metavar="PATH",
        default="",
        help="path to the cppreference offline HTML directory",
        required=True,
    )
    parser.add_argument(
        "-symbols",
        default="cpp",
        help="Generate c or cpp (removed) symbols. One of {cpp, c, cpp_removed}.",
        required=True,
    )
    return parser.parse_args()


def AdditionalHeadersForIOSymbols(symbol):
    # IO-related symbols declared in the <iosfwd> header, per C++
    # [iosfwd.syn 31.3.1]:
    iosfwd_symbols = [
        "basic_ios",
        "basic_streambuf",
        "basic_istream",
        "basic_ostream",
        "basic_iostream",
        "basic_stringbuf",
        "basic_istringstream",
        "basic_ostringstream",
        "basic_stringstream",
        "basic_spanbuf",
        "basic_ispanstream",
        "basic_ospanstream",
        "basic_spanstream",
        "basic_filebuf",
        "basic_ifstream",
        "basic_ofstream",
        "basic_fstream",
        "basic_syncbuf",
        "basic_osyncstream",
        "istreambuf_iterator",
        "ostreambuf_iterator",
        "ios",
        "wios",
        "streambuf",
        "istream",
        "ostream",
        "iostream",
        "stringbuf",
        "istringstream",
        "ostringstream",
        "stringstream",
        "spanbuf",
        "ispanstream",
        "ospanstream",
        "spanstream",
        "filebuf",
        "ifstream",
        "ofstream",
        "fstream",
        "syncbuf",
        "osyncstream",
        "wstreambuf",
        "wistream",
        "wostream",
        "wiostream",
        "wstringbuf",
        "wistringstream",
        "wostringstream",
        "wstringstream",
        "wspanbuf",
        "wispanstream",
        "wospanstream",
        "wspanstream",
        "wfilebuf",
        "wifstream",
        "wofstream",
        "wfstream",
        "wsyncbuf",
        "wosyncstream",
        "fpos",
        "streampos",
        "wstreampos",
        "u8streampos",
        "u16streampos",
        "u32streampos",
    ]
    assert len(symbol.headers) == 1
    sym_header = symbol.headers[0]
    headers = []
    # <iostream> is preferred than <iosfwd>

    # <iostream> is an alternative of <streambuf>, <istream>, <ostream>, <ios>.
    # per C++ [iostream.syn 31.4.1]
    if sym_header in ["<ios>", "<istream>", "<ostream>", "<streambuf>"]:
        headers.append("<iostream>")

    if symbol.name in iosfwd_symbols:
        headers.append("<iosfwd>")

    return headers


def GetCCompatibilitySymbols(symbol):
    # C++ form of the C standard headers.
    c_compat_headers = {
        "<cassert>",
        "<cctype>",
        "<cerrno>",
        "<cfenv>",
        "<cfloat>",
        "<cinttypes>",
        "<climits>",
        "<clocale>",
        "<cmath>",
        "<csetjmp>",
        "<csignal>",
        "<cstdarg>",
        "<cstddef>",
        "<cstdint>",
        "<cstdio>",
        "<cstdlib>",
        "<cstring>",
        "<ctime>",
        "<cuchar>",
        "<cwchar>",
        "<cwctype>",
    }
    # C++ [support.c.headers.other] 17.14.7
    #    ..., behaves as if each name placed in the standard library namespace by
    #    the corresponding <cname> header is placed within the global namespace
    #    scope, except for the functions described in [sf.cmath], the
    #    std​::​lerp function overloads ([c.math.lerp]), the declaration of
    #    std​::​byte ([cstddef.syn]), and the functions and function templates
    #    described in [support.types.byteops].
    exception_symbols = {
        "(assoc_)?laguerre[f|l]?",
        "(assoc_|sph_)?legendre[f|l]?",
        "beta[f|l]?",
        "(comp_)?ellint_[1-3][f|l]?",
        "(cyl_|sph_)?bessel_[i-k][f|l]?",
        "(cyl_|sph_)?neumann[f|l]?",
        "expint[f|l]?",
        "hermite[f|l]?",
        "riemann_zeta[f|l]?",
        "lerp",
        "byte",
    }
    assert len(symbol.headers) == 1
    header = symbol.headers[0]
    if header not in c_compat_headers:
        return []
    if any(re.fullmatch(x, symbol.name) for x in exception_symbols):
        return []

    # Introduce two more entries, both in the global namespace, one using the
    # C++-compat header and another using the C header.
    results = []
    if symbol.namespace != None:
        # avoid printing duplicated entries, for C macros!
        results.append(cppreference_parser.Symbol(symbol.name, None, [header]))
    c_header = "<" + header[2:-1] + ".h>"  # <cstdio> => <stdio.h>
    results.append(cppreference_parser.Symbol(symbol.name, None, [c_header]))
    return results


def main():
    args = ParseArg()
    if args.symbols == "cpp":
        page_root = os.path.join(args.cppreference, "en", "cpp")
        symbol_index_root = os.path.join(page_root, "symbol_index")
        parse_pages = [
            (page_root, "symbol_index.html", "std::"),
            # std sub-namespace symbols have separated pages.
            # We don't index std literal operators (e.g.
            # std::literals::chrono_literals::operator""d), these symbols can't be
            # accessed by std::<symbol_name>.
            #
            # std::placeholders symbols are handled manually in StdSpecialSymbolMap.inc
            (symbol_index_root, "chrono.html", "std::chrono::"),
            (symbol_index_root, "execution.html", "std::execution::"),
            (symbol_index_root, "numbers.html", "std::numbers::"),
            (symbol_index_root, "filesystem.html", "std::filesystem::"),
            (symbol_index_root, "pmr.html", "std::pmr::"),
            (symbol_index_root, "ranges.html", "std::ranges::"),

            (symbol_index_root, "views.html", "std::ranges::views::"),
            # std::ranges::views can be accessed as std::views.
            (symbol_index_root, "views.html", "std::views::"),

            (symbol_index_root, "regex_constants.html", "std::regex_constants::"),
            (symbol_index_root, "this_thread.html", "std::this_thread::"),
            # Zombie symbols that were available from the Standard Library, but are
            # removed in the following standards.
            (symbol_index_root, "zombie_names.html", "std::"),
            (symbol_index_root, "macro.html", None),
        ]
    elif args.symbols == "c":
        page_root = os.path.join(args.cppreference, "en", "c")
        symbol_index_root = page_root
        parse_pages = [(page_root, "index.html", None)]

    if not os.path.exists(symbol_index_root):
        exit("Path %s doesn't exist!" % symbol_index_root)

    symbols = cppreference_parser.GetSymbols(parse_pages)

    # We don't have version information from the unzipped offline HTML files.
    # so we use the modified time of the symbol_index.html as the version.
    index_page_path = os.path.join(page_root, "index.html")
    cppreference_modified_date = datetime.datetime.fromtimestamp(
        os.stat(index_page_path).st_mtime
    ).strftime("%Y-%m-%d")
    print(CODE_PREFIX % (args.symbols.upper(), cppreference_modified_date))
    for symbol in symbols:
        if len(symbol.headers) == 1:
            augmented_symbols = [symbol]
            augmented_symbols.extend(GetCCompatibilitySymbols(symbol))
            for s in augmented_symbols:
                s.headers.extend(AdditionalHeadersForIOSymbols(s))
                for header in s.headers:
                    # SYMBOL(unqualified_name, namespace, header)
                    print("SYMBOL(%s, %s, %s)" % (s.name, s.namespace, header))
        elif len(symbol.headers) == 0:
            sys.stderr.write("No header found for symbol %s\n" % symbol.name)
        else:
            # FIXME: support symbols with multiple headers (e.g. std::move).
            sys.stderr.write(
                "Ambiguous header for symbol %s: %s\n"
                % (symbol.name, ", ".join(symbol.headers))
            )


if __name__ == "__main__":
    main()