cpython/Tools/c-analyzer/cpython/_parser.py

import os.path
import re

from c_parser.preprocessor import (
    get_preprocessor as _get_preprocessor,
)
from c_parser import (
    parse_file as _parse_file,
    parse_files as _parse_files,
)
from . import REPO_ROOT


GLOB_ALL = '**/*'


def _abs(relfile):
    return os.path.join(REPO_ROOT, relfile)


def clean_lines(text):
    """Clear out comments, blank lines, and leading/trailing whitespace."""
    lines = (line.strip() for line in text.splitlines())
    lines = (line.partition('#')[0].rstrip()
             for line in lines
             if line and not line.startswith('#'))
    glob_all = f'{GLOB_ALL} '
    lines = (re.sub(r'^[*] ', glob_all, line) for line in lines)
    lines = (_abs(line) for line in lines)
    return list(lines)


'''
@begin=sh@
./python ../c-parser/cpython.py
    --exclude '+../c-parser/EXCLUDED'
    --macros '+../c-parser/MACROS'
    --incldirs '+../c-parser/INCL_DIRS'
    --same './Include/cpython/'
    Include/*.h
    Include/internal/*.h
    Modules/**/*.c
    Objects/**/*.c
    Parser/**/*.c
    Python/**/*.c
@end=sh@
'''

# XXX Handle these.
# Tab separated:
EXCLUDED = clean_lines('''
# @begin=conf@

# OSX
Modules/_scproxy.c                # SystemConfiguration/SystemConfiguration.h

# Windows
Modules/_winapi.c               # windows.h
Modules/expat/winconfig.h
Modules/overlapped.c            # winsock.h
Python/dynload_win.c            # windows.h
Python/thread_nt.h

# other OS-dependent
Python/dynload_aix.c            # sys/ldr.h
Python/dynload_dl.c             # dl.h
Python/dynload_hpux.c           # dl.h
Python/emscripten_signal.c
Python/thread_pthread.h
Python/thread_pthread_stubs.h

# only huge constants (safe but parsing is slow)
Modules/_ssl_data_31.h
Modules/_ssl_data_300.h
Modules/_ssl_data_111.h
Modules/cjkcodecs/mappings_*.h
Modules/unicodedata_db.h
Modules/unicodename_db.h
Objects/unicodetype_db.h

# generated
Python/deepfreeze/*.c
Python/frozen_modules/*.h
Python/generated_cases.c.h
Python/executor_cases.c.h
Python/optimizer_cases.c.h

# not actually source
Python/bytecodes.c
Python/optimizer_bytecodes.c

# mimalloc
Objects/mimalloc/*.c
Include/internal/mimalloc/*.h
Include/internal/mimalloc/mimalloc/*.h

# @end=conf@
''')

# XXX Fix the parser.
EXCLUDED += clean_lines('''
# The tool should be able to parse these...

# The problem with xmlparse.c is that something
# has gone wrong where # we handle "maybe inline actual"
# in Tools/c-analyzer/c_parser/parser/_global.py.
Modules/expat/internal.h
Modules/expat/xmlparse.c
''')

INCL_DIRS = clean_lines('''
# @begin=tsv@

glob	dirname
*	.
*	./Include
*	./Include/internal
*   ./Include/internal/mimalloc

Modules/_decimal/**/*.c	Modules/_decimal/libmpdec
Modules/_elementtree.c	Modules/expat
Modules/_hacl/*.c	Modules/_hacl/include
Modules/_hacl/*.c	Modules/_hacl/
Modules/_hacl/*.h	Modules/_hacl/include
Modules/_hacl/*.h	Modules/_hacl/
Modules/md5module.c	Modules/_hacl/include
Modules/sha1module.c	Modules/_hacl/include
Modules/sha2module.c	Modules/_hacl/include
Modules/sha3module.c	Modules/_hacl/include
Modules/blake2module.c	Modules/_hacl/include
Objects/stringlib/*.h	Objects

# possible system-installed headers, just in case
Modules/_tkinter.c	/usr/include/tcl8.6
Modules/_uuidmodule.c	/usr/include/uuid
Modules/tkappinit.c	/usr/include/tcl

# @end=tsv@
''')[1:]

INCLUDES = clean_lines('''
# @begin=tsv@

glob	include

**/*.h	Python.h
Include/**/*.h	object.h

# for Py_HAVE_CONDVAR
Include/internal/pycore_gil.h	pycore_condvar.h
Python/thread_pthread.h	pycore_condvar.h

# other

Objects/stringlib/join.h	stringlib/stringdefs.h
Objects/stringlib/ctype.h	stringlib/stringdefs.h
Objects/stringlib/transmogrify.h	stringlib/stringdefs.h
#Objects/stringlib/fastsearch.h	stringlib/stringdefs.h
#Objects/stringlib/count.h	stringlib/stringdefs.h
#Objects/stringlib/find.h	stringlib/stringdefs.h
#Objects/stringlib/partition.h	stringlib/stringdefs.h
#Objects/stringlib/split.h	stringlib/stringdefs.h
Objects/stringlib/fastsearch.h	stringlib/ucs1lib.h
Objects/stringlib/count.h	stringlib/ucs1lib.h
Objects/stringlib/find.h	stringlib/ucs1lib.h
Objects/stringlib/partition.h	stringlib/ucs1lib.h
Objects/stringlib/split.h	stringlib/ucs1lib.h
Objects/stringlib/find_max_char.h	Objects/stringlib/ucs1lib.h
Objects/stringlib/count.h	Objects/stringlib/fastsearch.h
Objects/stringlib/find.h	Objects/stringlib/fastsearch.h
Objects/stringlib/partition.h	Objects/stringlib/fastsearch.h
Objects/stringlib/replace.h	Objects/stringlib/fastsearch.h
Objects/stringlib/repr.h	Objects/stringlib/fastsearch.h
Objects/stringlib/split.h	Objects/stringlib/fastsearch.h

# @end=tsv@
''')[1:]

MACROS = clean_lines('''
# @begin=tsv@

glob	name	value

Include/internal/*.h	Py_BUILD_CORE	1
Python/**/*.c	Py_BUILD_CORE	1
Python/**/*.h	Py_BUILD_CORE	1
Parser/**/*.c	Py_BUILD_CORE	1
Parser/**/*.h	Py_BUILD_CORE	1
Objects/**/*.c	Py_BUILD_CORE	1
Objects/**/*.h	Py_BUILD_CORE	1

Modules/_asynciomodule.c	Py_BUILD_CORE	1
Modules/_codecsmodule.c	Py_BUILD_CORE	1
Modules/_collectionsmodule.c	Py_BUILD_CORE	1
Modules/_ctypes/_ctypes.c	Py_BUILD_CORE	1
Modules/_ctypes/cfield.c	Py_BUILD_CORE	1
Modules/_cursesmodule.c	Py_BUILD_CORE	1
Modules/_datetimemodule.c	Py_BUILD_CORE	1
Modules/_functoolsmodule.c	Py_BUILD_CORE	1
Modules/_heapqmodule.c	Py_BUILD_CORE	1
Modules/_io/*.c	Py_BUILD_CORE	1
Modules/_io/*.h	Py_BUILD_CORE	1
Modules/_localemodule.c	Py_BUILD_CORE	1
Modules/_operator.c	Py_BUILD_CORE	1
Modules/_posixsubprocess.c	Py_BUILD_CORE	1
Modules/_sre/sre.c	Py_BUILD_CORE	1
Modules/_threadmodule.c	Py_BUILD_CORE	1
Modules/_tracemalloc.c	Py_BUILD_CORE	1
Modules/_weakref.c	Py_BUILD_CORE	1
Modules/_zoneinfo.c	Py_BUILD_CORE	1
Modules/atexitmodule.c	Py_BUILD_CORE	1
Modules/cmathmodule.c	Py_BUILD_CORE	1
Modules/faulthandler.c	Py_BUILD_CORE	1
Modules/gcmodule.c	Py_BUILD_CORE	1
Modules/getpath.c	Py_BUILD_CORE	1
Modules/getpath_noop.c	Py_BUILD_CORE	1
Modules/itertoolsmodule.c	Py_BUILD_CORE	1
Modules/main.c	Py_BUILD_CORE	1
Modules/mathmodule.c	Py_BUILD_CORE	1
Modules/posixmodule.c	Py_BUILD_CORE	1
Modules/sha256module.c	Py_BUILD_CORE	1
Modules/sha512module.c	Py_BUILD_CORE	1
Modules/signalmodule.c	Py_BUILD_CORE	1
Modules/symtablemodule.c	Py_BUILD_CORE	1
Modules/timemodule.c	Py_BUILD_CORE	1
Modules/unicodedata.c	Py_BUILD_CORE	1

Modules/_json.c	Py_BUILD_CORE_BUILTIN	1
Modules/_pickle.c	Py_BUILD_CORE_BUILTIN	1
Modules/_testinternalcapi.c	Py_BUILD_CORE_BUILTIN	1

Include/cpython/abstract.h	Py_CPYTHON_ABSTRACTOBJECT_H	1
Include/cpython/bytearrayobject.h	Py_CPYTHON_BYTEARRAYOBJECT_H	1
Include/cpython/bytesobject.h	Py_CPYTHON_BYTESOBJECT_H	1
Include/cpython/ceval.h	Py_CPYTHON_CEVAL_H	1
Include/cpython/code.h	Py_CPYTHON_CODE_H	1
Include/cpython/dictobject.h	Py_CPYTHON_DICTOBJECT_H	1
Include/cpython/fileobject.h	Py_CPYTHON_FILEOBJECT_H	1
Include/cpython/fileutils.h	Py_CPYTHON_FILEUTILS_H	1
Include/cpython/frameobject.h	Py_CPYTHON_FRAMEOBJECT_H	1
Include/cpython/import.h	Py_CPYTHON_IMPORT_H	1
Include/cpython/interpreteridobject.h	Py_CPYTHON_INTERPRETERIDOBJECT_H	1
Include/cpython/listobject.h	Py_CPYTHON_LISTOBJECT_H	1
Include/cpython/methodobject.h	Py_CPYTHON_METHODOBJECT_H	1
Include/cpython/object.h	Py_CPYTHON_OBJECT_H	1
Include/cpython/objimpl.h	Py_CPYTHON_OBJIMPL_H	1
Include/cpython/pyerrors.h	Py_CPYTHON_ERRORS_H	1
Include/cpython/pylifecycle.h	Py_CPYTHON_PYLIFECYCLE_H	1
Include/cpython/pymem.h	Py_CPYTHON_PYMEM_H	1
Include/cpython/pystate.h	Py_CPYTHON_PYSTATE_H	1
Include/cpython/sysmodule.h	Py_CPYTHON_SYSMODULE_H	1
Include/cpython/traceback.h	Py_CPYTHON_TRACEBACK_H	1
Include/cpython/tupleobject.h	Py_CPYTHON_TUPLEOBJECT_H	1
Include/cpython/unicodeobject.h	Py_CPYTHON_UNICODEOBJECT_H	1

# implied include of <unistd.h>
Include/**/*.h	_POSIX_THREADS	1
Include/**/*.h	HAVE_PTHREAD_H	1

# from pyconfig.h
Include/cpython/pthread_stubs.h	HAVE_PTHREAD_STUBS	1
Python/thread_pthread_stubs.h	HAVE_PTHREAD_STUBS	1

# from Objects/bytesobject.c
Objects/stringlib/partition.h	STRINGLIB_GET_EMPTY()	bytes_get_empty()
Objects/stringlib/join.h	STRINGLIB_MUTABLE	0
Objects/stringlib/partition.h	STRINGLIB_MUTABLE	0
Objects/stringlib/split.h	STRINGLIB_MUTABLE	0
Objects/stringlib/transmogrify.h	STRINGLIB_MUTABLE	0

# from Makefile
Modules/getpath.c	PYTHONPATH	1
Modules/getpath.c	PREFIX	...
Modules/getpath.c	EXEC_PREFIX	...
Modules/getpath.c	VERSION	...
Modules/getpath.c	VPATH	...
Modules/getpath.c	PLATLIBDIR	...
#Modules/_dbmmodule.c	USE_GDBM_COMPAT	1
Modules/_dbmmodule.c	USE_NDBM	1
#Modules/_dbmmodule.c	USE_BERKDB	1

# See: setup.py
Modules/_decimal/**/*.c	CONFIG_64	1
Modules/_decimal/**/*.c	ASM	1
Modules/expat/xmlparse.c	HAVE_EXPAT_CONFIG_H	1
Modules/expat/xmlparse.c	XML_POOR_ENTROPY	1
Modules/_dbmmodule.c	HAVE_GDBM_DASH_NDBM_H	1

# others
Modules/_sre/sre_lib.h	LOCAL(type)	static inline type
Modules/_sre/sre_lib.h	SRE(F)	sre_ucs2_##F
Objects/stringlib/codecs.h	STRINGLIB_IS_UNICODE	1

# @end=tsv@
''')[1:]

# -pthread
# -Wno-unused-result
# -Wsign-compare
# -g
# -Og
# -Wall
# -std=c99
# -Wextra
# -Wno-unused-result -Wno-unused-parameter
# -Wno-missing-field-initializers
# -Werror=implicit-function-declaration

SAME = {
    _abs('Include/*.h'): [_abs('Include/cpython/')],
    _abs('Python/ceval.c'): ['Python/generated_cases.c.h'],
}

MAX_SIZES = {
    # GLOB: (MAXTEXT, MAXLINES),
    # default: (10_000, 200)
    # First match wins.
    _abs('Modules/_ctypes/ctypes.h'): (5_000, 500),
    _abs('Modules/_datetimemodule.c'): (20_000, 300),
    _abs('Modules/_hacl/*.c'): (200_000, 500),
    _abs('Modules/posixmodule.c'): (20_000, 500),
    _abs('Modules/termios.c'): (10_000, 800),
    _abs('Modules/_testcapimodule.c'): (20_000, 400),
    _abs('Modules/expat/expat.h'): (10_000, 400),
    _abs('Objects/stringlib/unicode_format.h'): (10_000, 400),
    _abs('Objects/typeobject.c'): (35_000, 200),
    _abs('Python/compile.c'): (20_000, 500),
    _abs('Python/optimizer.c'): (100_000, 5_000),
    _abs('Python/parking_lot.c'): (40_000, 1000),
    _abs('Python/pylifecycle.c'): (500_000, 5000),
    _abs('Python/pystate.c'): (500_000, 5000),
    _abs('Python/initconfig.c'): (50_000, 500),

    # Generated files:
    _abs('Include/internal/pycore_opcode.h'): (10_000, 1000),
    _abs('Include/internal/pycore_global_strings.h'): (5_000, 1000),
    _abs('Include/internal/pycore_runtime_init_generated.h'): (5_000, 1000),
    _abs('Python/deepfreeze/*.c'): (20_000, 500),
    _abs('Python/frozen_modules/*.h'): (20_000, 500),
    _abs('Python/opcode_targets.h'): (10_000, 500),
    _abs('Python/stdlib_module_names.h'): (5_000, 500),

    # These large files are currently ignored (see above).
    _abs('Modules/_ssl_data_31.h'): (80_000, 10_000),
    _abs('Modules/_ssl_data_300.h'): (80_000, 10_000),
    _abs('Modules/_ssl_data_111.h'): (80_000, 10_000),
    _abs('Modules/cjkcodecs/mappings_*.h'): (160_000, 2_000),
    _abs('Modules/unicodedata_db.h'): (180_000, 3_000),
    _abs('Modules/unicodename_db.h'): (1_200_000, 15_000),
    _abs('Objects/unicodetype_db.h'): (240_000, 3_000),

    # Catch-alls:
    _abs('Include/**/*.h'): (5_000, 500),
}


def get_preprocessor(*,
                     file_macros=None,
                     file_includes=None,
                     file_incldirs=None,
                     file_same=None,
                     **kwargs
                     ):
    macros = tuple(MACROS)
    if file_macros:
        macros += tuple(file_macros)
    includes = tuple(INCLUDES)
    if file_includes:
        includes += tuple(file_includes)
    incldirs = tuple(INCL_DIRS)
    if file_incldirs:
        incldirs += tuple(file_incldirs)
    samefiles = dict(SAME)
    if file_same:
        samefiles.update(file_same)
    return _get_preprocessor(
        file_macros=macros,
        file_includes=includes,
        file_incldirs=incldirs,
        file_same=samefiles,
        **kwargs
    )


def parse_file(filename, *,
               match_kind=None,
               ignore_exc=None,
               log_err=None,
               ):
    get_file_preprocessor = get_preprocessor(
        ignore_exc=ignore_exc,
        log_err=log_err,
    )
    yield from _parse_file(
        filename,
        match_kind=match_kind,
        get_file_preprocessor=get_file_preprocessor,
        file_maxsizes=MAX_SIZES,
    )


def parse_files(filenames=None, *,
                match_kind=None,
                ignore_exc=None,
                log_err=None,
                get_file_preprocessor=None,
                **file_kwargs
                ):
    if get_file_preprocessor is None:
        get_file_preprocessor = get_preprocessor(
            ignore_exc=ignore_exc,
            log_err=log_err,
        )
    yield from _parse_files(
        filenames,
        match_kind=match_kind,
        get_file_preprocessor=get_file_preprocessor,
        file_maxsizes=MAX_SIZES,
        **file_kwargs
    )