cpython/Tools/c-analyzer/c_parser/parser/_regexes.py

# Regular expression patterns for C syntax.
#
# None of these patterns has any capturing.  However, a number of them
# have capturing markers compatible with utils.set_capture_groups().

import textwrap


def _ind(text, level=1, edges='both'):
    indent = '    ' * level
    text = textwrap.indent(text, indent)
    if edges == 'pre' or edges == 'both':
        text = '\n' + indent + text.lstrip()
    if edges == 'post' or edges == 'both':
        text = text.rstrip() + '\n' + '    ' * (level - 1)
    return text


#######################################
# general

HEX = r'(?: [0-9a-zA-Z] )'

STRING_LITERAL = textwrap.dedent(rf'''
    (?:
        # character literal
        (?:
            ['] [^'] [']
            |
            ['] \\ . [']
            |
            ['] \\x{HEX}{HEX} [']
            |
            ['] \\0\d\d [']
            |
            (?:
                ['] \\o[01]\d\d [']
                |
                ['] \\o2[0-4]\d [']
                |
                ['] \\o25[0-5] [']
             )
         )
        |
        # string literal
        (?:
            ["] (?: [^"\\]* \\ . )* [^"\\]* ["]
         )
        # end string literal
     )
    ''')

_KEYWORD = textwrap.dedent(r'''
    (?:
        \b
        (?:
            auto |
            extern |
            register |
            static |
            _Thread_local |
            typedef |

            const |
            volatile |

            signed |
            unsigned |
            char |
            short |
            int |
            long |
            float |
            double |
            _Complex |
            void |

            struct |
            union |
            enum |

            goto |
            return |
            sizeof |
            break |
            continue |
            if |
            else |
            for |
            do |
            while |
            switch |
            case |
            default |
            entry
         )
        \b
     )
    ''')
KEYWORD = rf'''
    # keyword
    {_KEYWORD}
    # end keyword
    '''
_KEYWORD = ''.join(_KEYWORD.split())

IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
# We use a negative lookahead to filter out keywords.
STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'


#######################################
# types

SIMPLE_TYPE = textwrap.dedent(rf'''
    # simple type
    (?:
        \b
        (?:
            void
            |
            (?: signed | unsigned )  # implies int
            |
            (?:
                (?: (?: float | double | long\s+double ) \s+ )?
                _Complex
            )
            |
            (?:
                _Complex
                (?: \s+ (?: float | double | long\s+double ) )?
            )
            |
            (?:
                (?: (?: signed | unsigned ) \s+ )?
                (?: (?: long | short ) \s+ )?
                (?: char | short | int | long | float | double )
             )
         )
        \b
     )
    # end simple type
    ''')

COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'


#######################################
# variable declarations

_STORAGE = 'auto register static extern _Thread_local'.split()
STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )'
TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'

TYPE_SPEC = textwrap.dedent(rf'''
    # type spec
    (?:
        {_ind(SIMPLE_TYPE, 2)}
        |
        (?:
            [_]*typeof[_]*
            \s* [(]
            (?: \s* [*&] )*
            \s* {STRICT_IDENTIFIER}
            \s* [)]
         )
        |
        # reference to a compound type
        (?:
            {COMPOUND_TYPE_KIND}
            (?: \s* {ANON_IDENTIFIER} )?
         )
        |
        # reference to a typedef
        {STRICT_IDENTIFIER}
     )
    # end type spec
    ''')

DECLARATOR = textwrap.dedent(rf'''
    # declarator  (possibly abstract)
    (?:
        (?: {PTR_QUALIFIER} \s* )*
        (?:
            (?:
                (?:  # <IDENTIFIER>
                    {STRICT_IDENTIFIER}
                )
                # Inside the brackets is actually a "constant expression".
                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
             )
            |
            (?:
                [(] \s*
                (?:  # <WRAPPED_IDENTIFIER>
                    {STRICT_IDENTIFIER}
                )
                # Inside the brackets is actually a "constant expression".
                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
                \s* [)]
             )
            |
            # func ptr
            (?:
                [(] (?: \s* {PTR_QUALIFIER} )? \s*
                (?:  # <FUNC_IDENTIFIER>
                    {STRICT_IDENTIFIER}
                )
                # Inside the brackets is actually a "constant expression".
                (?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )*  # arrays
                \s* [)]
                # We allow for a single level of paren nesting in parameters.
                \s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
             )
         )
     )
    # end declarator
    ''')

VAR_DECL = textwrap.dedent(rf'''
    # var decl (and typedef and func return type)
    (?:
        (?:
            (?:  # <STORAGE>
                {STORAGE_CLASS}
            )
            \s*
        )?
        (?:
            (?:  # <TYPE_QUAL>
                {TYPE_QUALIFIER}
            )
            \s*
         )?
        (?:
            (?:  # <TYPE_SPEC>
                {_ind(TYPE_SPEC, 4)}
            )
         )
        \s*
        (?:
            (?:  # <DECLARATOR>
                {_ind(DECLARATOR, 4)}
            )
         )
     )
    # end var decl
    ''')

INITIALIZER = textwrap.dedent(rf'''
    # initializer
    (?:
        (?:
            [(]
            # no nested parens (e.g. func ptr)
            [^)]*
            [)]
            \s*
         )?
        (?:
            # a string literal
            (?:
                (?: {_ind(STRING_LITERAL, 4)} \s* )*
                {_ind(STRING_LITERAL, 4)}
             )
            |

            # a simple initializer
            (?:
                (?:
                    [^'",;{{]*
                    {_ind(STRING_LITERAL, 4)}
                 )*
                [^'",;{{]*
             )
            |

            # a struct/array literal
            (?:
                # We only expect compound initializers with
                # single-variable declarations.
                {{
                (?:
                    [^'";]*?
                    {_ind(STRING_LITERAL, 5)}
                 )*
                [^'";]*?
                }}
                (?= \s* ; )  # Note this lookahead.
             )
         )
     )
    # end initializer
    ''')


#######################################
# compound type declarations

STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
    (?:
        # inline compound type decl
        (?:
            (?:  # <COMPOUND_TYPE_KIND>
                {COMPOUND_TYPE_KIND}
             )
            (?:
                \s+
                (?:  # <COMPOUND_TYPE_NAME>
                    {STRICT_IDENTIFIER}
                 )
             )?
            \s* {{
         )
        |
        (?:
            # typed member
            (?:
                # Technically it doesn't have to have a type...
                (?:  # <SPECIFIER_QUALIFIER>
                    (?: {TYPE_QUALIFIER} \s* )?
                    {_ind(TYPE_SPEC, 5)}
                 )
                (?:
                    # If it doesn't have a declarator then it will have
                    # a size and vice versa.
                    \s*
                    (?:  # <DECLARATOR>
                        {_ind(DECLARATOR, 6)}
                     )
                 )?
            )

            # sized member
            (?:
                \s* [:] \s*
                (?:  # <SIZE>
                    # This is actually a "constant expression".
                    \d+
                    |
                    [^'",}}]+
                 )
             )?
            \s*
            (?:  # <ENDING>
                [,;]
             )
         )
        |
        (?:
            \s*
            (?:  # <CLOSE>
                }}
             )
         )
     )
    ''')

ENUM_MEMBER_DECL = textwrap.dedent(rf'''
    (?:
        (?:
            \s*
            (?:  # <CLOSE>
                }}
             )
         )
        |
        (?:
            \s*
            (?:  # <NAME>
                {IDENTIFIER}
             )
            (?:
                \s* = \s*
                (?:  # <INIT>
                    # This is actually a "constant expression".
                    {_ind(STRING_LITERAL, 4)}
                    |
                    [^'",}}]+
                 )
             )?
            \s*
            (?:  # <ENDING>
                , | }}
             )
         )
     )
    ''')


#######################################
# statements

SIMPLE_STMT_BODY = textwrap.dedent(rf'''
    # simple statement body
    (?:
        (?:
            [^'"{{}};]*
            {_ind(STRING_LITERAL, 3)}
         )*
        [^'"{{}};]*
        #(?= [;{{] )  # Note this lookahead.
     )
    # end simple statement body
    ''')
SIMPLE_STMT = textwrap.dedent(rf'''
    # simple statement
    (?:
        (?:  # <SIMPLE_STMT>
            # stmt-inline "initializer"
            (?:
                return \b
                (?:
                    \s*
                    {_ind(INITIALIZER, 5)}
                )?
             )
            |
            # variable assignment
            (?:
                (?: [*] \s* )?
                (?:
                    {STRICT_IDENTIFIER} \s*
                    (?: . | -> ) \s*
                 )*
                {STRICT_IDENTIFIER}
                (?: \s* \[ \s* \d+ \s* \] )?
                \s* = \s*
                {_ind(INITIALIZER, 4)}
             )
            |
            # catchall return statement
            (?:
                return \b
                (?:
                    (?:
                        [^'";]*
                        {_ind(STRING_LITERAL, 6)}
                     )*
                    \s* [^'";]*
                 )?
             )
            |
            # simple statement
            (?:
                {_ind(SIMPLE_STMT_BODY, 4)}
             )
         )
        \s*
        (?:  # <SIMPLE_ENDING>
            ;
         )
     )
    # end simple statement
    ''')
COMPOUND_STMT = textwrap.dedent(rf'''
    # compound statement
    (?:
        \b
        (?:
            (?:
                (?:  # <COMPOUND_BARE>
                    else | do
                 )
                \b
             )
            |
            (?:
                (?:  # <COMPOUND_LABELED>
                    (?:
                        case \b
                        (?:
                            [^'":]*
                            {_ind(STRING_LITERAL, 7)}
                         )*
                        \s* [^'":]*
                     )
                    |
                    default
                    |
                    {STRICT_IDENTIFIER}
                 )
                \s* [:]
             )
            |
            (?:
                (?:  # <COMPOUND_PAREN>
                    for | while | if | switch
                 )
                \s* (?= [(] )  # Note this lookahead.
             )
         )
        \s*
     )
    # end compound statement
    ''')


#######################################
# function bodies

LOCAL = textwrap.dedent(rf'''
    (?:
        # an empty statement
        (?:  # <EMPTY>
            ;
         )
        |
        # inline type decl
        (?:
            (?:
                (?:  # <INLINE_LEADING>
                    [^;{{}}]+?
                 )
                \s*
             )?
            (?:  # <INLINE_PRE>
                (?: {STORAGE_CLASS} \s* )?
                (?: {TYPE_QUALIFIER} \s* )?
             )?  # </INLINE_PRE>
            (?:  # <INLINE_KIND>
                {COMPOUND_TYPE_KIND}
             )
            (?:
                \s+
                (?:  # <INLINE_NAME>
                    {STRICT_IDENTIFIER}
                 )
             )?
            \s* {{
         )
        |
        # var decl
        (?:
            (?:  # <STORAGE>
                {STORAGE_CLASS}
             )?  # </STORAGE>
            (?:
                \s*
                (?:  # <VAR_DECL>
                    {_ind(VAR_DECL, 5)}
                 )
             )
            (?:
                (?:
                    # initializer
                    # We expect only basic initializers.
                    \s* = \s*
                    (?:  # <VAR_INIT>
                        {_ind(INITIALIZER, 6)}
                     )
                 )?
                (?:
                    \s*
                    (?:  # <VAR_ENDING>
                        [,;]
                     )
                 )
             )
         )
        |
        {_ind(COMPOUND_STMT, 2)}
        |
        # start-of-block
        (?:
            (?:  # <BLOCK_LEADING>
                (?:
                    [^'"{{}};]*
                    {_ind(STRING_LITERAL, 5)}
                 )*
                [^'"{{}};]*
                # Presumably we will not see "== {{".
                [^\s='"{{}});]
                \s*
             )?  # </BLOCK_LEADING>
            (?:  # <BLOCK_OPEN>
                {{
             )
         )
        |
        {_ind(SIMPLE_STMT, 2)}
        |
        # end-of-block
        (?:  # <BLOCK_CLOSE>
            }}
         )
     )
    ''')

LOCAL_STATICS = textwrap.dedent(rf'''
    (?:
        # inline type decl
        (?:
            (?:
                (?:  # <INLINE_LEADING>
                    [^;{{}}]+?
                 )
                \s*
             )?
            (?:  # <INLINE_PRE>
                (?: {STORAGE_CLASS} \s* )?
                (?: {TYPE_QUALIFIER} \s* )?
             )?
            (?:  # <INLINE_KIND>
                {COMPOUND_TYPE_KIND}
             )
            (?:
                \s+
                (?:  # <INLINE_NAME>
                    {STRICT_IDENTIFIER}
                 )
             )?
            \s* {{
         )
        |
        # var decl
        (?:
            # We only look for static variables.
            (?:  # <STATIC_DECL>
                static \b
                (?: \s* {TYPE_QUALIFIER} )?
                \s* {_ind(TYPE_SPEC, 4)}
                \s* {_ind(DECLARATOR, 4)}
             )
            \s*
            (?:
                (?:  # <STATIC_INIT>
                    = \s*
                    {_ind(INITIALIZER, 4)}
                    \s*
                    [,;{{]
                 )
                |
                (?:  # <STATIC_ENDING>
                    [,;]
                 )
             )
         )
        |
        # everything else
        (?:
            (?:  # <DELIM_LEADING>
                (?:
                    [^'"{{}};]*
                    {_ind(STRING_LITERAL, 4)}
                 )*
                \s* [^'"{{}};]*
             )
            (?:
                (?:  # <BLOCK_OPEN>
                    {{
                 )
                |
                (?:  # <BLOCK_CLOSE>
                    }}
                 )
                |
                (?:  # <STMT_END>
                    ;
                 )
             )
         )
     )
    ''')


#######################################
# global declarations

GLOBAL = textwrap.dedent(rf'''
    (?:
        # an empty statement
        (?:  # <EMPTY>
            ;
         )
        |

        # compound type decl (maybe inline)
        (?:
            (?:
                (?:  # <COMPOUND_LEADING>
                    [^;{{}}]+?
                 )
                 \s*
             )?
            (?:  # <COMPOUND_KIND>
                {COMPOUND_TYPE_KIND}
             )
            (?:
                \s+
                (?:  # <COMPOUND_NAME>
                    {STRICT_IDENTIFIER}
                 )
             )?
            \s* {{
         )
        |
        # bogus inline decl artifact
        # This simplifies resolving the relative syntactic ambiguity of
        # inline structs.
        (?:
            (?:  # <FORWARD_KIND>
                {COMPOUND_TYPE_KIND}
             )
            \s*
            (?:  # <FORWARD_NAME>
                {ANON_IDENTIFIER}
             )
            (?:  # <MAYBE_INLINE_ACTUAL>
                [^=,;({{[*\]]*
                [=,;({{]
             )
         )
        |

        # typedef
        (?:
            \b typedef \b \s*
            (?:  # <TYPEDEF_DECL>
                {_ind(VAR_DECL, 4)}
             )
            (?:
                # We expect no inline type definitions in the parameters.
                \s* [(] \s*
                (?:  # <TYPEDEF_FUNC_PARAMS>
                    [^{{;]*
                 )
                \s* [)]
             )?
            \s* ;
         )
        |

        # func decl/definition & var decls
        # XXX dedicated pattern for funcs (more restricted)?
        (?:
            (?:
                (?:  # <VAR_STORAGE>
                    {STORAGE_CLASS}
                 )
                \s*
             )?
            (?:
                (?:  # <FUNC_INLINE>
                    \b inline \b
                 )
                \s*
             )?
            (?:  # <VAR_DECL>
                {_ind(VAR_DECL, 4)}
             )
            (?:
                # func decl / definition
                (?:
                    (?:
                        # We expect no inline type definitions in the parameters.
                        \s* [(] \s*
                        (?:  # <FUNC_PARAMS>
                            [^{{;]*
                         )
                        \s* [)] \s*
                        (?:  # <FUNC_DELIM>
                            [{{;]
                         )
                     )
                    |
                    (?:
                        # This is some old-school syntax!
                        \s* [(] \s*
                        # We throw away the bare names:
                        {STRICT_IDENTIFIER}
                        (?: \s* , \s* {STRICT_IDENTIFIER} )*
                        \s* [)] \s*

                        # We keep the trailing param declarations:
                        (?:  # <FUNC_LEGACY_PARAMS>
                            # There's at least one!
                            (?: {TYPE_QUALIFIER} \s* )?
                            {_ind(TYPE_SPEC, 7)}
                            \s*
                            {_ind(DECLARATOR, 7)}
                            \s* ;
                            (?:
                                \s*
                                (?: {TYPE_QUALIFIER} \s* )?
                                {_ind(TYPE_SPEC, 8)}
                                \s*
                                {_ind(DECLARATOR, 8)}
                                \s* ;
                             )*
                         )
                        \s* {{
                     )
                 )
                |
                # var / typedef
                (?:
                    (?:
                        # initializer
                        # We expect only basic initializers.
                        \s* = \s*
                        (?:  # <VAR_INIT>
                            {_ind(INITIALIZER, 6)}
                         )
                     )?
                    \s*
                    (?:  # <VAR_ENDING>
                        [,;]
                     )
                 )
             )
         )
     )
    ''')