# Regular expression patterns for C syntax.
#
# None of these patterns has any capturing. However, a number of them
# have capturing markers compatible with utils.set_capture_groups().
import textwrap
def _ind(text, level=1, edges='both'):
indent = ' ' * level
text = textwrap.indent(text, indent)
if edges == 'pre' or edges == 'both':
text = '\n' + indent + text.lstrip()
if edges == 'post' or edges == 'both':
text = text.rstrip() + '\n' + ' ' * (level - 1)
return text
#######################################
# general
HEX = r'(?: [0-9a-zA-Z] )'
STRING_LITERAL = textwrap.dedent(rf'''
(?:
# character literal
(?:
['] [^'] [']
|
['] \\ . [']
|
['] \\x{HEX}{HEX} [']
|
['] \\0\d\d [']
|
(?:
['] \\o[01]\d\d [']
|
['] \\o2[0-4]\d [']
|
['] \\o25[0-5] [']
)
)
|
# string literal
(?:
["] (?: [^"\\]* \\ . )* [^"\\]* ["]
)
# end string literal
)
''')
_KEYWORD = textwrap.dedent(r'''
(?:
\b
(?:
auto |
extern |
register |
static |
_Thread_local |
typedef |
const |
volatile |
signed |
unsigned |
char |
short |
int |
long |
float |
double |
_Complex |
void |
struct |
union |
enum |
goto |
return |
sizeof |
break |
continue |
if |
else |
for |
do |
while |
switch |
case |
default |
entry
)
\b
)
''')
KEYWORD = rf'''
# keyword
{_KEYWORD}
# end keyword
'''
_KEYWORD = ''.join(_KEYWORD.split())
IDENTIFIER = r'(?: [a-zA-Z_][a-zA-Z0-9_]* )'
# We use a negative lookahead to filter out keywords.
STRICT_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} \b )'
ANON_IDENTIFIER = rf'(?: (?! {_KEYWORD} ) \b {IDENTIFIER} (?: - \d+ )? \b )'
#######################################
# types
SIMPLE_TYPE = textwrap.dedent(rf'''
# simple type
(?:
\b
(?:
void
|
(?: signed | unsigned ) # implies int
|
(?:
(?: (?: float | double | long\s+double ) \s+ )?
_Complex
)
|
(?:
_Complex
(?: \s+ (?: float | double | long\s+double ) )?
)
|
(?:
(?: (?: signed | unsigned ) \s+ )?
(?: (?: long | short ) \s+ )?
(?: char | short | int | long | float | double )
)
)
\b
)
# end simple type
''')
COMPOUND_TYPE_KIND = r'(?: \b (?: struct | union | enum ) \b )'
#######################################
# variable declarations
_STORAGE = 'auto register static extern _Thread_local'.split()
STORAGE_CLASS = rf'(?: \b (?: {" | ".join(_STORAGE)} ) \b )'
TYPE_QUALIFIER = r'(?: \b (?: const | volatile ) \b )'
PTR_QUALIFIER = rf'(?: [*] (?: \s* {TYPE_QUALIFIER} )? )'
TYPE_SPEC = textwrap.dedent(rf'''
# type spec
(?:
{_ind(SIMPLE_TYPE, 2)}
|
(?:
[_]*typeof[_]*
\s* [(]
(?: \s* [*&] )*
\s* {STRICT_IDENTIFIER}
\s* [)]
)
|
# reference to a compound type
(?:
{COMPOUND_TYPE_KIND}
(?: \s* {ANON_IDENTIFIER} )?
)
|
# reference to a typedef
{STRICT_IDENTIFIER}
)
# end type spec
''')
DECLARATOR = textwrap.dedent(rf'''
# declarator (possibly abstract)
(?:
(?: {PTR_QUALIFIER} \s* )*
(?:
(?:
(?: # <IDENTIFIER>
{STRICT_IDENTIFIER}
)
# Inside the brackets is actually a "constant expression".
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
)
|
(?:
[(] \s*
(?: # <WRAPPED_IDENTIFIER>
{STRICT_IDENTIFIER}
)
# Inside the brackets is actually a "constant expression".
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
\s* [)]
)
|
# func ptr
(?:
[(] (?: \s* {PTR_QUALIFIER} )? \s*
(?: # <FUNC_IDENTIFIER>
{STRICT_IDENTIFIER}
)
# Inside the brackets is actually a "constant expression".
(?: \s* \[ (?: \s* [^\]]+ \s* )? [\]] )* # arrays
\s* [)]
# We allow for a single level of paren nesting in parameters.
\s* [(] (?: [^()]* [(] [^)]* [)] )* [^)]* [)]
)
)
)
# end declarator
''')
VAR_DECL = textwrap.dedent(rf'''
# var decl (and typedef and func return type)
(?:
(?:
(?: # <STORAGE>
{STORAGE_CLASS}
)
\s*
)?
(?:
(?: # <TYPE_QUAL>
{TYPE_QUALIFIER}
)
\s*
)?
(?:
(?: # <TYPE_SPEC>
{_ind(TYPE_SPEC, 4)}
)
)
\s*
(?:
(?: # <DECLARATOR>
{_ind(DECLARATOR, 4)}
)
)
)
# end var decl
''')
INITIALIZER = textwrap.dedent(rf'''
# initializer
(?:
(?:
[(]
# no nested parens (e.g. func ptr)
[^)]*
[)]
\s*
)?
(?:
# a string literal
(?:
(?: {_ind(STRING_LITERAL, 4)} \s* )*
{_ind(STRING_LITERAL, 4)}
)
|
# a simple initializer
(?:
(?:
[^'",;{{]*
{_ind(STRING_LITERAL, 4)}
)*
[^'",;{{]*
)
|
# a struct/array literal
(?:
# We only expect compound initializers with
# single-variable declarations.
{{
(?:
[^'";]*?
{_ind(STRING_LITERAL, 5)}
)*
[^'";]*?
}}
(?= \s* ; ) # Note this lookahead.
)
)
)
# end initializer
''')
#######################################
# compound type declarations
STRUCT_MEMBER_DECL = textwrap.dedent(rf'''
(?:
# inline compound type decl
(?:
(?: # <COMPOUND_TYPE_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <COMPOUND_TYPE_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
(?:
# typed member
(?:
# Technically it doesn't have to have a type...
(?: # <SPECIFIER_QUALIFIER>
(?: {TYPE_QUALIFIER} \s* )?
{_ind(TYPE_SPEC, 5)}
)
(?:
# If it doesn't have a declarator then it will have
# a size and vice versa.
\s*
(?: # <DECLARATOR>
{_ind(DECLARATOR, 6)}
)
)?
)
# sized member
(?:
\s* [:] \s*
(?: # <SIZE>
# This is actually a "constant expression".
\d+
|
[^'",}}]+
)
)?
\s*
(?: # <ENDING>
[,;]
)
)
|
(?:
\s*
(?: # <CLOSE>
}}
)
)
)
''')
ENUM_MEMBER_DECL = textwrap.dedent(rf'''
(?:
(?:
\s*
(?: # <CLOSE>
}}
)
)
|
(?:
\s*
(?: # <NAME>
{IDENTIFIER}
)
(?:
\s* = \s*
(?: # <INIT>
# This is actually a "constant expression".
{_ind(STRING_LITERAL, 4)}
|
[^'",}}]+
)
)?
\s*
(?: # <ENDING>
, | }}
)
)
)
''')
#######################################
# statements
SIMPLE_STMT_BODY = textwrap.dedent(rf'''
# simple statement body
(?:
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 3)}
)*
[^'"{{}};]*
#(?= [;{{] ) # Note this lookahead.
)
# end simple statement body
''')
SIMPLE_STMT = textwrap.dedent(rf'''
# simple statement
(?:
(?: # <SIMPLE_STMT>
# stmt-inline "initializer"
(?:
return \b
(?:
\s*
{_ind(INITIALIZER, 5)}
)?
)
|
# variable assignment
(?:
(?: [*] \s* )?
(?:
{STRICT_IDENTIFIER} \s*
(?: . | -> ) \s*
)*
{STRICT_IDENTIFIER}
(?: \s* \[ \s* \d+ \s* \] )?
\s* = \s*
{_ind(INITIALIZER, 4)}
)
|
# catchall return statement
(?:
return \b
(?:
(?:
[^'";]*
{_ind(STRING_LITERAL, 6)}
)*
\s* [^'";]*
)?
)
|
# simple statement
(?:
{_ind(SIMPLE_STMT_BODY, 4)}
)
)
\s*
(?: # <SIMPLE_ENDING>
;
)
)
# end simple statement
''')
COMPOUND_STMT = textwrap.dedent(rf'''
# compound statement
(?:
\b
(?:
(?:
(?: # <COMPOUND_BARE>
else | do
)
\b
)
|
(?:
(?: # <COMPOUND_LABELED>
(?:
case \b
(?:
[^'":]*
{_ind(STRING_LITERAL, 7)}
)*
\s* [^'":]*
)
|
default
|
{STRICT_IDENTIFIER}
)
\s* [:]
)
|
(?:
(?: # <COMPOUND_PAREN>
for | while | if | switch
)
\s* (?= [(] ) # Note this lookahead.
)
)
\s*
)
# end compound statement
''')
#######################################
# function bodies
LOCAL = textwrap.dedent(rf'''
(?:
# an empty statement
(?: # <EMPTY>
;
)
|
# inline type decl
(?:
(?:
(?: # <INLINE_LEADING>
[^;{{}}]+?
)
\s*
)?
(?: # <INLINE_PRE>
(?: {STORAGE_CLASS} \s* )?
(?: {TYPE_QUALIFIER} \s* )?
)? # </INLINE_PRE>
(?: # <INLINE_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <INLINE_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
# var decl
(?:
(?: # <STORAGE>
{STORAGE_CLASS}
)? # </STORAGE>
(?:
\s*
(?: # <VAR_DECL>
{_ind(VAR_DECL, 5)}
)
)
(?:
(?:
# initializer
# We expect only basic initializers.
\s* = \s*
(?: # <VAR_INIT>
{_ind(INITIALIZER, 6)}
)
)?
(?:
\s*
(?: # <VAR_ENDING>
[,;]
)
)
)
)
|
{_ind(COMPOUND_STMT, 2)}
|
# start-of-block
(?:
(?: # <BLOCK_LEADING>
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 5)}
)*
[^'"{{}};]*
# Presumably we will not see "== {{".
[^\s='"{{}});]
\s*
)? # </BLOCK_LEADING>
(?: # <BLOCK_OPEN>
{{
)
)
|
{_ind(SIMPLE_STMT, 2)}
|
# end-of-block
(?: # <BLOCK_CLOSE>
}}
)
)
''')
LOCAL_STATICS = textwrap.dedent(rf'''
(?:
# inline type decl
(?:
(?:
(?: # <INLINE_LEADING>
[^;{{}}]+?
)
\s*
)?
(?: # <INLINE_PRE>
(?: {STORAGE_CLASS} \s* )?
(?: {TYPE_QUALIFIER} \s* )?
)?
(?: # <INLINE_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <INLINE_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
# var decl
(?:
# We only look for static variables.
(?: # <STATIC_DECL>
static \b
(?: \s* {TYPE_QUALIFIER} )?
\s* {_ind(TYPE_SPEC, 4)}
\s* {_ind(DECLARATOR, 4)}
)
\s*
(?:
(?: # <STATIC_INIT>
= \s*
{_ind(INITIALIZER, 4)}
\s*
[,;{{]
)
|
(?: # <STATIC_ENDING>
[,;]
)
)
)
|
# everything else
(?:
(?: # <DELIM_LEADING>
(?:
[^'"{{}};]*
{_ind(STRING_LITERAL, 4)}
)*
\s* [^'"{{}};]*
)
(?:
(?: # <BLOCK_OPEN>
{{
)
|
(?: # <BLOCK_CLOSE>
}}
)
|
(?: # <STMT_END>
;
)
)
)
)
''')
#######################################
# global declarations
GLOBAL = textwrap.dedent(rf'''
(?:
# an empty statement
(?: # <EMPTY>
;
)
|
# compound type decl (maybe inline)
(?:
(?:
(?: # <COMPOUND_LEADING>
[^;{{}}]+?
)
\s*
)?
(?: # <COMPOUND_KIND>
{COMPOUND_TYPE_KIND}
)
(?:
\s+
(?: # <COMPOUND_NAME>
{STRICT_IDENTIFIER}
)
)?
\s* {{
)
|
# bogus inline decl artifact
# This simplifies resolving the relative syntactic ambiguity of
# inline structs.
(?:
(?: # <FORWARD_KIND>
{COMPOUND_TYPE_KIND}
)
\s*
(?: # <FORWARD_NAME>
{ANON_IDENTIFIER}
)
(?: # <MAYBE_INLINE_ACTUAL>
[^=,;({{[*\]]*
[=,;({{]
)
)
|
# typedef
(?:
\b typedef \b \s*
(?: # <TYPEDEF_DECL>
{_ind(VAR_DECL, 4)}
)
(?:
# We expect no inline type definitions in the parameters.
\s* [(] \s*
(?: # <TYPEDEF_FUNC_PARAMS>
[^{{;]*
)
\s* [)]
)?
\s* ;
)
|
# func decl/definition & var decls
# XXX dedicated pattern for funcs (more restricted)?
(?:
(?:
(?: # <VAR_STORAGE>
{STORAGE_CLASS}
)
\s*
)?
(?:
(?: # <FUNC_INLINE>
\b inline \b
)
\s*
)?
(?: # <VAR_DECL>
{_ind(VAR_DECL, 4)}
)
(?:
# func decl / definition
(?:
(?:
# We expect no inline type definitions in the parameters.
\s* [(] \s*
(?: # <FUNC_PARAMS>
[^{{;]*
)
\s* [)] \s*
(?: # <FUNC_DELIM>
[{{;]
)
)
|
(?:
# This is some old-school syntax!
\s* [(] \s*
# We throw away the bare names:
{STRICT_IDENTIFIER}
(?: \s* , \s* {STRICT_IDENTIFIER} )*
\s* [)] \s*
# We keep the trailing param declarations:
(?: # <FUNC_LEGACY_PARAMS>
# There's at least one!
(?: {TYPE_QUALIFIER} \s* )?
{_ind(TYPE_SPEC, 7)}
\s*
{_ind(DECLARATOR, 7)}
\s* ;
(?:
\s*
(?: {TYPE_QUALIFIER} \s* )?
{_ind(TYPE_SPEC, 8)}
\s*
{_ind(DECLARATOR, 8)}
\s* ;
)*
)
\s* {{
)
)
|
# var / typedef
(?:
(?:
# initializer
# We expect only basic initializers.
\s* = \s*
(?: # <VAR_INIT>
{_ind(INITIALIZER, 6)}
)
)?
\s*
(?: # <VAR_ENDING>
[,;]
)
)
)
)
)
''')