extract_sqlite_api.py | Explore in Territory

#!/usr/bin/env python3
#
# Copyright 2018 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
'''
Parses SQLite source code and produces renaming macros for its exported symbols.

Usage:
    extract_sqlite_api.py sqlite.h rename_exports.h

For example, the following renaming macro is produced for sqlite3_initialize().

    #define sqlite3_initialize chrome_sqlite3_initialize
'''

from datetime import datetime
import re
import sys


class ExtractError(ValueError):
    def __init__(self, message):
        self.message = message


def ExtractLineTuples(string):
    '''Returns a list of lines, with start/end whitespace stripped.

    Each line is a tuple of (line number, string).
    '''
    raw_lines = string.split('\n')
    stripped_lines = [line.strip() for line in raw_lines]
    return list(enumerate(stripped_lines, start=1))


def ExtractPreprocessorDirectives(lines):
    '''Extracts preprocessor directives from lines of C code.

    Each input line should be a tuple of (line number, string).

    Returns a list of preprocessor directives, and a list of C code lines with
    the preprocessor directives removed. The returned code lines are a subset
    of the input tuples.
    '''
    code_lines = []
    directives = []
    in_directive = False
    last_directive = []
    for line_tuple in lines:
        line = line_tuple[1]
        # Preprocessor directives start with #.
        if not in_directive:
            if len(line) > 0 and line[0] == '#':
                in_directive = True
                last_directive = []

        # Preprocessor directives use \ as a line continuation character.
        if in_directive:
            if line[-1] == '\\':
                line = line[:-1]
            else:
                in_directive = False
            last_directive.append(line)

            if not in_directive:
                directives.append('\n'.join(last_directive))
        else:
            code_lines.append(line_tuple)

    return directives, code_lines


# Regular expression used to parse a macro definition.
DEFINITION_RE = re.compile(r'^\#\s*define\s+(\w+)(\s|$)')


def ExtractDefineMacroName(line):
    '''Extracts the macro name from a non-function preprocessor definition.

    Returns None if the preprocessor line is not a preprocessor macro
    definition.  Macro functions are not considered preprocessor definitions.
    '''
    match = DEFINITION_RE.match(line)
    if match is None:
        return None
    return match.group(1)


# Matches C++-style // single-line comments.
SINGLE_LINE_COMMENT_RE = re.compile(r'//.*$')
# Matches C-style /* multi-line comments */.
MULTI_LINE_COMMENT_RE = re.compile(
    r'/\*.*?\*/', flags=re.MULTILINE | re.DOTALL)


def RemoveLineComments(line):
    '''Returns the given C code line with comments removed.

    This handles both C-style /* comments */ and C++-style // comments, but
    cannot tackle C-style comments that extend over multiple lines.
    '''
    return SINGLE_LINE_COMMENT_RE.sub('', MULTI_LINE_COMMENT_RE.sub('', line))


def RemoveComments(code_tuples):
    'Returns the given C code tuples with all comments removed.'

    output_tuples = []
    in_comment = False
    for line_number, line in code_tuples:
        if in_comment:
            if '*/' in line:
                _, line = line.split('*/', 1)
                in_comment = False
        if not in_comment:
            line = RemoveLineComments(line)
            if '/*' in line:
                line, _ = line.split('/*', 1)
                in_comment = True
            output_tuples.append((line_number, line))
    return output_tuples


# Splits a line of C code into statement pieces.
STATEMENT_BREAK_RE = re.compile(r'[;{}]')


def ToStatementTuples(code_tuples):
    '''Converts C code lines into statements.

    The input is tuples of (line number, line code string). The output is
    tuples of (min line, max line, statement).

    The function considers ; { and } to be statement separators. This is
    sufficiently correct, given our goal.
    '''
    statements = []
    current_statement = ''
    current_start = 0

    for line_number, line in code_tuples:
        pieces = STATEMENT_BREAK_RE.split(line)
        for piece in pieces[:-1]:  # The last piece is an unfinished statement.
            if current_statement != '':
                current_statement = current_statement + '\n' + piece
                statements.append((current_start, line_number,
                                   current_statement.strip()))
                current_statement = ''
            else:
                statements.append((line_number, line_number, piece.strip()))

        if current_statement == '':
            current_start = line_number
        if pieces[-1] != '':
            current_statement = current_statement + '\n' + pieces[-1]

    return statements


# Used to break down a line into words.
WHITESPACE_RE = re.compile(r'\s+')

# Features unsupported by our extractor.
#
# We do not support parsing struct and enum literals because sqlite typedefs
# them before incorporating them into exported symbols. We can avoid matching
# curly braces because we do not support enum, struct, or union, and we only
# need to consider declarations involving typedef names and primitive types.
UNSUPPORTED_KEYWORDS = set(['enum', 'struct', 'union', 'typedef'])

# Type qualifiers that we can skip over.
#
# We discard storage-class specifiers and type qualifiers. For purposes of
# finding the end of declaration specifiers, they are not needed. This
# additionally discards any pointer type qualifiers.
QUALIFIER_KEYWORDS = set([
    'extern',
    'static',
    'auto',
    'register',
    'const',
    'volatile',
])

# Keywords used in composite primitive types.
#
# Types using these keywords may have more than one keyword, e.g.
# "long long int".
COMPOSITE_TYPE_SPECIFIERS = set([
    'char',
    'short',
    'int',
    'long',
    'float',
    'double',
    'signed',
    'unsigned',
])

# Matches an identifier.
IDENTIFIER_RE = re.compile(r'^[a-zA-Z_0-9]+$')


def ExtractApiExport(macro_names, api_export_macro, statement):
    '''Extracts the symbol name from a statement exporting a function.

    Returns None if the statement does not export a symbol. Throws ExtractError
    if the parser cannot understand the statement.
    '''
    # See http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1256.pdf, section 6.7
    # for how to parse C declarations. Note that a declaration is a number of
    # declaration-specifiers, followed by a list of declarators with optional
    # initializer. Multiple declarators would be a declaration like:
    #
    # int a, b;
    #
    # While, in principle, one could declare a pair of C functions like this, no
    # one does it. We assume there is only one declarator.
    #
    # int foo(int), bar(int, int);
    #
    # Jumping to section 6.7.5, a declarator includes some optional pointer
    # specifiers (which may have type qualifiers like 'const' embedded, e.g. 'int
    # * const * const foo') and some grouping. Note, however, that in all cases,
    # the declaration name is the first non-type-qualifier identifier.
    #
    # Thus our goal is to skip the declaration specifiers and get to the
    # declarators.

    # Simplification: get rid of pointer characters.
    statement = statement.replace('*', ' ')

    # Simplification: make sure each open parenthesis is each own word.
    statement = statement.replace('(', ' ( ')
    statement = statement.replace('[', ' [ ')

    words = WHITESPACE_RE.split(statement)

    # Ignore statements that don't deal with exporting symbols.
    if api_export_macro not in words:
        return None

    seen_composite_type = False
    seen_simple_type = False
    for word in words:
        if word in UNSUPPORTED_KEYWORDS:
            raise ExtractError("Unsupported keyword %s" % word)

        if word in QUALIFIER_KEYWORDS:
            continue

        # Per section 6.7.2, we must have at least one type specifier (so the first
        # token is one). Moreover, clause 2 implies that if we have a typedef name,
        # enum, struct, or union, it is the only type specifier. If we have a
        # keyword such as 'int', we may have one or more of such keywords.

        if word in COMPOSITE_TYPE_SPECIFIERS:
            if seen_simple_type:
                raise ExtractError(
                    'Mixed simple (struct_name) and composite (int) types')
            seen_composite_type = True
            continue

        # We assume that macros are only used for qualifiers, which can be skipped.
        if word in macro_names or word == api_export_macro:
            continue

        if not seen_composite_type and not seen_simple_type:
            seen_simple_type = True
            if IDENTIFIER_RE.match(word) is None:
                raise ExtractError(
                    "%s parsed as type name, which doesn't make sense" % word)
            continue

        if IDENTIFIER_RE.match(word) is None:
            raise ExtractError(
                "%s parsed as symbol name, which doesn't make sense" % word)
        return word

    raise ExtractError('Failed to find symbol name')


def ExportedSymbolLine(symbol_prefix, symbol, statement_tuple):
    'Returns an output line for an exported symbol.'
    if statement_tuple[0] == statement_tuple[1]:
        lines = 'Line %d' % statement_tuple[0]
    else:
        lines = 'Lines %d-%d' % (statement_tuple[0], statement_tuple[1])
    return '#define %s %s%s  // %s' % (symbol, symbol_prefix, symbol, lines)


def ExportedExceptionLine(exception, statement_tuple):
    'Returns an output line for a parsing failure.'

    # Output a TODO without a name so the broken parsing result doesn't
    # accidentally get checked in.
    return '// TODO: Lines %d-%d -- %s' % (
        statement_tuple[0], statement_tuple[1], exception.message)


def ProcessSource(api_export_macro, symbol_prefix, header_line, footer_line,
                  file_content):
    'Returns a list of lines that rename exported symbols in an C program file.'

    line_tuples = ExtractLineTuples(file_content)
    line_tuples = RemoveComments(line_tuples)
    directives, code_tuples = ExtractPreprocessorDirectives(line_tuples)
    macro_names = set(
        name for name in
        [ExtractDefineMacroName(directive) for directive in directives]
        if name is not None)
    statement_tuples = ToStatementTuples(code_tuples)

    output_lines = []
    for statement_tuple in statement_tuples:
        line = statement_tuple[2]
        try:
            symbol_name = ExtractApiExport(macro_names, api_export_macro, line)
            if symbol_name:
                output_lines.append(
                    ExportedSymbolLine(symbol_prefix, symbol_name,
                                       statement_tuple))
        except ExtractError as exception:
            output_lines.append(
                ExportedExceptionLine(exception, statement_tuple))

    output_lines.sort()
    return [header_line] + output_lines + [footer_line]


def ProcessSourceFile(api_export_macro, symbol_prefix, header_line,
                      footer_line, input_file, output_file):
    'Reads in a C program file and outputs macros renaming exported symbols.'

    with open(input_file, 'r') as f:
        file_content = f.read()
    output_lines = ProcessSource(api_export_macro, symbol_prefix, header_line,
                                 footer_line, file_content)
    output_lines.append('')
    with open(output_file, 'w') as f:
        f.write('\n'.join(output_lines))


header_line = '''// Copyright %s The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// This file is generated by extract_sqlite_api.py.

#ifndef THIRD_PARTY_SQLITE_AMALGAMATION_RENAME_EXPORTS_H_
#define THIRD_PARTY_SQLITE_AMALGAMATION_RENAME_EXPORTS_H_
''' % datetime.now().strftime('%Y')

footer_line = '''
#endif  // THIRD_PARTY_SQLITE_AMALGAMATION_RENAME_EXPORTS_H_
'''

if __name__ == '__main__':
    ProcessSourceFile(
        api_export_macro='SQLITE_API',
        symbol_prefix='chrome_',
        header_line=header_line,
        footer_line=footer_line,
        input_file=sys.argv[1],
        output_file=sys.argv[2])
chromium/third_party/sqlite/scripts/extract_sqlite_api.py