parser.py | Explore in Territory

"""
A direct translation of the webvtt file parsing algorithm.

See https://w3c.github.io/webvtt/#file-parsing for documentation
"""
import re
import string

SPACE_CHARACTERS = [' ', '\t', '\n', '\f', '\r']
SPACE_SPLIT_PATTERN = r"[{}]*".format(''.join(SPACE_CHARACTERS))
DIGITS = string.digits

class DictInit:
    def __init__(self, **dict):
        self.__dict__.update(dict)

class VTTCue(DictInit): pass
class VTTRegion(DictInit): pass
class Stylesheet(DictInit): pass

class W3CParser:
    input = None
    position = None

    def collect_characters(self, condition):
        result = ""
        while self.position < len(self.input) and condition(self.input[self.position]):
            result += self.input[self.position]
            self.position += 1
        return result

    def skip_whitespace(self):
        self.collect_characters(lambda c: c in SPACE_CHARACTERS)

    def parse_percentage_string(self, input):
        'parse a percentage string'

        # 1.
        input = input

        # 2.
        if not re.match(r'^\d+(\.\d+)?%$', input):
            return None

        # 3.
        percentage = float(input[:-1])

        # 4.
        if percentage < 0 or percentage > 100:
            return None

        # 5.
        return percentage

class VTTParser(W3CParser):
    def __init__(self, input):
        self.input = input
        self.position = 0
        self.seen_cue = False

        self.text_tracks = []
        self.stylesheets = []
        self.regions = []
        self.errors = []

    def parse(self):
        'WebVTT parser algorithm'

        # 1.
        self.input = self.input.replace('\0', '\ufffd').replace('\r\n', '\n').replace('\r', '\n')

        # 2.
        self.position = 0

        # 3.
        self.seen_cue = False

        # 4.
        if len(self.input) < 6:
            self.errors.append('input too small for webvtt')
            return

        # 5.
        if len(self.input) == 6 and self.input != 'WEBVTT':
            self.errors.append('invalid webvtt header')
            return

        # 6.
        if len(self.input) > 6:
            if not (self.input[0:6] == 'WEBVTT' and self.input[6] in ['\u0020', '\u0009', '\u000A']):
                self.errors.append('invalid webvtt header')
                return

        # 7.
        self.collect_characters(lambda c: c != '\n')

        # 8.
        if self.position >= len(self.input):
            return

        # 9.
        if self.input[self.position] == '\n':
            self.position += 1

        # 10.
        if self.position >= len(self.input):
            return

        # 11.
        if self.input[self.position] != '\n':
            self.collect_block(in_header = True)
        else:
            self.position += 1

        # 12.
        self.collect_characters(lambda c: c == '\n')

        # 13.
        self.regions = []

        # 14.
        while self.position < len(self.input):
            # 1.
            block = self.collect_block()

            # 2.
            if isinstance(block, VTTCue):
                self.text_tracks.append(block)

            # 3.
            elif isinstance(block, Stylesheet):
                self.stylesheets.append(block)

            # 4.
            elif isinstance(block, VTTRegion):
                self.regions.append(block)

            # 5.
            self.collect_characters(lambda c: c == '\n')

        # 15.
        return

    def collect_block(self, in_header = False):
        'collect a WebVTT block'

        # 1. (done by class)

        line_count = 0                    # 2.
        previous_position = self.position # 3.
        line = ""                         # 4.
        buffer = ""                       # 5.
        seen_eof = False                  # 6.
        seen_arrow = False                # 7.
        cue = None                        # 8.
        stylesheet = None                 # 9.
        region = None                     # 10.

        # 11.
        while True:
            # 1.
            line = self.collect_characters(lambda c: c != '\n')

            # 2.
            line_count += 1

            # 3.
            if self.position >= len(self.input):
                seen_eof = True
            else:
                self.position += 1

            # 4.
            if '-->' in line:
                # 1.
                if not in_header and (line_count == 1 or line_count == 2 and not seen_arrow):
                    # 1.
                    seen_arrow = True

                    # 2.
                    previous_position = self.position

                    # 3.
                    cue = VTTCue(
                        id = buffer,
                        pause_on_exit = False,
                        region = None,
                        writing_direction = 'horizontal',
                        snap_to_lines = True,
                        line = 'auto',
                        line_alignment = 'start alignment',
                        position = 'auto',
                        position_alignment = 'auto',
                        cue_size = 100,
                        text_alignment = 'center',
                        text = '',
                    )

                    # 4.
                    if not VTTCueParser(self, line, cue).collect_cue_timings_and_settings():
                        cue = None
                    else:
                        buffer = ''
                        self.seen_cue = True # DIFFERENCE

                else:
                    self.errors.append('invalid webvtt cue block')
                    self.position = previous_position
                    break

            # 5.
            elif line == '':
                break

            # 6.
            else:
                # 1.
                if not in_header and line_count == 2:
                    # 1.
                    if not self.seen_cue and re.match(r'^STYLE\s*$', buffer):
                        stylesheet = Stylesheet(
                            location = None,
                            parent = None,
                            owner_node = None,
                            owner_rule = None,
                            media = None,
                            title = None,
                            alternate = False,
                            origin_clean = True,
                            source = None,
                        )
                        buffer = ''
                    # 2.
                    elif not self.seen_cue and re.match(r'^REGION\s*$', buffer):
                        region = VTTRegion(
                            id = '',
                            width = 100,
                            lines = 3,
                            anchor_point = (0, 100),
                            viewport_anchor_point = (0, 100),
                            scroll_value = None,
                        )
                        buffer = ''

                # 2.
                if buffer != '':
                    buffer += '\n'

                # 3.
                buffer += line

                # 4.
                previous_position = self.position

            # 7.
            if seen_eof:
                break

        # 12.
        if cue is not None:
            cue.text = buffer
            return cue

        # 13.
        elif stylesheet is not None:
            stylesheet.source = buffer
            return stylesheet

        # 14.
        elif region is not None:
            self.collect_region_settings(region, buffer)
            return region

        # 15.
        return None

    def collect_region_settings(self, region, input):
        'collect WebVTT region settings'

        # 1.
        settings = re.split(SPACE_SPLIT_PATTERN, input)

        # 2.
        for setting in settings:
            # 1.
            if ':' not in setting:
                continue

            index = setting.index(':')
            if index in [0, len(setting) - 1]:
                continue

            # 2.
            name = setting[:index]

            # 3.
            value = setting[index + 1:]

            # 4.
            if name == "id":
                region.id = value

            elif name == "width":
                percentage = self.parse_percentage_string(value)
                if percentage is not None:
                    region.width = percentage

            elif name == "lines":
                # 1.
                if not re.match(r'^\d+$', value):
                    continue

                # 2.
                number = int(value)

                # 3.
                region.lines = number

            elif name == "regionanchor":
                # 1.
                if ',' not in value:
                    continue

                #. 2.
                index = value.index(',')
                anchorX = value[:index]

                # 3.
                anchorY = value[index + 1:]

                # 4.
                percentageX = self.parse_percentage_string(anchorX)
                percentageY = self.parse_percentage_string(anchorY)
                if None in [percentageX, percentageY]:
                    continue

                # 5.
                region.anchor_point = (percentageX, percentageY)

            elif name == "viewportanchor":
                # 1.
                if ',' not in value:
                    continue

                #. 2.
                index = value.index(',')
                viewportanchorX = value[:index]

                # 3.
                viewportanchorY = value[index + 1:]

                # 4.
                percentageX = self.parse_percentage_string(viewportanchorX)
                percentageY = self.parse_percentage_string(viewportanchorY)
                if None in [percentageX, percentageY]:
                    continue

                # 5.
                region.viewport_anchor_point = (percentageX, percentageY)

            elif name == "scroll":
                # 1.
                if value == "up":
                    region.scroll_value = "up"

            # 5.
            continue


class VTTCueParser(W3CParser):
    def __init__(self, parent, input, cue):
        self.parent = parent
        self.errors = self.parent.errors
        self.input = input
        self.position = 0
        self.cue = cue

    def collect_cue_timings_and_settings(self):
        'collect WebVTT cue timings and settings'

        # 1. (handled by class)

        # 2.
        self.position = 0

        # 3.
        self.skip_whitespace()

        # 4.
        timestamp = self.collect_timestamp()
        if timestamp is None:
            self.errors.append('invalid start time for VTTCue')
            return False
        self.cue.start_time = timestamp

        # 5.
        self.skip_whitespace()

        # 6.
        if self.input[self.position] != '-':
            return False
        self.position += 1

        # 7.
        if self.input[self.position] != '-':
            return False
        self.position += 1

        # 8.
        if self.input[self.position] != '>':
            return False
        self.position += 1

        # 9.
        self.skip_whitespace()

        # 10.
        timestamp = self.collect_timestamp()
        if timestamp is None:
            self.errors.append('invalid end time for VTTCue')
            return False
        self.cue.end_time = timestamp

        # 11.
        remainder = self.input[self.position:]

        # 12.
        self.parse_settings(remainder)

        # Extra
        return True

    def parse_settings(self, input):
        'parse the WebVTT cue settings'

        # 1.

        settings = re.split(SPACE_SPLIT_PATTERN, input)

        # 2.
        for setting in settings:
            # 1.
            if ':' not in setting:
                continue

            index = setting.index(':')
            if index in [0, len(setting) - 1]:
                continue

            # 2.
            name = setting[:index]

            # 3.
            value = setting[index + 1:]

            # 4.
            if name == 'region':
                # 1.
                last_regions = (region for region in reversed(self.parent.regions) if region.id == value)
                self.cue.region = next(last_regions, None)

            elif name == 'vertical':
                # 1. and 2.
                if value in ['rl', 'lr']:
                    self.cue.writing_direction = value

            elif name == 'line':
                # 1.
                if ',' in value:
                    index = value.index(',')
                    linepos = value[:index]
                    linealign = value[index + 1:]

                # 2.
                else:
                    linepos = value
                    linealign = None

                # 3.
                if not re.search(r'\d', linepos):
                    continue

                # 4.
                if linepos[-1] == '%':
                    number = self.parse_percentage_string(linepos)
                    if number is None:
                        continue
                else:
                    # 1.
                    if not re.match(r'^[-\.\d]*$', linepos):
                        continue

                    # 2.
                    if '-' in linepos[1:]:
                        continue

                    # 3.
                    if linepos.count('.') > 1:
                        continue

                    # 4.
                    if '.' in linepos:
                        if not re.search(r'\d\.\d', linepos):
                            continue

                    # 5.
                    number = float(linepos)

                # 5.
                if linealign == "start":
                    self.cue.line_alignment = 'start'

                # 6.
                elif linealign == "center":
                    self.cue.line_alignment = 'center'

                # 7.
                elif linealign == "end":
                    self.cue.line_alignment = 'end'

                # 8.
                elif linealign != None:
                    continue

                # 9.
                self.cue.line = number

                # 10.
                if linepos[-1] == '%':
                    self.cue.snap_to_lines = False
                else:
                    self.cue.snap_to_lines = True

            elif name == 'position':
                # 1.
                if ',' in value:
                    index = value.index(',')
                    colpos = value[:index]
                    colalign = value[index + 1:]

                # 2.
                else:
                    colpos = value
                    colalign = None

                # 3.
                number = self.parse_percentage_string(colpos)
                if number is None:
                    continue

                # 4.
                if colalign == "line-left":
                    self.cue.line_alignment = 'line-left'

                # 5.
                elif colalign == "center":
                    self.cue.line_alignment = 'center'

                # 6.
                elif colalign == "line-right":
                    self.cue.line_alignment = 'line-right'

                # 7.
                elif colalign != None:
                    continue

                # 8.
                self.cue.position = number

            elif name == 'size':
                # 1.
                number = self.parse_percentage_string(value)
                if number is None:
                    continue

                # 2.
                self.cue.cue_size = number

            elif name == 'align':
                # 1.
                if value == 'start':
                    self.cue.text_alignment = 'start'

                # 2.
                if value == 'center':
                    self.cue.text_alignment = 'center'

                # 3.
                if value == 'end':
                    self.cue.text_alignment = 'end'

                # 4.
                if value == 'left':
                    self.cue.text_alignment = 'left'

                # 5.
                if value == 'right':
                    self.cue.text_alignment = 'right'

            # 5.
            continue

    def collect_timestamp(self):
        'collect a WebVTT timestamp'

        # 1. (handled by class)

        # 2.
        most_significant_units = 'minutes'

        # 3.
        if self.position >= len(self.input):
            return None

        # 4.
        if self.input[self.position] not in DIGITS:
            return None

        # 5.
        string = self.collect_characters(lambda c: c in DIGITS)

        # 6.
        value_1 = int(string)

        # 7.
        if len(string) != 2 or value_1 > 59:
            most_significant_units = 'hours'

        # 8.
        if self.position >= len(self.input) or self.input[self.position] != ':':
            return None
        self.position += 1

        # 9.
        string = self.collect_characters(lambda c: c in DIGITS)

        # 10.
        if len(string) != 2:
            return None

        # 11.
        value_2 = int(string)

        # 12.
        if most_significant_units == 'hours' or self.position < len(self.input) and self.input[self.position] == ':':
            # 1.
            if self.position >= len(self.input) or self.input[self.position] != ':':
                return None
            self.position += 1

            # 2.
            string = self.collect_characters(lambda c: c in DIGITS)

            # 3.
            if len(string) != 2:
                return None

            # 4.
            value_3 = int(string)
        else:
            value_3 = value_2
            value_2 = value_1
            value_1 = 0

        # 13.
        if self.position >= len(self.input) or self.input[self.position] != '.':
            return None
        self.position += 1

        # 14.
        string = self.collect_characters(lambda c: c in DIGITS)

        # 15.
        if len(string) != 3:
            return None

        # 16.
        value_4 = int(string)

        # 17.
        if value_2 >= 59 or value_3 >= 59:
            return None

        # 18.
        result = value_1 * 60 * 60 + value_2 * 60 + value_3 + value_4 / 1000

        # 19.
        return result


def main(argv):
    files = [open(path, 'r') for path in argv[1:]]

    try:
        for file in files:
            parser = VTTParser(file.read())
            parser.parse()

            print("Results: {}".format(file))
            print("  Cues: {}".format(parser.text_tracks))
            print("  StyleSheets: {}".format(parser.stylesheets))
            print("  Regions: {}".format(parser.regions))
            print("  Errors: {}".format(parser.errors))
    finally:
        for file in files:
            file.close()

if __name__ == '__main__':
    import sys
    main(sys.argv);
chromium/third_party/blink/web_tests/external/wpt/webvtt/parsing/file-parsing/tools/parser.py