build_utf8_validator_tables.cc | Explore in Territory

// Copyright 2014 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
#pragma allow_unsafe_buffers
#endif

// Create a state machine for validating UTF-8. The algorithm in brief:
// 1. Convert the complete unicode range of code points, except for the
//    surrogate code points, to an ordered array of sequences of bytes in
//    UTF-8.
// 2. Convert individual bytes to ranges, starting from the right of each byte
//    sequence. For each range, ensure the bytes on the left and the ranges
//    on the right are the identical.
// 3. Convert the resulting list of ranges into a state machine, collapsing
//    identical states.
// 4. Convert the state machine to an array of bytes.
// 5. Output as a C++ file.
//
// To use:
//  $ ninja -C out/Release build_utf8_validator_tables
//  $ out/Release/build_utf8_validator_tables
//                                   --output=base/i18n/utf8_validator_tables.cc
//  $ git add base/i18n/utf8_validator_tables.cc
//
// Because the table is not expected to ever change, it is checked into the
// repository rather than being regenerated at build time.
//
// This code uses type uint8_t throughout to represent bytes, to avoid
// signed/unsigned char confusion.

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include <algorithm>
#include <map>
#include <string>
#include <vector>

#include "base/command_line.h"
#include "base/files/file_path.h"
#include "base/files/file_util.h"
#include "base/logging.h"
#include "base/memory/raw_ptr.h"
#include "base/numerics/safe_conversions.h"
#include "base/strings/stringprintf.h"
#include "third_party/icu/source/common/unicode/utf8.h"

namespace {

const char kHelpText[] = …;

const char kProlog[] = …;

const char kEpilog[] = …;

// Ranges are inclusive at both ends--they represent [from, to]
class Range { … };

// A vector of Ranges is like a simple regular expression--it corresponds to
// a set of strings of the same length that have bytes in each position in
// the appropriate range.
StringSet;

// A UTF-8 "character" is represented by a sequence of bytes.
Character;

// In the second stage of the algorithm, we want to convert a large list of
// Characters into a small list of StringSets.
struct Pair { … };

PairVector;

// A class to print a table of numbers in the same style as clang-format.
class TablePrinter { … };

// Start by filling a PairVector with characters. The resulting vector goes from
// "\x00" to "\xf4\x8f\xbf\xbf".
PairVector InitializeCharacters() { … }

// Construct a new Pair from |character| and the concatenation of |new_range|
// and |existing_set|, and append it to |pairs|.
void ConstructPairAndAppend(const Character& character,
                            const Range& new_range,
                            const StringSet& existing_set,
                            PairVector* pairs) { … }

// Each pass over the PairVector strips one byte off the right-hand-side of the
// characters and adds a range to the set on the right. For example, the first
// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
// [\xa0-\xbf][\x80-\xbf]).
void MoveRightMostCharToSet(PairVector* pairs) { … }

void MoveAllCharsToSets(PairVector* pairs) { … }

// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
// algorithm is working. Use the command-line option
// --vmodule=build_utf8_validator_tables=1 to see this output.
void LogStringSets(const PairVector& pairs) { … }

// A single state in the state machine is represented by a sorted vector of
// start bytes and target states. All input bytes in the range between the start
// byte and the next entry in the vector (or 0xFF) result in a transition to the
// target state.
struct StateRange { … };

State;

// Generates a state where all bytes go to state 1 (invalid). This is also used
// as an initialiser for other states (since bytes from outside the desired
// range are invalid).
State GenerateInvalidState() { … }

// A map from a state (ie. a set of strings which will match from this state) to
// a number (which is an index into the array of states).
StateMap;

// Create a new state corresponding to |set|, add it |states| and |state_map|
// and return the index it was given in |states|.
uint8_t MakeState(const StringSet& set,
                  std::vector<State>* states,
                  StateMap* state_map) { … }

std::vector<State> GenerateStates(const PairVector& pairs) { … }

// Output the generated states as a C++ table. Two tricks are used to compact
// the table: each state in the table starts with a shift value which indicates
// how many bits we can discard from the right-hand-side of the byte before
// doing the table lookup. Secondly, only the state-transitions for bytes
// with the top-bit set are included in the table; bytes without the top-bit set
// are just ASCII and are handled directly by the code.
void PrintStates(const std::vector<State>& states, FILE* stream) { … }

}  // namespace

int main(int argc, char* argv[]) { … }
chromium/base/i18n/build_utf8_validator_tables.cc