// Copyright 2014 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #ifdef UNSAFE_BUFFERS_BUILD // TODO(crbug.com/40284755): Remove this and spanify to fix the errors. #pragma allow_unsafe_buffers #endif // Create a state machine for validating UTF-8. The algorithm in brief: // 1. Convert the complete unicode range of code points, except for the // surrogate code points, to an ordered array of sequences of bytes in // UTF-8. // 2. Convert individual bytes to ranges, starting from the right of each byte // sequence. For each range, ensure the bytes on the left and the ranges // on the right are the identical. // 3. Convert the resulting list of ranges into a state machine, collapsing // identical states. // 4. Convert the state machine to an array of bytes. // 5. Output as a C++ file. // // To use: // $ ninja -C out/Release build_utf8_validator_tables // $ out/Release/build_utf8_validator_tables // --output=base/i18n/utf8_validator_tables.cc // $ git add base/i18n/utf8_validator_tables.cc // // Because the table is not expected to ever change, it is checked into the // repository rather than being regenerated at build time. // // This code uses type uint8_t throughout to represent bytes, to avoid // signed/unsigned char confusion. #include <stddef.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <algorithm> #include <map> #include <string> #include <vector> #include "base/command_line.h" #include "base/files/file_path.h" #include "base/files/file_util.h" #include "base/logging.h" #include "base/memory/raw_ptr.h" #include "base/numerics/safe_conversions.h" #include "base/strings/stringprintf.h" #include "third_party/icu/source/common/unicode/utf8.h" namespace { const char kHelpText[] = …; const char kProlog[] = …; const char kEpilog[] = …; // Ranges are inclusive at both ends--they represent [from, to] class Range { … }; // A vector of Ranges is like a simple regular expression--it corresponds to // a set of strings of the same length that have bytes in each position in // the appropriate range. StringSet; // A UTF-8 "character" is represented by a sequence of bytes. Character; // In the second stage of the algorithm, we want to convert a large list of // Characters into a small list of StringSets. struct Pair { … }; PairVector; // A class to print a table of numbers in the same style as clang-format. class TablePrinter { … }; // Start by filling a PairVector with characters. The resulting vector goes from // "\x00" to "\xf4\x8f\xbf\xbf". PairVector InitializeCharacters() { … } // Construct a new Pair from |character| and the concatenation of |new_range| // and |existing_set|, and append it to |pairs|. void ConstructPairAndAppend(const Character& character, const Range& new_range, const StringSet& existing_set, PairVector* pairs) { … } // Each pass over the PairVector strips one byte off the right-hand-side of the // characters and adds a range to the set on the right. For example, the first // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0", // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0", // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0", // [\xa0-\xbf][\x80-\xbf]). void MoveRightMostCharToSet(PairVector* pairs) { … } void MoveAllCharsToSets(PairVector* pairs) { … } // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the // algorithm is working. Use the command-line option // --vmodule=build_utf8_validator_tables=1 to see this output. void LogStringSets(const PairVector& pairs) { … } // A single state in the state machine is represented by a sorted vector of // start bytes and target states. All input bytes in the range between the start // byte and the next entry in the vector (or 0xFF) result in a transition to the // target state. struct StateRange { … }; State; // Generates a state where all bytes go to state 1 (invalid). This is also used // as an initialiser for other states (since bytes from outside the desired // range are invalid). State GenerateInvalidState() { … } // A map from a state (ie. a set of strings which will match from this state) to // a number (which is an index into the array of states). StateMap; // Create a new state corresponding to |set|, add it |states| and |state_map| // and return the index it was given in |states|. uint8_t MakeState(const StringSet& set, std::vector<State>* states, StateMap* state_map) { … } std::vector<State> GenerateStates(const PairVector& pairs) { … } // Output the generated states as a C++ table. Two tricks are used to compact // the table: each state in the table starts with a shift value which indicates // how many bits we can discard from the right-hand-side of the byte before // doing the table lookup. Secondly, only the state-transitions for bytes // with the top-bit set are included in the table; bytes without the top-bit set // are just ASCII and are handled directly by the code. void PrintStates(const std::vector<State>& states, FILE* stream) { … } } // namespace int main(int argc, char* argv[]) { … }