// Protocol Buffers - Google's data interchange format // Copyright 2008 Google Inc. All rights reserved. // https://developers.google.com/protocol-buffers/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above // copyright notice, this list of conditions and the following disclaimer // in the documentation and/or other materials provided with the // distribution. // * Neither the name of Google Inc. nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // Author: [email protected] (Kenton Varda) // Based on original Protocol Buffers design by // Sanjay Ghemawat, Jeff Dean, and others. // // Here we have a hand-written lexer. At first you might ask yourself, // "Hand-written text processing? Is Kenton crazy?!" Well, first of all, // yes I am crazy, but that's beside the point. There are actually reasons // why I ended up writing this this way. // // The traditional approach to lexing is to use lex to generate a lexer for // you. Unfortunately, lex's output is ridiculously ugly and difficult to // integrate cleanly with C++ code, especially abstract code or code meant // as a library. Better parser-generators exist but would add dependencies // which most users won't already have, which we'd like to avoid. (GNU flex // has a C++ output option, but it's still ridiculously ugly, non-abstract, // and not library-friendly.) // // The next approach that any good software engineer should look at is to // use regular expressions. And, indeed, I did. I have code which // implements this same class using regular expressions. It's about 200 // lines shorter. However: // - Rather than error messages telling you "This string has an invalid // escape sequence at line 5, column 45", you get error messages like // "Parse error on line 5". Giving more precise errors requires adding // a lot of code that ends up basically as complex as the hand-coded // version anyway. // - The regular expression to match a string literal looks like this: // kString = new RE("(\"([^\"\\\\]|" // non-escaped // "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape // "\\\\x[0-9a-fA-F])*\"|" // hex escape // "\'([^\'\\\\]|" // Also support single-quotes. // "\\\\[abfnrtv?\"'\\\\0-7]|" // "\\\\x[0-9a-fA-F])*\')"); // Verifying the correctness of this line noise is actually harder than // verifying the correctness of ConsumeString(), defined below. I'm not // even confident that the above is correct, after staring at it for some // time. // - PCRE is fast, but there's still more overhead involved than the code // below. // - Sadly, regular expressions are not part of the C standard library, so // using them would require depending on some other library. For the // open source release, this could be really annoying. Nobody likes // downloading one piece of software just to find that they need to // download something else to make it work, and in all likelihood // people downloading Protocol Buffers will already be doing so just // to make something else work. We could include a copy of PCRE with // our code, but that obligates us to keep it up-to-date and just seems // like a big waste just to save 200 lines of code. // // On a similar but unrelated note, I'm even scared to use ctype.h. // Apparently functions like isalpha() are locale-dependent. So, if we used // that, then if this code is being called from some program that doesn't // have its locale set to "C", it would behave strangely. We can't just set // the locale to "C" ourselves since we might break the calling program that // way, particularly if it is multi-threaded. WTF? Someone please let me // (Kenton) know if I'm missing something here... // // I'd love to hear about other alternatives, though, as this code isn't // exactly pretty. #include <google/protobuf/io/tokenizer.h> #include <google/protobuf/stubs/common.h> #include <google/protobuf/stubs/logging.h> #include <google/protobuf/stubs/strutil.h> #include <google/protobuf/stubs/stringprintf.h> #include <google/protobuf/io/strtod.h> #include <google/protobuf/io/zero_copy_stream.h> #include <google/protobuf/stubs/stl_util.h> // Must be included last. #include <google/protobuf/port_def.inc> namespace google { namespace protobuf { namespace io { namespace … // anonymous namespace ErrorCollector::~ErrorCollector() { … } // =================================================================== Tokenizer::Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector) : … { … } Tokenizer::~Tokenizer() { … } bool Tokenizer::report_whitespace() const { … } // Note: `set_report_whitespace(false)` implies `set_report_newlines(false)`. void Tokenizer::set_report_whitespace(bool report) { … } // If true, newline tokens are reported by Next(). bool Tokenizer::report_newlines() const { … } // Note: `set_report_newlines(true)` implies `set_report_whitespace(true)`. void Tokenizer::set_report_newlines(bool report) { … } // ------------------------------------------------------------------- // Internal helpers. void Tokenizer::NextChar() { … } void Tokenizer::Refresh() { … } inline void Tokenizer::RecordTo(std::string* target) { … } inline void Tokenizer::StopRecording() { … } inline void Tokenizer::StartToken() { … } inline void Tokenizer::EndToken() { … } // ------------------------------------------------------------------- // Helper methods that consume characters. template <typename CharacterClass> inline bool Tokenizer::LookingAt() { … } template <typename CharacterClass> inline bool Tokenizer::TryConsumeOne() { … } inline bool Tokenizer::TryConsume(char c) { … } template <typename CharacterClass> inline void Tokenizer::ConsumeZeroOrMore() { … } template <typename CharacterClass> inline void Tokenizer::ConsumeOneOrMore(const char* error) { … } // ------------------------------------------------------------------- // Methods that read whole patterns matching certain kinds of tokens // or comments. void Tokenizer::ConsumeString(char delimiter) { … } Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero, bool started_with_dot) { … } void Tokenizer::ConsumeLineComment(std::string* content) { … } void Tokenizer::ConsumeBlockComment(std::string* content) { … } Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() { … } bool Tokenizer::TryConsumeWhitespace() { … } bool Tokenizer::TryConsumeNewline() { … } // ------------------------------------------------------------------- bool Tokenizer::Next() { … } namespace { // Helper class for collecting comments and putting them in the right places. // // This basically just buffers the most recent comment until it can be decided // exactly where that comment should be placed. When Flush() is called, the // current comment goes into either prev_trailing_comments or detached_comments. // When the CommentCollector is destroyed, the last buffered comment goes into // next_leading_comments. class CommentCollector { … }; } // namespace bool Tokenizer::NextWithComments(std::string* prev_trailing_comments, std::vector<std::string>* detached_comments, std::string* next_leading_comments) { … } // ------------------------------------------------------------------- // Token-parsing helpers. Remember that these don't need to report // errors since any errors should already have been reported while // tokenizing. Also, these can assume that whatever text they // are given is text that the tokenizer actually parsed as a token // of the given type. bool Tokenizer::ParseInteger(const std::string& text, uint64_t max_value, uint64_t* output) { … } double Tokenizer::ParseFloat(const std::string& text) { … } // Helper to append a Unicode code point to a string as UTF8, without bringing // in any external dependencies. static void AppendUTF8(uint32_t code_point, std::string* output) { … } // Try to read <len> hex digits from ptr, and stuff the numeric result into // *result. Returns true if that many digits were successfully consumed. static bool ReadHexDigits(const char* ptr, int len, uint32_t* result) { … } // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail // surrogate. These numbers are in a reserved range of Unicode code points, so // if we encounter such a pair we know how to parse it and convert it into a // single code point. static const uint32_t kMinHeadSurrogate = …; static const uint32_t kMaxHeadSurrogate = …; static const uint32_t kMinTrailSurrogate = …; static const uint32_t kMaxTrailSurrogate = …; static inline bool IsHeadSurrogate(uint32_t code_point) { … } static inline bool IsTrailSurrogate(uint32_t code_point) { … } // Combine a head and trail surrogate into a single Unicode code point. static uint32_t AssembleUTF16(uint32_t head_surrogate, uint32_t trail_surrogate) { … } // Convert the escape sequence parameter to a number of expected hex digits. static inline int UnicodeLength(char key) { … } // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt // to parse that sequence. On success, returns a pointer to the first char // beyond that sequence, and fills in *code_point. On failure, returns ptr // itself. static const char* FetchUnicodePoint(const char* ptr, uint32_t* code_point) { … } // The text string must begin and end with single or double quote // characters. void Tokenizer::ParseStringAppend(const std::string& text, std::string* output) { … } template <typename CharacterClass> static bool AllInClass(const std::string& s) { … } bool Tokenizer::IsIdentifier(const std::string& text) { … } } // namespace io } // namespace protobuf } // namespace google #include <google/protobuf/port_undef.inc>