// Copyright 2011 The Chromium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Defines an iterator class that enumerates words supported by our spellchecker // from multi-language text. This class is used for filtering out characters // not supported by our spellchecker. #ifndef COMPONENTS_SPELLCHECK_RENDERER_SPELLCHECK_WORDITERATOR_H_ #define COMPONENTS_SPELLCHECK_RENDERER_SPELLCHECK_WORDITERATOR_H_ #include <stddef.h> #include <memory> #include <string> #include <string_view> #include "base/memory/raw_ptr.h" #include "third_party/icu/source/common/unicode/uscript.h" namespace base { namespace i18n { class BreakIterator; } // namespace i18n } // namespace base // A class which encapsulates language-specific operations used by // SpellcheckWordIterator. When we set the spellchecker language, this class // creates rule sets that filter out the characters not supported by the // spellchecker. (Please read the comment in the SpellcheckWordIterator class // about how to use this class.) class SpellcheckCharAttribute { … }; // A class which extracts words that can be checked for spelling from a // multi-language string. The ICU word-break iterator does not discard some // punctuation characters attached to a word. For example, when we set a word // "_hello_" to a word-break iterator, it just returns "_hello_". Neither does // it discard characters not used by the language. For example, it returns // Russian words even though we need English words only. To extract only the // words that our spellchecker can check their spellings, this class uses custom // rule-sets created by the SpellcheckCharAttribute class. Also, this class // normalizes extracted words so our spellchecker can check the spellings of // words that include ligatures, combined characters, full-width characters, // etc. This class uses UTF-16 strings as its input and output strings since // UTF-16 is the native encoding of ICU and avoid unnecessary conversions // when changing the encoding of this string for our spellchecker. (Chrome can // use two or more spellcheckers and we cannot assume their encodings.) // The following snippet is an example that extracts words with this class. // // // Creates the language-specific attributes for US English. // SpellcheckCharAttribute attribute; // attribute.SetDefaultLanguage("en-US"); // // // Set up a SpellcheckWordIterator object which extracts English words, // // and retrieve them. // SpellcheckWordIterator iterator; // std::u16string text(u"this is a test."); // iterator.Initialize(&attribute, true); // iterator.SetText(text); // // std::u16string word; // int offset; // int length; // while (iterator.GetNextWord(&word, &offset, &length)) { // ... // } // class SpellcheckWordIterator { … }; #endif // COMPONENTS_SPELLCHECK_RENDERER_SPELLCHECK_WORDITERATOR_H_