chromium/components/spellcheck/renderer/spellcheck_worditerator.h

// Copyright 2011 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// Defines an iterator class that enumerates words supported by our spellchecker
// from multi-language text. This class is used for filtering out characters
// not supported by our spellchecker.

#ifndef COMPONENTS_SPELLCHECK_RENDERER_SPELLCHECK_WORDITERATOR_H_
#define COMPONENTS_SPELLCHECK_RENDERER_SPELLCHECK_WORDITERATOR_H_

#include <stddef.h>

#include <memory>
#include <string>
#include <string_view>

#include "base/memory/raw_ptr.h"
#include "third_party/icu/source/common/unicode/uscript.h"

namespace base {
namespace i18n {
class BreakIterator;
} // namespace i18n
} // namespace base

// A class which encapsulates language-specific operations used by
// SpellcheckWordIterator. When we set the spellchecker language, this class
// creates rule sets that filter out the characters not supported by the
// spellchecker. (Please read the comment in the SpellcheckWordIterator class
// about how to use this class.)
class SpellcheckCharAttribute {};

// A class which extracts words that can be checked for spelling from a
// multi-language string. The ICU word-break iterator does not discard some
// punctuation characters attached to a word. For example, when we set a word
// "_hello_" to a word-break iterator, it just returns "_hello_". Neither does
// it discard characters not used by the language. For example, it returns
// Russian words even though we need English words only. To extract only the
// words that our spellchecker can check their spellings, this class uses custom
// rule-sets created by the SpellcheckCharAttribute class. Also, this class
// normalizes extracted words so our spellchecker can check the spellings of
// words that include ligatures, combined characters, full-width characters,
// etc. This class uses UTF-16 strings as its input and output strings since
// UTF-16 is the native encoding of ICU and avoid unnecessary conversions
// when changing the encoding of this string for our spellchecker. (Chrome can
// use two or more spellcheckers and we cannot assume their encodings.)
// The following snippet is an example that extracts words with this class.
//
//   // Creates the language-specific attributes for US English.
//   SpellcheckCharAttribute attribute;
//   attribute.SetDefaultLanguage("en-US");
//
//   // Set up a SpellcheckWordIterator object which extracts English words,
//   // and retrieve them.
//   SpellcheckWordIterator iterator;
//   std::u16string text(u"this is a test.");
//   iterator.Initialize(&attribute, true);
//   iterator.SetText(text);
//
//   std::u16string word;
//   int offset;
//   int length;
//   while (iterator.GetNextWord(&word, &offset, &length)) {
//     ...
//   }
//
class SpellcheckWordIterator {};

#endif  // COMPONENTS_SPELLCHECK_RENDERER_SPELLCHECK_WORDITERATOR_H_