dictbe.cpp | Explore in Territory

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/**
 *******************************************************************************
 * Copyright (C) 2006-2016, International Business Machines Corporation
 * and others. All Rights Reserved.
 *******************************************************************************
 */

#include <utility>

#include "unicode/utypes.h"

#if !UCONFIG_NO_BREAK_ITERATION

#include "brkeng.h"
#include "dictbe.h"
#include "unicode/uniset.h"
#include "unicode/chariter.h"
#include "unicode/resbund.h"
#include "unicode/ubrk.h"
#include "unicode/usetiter.h"
#include "ubrkimpl.h"
#include "utracimp.h"
#include "uvectr32.h"
#include "uvector.h"
#include "uassert.h"
#include "unicode/normlzr.h"
#include "cmemory.h"
#include "dictionarydata.h"

U_NAMESPACE_BEGIN

/*
 ******************************************************************
 */

DictionaryBreakEngine::DictionaryBreakEngine() { … }

DictionaryBreakEngine::~DictionaryBreakEngine() { … }

UBool
DictionaryBreakEngine::handles(UChar32 c, const char*) const { … }

int32_t
DictionaryBreakEngine::findBreaks( UText *text,
                                 int32_t startPos,
                                 int32_t endPos,
                                 UVector32 &foundBreaks,
                                 UBool isPhraseBreaking,
                                 UErrorCode& status) const { … }

void
DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { … }

/*
 ******************************************************************
 * PossibleWord
 */

// Helper class for improving readability of the Thai/Lao/Khmer word break
// algorithm. The implementation is completely inline.

// List size, limited by the maximum number of words in the dictionary
// that form a nested sequence.
static const int32_t POSSIBLE_WORD_LIST_MAX = …;

class PossibleWord { … };


int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { … }

int32_t
PossibleWord::acceptMarked( UText *text ) { … }


UBool
PossibleWord::backUp( UText *text ) { … }

/*
 ******************************************************************
 * ThaiBreakEngine
 */

// How many words in a row are "good enough"?
static const int32_t THAI_LOOKAHEAD = …;

// Will not combine a non-word with a preceding dictionary word longer than this
static const int32_t THAI_ROOT_COMBINE_THRESHOLD = …;

// Will not combine a non-word that shares at least this much prefix with a
// dictionary word, with a preceding word
static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = …;

// Elision character
static const int32_t THAI_PAIYANNOI = …;

// Repeat character
static const int32_t THAI_MAIYAMOK = …;

// Minimum word size
static const int32_t THAI_MIN_WORD = …;

// Minimum number of characters for two words
static const int32_t THAI_MIN_WORD_SPAN = …;

ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    : … { … }

ThaiBreakEngine::~ThaiBreakEngine() { … }

int32_t
ThaiBreakEngine::divideUpDictionaryRange( UText *text,
                                                int32_t rangeStart,
                                                int32_t rangeEnd,
                                                UVector32 &foundBreaks,
                                                UBool /* isPhraseBreaking */,
                                                UErrorCode& status) const { … }

/*
 ******************************************************************
 * LaoBreakEngine
 */

// How many words in a row are "good enough"?
static const int32_t LAO_LOOKAHEAD = …;

// Will not combine a non-word with a preceding dictionary word longer than this
static const int32_t LAO_ROOT_COMBINE_THRESHOLD = …;

// Will not combine a non-word that shares at least this much prefix with a
// dictionary word, with a preceding word
static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = …;

// Minimum word size
static const int32_t LAO_MIN_WORD = …;

// Minimum number of characters for two words
static const int32_t LAO_MIN_WORD_SPAN = …;

LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    : … { … }

LaoBreakEngine::~LaoBreakEngine() { … }

int32_t
LaoBreakEngine::divideUpDictionaryRange( UText *text,
                                                int32_t rangeStart,
                                                int32_t rangeEnd,
                                                UVector32 &foundBreaks,
                                                UBool /* isPhraseBreaking */,
                                                UErrorCode& status) const { … }

/*
 ******************************************************************
 * BurmeseBreakEngine
 */

// How many words in a row are "good enough"?
static const int32_t BURMESE_LOOKAHEAD = …;

// Will not combine a non-word with a preceding dictionary word longer than this
static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = …;

// Will not combine a non-word that shares at least this much prefix with a
// dictionary word, with a preceding word
static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = …;

// Minimum word size
static const int32_t BURMESE_MIN_WORD = …;

// Minimum number of characters for two words
static const int32_t BURMESE_MIN_WORD_SPAN = …;

BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    : … { … }

BurmeseBreakEngine::~BurmeseBreakEngine() { … }

int32_t
BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
                                                int32_t rangeStart,
                                                int32_t rangeEnd,
                                                UVector32 &foundBreaks,
                                                UBool /* isPhraseBreaking */,
                                                UErrorCode& status ) const { … }

/*
 ******************************************************************
 * KhmerBreakEngine
 */

// How many words in a row are "good enough"?
static const int32_t KHMER_LOOKAHEAD = …;

// Will not combine a non-word with a preceding dictionary word longer than this
static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = …;

// Will not combine a non-word that shares at least this much prefix with a
// dictionary word, with a preceding word
static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = …;

// Minimum word size
static const int32_t KHMER_MIN_WORD = …;

// Minimum number of characters for two words
static const int32_t KHMER_MIN_WORD_SPAN = …;

KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
    : … { … }

KhmerBreakEngine::~KhmerBreakEngine() { … }

int32_t
KhmerBreakEngine::divideUpDictionaryRange( UText *text,
                                                int32_t rangeStart,
                                                int32_t rangeEnd,
                                                UVector32 &foundBreaks,
                                                UBool /* isPhraseBreaking */,
                                                UErrorCode& status ) const { … }

#if !UCONFIG_NO_NORMALIZATION
/*
 ******************************************************************
 * CjkBreakEngine
 */
static const uint32_t kuint32max = …;
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
: … { … }

CjkBreakEngine::~CjkBreakEngine(){ … }

// The katakanaCost values below are based on the length frequencies of all
// katakana phrases in the dictionary
static const int32_t kMaxKatakanaLength = …;
static const int32_t kMaxKatakanaGroupLength = …;
static const uint32_t maxSnlp = …;

static inline uint32_t getKatakanaCost(int32_t wordLength){ … }

static inline bool isKatakana(UChar32 value) { … }

// Function for accessing internal utext flags.
//   Replicates an internal UText function.

static inline int32_t utext_i32_flag(int32_t bitIndex) { … }
       
/*
 * @param text A UText representing the text
 * @param rangeStart The start of the range of dictionary characters
 * @param rangeEnd The end of the range of dictionary characters
 * @param foundBreaks vector<int32> to receive the break positions
 * @return The number of breaks found
 */
int32_t 
CjkBreakEngine::divideUpDictionaryRange( UText *inText,
        int32_t rangeStart,
        int32_t rangeEnd,
        UVector32 &foundBreaks,
        UBool isPhraseBreaking,
        UErrorCode& status) const { … }

void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) { … }

void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) { … }

void CjkBreakEngine::loadHiragana(UErrorCode& error) { … }
#endif

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
chromium/third_party/icu/source/common/dictbe.cpp