chromium/third_party/cld_3/src/src/language_identifier_features.h

/* Copyright 2016 Google Inc. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#ifndef LANGUAGE_IDENTIFIER_FEATURES_H_
#define LANGUAGE_IDENTIFIER_FEATURES_H_

#include <string>

#include "feature_extractor.h"
#include "feature_types.h"
#include "script_span/generated_ulscript.h"
#include "cld_3/protos/sentence.pb.h"
#include "sentence_features.h"
#include "task_context.h"
#include "workspace.h"

namespace chrome_lang_id {

// Feature type for numeric features.
class NumericFeatureType : public FeatureType {};

// Class for computing continuous char ngram features.
// Feature function descriptor parameters:
//   include_terminators(bool, false):
//     If 'true', then splits the text based on spaces to get tokens, adds "^"
//     to the beginning of each token, and adds "$" to the end of each token.
//   include_spaces(bool, false):
//     If 'true', then includes char ngrams containing spaces.
//   use_equal_weight(bool, false):
//     If 'true', then weighs each unique ngram by 1.0 / (number of unique
//     ngrams in the input). Otherwise, weighs each unique ngram by (ngram
//     count) / (total number of ngrams).
//   id_dim(int, 10000):
//     The integer id of each char ngram is computed as follows:
//     Hash32WithDefaultSeed(char ngram) % id_dim.
//   size(int, 3):
//     Only ngrams of this size will be extracted.
class ContinuousBagOfNgramsFunction : public WholeSentenceFeature {};

// Class for detecting the script of a piece of text. The list of supported
// scripts is chrome_lang_id::CLD2::ULScript. This class uses the script
// recognition code ported from CLD2. ULScript_Hani is split into non-Korean
// script and Korean script (Hangul). In the former case, the function emits
// ULScript_Hani. In the latter case, the function emits NUM_ULSCRIPTS. The
// class assumes that the input is (1) interchange valid UTF8, and (2) contains
// only one chrome_lang_id::CLD2::ULScript.
class ScriptFeature : public WholeSentenceFeature {};

}  // namespace chrome_lang_id

#endif  // LANGUAGE_IDENTIFIER_FEATURES_H_